In [None]:
# In project 4 I used linear gradient descent and got an undefitted model with a cost of 0.79
# this time I'll try to do better with a neural network and evaluating diagnostics

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.lines import Line2D
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.regularizers import l2
import numpy as np

In [None]:
def dataframe_to_xy(dataframe):
    data = dataframe.to_numpy()
    y = data[:, 0]
    X = data[:, 1:]
    return X, y

In [None]:
xy_dataframe = pd.read_stata('binary.dta') # https://stats.oarc.ucla.edu/stata/dae/logistic-regression/

In [None]:
xy_dataframe.info()

In [None]:
X, y = dataframe_to_xy(xy_dataframe)

In [None]:
print(X[0])
print(y[0])

In [None]:
# Get 60% of the dataset as the training set. Put the remaining 40% in temporary variables: x_ and y_.
x_train, x_, y_train, y_ = train_test_split(X, y, test_size=0.40, random_state=1)

# Split the 40% subset above into two: one half for cross validation and the other for the test set
x_cv, x_test, y_cv, y_test = train_test_split(x_, y_, test_size=0.50, random_state=1)

# Delete temporary variables
del x_, y_

In [None]:
print(x_train.shape)
print(x_train[0]) # note the first value is several orders larger than the others, so I should normalize
print(y_train[0])

In [None]:
def draw_scatter_plot_3d(x1, x2, x3, y, x1_label='X axis', x2_label='Y axis', x3_label='Z axis', y_labels=['0', '1']):
    fig = plt.figure(figsize=(8, 6))
    ax = fig.add_subplot(111, projection='3d')
    colors = ['red' if label == 0 else 'blue' for label in y]
    ax.scatter(x1, x2, x3, c=colors, s=50, alpha=0.8)
    ax.set_xlabel(x1_label)
    ax.set_ylabel(x2_label)
    ax.set_zlabel(x3_label)
    legend_elements = [
        Line2D([0], [0], marker='o', color='w', label=y_labels[0], markerfacecolor='red', markersize=8),
        Line2D([0], [0], marker='o', color='w', label=y_labels[1], markerfacecolor='blue', markersize=8)
    ]
    ax.legend(handles=legend_elements, loc='best')
    plt.show()

In [None]:
draw_scatter_plot_3d(X[:, 0], X[:, 1], X[:, 2], y, 'GRE', 'GPA', 'Rank', ['Not Admitted', 'Admitted'])

In [None]:
# Scale the features
scaler_linear = StandardScaler()
x_train_scaled = scaler_linear.fit_transform(x_train)
x_cv_scaled = scaler_linear.transform(x_cv)
x_test_scaled = scaler_linear.transform(x_test)

In [None]:
# Compute a baseline for accuracy
unique, counts = np.unique(y_train, return_counts=True)
print(dict(zip(unique, counts)))
baseline_accuracy = counts.max() / counts.sum()
print("Majority baseline:", baseline_accuracy)

In [None]:
# Test a few different models
model_1 = Sequential(
    [
        tf.keras.Input(shape=(3,)),
        Dense(3, activation = 'relu'),
        Dense(1, activation = 'sigmoid')
    ],
    name='model_1'
)
model_2 = Sequential(
    [
        tf.keras.Input(shape=(3,)),
        Dense(25, activation = 'relu'),
        Dense(15, activation = 'relu'),
        Dense(1, activation = 'sigmoid')
    ],
    name='model_2'
)
model_3 = Sequential(
    [
        tf.keras.Input(shape=(3,)),
        Dense(50, activation = 'relu'),
        Dense(25, activation = 'relu'),
        Dense(15, activation = 'relu'),
        Dense(1, activation = 'sigmoid')
    ],
    name='model_3'
)
model_4 = Sequential(
    [
        tf.keras.Input(shape=(3,)),
        Dense(35, activation = 'relu'),
        Dense(25, activation = 'relu'),
        Dense(15, activation = 'relu'),
        Dense(5, activation = 'relu'),
        Dense(1, activation = 'sigmoid')
    ],
    name='model_4'
)
models = [model_1, model_2, model_3, model_4]

In [None]:
# Initialize lists that will contain the errors for each model
train_accuracies = []
cv_accuracies = []
histories = []

# Loop over each model
for model in models:
    # Setup the loss and optimizer
    model.compile(
        loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        metrics=[
            tf.keras.metrics.BinaryAccuracy(name='accuracy', threshold=0.5),
            tf.keras.metrics.AUC(name='auc')
        ]
    )
    print(f"Training {model.name}...")

    # Train the model
    history = model.fit(
        x_train_scaled, 
        y_train,
        validation_data=(x_cv_scaled, y_cv),
        epochs=200,
        verbose=0
    )
    histories.append(history)
    print("Done!")

    train_results = model.evaluate(x_train_scaled, y_train, verbose=0, return_dict=True)
    train_accuracies.append(train_results['accuracy'])
    
    cv_results = model.evaluate(x_cv_scaled, y_cv, verbose=0, return_dict=True)
    cv_accuracies.append(cv_results['accuracy'])
    print(model.name, cv_results)

In [None]:
histories[0].history.keys()

In [None]:
def plot_history(history):
    # Plot loss
    plt.figure(figsize=(8, 4))
    plt.plot(history.history['loss'], label='Training loss')
    plt.plot(history.history['val_loss'], label='Validation loss')
    plt.title('Loss over epochs')
    plt.xlabel('Epoch')
    plt.ylabel('BinaryCrossentropy Loss')
    plt.legend()
    plt.show()
    
    # Plot accuracy
    plt.figure(figsize=(8, 4))
    plt.plot(history.history['accuracy'], label='Training accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation accuracy')
    plt.axhline(y=baseline_accuracy, color='r', linestyle=':', label='Baseline accuracy')
    plt.title('Accuracy over epochs')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.show()

In [None]:
for i in range(len(histories)):
    history = histories[i]
    print(f'Model {i+1}')
    plot_history(history)

In [None]:
model_num = np.argmax(cv_accuracies)
model = models[model_num]
print(f'{model.name} has the best accuracy: {cv_accuracies[model_num]}')

In [None]:
# Model 4 has the best accuracy, but its loss graph shows validation loss rising and training loss falling - indicating overfitting
# we could reduce the number of features, or add regularization 
# since the other models already have fewer features and worse accuracy aside from model 1, let's try adding regularization

In [None]:
# Initialize lists that will contain the errors for each model
reg_train_accuracies = []
reg_cv_accuracies = []
reg_histories = []
reg_params = [1, 0.5, 0.2, 0.1, 0.05, 0.02, 0.01, 0.005, 0.002, 0.001]
reg_models = []

# Loop over each model
for lambda_ in reg_params:
    # Build the model
    model = Sequential(
        [
            tf.keras.Input(shape=(3,)),
            Dense(35, activation = 'relu', kernel_regularizer=l2(lambda_)),
            Dense(25, activation = 'relu', kernel_regularizer=l2(lambda_)),
            Dense(15, activation = 'relu', kernel_regularizer=l2(lambda_)),
            Dense(5, activation = 'relu', kernel_regularizer=l2(lambda_)),
            Dense(1, activation = 'sigmoid')
        ],
        name='model_4'
    )
    reg_models.append(model)
    
    # Setup the loss and optimizer
    model.compile(
        loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        metrics=[
            tf.keras.metrics.BinaryAccuracy(name='accuracy', threshold=0.5),
            tf.keras.metrics.AUC(name='auc')
        ]
    )
    print(f"Training {model.name} with lambda {lambda_}...")

    # Train the model
    history = model.fit(
        x_train_scaled, 
        y_train,
        validation_data=(x_cv_scaled, y_cv),
        epochs=100,
        verbose=0
    )
    reg_histories.append(history)
    print("Done!")

    train_results = model.evaluate(x_train_scaled, y_train, verbose=0, return_dict=True)
    reg_train_accuracies.append(train_results['accuracy'])
    
    cv_results = model.evaluate(x_cv_scaled, y_cv, verbose=0, return_dict=True)
    reg_cv_accuracies.append(cv_results['accuracy'])
    print(lambda_, cv_results)

In [None]:
reg_num = np.argmax(reg_cv_accuracies)
model = reg_models[reg_num]
print(f'{model.name} with lambda {reg_params[reg_num]} has the best accuracy: {cv_accuracies[model_num]}')

In [None]:
for i in range(len(reg_histories)):
    history = reg_histories[i]
    print(f'Model {i+1} with lambda {reg_params[i]}')
    plot_history(history)

In [None]:
test_results = model.evaluate(x_test_scaled, y_test, verbose=0, return_dict=True)
print(test_results)

In [None]:
prediction = model.predict(x_test_scaled)
fx = [0 if p < 0.5 else 1 for p in prediction]

In [None]:
def draw_scatter_plot_3d_with_fx(x1, x2, x3, y, fx, x1_label='X axis', x2_label='Y axis', x3_label='Z axis'):
    fig = plt.figure(figsize=(8, 6))
    ax = fig.add_subplot(111, projection='3d')
    colors_actual = ['red' if label == 0 else 'blue' for label in y]
    ax.scatter(x1, x2, x3, c=colors_actual, s=50, alpha=0.4, label='Actual')
    colors_predicted = ['pink' if label == 0 else 'lightblue' for label in fx]
    ax.scatter(x1, x2, x3, c=colors_predicted, s=20, alpha=0.8, label='Predicted', marker='^')
    ax.set_xlabel(x1_label)
    ax.set_ylabel(x2_label)
    ax.set_zlabel(x3_label)
    legend_elements = [
        Line2D([0], [0], marker='o', color='w', label='Actual 0', markerfacecolor='red', markersize=8),
        Line2D([0], [0], marker='o', color='w', label='Actual 1', markerfacecolor='blue', markersize=8),
        Line2D([0], [0], marker='^', color='w', label='Predicted 0', markerfacecolor='pink', markersize=8),
        Line2D([0], [0], marker='^', color='w', label='Predicted 1', markerfacecolor='lightblue', markersize=8)
    ]
    ax.legend(handles=legend_elements, loc='best')
    plt.show()

In [None]:
draw_scatter_plot_3d_with_fx(x_test[:, 0], x_test[:, 1], x_test[:, 2], y_test, fx, 'GRE', 'GPA', 'Rank')

In [None]:
accuracy_improvement = test_results['accuracy'] - baseline_accuracy
print(f'{test_results['accuracy']:.3f} is a {accuracy_improvement:.3f} improvement over the baseline {baseline_accuracy:.3f}')

In [None]:
# 0.725 is markedly better than the 0.21 accuracy I got in project 4