In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.lines import Line2D
import numpy as np

In [None]:
def dataframe_to_xy(dataframe):
    data = dataframe.to_numpy()
    y = data[:, 0]
    X = data[:, 1:]
    return X, y

In [None]:
xy_dataframe = pd.read_stata('binary.dta') # https://stats.oarc.ucla.edu/stata/dae/logistic-regression/

In [None]:
xy_dataframe.info()

In [None]:
X, y = dataframe_to_xy(xy_dataframe)

In [None]:
# Compute a baseline for accuracy
unique, counts = np.unique(y, return_counts=True)
print(dict(zip(unique, counts)))
baseline_accuracy = counts.max() / counts.sum()
print("Majority baseline:", baseline_accuracy)

In [None]:
# Get 60% of the dataset as the training set. Put the remaining 40% in temporary variables: x_ and y_.
x_train, x_, y_train, y_ = train_test_split(X, y, test_size=0.40, random_state=1)

# Split the 40% subset above into two: one half for cross validation and the other for the test set
x_cv, x_test, y_cv, y_test = train_test_split(x_, y_, test_size=0.50, random_state=1)

# Delete temporary variables
del x_, y_

In [None]:
def draw_scatter_plot_3d(x1, x2, x3, y, x1_label='X axis', x2_label='Y axis', x3_label='Z axis', y_labels=['0', '1']):
    fig = plt.figure(figsize=(8, 6))
    ax = fig.add_subplot(111, projection='3d')
    colors = ['red' if label == 0 else 'blue' for label in y]
    ax.scatter(x1, x2, x3, c=colors, s=50, alpha=0.8)
    ax.set_xlabel(x1_label)
    ax.set_ylabel(x2_label)
    ax.set_zlabel(x3_label)
    legend_elements = [
        Line2D([0], [0], marker='o', color='w', label=y_labels[0], markerfacecolor='red', markersize=8),
        Line2D([0], [0], marker='o', color='w', label=y_labels[1], markerfacecolor='blue', markersize=8)
    ]
    ax.legend(handles=legend_elements, loc='best')
    plt.show()

In [None]:
draw_scatter_plot_3d(X[:, 0], X[:, 1], X[:, 2], y, 'GRE', 'GPA', 'Rank', ['Not Admitted', 'Admitted'])

In [None]:
min_samples_split_list = [2, 10, 20, 30, 50, 100, 200]
max_depth_list = [2, 4, 6, 8, 16, 32, None]
num_trees_list = [1, 2, 5, 10, 50, 100, 500]

In [None]:
accuracy_list_train = []
accuracy_list_cv = []
for min_samples_split in min_samples_split_list:
    model = RandomForestClassifier(min_samples_split = min_samples_split).fit(x_train, y_train) 
    predictions_train = model.predict(x_train)
    predictions_cv = model.predict(x_cv)
    accuracy_train = accuracy_score(predictions_train, y_train)
    accuracy_cv = accuracy_score(predictions_cv, y_cv)
    accuracy_list_train.append(accuracy_train)
    accuracy_list_cv.append(accuracy_cv)

In [None]:
plt.title('Train x Validation metrics')
plt.xlabel('min_samples_split')
plt.ylabel('accuracy')
plt.xticks(ticks = range(len(min_samples_split_list)), labels=min_samples_split_list) 
plt.plot(accuracy_list_train)
plt.plot(accuracy_list_cv)
plt.legend(['Train', 'Validation'])

In [None]:
# 30 min_samples_split looks like a good number for validation accuracy without overfitting or sacrificing training accuracy

In [None]:
accuracy_list_train = []
accuracy_list_cv = []
for max_depth in max_depth_list:
    model = RandomForestClassifier(max_depth = max_depth).fit(x_train, y_train) 
    predictions_train = model.predict(x_train)
    predictions_cv = model.predict(x_cv)
    accuracy_train = accuracy_score(predictions_train, y_train)
    accuracy_cv = accuracy_score(predictions_cv, y_cv)
    accuracy_list_train.append(accuracy_train)
    accuracy_list_cv.append(accuracy_cv)

In [None]:
plt.title('Train x Validation metrics')
plt.xlabel('max_depth')
plt.ylabel('accuracy')
plt.xticks(ticks = range(len(max_depth_list)), labels=max_depth_list)
plt.plot(accuracy_list_train)
plt.plot(accuracy_list_cv)
plt.legend(['Train', 'Validation'])

In [None]:
# max_depth 4 has the best validation accuracy without overfitting

In [None]:
accuracy_list_train = []
accuracy_list_cv = []
for num_trees in num_trees_list:
    model = RandomForestClassifier(n_estimators = num_trees).fit(x_train, y_train) 
    predictions_train = model.predict(x_train)
    predictions_cv = model.predict(x_cv)
    accuracy_train = accuracy_score(predictions_train, y_train)
    accuracy_cv = accuracy_score(predictions_cv, y_cv)
    accuracy_list_train.append(accuracy_train)
    accuracy_list_cv.append(accuracy_cv)

In [None]:
plt.title('Train x Validation metrics')
plt.xlabel('number of trees')
plt.ylabel('accuracy')
plt.xticks(ticks = range(len(num_trees_list)), labels=num_trees_list)
plt.plot(accuracy_list_train)
plt.plot(accuracy_list_cv)
plt.legend(['Train', 'Validation'])

In [None]:
# it appears any number of trees results in overfitting, so I may as well use the default number 100, which also has the highest validation accuracy

In [None]:
random_forest_model = RandomForestClassifier(n_estimators = 100, max_depth = 4, min_samples_split = 30).fit(x_train, y_train)

In [None]:
print(f"Metrics train:\n\tAccuracy score: {accuracy_score(random_forest_model.predict(x_train), y_train):.4f}")
print(f"Metrics cv:\n\tAccuracy score: {accuracy_score(random_forest_model.predict(x_cv), y_cv):.4f}")
fx = random_forest_model.predict(x_test)
print(f"Metrics test:\n\tAccuracy score: {accuracy_score(fx, y_test):.4f}")

In [None]:
# in the end, we couldn't get better accuracy than the baseline
# surprisingly, even though the data is tabular, the neural network from project 8 performed better

In [None]:
def draw_scatter_plot_3d_with_fx(x1, x2, x3, y, fx, x1_label='X axis', x2_label='Y axis', x3_label='Z axis'):
    fig = plt.figure(figsize=(8, 6))
    ax = fig.add_subplot(111, projection='3d')
    colors_actual = ['red' if label == 0 else 'blue' for label in y]
    ax.scatter(x1, x2, x3, c=colors_actual, s=50, alpha=0.4, label='Actual')
    colors_predicted = ['pink' if label == 0 else 'lightblue' for label in fx]
    ax.scatter(x1, x2, x3, c=colors_predicted, s=20, alpha=0.8, label='Predicted', marker='^')
    ax.set_xlabel(x1_label)
    ax.set_ylabel(x2_label)
    ax.set_zlabel(x3_label)
    legend_elements = [
        Line2D([0], [0], marker='o', color='w', label='Actual 0', markerfacecolor='red', markersize=8),
        Line2D([0], [0], marker='o', color='w', label='Actual 1', markerfacecolor='blue', markersize=8),
        Line2D([0], [0], marker='^', color='w', label='Predicted 0', markerfacecolor='pink', markersize=8),
        Line2D([0], [0], marker='^', color='w', label='Predicted 1', markerfacecolor='lightblue', markersize=8)
    ]
    ax.legend(handles=legend_elements, loc='best')
    plt.show()

In [None]:
draw_scatter_plot_3d_with_fx(x_test[:, 0], x_test[:, 1], x_test[:, 2], y_test, fx, 'GRE', 'GPA', 'Rank')