In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import pickle
from collections import Counter
from sklearn.metrics import precision_recall_curve, roc_auc_score, confusion_matrix, balanced_accuracy_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from scipy.stats.mstats import gmean
import math
import datetime as dt
import matplotlib
font = {'size': 24}
matplotlib.rc('font', **font)
import matplotlib.pyplot as plt
import seaborn as sns
import re


df_new = pd.read_stata("../degree_completion_1/df_new.dta")
predictors = pickle.load(open("../degree_completion_1/predictors_rf2.p", "rb"))
train_df = pd.read_stata("train_df_downsampled.dta")
test_df = df_new[df_new.valid == 1]
train_df_original = df_new[df_new.valid == 0]

In [2]:
def calc_cw(y):
    # Calculate the weight of each letter grade to be used in the modeling fitting procedure: the weight is inversely proportional to the square root of the frequency of the letter grade in the training sample
    cw = Counter(y)
    class_weight = {k:np.sqrt(cw.most_common()[0][-1]/v, dtype=np.float32) for k,v in cw.items()}
    return class_weight # The output is a dictionary mapping letter grade to the corresponding weight

In [3]:
optimal_d = 16
optimal_n = 100
optimal_nf = 11
rf = RandomForestClassifier(n_estimators=optimal_n, criterion="entropy",
                            max_depth=optimal_d,
                            random_state=0, n_jobs=-1, max_features=optimal_nf,
                            class_weight = calc_cw(train_df.grad_6years))
rf.fit(train_df.loc[:,predictors], train_df.grad_6years)

RandomForestClassifier(bootstrap=True, class_weight={0.0: 1.0, 1.0: 1.500234},
            criterion='entropy', max_depth=16, max_features=11,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=-1, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [4]:
# Coefficients and predicted scores
y_test_pred_rf = rf.predict_proba(test_df.loc[:,predictors])[:,1]
y_train_pred_rf = rf.predict_proba(train_df_original.loc[:,predictors])[:,1]
pickle.dump(y_test_pred_rf, open("y_test_pred_rf.p", "wb"))
pickle.dump(list(test_df.grad_6years), open( "y_test.p", "wb"))
pickle.dump(y_train_pred_rf, open("y_train_pred_rf.p", "wb"))
pickle.dump(list(train_df_original.grad_6years), open("y_train.p", "wb"))
print("Random Forest:")
print("Validation AUC = {}".format(round(roc_auc_score(test_df.grad_6years, y_test_pred_rf),4)))
print("Training AUC = {}".format(round(roc_auc_score(train_df_original.grad_6years, y_train_pred_rf),4)))

Random Forest:
Validation AUC = 0.8964
Training AUC = 0.9095


In [6]:
race_column = []
for i in range(test_df.shape[0]):
    if test_df.white.iloc[i] == 1:
        race_column.append("white")
    elif test_df.afam.iloc[i] == 1:
        race_column.append("afam")
    elif test_df.hisp.iloc[i] == 1:
        race_column.append("hisp")
    elif test_df.asian.iloc[i] == 1:
        race_column.append("asian")
    elif test_df.other.iloc[i] == 1:
        race_column.append("other")
    else:
        race_column.append("mi")
race_column = np.array(race_column)
pred_y = np.array(y_test_pred_rf)
test_y = np.array(test_df.grad_6years)
pred_y = pred_y[race_column != "mi"]
test_y = test_y[race_column != "mi"]
race_column = race_column[race_column != "mi"]
print(len(race_column), len(pred_y), len(test_y))

pred_score_by_race = pd.DataFrame({'race_column': race_column, 'pred_y': pred_y, 'test_y': test_y})
pred_score_by_race.to_csv("pred_score_by_race.csv", index=False)

62049 62049 62049
