In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import pickle
from collections import Counter
from sklearn.metrics import precision_recall_curve, roc_auc_score, confusion_matrix, balanced_accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from scipy.stats.mstats import gmean
import math
import datetime as dt
import matplotlib
font = {'size': 24}
matplotlib.rc('font', **font)
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [2]:
df_new = pd.read_stata("../degree_completion_1/df_new.dta")

In [3]:
predictors = pickle.load(open("../degree_completion_1/predictors.p", "rb"))

In [4]:
train_df_all = df_new[df_new.valid == 0]
test_df_all = df_new[df_new.valid == 1]

In [5]:
def calc_cw(y):
    # Calculate the weight of each letter grade to be used in the modeling fitting procedure: the weight is inversely proportional to the square root of the frequency of the letter grade in the training sample
    cw = Counter(y)
    class_weight = {k:np.sqrt(cw.most_common()[0][-1]/v, dtype=np.float32) for k,v in cw.items()}
    return class_weight # The output is a dictionary mapping letter grade to the corresponding weight

In [6]:
params_dict = {'white': [16, 140, 10],
               'afam': [15, 120, 11],
               'hisp': [13, 160, 10],
               'asian': [11,100,12],
               'other': [13,140,6]}

for r in ['white', 'afam', 'hisp', 'asian', 'other']:

    # Load the training/validation sample
    print(r)
    train_df = train_df_all[train_df_all[r] == 1]
    test_df = test_df_all[test_df_all[r] == 1]
    train_df.loc[:,['vccsid']].to_stata("train_id_{}.dta".format(r), write_index=False)
    test_df.loc[:,['vccsid']].to_stata("test_id_{}.dta".format(r), write_index=False)
    print(train_df.shape,test_df.shape)

    optimal_d = params_dict[r][0]
    optimal_n = params_dict[r][1]
    optimal_nf = params_dict[r][2]
    rf = RandomForestClassifier(n_estimators=optimal_n, criterion="entropy",
                                max_depth=optimal_d,
                                random_state=0, n_jobs=-1, max_features=optimal_nf,
                                class_weight = calc_cw(train_df.grad_6years))
    rf.fit(train_df.loc[:,predictors], train_df.grad_6years)
    
    xx = np.array(predictors)[np.argsort(rf.feature_importances_)[::-1]]
    yy = rf.feature_importances_[np.argsort(rf.feature_importances_)[::-1]]
    pd.DataFrame({'predictor':xx, 'fi':yy}).to_csv("fi_{}.csv".format(r), index=False)

white
(179919, 354) (32288, 354)
afam
(82784, 354) (16796, 354)
hisp
(25338, 354) (6370, 354)
asian
(18395, 354) (3625, 354)
other
(9690, 354) (2970, 354)


In [11]:
params_dict = {'white': [16, 140, 10],
               'afam': [15, 120, 11],
               'hisp': [13, 160, 10],
               'asian': [11,100,12],
               'other': [13,140,6]}
for r in ['white', 'afam', 'hisp', 'asian', 'other']:

    # Load the training/validation sample
    print(r)
    train_df = train_df_all[train_df_all[r] == 1]
    test_df = test_df_all[test_df_all[r] == 1]
    print(train_df.shape,test_df.shape)

    optimal_d = params_dict[r][0]
    optimal_n = params_dict[r][1]
    optimal_nf = params_dict[r][2]
    rf = RandomForestClassifier(n_estimators=optimal_n, criterion="entropy",
                                max_depth=optimal_d,
                                random_state=0, n_jobs=-1, max_features=optimal_nf,
                                class_weight = calc_cw(train_df.grad_6years))
    rf.fit(train_df.loc[:,predictors], train_df.grad_6years)
    
    # Coefficients and predicted scores
    y_test_pred_rf = rf.predict_proba(test_df.loc[:,predictors])[:,1]
    y_train_pred_rf = rf.predict_proba(train_df.loc[:,predictors])[:,1]
    pickle.dump(y_test_pred_rf, open("y_test_pred_rf_{}.p".format(r), "wb"))
    pickle.dump(list(test_df.grad_6years), open( "y_test_{}.p".format(r), "wb"))
    pickle.dump(y_train_pred_rf, open("y_train_pred_rf_{}.p".format(r), "wb"))
    pickle.dump(list(train_df.grad_6years), open("y_train_{}.p".format(r), "wb"))
    print("Random Forest:")
    print("Validation AUC = {}".format(round(roc_auc_score(test_df.grad_6years, y_test_pred_rf),4)))
    print("Training AUC = {}".format(round(roc_auc_score(train_df.grad_6years, y_train_pred_rf),4)))

    pred_score_by_race = pd.DataFrame({'race_column': [r]*test_df.shape[0], 'pred_y': list(y_test_pred_rf), 'test_y': list(test_df.grad_6years)})
    pred_score_by_race.to_csv("pred_score_by_race_{}.csv".format(r), index=False)

white
(179919, 354) (32288, 354)
Random Forest:
Validation AUC = 0.8984
Training AUC = 0.9354
afam
(82784, 354) (16796, 354)
Random Forest:
Validation AUC = 0.8834
Training AUC = 0.9314
hisp
(25338, 354) (6370, 354)
Random Forest:
Validation AUC = 0.8937
Training AUC = 0.9415
asian
(18395, 354) (3625, 354)
Random Forest:
Validation AUC = 0.8821
Training AUC = 0.9378
other
(9690, 354) (2970, 354)
Random Forest:
Validation AUC = 0.8852
Training AUC = 0.9499


In [12]:
df_list = []
for r in ['white', 'afam', 'hisp', 'asian', 'other']:
    df_list.append(pd.read_csv("pred_score_by_race_{}.csv".format(r)))
final = pd.concat(df_list)
final.to_csv("pred_score_by_race.csv", index=False)