In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import pickle
from collections import Counter
from sklearn.metrics import precision_recall_curve, roc_auc_score, confusion_matrix, balanced_accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from scipy.stats.mstats import gmean
import math
import datetime as dt
import matplotlib
font = {'size': 24}
matplotlib.rc('font', **font)
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [2]:
df_new = pd.read_stata("../degree_completion_1/df_new.dta")
predictors = pickle.load(open("../degree_completion_1/predictors.p", "rb"))

In [3]:
program = pd.read_csv("../degree_completion_6/college_program_race.csv")
program = program.loc[:,['vccsid', 'college']]
df_new = program.merge(df_new, on=['vccsid'], how='right')

train_df_all = df_new[df_new.valid == 0]
test_df_all = df_new[df_new.valid == 1]

In [4]:
def calc_cw(y):
    # Calculate the weight of each letter grade to be used in the modeling fitting procedure: the weight is inversely proportional to the square root of the frequency of the letter grade in the training sample
    cw = Counter(y)
    class_weight = {k:np.sqrt(cw.most_common()[0][-1]/v, dtype=np.float32) for k,v in cw.items()}
    return class_weight # The output is a dictionary mapping letter grade to the corresponding weight

In [5]:
params_dict = {'JSRCC': [12, 120, 10],
               'JTCC': [12, 120, 8],
               'NVCC': [14, 120, 10],
               'TCC': [13, 120, 13],
               'TNCC': [12, 160, 9]}
cname = {'Northern_Virginia': "NVCC", "Tidewater": "TCC", "J._Sargeant_Reynolds": "JSRCC", "Thomas_Nelson": "TNCC", "John_Tyler": "JTCC"}
for r2 in ['J._Sargeant_Reynolds', 'John_Tyler', 'Northern_Virginia', 'Tidewater', 'Thomas_Nelson']:

    # Load the training/validation sample
    print(r2)
    r = cname[r2]
    train_df = train_df_all[train_df_all.college == r2]
    test_df = test_df_all[test_df_all.college == r2]
    print(train_df.shape,test_df.shape)

    train_df.loc[:,['vccsid']].to_stata("train_id_{}.dta".format(r), write_index=False)
    test_df.loc[:,['vccsid']].to_stata("test_id_{}.dta".format(r), write_index=False)

    optimal_d = params_dict[r][0]
    optimal_n = params_dict[r][1]
    optimal_nf = params_dict[r][2]
    rf = RandomForestClassifier(n_estimators=optimal_n, criterion="entropy",
                                max_depth=optimal_d,
                                random_state=0, n_jobs=-1, max_features=optimal_nf,
                                class_weight = calc_cw(train_df.grad_6years))
    rf.fit(train_df.loc[:,predictors], train_df.grad_6years)
    
    # Coefficients and predicted scores
    y_test_pred_rf = rf.predict_proba(test_df.loc[:,predictors])[:,1]
    y_train_pred_rf = rf.predict_proba(train_df.loc[:,predictors])[:,1]
    pickle.dump(y_test_pred_rf, open("y_test_pred_rf_{}.p".format(r), "wb"))
    pickle.dump(list(test_df.grad_6years), open( "y_test_{}.p".format(r), "wb"))
    pickle.dump(y_train_pred_rf, open("y_train_pred_rf_{}.p".format(r), "wb"))
    pickle.dump(list(train_df.grad_6years), open("y_train_{}.p".format(r), "wb"))
    print("Random Forest:")
    print("Validation AUC = {}".format(round(roc_auc_score(test_df.grad_6years, y_test_pred_rf),4)))
    print("Training AUC = {}".format(round(roc_auc_score(train_df.grad_6years, y_train_pred_rf),4)))
    
    race_column = []
    for i in range(test_df.shape[0]):
        if test_df.white.iloc[i] == 1:
            race_column.append("white")
        elif test_df.afam.iloc[i] == 1:
            race_column.append("afam")
        elif test_df.hisp.iloc[i] == 1:
            race_column.append("hisp")
        elif test_df.asian.iloc[i] == 1:
            race_column.append("asian")
        elif test_df.other.iloc[i] == 1:
            race_column.append("other")
        else:
            race_column.append("mi")
    race_column = np.array(race_column)
    pred_y = np.array(y_test_pred_rf)
    test_y = np.array(test_df.grad_6years)
    pred_y = pred_y[race_column != "mi"]
    test_y = test_y[race_column != "mi"]
    race_column = race_column[race_column != "mi"]
    print(len(race_column), len(pred_y), len(test_y))

    pred_score_by_race = pd.DataFrame({'race_column': race_column, 'pred_y': pred_y, 'test_y': test_y})
    pred_score_by_race.to_csv("pred_score_by_race_{}.csv".format(r), index=False)

J._Sargeant_Reynolds
(23333, 355) (4248, 355)
Random Forest:
Validation AUC = 0.8997
Training AUC = 0.9359
4248 4248 4248
John_Tyler
(13918, 355) (2839, 355)
Random Forest:
Validation AUC = 0.8864
Training AUC = 0.9456
2839 2839 2839
Northern_Virginia
(84954, 355) (18246, 355)
Random Forest:
Validation AUC = 0.8955
Training AUC = 0.9316
18246 18246 18246
Tidewater
(59596, 355) (11489, 355)
Random Forest:
Validation AUC = 0.8712
Training AUC = 0.9163
11489 11489 11489
Thomas_Nelson
(20788, 355) (4120, 355)
Random Forest:
Validation AUC = 0.8609
Training AUC = 0.9169
4120 4120 4120
