This script uses the base model to generate the predicted scores on XX-specific (e.g. college-specific, first-term-specific) samples. Those XX-specific scores will be used in subsequent steps of comparing algorithmic biases.

In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import pickle
from collections import Counter
from sklearn.metrics import precision_recall_curve, roc_auc_score, confusion_matrix, balanced_accuracy_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from scipy.stats.mstats import gmean
import math
import datetime as dt
import matplotlib
font = {'size': 24}
matplotlib.rc('font', **font)
import matplotlib.pyplot as plt
import seaborn as sns
import re


df_new = pd.read_stata("full_data_truncated_enlarged_new.dta")
predictors = pickle.load(open("predictors_rf2.p", "rb"))


l1 = ['coll_lvl_cred_earn', 'prop_comp_pre', 'cum_gpa_pre']
for p in l1:
    v = np.min(df_new[p]) - 1
    df_new.loc[:, p] = df_new.apply(lambda x: v if x['enrolled_pre'] != 1 else x[p], axis=1)
l2 = ['admrate', 'gradrate', 'satvr25', 'satvr75', 'satmt25', 'satmt75', 'satwr25', 'satwr75', 'nsc_coll_type_1', 'nsc_coll_type_2', 'nsc_coll_type_3', 'nsc_coll_type_4', 'nsc_coll_type_5', 'nsc_coll_type_6', 'nsc_coll_type_7', 'nsc_coll_type_8']
for p in l2:
    v = np.min(df_new[p]) - 1
    df_new.loc[:, p] = df_new.apply(lambda x: v if x['enrolled_nsc'] != 1 else x[p], axis=1)
l3 = ['degree_seeking', 'term_cred_att', 'term_gpa', 'prop_comp', 'withdrawn_prop_comp', 'lvl2_prop_comp', 'dev_prop_comp', 'repeat']
for t in ['su', 'fa', 'sp']:
    for i in range(1,7):
        for pp in l3:
            p = pp + "_" + t + str(i)
            v = np.min(df_new[p]) - 1
            df_new.loc[:, p] = df_new.apply(lambda x: v if x['enrolled_' + t + str(i)] != 1 else x[p], axis=1)
l4 = ['enrl_intensity_nsc']
for t in ['su', 'fa', 'sp']:
    for i in range(1,7):
        for pp in l4:
            p = pp + "_" + t + str(i)
            v = np.min(df_new[p]) - 1
            df_new.loc[:, p] = df_new.apply(lambda x: v if x['enrolled_nsc_' + t + str(i)] != 1 else x[p], axis=1)
l5 = ['grants', 'sub_loans', 'unsub_loans', 'others']
for i in range(1,7):
    for pp in l5:
        p = pp + "_yr" + str(i)
        v = np.min(df_new[p]) - 1
        df_new.loc[:, p] = df_new.apply(lambda x: v if x['enrolled_yr' + str(i)] != 1 else x[p], axis=1)
to_drop = ['enrolled_pre', 'enrolled_nsc'] + ['enrolled_' + t + str(i) for t in ['su', 'fa', 'sp'] for i in range(1,7)] + ['enrolled_nsc_' + t + str(i) for t in ['su', 'fa', 'sp'] for i in range(1,7)]
predictors = [p for p in predictors if p not in set(to_drop)]
print(len(predictors))

train_df = df_new[df_new.valid == 0]
test_df = df_new[df_new.valid == 1]

294


In [2]:
df_new.to_stata("df_new.dta", write_index=False)

In [3]:
pickle.dump(predictors, open("predictors.p", "wb"))

In [7]:
def calc_cw(y):
    # Calculate the weight of each letter grade to be used in the modeling fitting procedure: the weight is inversely proportional to the square root of the frequency of the letter grade in the training sample
    cw = Counter(y)
    class_weight = {k:np.sqrt(cw.most_common()[0][-1]/v, dtype=np.float32) for k,v in cw.items()}
    return class_weight # The output is a dictionary mapping letter grade to the corresponding weight

In [8]:
optimal_d = 16
optimal_n = 120
optimal_nf = 12
rf = RandomForestClassifier(n_estimators=optimal_n, criterion="entropy",
                            max_depth=optimal_d,
                            random_state=0, n_jobs=-1, max_features=optimal_nf,
                            class_weight = calc_cw(train_df.grad_6years))
rf.fit(train_df.loc[:,predictors], train_df.grad_6years)

RandomForestClassifier(bootstrap=True,
            class_weight={0.0: 1.0, 1.0: 1.3931639}, criterion='entropy',
            max_depth=16, max_features=12, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=120, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [5]:
#### first-term-specific and returning-specific
first_ind = pd.read_stata("../degree_completion_1/full_data_truncated_enlarged_new.dta")
first_ind.loc[:,'available_sum'] = 0
for p in [p for p in list(first_ind.columns)[10:] if p.startswith("available") and p != "available_sum"]:
    first_ind.loc[:,'available_sum'] += first_ind[p]
first_ind.loc[:,'first_ind'] = first_ind.available_sum.apply(lambda x: x <= 1).astype(int)
first_ind = first_ind.loc[:,['vccsid', 'first_ind']]

In [9]:
df_new = pd.read_stata("df_new.dta")
df_new = df_new.merge(first_ind, on=['vccsid'], how='left')
test_df_all = df_new[df_new.valid == 1]
for r in [0,1]:
    test_df = test_df_all[test_df_all.first_ind == r]
    y_test_pred_rf = rf.predict_proba(test_df.loc[:,predictors])[:,1]
    print("Random Forest:")
    print("Validation AUC = {}".format(round(roc_auc_score(test_df.grad_6years, y_test_pred_rf),4)))

    race_column = []
    for i in range(test_df.shape[0]):
        if test_df.white.iloc[i] == 1:
            race_column.append("white")
        elif test_df.afam.iloc[i] == 1:
            race_column.append("afam")
        elif test_df.hisp.iloc[i] == 1:
            race_column.append("hisp")
        elif test_df.asian.iloc[i] == 1:
            race_column.append("asian")
        elif test_df.other.iloc[i] == 1:
            race_column.append("other")
        else:
            race_column.append("mi")
    race_column = np.array(race_column)
    pred_y = np.array(y_test_pred_rf)
    test_y = np.array(test_df.grad_6years)
    pred_y = pred_y[race_column != "mi"]
    test_y = test_y[race_column != "mi"]
    race_column = race_column[race_column != "mi"]
    print(len(race_column), len(pred_y), len(test_y))

    pred_score_by_race = pd.DataFrame({'race_column': race_column, 'pred_y': pred_y, 'test_y': test_y})
    pred_score_by_race.to_csv("../degree_completion_5/full/pred_score_by_race_{}.csv".format(r), index=False)

Random Forest:
Validation AUC = 0.8935
47371 47371 47371
Random Forest:
Validation AUC = 0.8781
14678 14678 14678


In [12]:
df_new = pd.read_stata("df_new.dta")
program = pd.read_stata("../degree_completion_9/student_deglvl.dta")
df_new = program.merge(df_new, on=['vccsid'], how='right')
test_df_all = df_new[df_new.valid == 1]
for r in ['AA&S', 'AAS', 'CERT', 'CSC']:
    test_df = test_df_all[test_df_all.deglvl == r]
    y_test_pred_rf = rf.predict_proba(test_df.loc[:,predictors])[:,1]
    print("Random Forest:")
    print("Validation AUC = {}".format(round(roc_auc_score(test_df.grad_6years, y_test_pred_rf),4)))

    race_column = []
    for i in range(test_df.shape[0]):
        if test_df.white.iloc[i] == 1:
            race_column.append("white")
        elif test_df.afam.iloc[i] == 1:
            race_column.append("afam")
        elif test_df.hisp.iloc[i] == 1:
            race_column.append("hisp")
        elif test_df.asian.iloc[i] == 1:
            race_column.append("asian")
        elif test_df.other.iloc[i] == 1:
            race_column.append("other")
        else:
            race_column.append("mi")
    race_column = np.array(race_column)
    pred_y = np.array(y_test_pred_rf)
    test_y = np.array(test_df.grad_6years)
    pred_y = pred_y[race_column != "mi"]
    test_y = test_y[race_column != "mi"]
    race_column = race_column[race_column != "mi"]
    print(len(race_column), len(pred_y), len(test_y))

    pred_score_by_race = pd.DataFrame({'race_column': race_column, 'pred_y': pred_y, 'test_y': test_y})
    pred_score_by_race.to_csv("../degree_completion_9/full/pred_score_by_race_{}.csv".format(r), index=False)

Random Forest:
Validation AUC = 0.9059
35978 35978 35978
Random Forest:
Validation AUC = 0.9064
13985 13985 13985
Random Forest:
Validation AUC = 0.896
2822 2822 2822
Random Forest:
Validation AUC = 0.85
5390 5390 5390


In [13]:
df_new = pd.read_stata("df_new.dta")
program = pd.read_csv("../degree_completion_6/college_program_race.csv")
program.loc[:,'curr_degree_lvl'] = program.curr.astype(str) + "_" + program.degree_lvl
program = program.loc[:,['vccsid', 'curr_degree_lvl']]
df_new = program.merge(df_new, on=['vccsid'], how='right')
df_new = df_new[df_new.curr_degree_lvl.apply(lambda x: x in {"213_A", "697_A", "699_A", "880_A", "882_A"})]
df_new.loc[:,'curr_degree_lvl'] = df_new.curr_degree_lvl.apply(lambda x: x[:3])
df_new = df_new.rename(columns = {'curr_degree_lvl': 'curr'})
test_df_all = df_new[df_new.valid == 1]
for r in ['699', '213', '880', '882', '697']:
    test_df = test_df_all[test_df_all.curr == r]
    y_test_pred_rf = rf.predict_proba(test_df.loc[:,predictors])[:,1]
    print("Random Forest:")
    print("Validation AUC = {}".format(round(roc_auc_score(test_df.grad_6years, y_test_pred_rf),4)))

    race_column = []
    for i in range(test_df.shape[0]):
        if test_df.white.iloc[i] == 1:
            race_column.append("white")
        elif test_df.afam.iloc[i] == 1:
            race_column.append("afam")
        elif test_df.hisp.iloc[i] == 1:
            race_column.append("hisp")
        elif test_df.asian.iloc[i] == 1:
            race_column.append("asian")
        elif test_df.other.iloc[i] == 1:
            race_column.append("other")
        else:
            race_column.append("mi")
    race_column = np.array(race_column)
    pred_y = np.array(y_test_pred_rf)
    test_y = np.array(test_df.grad_6years)
    pred_y = pred_y[race_column != "mi"]
    test_y = test_y[race_column != "mi"]
    race_column = race_column[race_column != "mi"]
    print(len(race_column), len(pred_y), len(test_y))

    pred_score_by_race = pd.DataFrame({'race_column': race_column, 'pred_y': pred_y, 'test_y': test_y})
    pred_score_by_race.to_csv("../degree_completion_6/full/pred_score_by_race_{}.csv".format(r), index=False)

Random Forest:
Validation AUC = 0.8891
7222 7222 7222
Random Forest:
Validation AUC = 0.9046
5030 5030 5030
Random Forest:
Validation AUC = 0.8885
4229 4229 4229
Random Forest:
Validation AUC = 0.9109
4557 4557 4557
Random Forest:
Validation AUC = 0.9284
3963 3963 3963


In [14]:
df_new = pd.read_stata("df_new.dta")
program = pd.read_csv("../degree_completion_6/college_program_race.csv")
program = program.loc[:,['vccsid', 'college']]
df_new = program.merge(df_new, on=['vccsid'], how='right')
test_df_all = df_new[df_new.valid == 1]

params_dict = {'JSRCC': [12, 120, 10],
               'JTCC': [12, 120, 8],
               'NVCC': [14, 120, 10],
               'TCC': [13, 120, 13],
               'TNCC': [12, 160, 9]}
cname = {'Northern_Virginia': "NVCC", "Tidewater": "TCC", "J._Sargeant_Reynolds": "JSRCC", "Thomas_Nelson": "TNCC", "John_Tyler": "JTCC"}
for r2 in ['J._Sargeant_Reynolds', 'John_Tyler', 'Northern_Virginia', 'Tidewater', 'Thomas_Nelson']:
    # Load the training/validation sample
    print(r2)
    r = cname[r2]
    test_df = test_df_all[test_df_all.college == r2]
    y_test_pred_rf = rf.predict_proba(test_df.loc[:,predictors])[:,1]
    print("Random Forest:")
    print("Validation AUC = {}".format(round(roc_auc_score(test_df.grad_6years, y_test_pred_rf),4)))
    
    race_column = []
    for i in range(test_df.shape[0]):
        if test_df.white.iloc[i] == 1:
            race_column.append("white")
        elif test_df.afam.iloc[i] == 1:
            race_column.append("afam")
        elif test_df.hisp.iloc[i] == 1:
            race_column.append("hisp")
        elif test_df.asian.iloc[i] == 1:
            race_column.append("asian")
        elif test_df.other.iloc[i] == 1:
            race_column.append("other")
        else:
            race_column.append("mi")
    race_column = np.array(race_column)
    pred_y = np.array(y_test_pred_rf)
    test_y = np.array(test_df.grad_6years)
    pred_y = pred_y[race_column != "mi"]
    test_y = test_y[race_column != "mi"]
    race_column = race_column[race_column != "mi"]
    print(len(race_column), len(pred_y), len(test_y))

    pred_score_by_race = pd.DataFrame({'race_column': race_column, 'pred_y': pred_y, 'test_y': test_y})
    pred_score_by_race.to_csv("../degree_completion_7/full/pred_score_by_race_{}.csv".format(r), index=False)

J._Sargeant_Reynolds
Random Forest:
Validation AUC = 0.9079
4248 4248 4248
John_Tyler
Random Forest:
Validation AUC = 0.8992
2839 2839 2839
Northern_Virginia
Random Forest:
Validation AUC = 0.8978
18246 18246 18246
Tidewater
Random Forest:
Validation AUC = 0.8749
11489 11489 11489
Thomas_Nelson
Random Forest:
Validation AUC = 0.8705
4120 4120 4120
