In [1]:
import pickle
import pandas as pd
import numpy as np
from scipy.stats.mstats import gmean
import sklearn
import statsmodels.api as sm
from statsmodels.tools import add_constant
from statsmodels.genmod.generalized_linear_model import GLM
from statsmodels.genmod import families
from sklearn.metrics import precision_recall_curve, roc_auc_score, confusion_matrix, precision_score, recall_score
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
import seaborn as sns
import matplotlib.pyplot as plt
import re

fpath = "/Users/ys8mz/Box Sync/Predictive Models of College Completion (VCCS)/intermediate_files"

In [2]:
df = pd.read_stata(fpath + "/full_data_truncated.dta")

In [3]:
predictors = list(df.columns)[10:]
print(len(predictors))
impute_list_1 = set(["prop_comp_pre","cum_gpa_pre"])
impute_list_2 = set([t1+"_"+t2+str(t3) for t1 in ["term_gpa", "prop_comp", "lvl2_prop_comp", "dev_prop_comp"] for t2 in ["fa", "sp", "su"] for t3 in range(1,7,1)])
impute_list_3 = set(["cum_gpa", "lvl2_prop_comp", "dev_prop_comp", "prop_comp", "prop_comp_sd", "withdrawn_prop_comp_sd"])
impute_list_4 = set(["admrate", "gradrate", "satvr25", "satvr75", "satmt25", "satmt75", "satwr25", "satwr75"])

331


#### (0) Create the predictor name crosswalk

In [1]:
term_predictors = [p for p in predictors if re.search("_(yr|sp|su|fa)[1-6]$", p)]
non_term_predictors = np.setdiff1d(predictors, term_predictors)
term_predictors_prototype = np.unique([p[:-4] for p in term_predictors if p.endswith("_fa1")])
yr_predictors_prototype = np.unique([p[:-4] for p in term_predictors if p.endswith("_yr1")])

In [113]:
print(yr_predictors_prototype)
yr_predictors = [p+y for y in ["_yr{}".format(i) for i in range(1,7,1)] for p in yr_predictors_prototype]

['grants' 'others' 'sub_loans' 'unsub_loans']


In [114]:
yr_predictors_prototype_2 = ['logarithm of total grants received in',
                             'logarithm of other aids received in',
                             'logarithm of subsidized loans received in',
                             'logarithm of unsubsidized loans received in']
yr_predictors_2 = [p+y for y in [" year {}".format(i) for i in range(1,7,1)] for p in yr_predictors_prototype_2]

In [115]:
print(term_predictors_prototype)
term_predictors = [p + "_{0}{1}".format(t,i) for i in range(1,7,1) for t in ['fa','sp','su'] for p in term_predictors_prototype]

['available' 'degree_seeking' 'dev_prop_comp' 'enrl_intensity_nsc'
 'enrolled' 'enrolled_nsc' 'lvl2_prop_comp' 'pell_0' 'pell_1' 'prop_comp'
 'repeat' 'term_cred_att' 'term_gpa' 'withdrawn_prop_comp']


In [116]:
term_dict = {1: '1st', 2:'2nd', 3:'3rd', 4:'4th', 5:'5th', 6:'6th',
             'su': 'summer', 'sp': 'spring', 'fa': 'fall'}
term_predictors_prototype_2 = ['indicator for data availability in',
                               'indicator for whether student is in degree-seeking status in',
                               'proportion of attempted credits of developmental courses in',
                               'total enrollment intensity in non-VCCS institutions in',
                               'indicator for whether student is actively enrolled in VCCS in',
                               'indicator for whether student is actively enrolled in non-VCCS institutions in',
                               'proprotion of attempted credits of 2XX level courses in',
                               'indicator for not Pell-eligibile',
                               'indicator for Pell-eligible',
                               'proportion of earned credits among attempted credits in',
                               'indicator for whether student repeated a previously taken course in',
                               'number of credit hours attempted in',
                               'term GPA in',
                               'proportion of withdrawn credits among attempted credits']
term_predictors_2 = [p + " the {0} {1} term".format(term_dict[i], term_dict[t]) for i in range(1,7,1) for t in ['fa','sp','su'] for p in term_predictors_prototype_2]

In [117]:
print(non_term_predictors)

['admrate' 'afam' 'age_entry' 'coll_lvl_cred_earn' 'college_entropy'
 'cum_gpa' 'cum_gpa_pre' 'dev_prop_comp' 'dual_ind' 'enrl_intensity_trend'
 'enrolled_nsc' 'enrolled_pre' 'gpa_trend' 'gradrate' 'hisp'
 'lvl2_prop_comp' 'male' 'nsc_coll_type_1' 'nsc_coll_type_2'
 'nsc_coll_type_3' 'nsc_coll_type_4' 'nsc_coll_type_5' 'nsc_coll_type_6'
 'nsc_coll_type_7' 'nsc_coll_type_8' 'nsc_terms' 'num_nsc_coll' 'other'
 'pell_0_ind' 'pell_1_ind' 'phe_1' 'phe_2' 'phe_3' 'phe_4' 'phe_5' 'phe_6'
 'phe_7' 'pre_nsc_terms' 'pre_num_nsc_coll' 'program_chng_ind' 'prop_comp'
 'prop_comp_pre' 'prop_comp_sd' 'repeat_ind' 'satmt25' 'satmt75' 'satvr25'
 'satvr75' 'satwr25' 'satwr75' 'seamless_enrollee_0' 'seamless_enrollee_1'
 'white' 'withdrawn_prop_comp' 'withdrawn_prop_comp_sd']


In [118]:
non_term_predictors_2 = ['(weighted) average of admission rates of all non-VCCS institutions attended',
                         'indicator for African American',
                         'age at initial enrollment at VCCS',
                         'number of cumulative college-level credit hours earned prior to initial enrollment at VCCS',
                         'negative of logarithm of the maximum proportion of cumulative credits attempted at one VCCS institution',
                         'Cumulative GPA through the end of observation window',
                         'Cumulative GPA prior to initial enrollment term at VCCS',
                         'overall proportion of attempted credits of developmental courses',
                         'indicator for dual enrollment prior to initial enrollment term',
                         'slope of term-level number of credits attempted through the end of observation window',
                         'indicator for whether student was ever enrolled in any non-VCCS institutions since initial enrollment term',
                         'indicator for whether student was ever enrolled in VCCS prior to initial enrollment term',
                         'slope of term GPA through the end of observation window',
                         '(weighted) average of graduation rates of all non-VCCS institutions attended',
                         'indicator for Hispanic',
                         'overall proprotion of attempted credits of 2XX level courses',
                         'indicator for male',
                         'indicator for two-year, private, out-of-state',
                         'indicator for two-year, private, in-state',
                         'indicator for two-year, public, out-of-state',
                         'indicator for two-year, public, in-state',
                         'indicator for four-year, private, out-of-state',
                         'indicator for four-year, private, in-state',
                         'indicator for four-year, public, out-of-state',
                         'indicator for four-year, public, in-state',
                         'number of terms in which student was enrolled in non-VCCS institutions since initial enrollment term',
                         'number of non-VCCS institutions in which student was enrolled since initial enrollment term',
                         'indicator for other race/ethnicity',
                         'indicator for never pell-eligible',
                         'indicator for ever pell-eligible',
                         'indicator for highest parental education being less than high school',
                         'indicator for highest parental education being having attended high school',
                         'indicator for highest parental education being having graduated from high school',
                         'indicator for highest parental education being having attended college',
                         "indicator for highest parental education being having earned Associate's degree",
                         "indicator for highest parental education being having earned Bachelor's degree",
                         "indicator for highest parental education being having earned Post-Bachelor's degree",
                         'number of terms in which student was enrolled in non-VCCS institutions prior to initial enrollment term',
                         'number of non-VCCS institutions in which student was enrolled prior to initial enrollment term',
                         'indicator for whether student changed degree/major program pursued',
                         'overall proportion of earned credits among attempted credits since initial enrollment term',
                         'overall proportion of earned credits among attempted credits prior to initial enrollment term',
                         'standard deviation of term-level proportion of earned credits among attempted credits since initial enrollment term',
                         'indicator for whether student has ever repeated a previously taken course',
                         '(weighted) average of the 1st quartiles of SAT math scores of all non-VCCS institutions attended',
                         '(weighted) average of the 3rd quartiles of SAT math scores of all non-VCCS institutions attended',
                         '(weighted) average of the 1st quartiles of SAT verbal scores of all non-VCCS institutions attended',
                         '(weighted) average of the 3rd quartiles of SAT verbal scores of all non-VCCS institutions attended',
                         '(weighted) average of the 1st quartiles of SAT writing scores of all non-VCCS institutions attended',
                         '(weighted) average of the 3rd quartiles of SAT writing scores of all non-VCCS institutions attended',
                         'indicator for not a seamless enrollee',
                         'indicator for seamless enrollee',
                         'indicator for White',
                         'overall proportion of withdrawn credits among attempted credits since initial enrollment term',
                         'standard deviation of term-level proportion of withdrawn credits among attempted credits since initial enrollment term']

In [119]:
predictors_1 = list(non_term_predictors) + term_predictors + yr_predictors
predictors_2 = list(non_term_predictors_2) + term_predictors_2 + yr_predictors_2

In [124]:
pd.DataFrame({'predictor_name':predictors_1,
              'predictor_meaning':predictors_2}).loc[:,['predictor_name', 'predictor_meaning']].to_csv(fpath + "predictor_name_crosswalk.csv", index=False)

#### (1) Data pre-processing: missing value imputation

In [90]:
train_df = df[df.valid == 0]
test_df = df[df.valid == 1]
print(train_df.shape,test_df.shape)

(298139, 341) (33115, 341)


In [91]:
# degree completion rate of validation sample
sum(train_df.grad_6years)/train_df.shape[0]

0.342028382734228

In [92]:
# degree completion rate of validation sample
sum(test_df.grad_6years)/test_df.shape[0]

0.34144647440736825

In [93]:
def impute(train, test):
    for p in impute_list_1:
        avg_p = np.nanmean(train[train.enrolled_pre == 1][p])
        train.loc[:,p] = train.loc[:,p].apply(lambda x: avg_p if pd.isnull(x) else x)
        test.loc[:,p] = test.loc[:,p].apply(lambda x: avg_p if pd.isnull(x) else x)
    for p in impute_list_3:
        avg_p = np.nanmean(train[p])
        train.loc[:,p] = train.loc[:,p].apply(lambda x: avg_p if pd.isnull(x) else x)
        test.loc[:,p] = test.loc[:,p].apply(lambda x: avg_p if pd.isnull(x) else x)
    for p in impute_list_2:
        suffix = p[-3:]
        avg_p = np.nanmean(train[train["enrolled_" + suffix] == 1][p])
        train.loc[:,p] = train.loc[:,p].apply(lambda x: avg_p if pd.isnull(x) else x)
        test.loc[:,p] = test.loc[:,p].apply(lambda x: avg_p if pd.isnull(x) else x)
    for p in impute_list_4:
        avg_p = np.nanmean(train[train["enrolled_nsc"] == 1][p])
        train.loc[:,p] = train.loc[:,p].apply(lambda x: avg_p if pd.isnull(x) else x)
        test.loc[:,p] = test.loc[:,p].apply(lambda x: avg_p if pd.isnull(x) else x)
    return train, test                    

In [94]:
train_df_new, test_df_new = impute(train_df, test_df)
X_train = train_df_new.loc[:,predictors]
y_train = train_df_new.grad_6years
X_test = test_df_new.loc[:,predictors]
y_test = test_df_new.grad_6years

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [96]:
results_dir = "C:\\Users\\ys8mz\\Box Sync\\Predictive Models of College Completion (VCCS)\\evaluation_results\\truncated_predictors\\"

#### (2) Run the basic version of logistic regression

In [99]:
lr = sm.Logit(y_train, add_constant(X_train,prepend=True)).fit()

Optimization terminated successfully.
         Current function value: 0.405821
         Iterations 9


In [101]:
y_test_pred_lr = list(lr.predict(add_constant(X_test, prepend=True)))
print("Logistic Regression:")
print("AUC = {}".format(round(roc_auc_score(y_test, y_test_pred_lr),4)))

Logistic Regression:
AUC = 0.8837


In [102]:
# save the model object and predicted scores on the validation sample to local disk
pickle.dump(lr, open(results_dir + "/lr.p", "wb"))
pickle.dump(list(y_test_pred_lr), open(fpath + "/y_test_pred_lr.p", "wb"))

In [15]:
# Save the coefficients along with standard errors and p-values of the Logit model to disk
# (Used for generating Appendix Table A6 of the paper)
lr_summary = pd.DataFrame({'coef': lr.params, 'std_err': lr.bse, 'p-values': lr.pvalues}, index=lr.params.index).loc[:,['coef','std_err','p-values']]
lr_summary.to_csv(results_dir + "lr_summary.csv", index=True)

In [16]:
def find_optimal_threshold(p,r,t):
    to_drop = np.union1d(np.where(pd.isnull(p[:-1]) == True)[0], np.where(pd.isnull(r[:-1]) == True)[0])
    to_drop = np.union1d(to_drop, np.where(pd.isnull(t) == True)[0])
    to_keep = np.setdiff1d(np.array(list(range(len(p)-1))), to_drop)
    p,r,t = p[to_keep],r[to_keep],t[to_keep]
    f1 = 2*p*r/(p+r)
    best_t = t[np.argmax(f1)]
    best_t
    return best_t

In [17]:
def cross_validation_lr(train):
    threshold_list = []
    auc_list = []
    k_fold =  StratifiedKFold(n_splits = 10, random_state = 12345, shuffle=True)
    for train_indices, test_indices in k_fold.split(train, train.grad_6years):
        train_part = train.iloc[train_indices,:]
        test_part = train.iloc[test_indices,:]
        train_part_new, test_part_new = impute(train_part, test_part)
        X_1 = train_part_new.loc[:,predictors]
        y_1 = train_part_new.grad_6years
        X_2 = test_part_new.loc[:,predictors]
        y_2 = test_part_new.grad_6years
        model = sm.Logit(y_1, add_constant(X_1,prepend=True)).fit()
        p,r,t = precision_recall_curve(y_2, model.predict(add_constant(X_2, prepend=True)))
        auc = roc_auc_score(y_2, model.predict(add_constant(X_2, prepend=True)))
        threshold_list.append(find_optimal_threshold(p,r,t))
        auc_list.append(auc)
    print(threshold_list)
    print(np.mean(auc_list), np.std(auc_list, ddof=1))
    return gmean(threshold_list)                                  

In [18]:
best_threshold = cross_validation_lr(train_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Optimization terminated successfully.
         Current function value: 0.405406
         Iterations 9


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Optimization terminated successfully.
         Current function value: 0.406092
         Iterations 9


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


         Current function value: 0.405902
         Iterations: 35


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Optimization terminated successfully.
         Current function value: 0.406007
         Iterations 9


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Optimization terminated successfully.
         Current function value: 0.405737
         Iterations 9


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Optimization terminated successfully.
         Current function value: 0.405229
         Iterations 9


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Optimization terminated successfully.
         Current function value: 0.405290
         Iterations 9


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Optimization terminated successfully.
         Current function value: 0.405583
         Iterations 9


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Optimization terminated successfully.
         Current function value: 0.406413
         Iterations 9


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Optimization terminated successfully.
         Current function value: 0.405819
         Iterations 9
[0.39001663305004247, 0.41422582916072803, 0.384215963324511, 0.4021071752982359, 0.38682440811220364, 0.40116498914814935, 0.3725739464670538, 0.3842004988610529, 0.4234501045583369, 0.9969173080745926]
0.8810053844794516 0.002035635902280419


In [20]:
print(best_threshold)

0.43344163040148626

In [140]:
def create_confusion_matrix(y_test_pred, threshold, fname):
    cm_arr = confusion_matrix(y_test, np.where(y_test_pred > threshold, 1, 0))
    cm_df = pd.DataFrame(cm_arr, columns=['Pred_0','Pred_1'], index=['Real_0', 'Real_1'])
    cm_df.loc[:,''] = cm_df.sum(axis=1)
    cm_df.loc['',:] = cm_df.sum(axis=0)
    print(cm_df)
    print("")
    p1 = cm_df.iloc[1,1]/cm_df.iloc[2,1]
    r1 = cm_df.iloc[1,1]/cm_df.iloc[1,2]
    p0 = cm_df.iloc[0,0]/cm_df.iloc[2,0]
    r0 = cm_df.iloc[0,0]/cm_df.iloc[0,2]    
    print("F1 score = {}".format(round(2*p1*r1/(p1+r1),4)))    
    cm_df.to_csv(results_dir + fname + ".csv")
    return p1,r1,p0,r0,round(2*p1*r1/(p1+r1),4)

In [141]:
print("F1 threshold = {}:\n".format(str(round(best_threshold,3))))
pr_lr = create_confusion_matrix(y_test_pred_lr, best_threshold, "LR1_cm1")

F1 threshold = 0.433:

         Pred_0   Pred_1         
Real_0  18789.0   3019.0  21808.0
Real_1   2854.0   8453.0  11307.0
        21643.0  11472.0  33115.0

F1 score = 0.7422


In [150]:
num_of_0 = int(round((1-np.mean(train_df.grad_6years))*len(y_test)))
y_test_pred_binary = np.ones(len(y_test))
y_test_pred_binary[np.argsort(y_test_pred_lr)[:num_of_0]] = 0
alternative_threshold = y_test_pred_lr[np.argsort(y_test_pred_lr)[num_of_0]]
print("Alternative threshold = {}:\n".format(str(round(alternative_threshold,3))))
pr2_lr = create_confusion_matrix(y_test_pred_binary, best_threshold, "LR1_cm2")

Alternative threshold = 0.44:

         Pred_0   Pred_1         
Real_0  18873.0   2935.0  21808.0
Real_1   2916.0   8391.0  11307.0
        21789.0  11326.0  33115.0

F1 score = 0.7415


In [161]:
precision_recall_df = pd.DataFrame([(best_threshold,)+pr_lr,(alternative_threshold,)+pr2_lr]).round(4)
precision_recall_df.index = ['F1','Same_Graduation_Rate']
precision_recall_df.columns = ['threshold','precision_1','recall_1','precision_0','recall_0','f1_score']
precision_recall_df.to_csv(results_dir + "LR1_precision_recall.csv", index=True)

#### (3) Run the basic version of OLS

In [104]:
ols = sm.OLS(y_train, add_constant(X_train,prepend=True)).fit()

In [105]:
y_test_pred_ols = list(ols.predict(add_constant(X_test, prepend=True)))
print("OLS:")
print("AUC = {}".format(round(roc_auc_score(y_test, y_test_pred_ols),4)))

OLS:
AUC = 0.8795


In [106]:
pickle.dump(ols, open(results_dir + "/ols.p", "wb"))
pickle.dump(list(y_test_pred_ols), open(fpath + "/y_test_pred_ols.p", "wb"))

In [122]:
# Save the coefficients along with standard errors and p-values of the OLS model to disk
# (Used for generating Appendix Table A6 of the paper)
ols_summary = pd.DataFrame({'coef': ols.params, 'std_err': ols.bse, 'p-values': ols.pvalues}, index=ols.params.index).loc[:,['coef','std_err','p-values']]
ols_summary.to_csv(results_dir + "ols_summary.csv", index=True)

In [126]:
def cross_validation_ols(train):
    threshold_list = []
    auc_list = []
    k_fold =  StratifiedKFold(n_splits = 10, random_state = 12345, shuffle=True)
    for train_indices, test_indices in k_fold.split(train, train.grad_6years):
        train_part = train.iloc[train_indices,:]
        test_part = train.iloc[test_indices,:]
        train_part_new, test_part_new = impute(train_part, test_part)
        X_1 = train_part_new.loc[:,predictors]
        y_1 = train_part_new.grad_6years
        X_2 = test_part_new.loc[:,predictors]
        y_2 = test_part_new.grad_6years
        model = sm.OLS(y_1, add_constant(X_1,prepend=True)).fit()
        p,r,t = precision_recall_curve(y_2, model.predict(add_constant(X_2, prepend=True)))
        auc = roc_auc_score(y_2, model.predict(add_constant(X_2, prepend=True)))
        threshold_list.append(find_optimal_threshold(p,r,t))
        auc_list.append(auc)
    print(np.mean(auc_list), np.std(auc_list, ddof=1))
    return gmean(threshold_list)   

In [127]:
best_threshold_3 = cross_validation_ols(train_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/

0.8762703659572899 0.0023470017735945396


In [129]:
print(best_threshold_3)

0.46918787407941415

In [162]:
print("F1 threshold = {}:\n".format(str(round(best_threshold_3,3))))
pr_ols = create_confusion_matrix(y_test_pred_ols, best_threshold_3, "OLS_cm1")

F1 threshold = 0.469:

         Pred_0   Pred_1         
Real_0  19062.0   2746.0  21808.0
Real_1   3250.0   8057.0  11307.0
        22312.0  10803.0  33115.0

F1 score = 0.7288


In [163]:
num_of_0 = int(round((1-np.mean(train_df.grad_6years))*len(y_test)))
y_test_pred_binary = np.ones(len(y_test))
y_test_pred_binary[np.argsort(y_test_pred_ols)[:num_of_0]] = 0
alternative_threshold_3 = y_test_pred_ols[np.argsort(y_test_pred_ols)[num_of_0]]
print("Alternative threshold = {}:\n".format(str(round(alternative_threshold_3,3))))
pr2_ols = create_confusion_matrix(y_test_pred_binary, best_threshold_3, "OLS_cm2")

Alternative threshold = 0.452:

         Pred_0   Pred_1         
Real_0  18786.0   3022.0  21808.0
Real_1   3003.0   8304.0  11307.0
        21789.0  11326.0  33115.0

F1 score = 0.7338


In [164]:
precision_recall_df_3 = pd.DataFrame([(best_threshold_3,)+pr_ols,(alternative_threshold_3,)+pr2_ols]).round(4)
precision_recall_df_3.index = ['F1','Same_Graduation_Rate']
precision_recall_df_3.columns = ['threshold','precision_1','recall_1','precision_0','recall_0','f1_score']
precision_recall_df_3.to_csv(results_dir + "OLS_precision_recall.csv", index=True)

#### (4) Evaluation results by subgroups -- used for generating Figure 9 of the paper

In [1]:
test_df.loc[:,"non_first_gen"] = test_df.phe_4 + test_df.phe_5 + test_df.phe_6 + test_df.phe_7
test_df.loc[:,'first_gen'] = test_df.phe_1 + test_df.phe_2 + test_df.phe_3

In [2]:
def eval_subgroup(y_pred,sg,bt):
    if sg == 'female':
        y_real_sg = np.array(y_test)[np.array(test_df.male) == 0]
        y_pred_sg = np.array(y_pred)[np.array(test_df.male) == 0]
    elif sg == 'pell_eligible':
        y_real_sg = np.array(y_test)[np.array(test_df.pell_1_ind) == 1]
        y_pred_sg = np.array(y_pred)[np.array(test_df.pell_1_ind) == 1] 
    elif sg == "not_pell_eligible":
        y_real_sg = np.array(y_test)[np.array(test_df.pell_0_ind) == 1]
        y_pred_sg = np.array(y_pred)[np.array(test_df.pell_0_ind) == 1]
    elif sg == 'non_filers':
        y_real_sg = np.array(y_test)[np.array(test_df.pell_1_ind == 0) & np.array(test_df.pell_0_ind == 0)]
        y_pred_sg = np.array(y_pred)[np.array(test_df.pell_1_ind == 0) & np.array(test_df.pell_0_ind == 0)] 
    elif sg == "not_pell_eligible":
        y_real_sg = np.array(y_test)[np.array(test_df.pell_ind) == 0]
        y_pred_sg = np.array(y_pred)[np.array(test_df.pell_ind) == 0]
    elif sg == "age>=25":
        y_real_sg = np.array(y_test)[np.array(test_df.age_entry) >= 25]
        y_pred_sg = np.array(y_pred)[np.array(test_df.age_entry) >= 25]
    elif sg == "age<25":
        y_real_sg = np.array(y_test)[np.array(test_df.age_entry) < 25]
        y_pred_sg = np.array(y_pred)[np.array(test_df.age_entry) < 25]
    elif sg == "first_gen":
        y_real_sg = np.array(y_test)[np.array(test_df.first_gen) == 1]
        y_pred_sg = np.array(y_pred)[np.array(test_df.first_gen) == 1]
    elif sg == "non_first_gen":
        y_real_sg = np.array(y_test)[np.array(test_df.non_first_gen) == 1]
        y_pred_sg = np.array(y_pred)[np.array(test_df.non_first_gen) == 1]
    elif sg == "unknown_first_gen":
        y_real_sg = np.array(y_test)[np.array(test_df.first_gen == 0) & np.array(test_df.non_first_gen == 0)]
        y_pred_sg = np.array(y_pred)[np.array(test_df.first_gen == 0) & np.array(test_df.non_first_gen == 0)]
    else:
        y_real_sg = np.array(y_test)[np.array(test_df[sg]) == 1]
        y_pred_sg = np.array(y_pred)[np.array(test_df[sg]) == 1]        
    N = len(y_real_sg)
    d_rate = round(np.mean(y_real_sg), 4)
    d_rate_2 = round(np.mean(np.where(y_pred_sg>bt,1,0)), 4)
    avg_score = round(np.mean(y_pred_sg), 4)
    auc = round(roc_auc_score(y_real_sg,y_pred_sg), 4)
    p = round(precision_score(y_real_sg, np.where(y_pred_sg>bt,1,0)), 4)
    r = round(recall_score(y_real_sg, np.where(y_pred_sg>bt,1,0)), 4)
    p2 = round(precision_score(y_real_sg, np.where(y_pred_sg>bt,1,0),pos_label=0), 4)
    r2 = round(recall_score(y_real_sg, np.where(y_pred_sg>bt,1,0), pos_label=0), 4)
    f1_1 = round(2*p*r/(p+r),4)
    f1_0 = round(2*p2*r2/(p2+r2),4)
    return (sg,N,d_rate,d_rate_2,avg_score,auc,p,r,p2,r2,f1_1,f1_0)

In [3]:
sgs = ['male', 'female', 'white', 'afam', 'hisp', 'other', 'pell_eligible', 'not_pell_eligible', 'non_filers', 'age>=25', 'age<25', 'first_gen', 'non_first_gen', 'unknown_first_gen']
eval_results_1 = []
for sg in sgs:
    eval_results_1.append(eval_subgroup(y_test_pred_lr,sg,best_threshold))
eval_1_df = pd.DataFrame(eval_results_1, columns = ['subgroup', 'N', 'deg_compl_rate_real', 'deg_compl_rate_pred', 'avg_pred_score', 'c-statistic', 'precision_1', 'recall_1', 'precision_0', 'recall_0', 'f1_score_1', 'f1_score_0'])
eval_1_df.to_csv(results_dir + "LR1_subgroups.csv", index=False)

eval_results_3 = []
for sg in sgs:
    eval_results_3.append(eval_subgroup(y_test_pred_ols,sg,best_threshold_3))
eval_3_df = pd.DataFrame(eval_results_3, columns = ['subgroup', 'N', 'deg_compl_rate_real', 'deg_compl_rate_pred', 'avg_pred_score', 'c-statistic', 'precision_1', 'recall_1', 'precision_0', 'recall_0', 'f1_score_1', 'f1_score_0'])
eval_3_df.to_csv(results_dir + "OLS_subgroups.csv", index=False)