In [20]:
import pickle
import pandas as pd
import numpy as np
from scipy.stats.mstats import gmean
import sklearn
import statsmodels.api as sm
from statsmodels.tools import add_constant
from statsmodels.genmod.generalized_linear_model import GLM
from statsmodels.genmod import families
from sklearn.metrics import precision_recall_curve, roc_auc_score, confusion_matrix, precision_score, recall_score
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
import seaborn as sns
import matplotlib.pyplot as plt
fpath = "/Users/ys8mz/Box Sync/Predictive Models of College Completion (VCCS)/intermediate_files"

In [21]:
df1 = pd.read_stata(fpath + "/full_data_6yr.dta")
df1 = df1[df1.valid == 0]
df1.loc[:,'first_gen_0'] = df1.phe_1 + df1.phe_2 + df1.phe_3
df1.loc[:,'first_gen_1'] = df1.phe_4 + df1.phe_5 + df1.phe_6 + df1.phe_7
for v in ['term_cred_att_', 'enrolled_']:
    df1.loc[:,v+"sum"] = 0
    for t1 in ['sp','su','fa']:
        for t2 in range(1,7):
            t = t1+str(t2)
            df1.loc[:,v+"sum"] = df1.loc[:,v+"sum"] + df1.loc[:,v+t]
df1.loc[:,'avg_cred_att'] = df1.term_cred_att_sum / df1.enrolled_sum
df1.loc[:,'pct_enrolled'] = df1.enrolled_sum / 18

In [19]:
df2 = pd.read_stata(fpath + "/full_data_truncated.dta")
df2 = df2[df2.valid == 1]
df2.loc[:,'first_gen_0'] = df2.phe_1 + df2.phe_2 + df2.phe_3
df2.loc[:,'first_gen_1'] = df2.phe_4 + df2.phe_5 + df2.phe_6 + df2.phe_7
for v in ['term_cred_att_', 'enrolled_', 'available_']:
    df2.loc[:,v+"sum"] = 0
    for t1 in ['sp','su','fa']:
        for t2 in range(1,7):
            t = t1+str(t2)
            df2.loc[:,v+"sum"] = df2.loc[:,v+"sum"] + df2.loc[:,v+t]
df2.loc[:,'avg_cred_att'] = df2.term_cred_att_sum / df2.enrolled_sum
df2.loc[:,'pct_enrolled'] = df2.enrolled_sum / df2.available_sum

In [24]:
df = pd.concat([df1,df2], join='inner')

##### **Note: This model variant only includes 13 predictors, which are all simple non-term-specific predictors

In [26]:
predictors = ['male', 'afam', 'white', 'hisp', 'other', 'first_gen_0', 'first_gen_1', 'cum_gpa', 'pct_enrolled', 'avg_cred_att', 'prop_comp', 'pell_0_ind', 'pell_1_ind']
print(len(predictors))
impute_list_3 = set(["cum_gpa", "prop_comp"])

13


In [27]:
train_df = df[df.valid == 0]
test_df = df[df.valid == 1]
print(train_df.shape,test_df.shape)

(300144, 329) (33115, 329)


In [28]:
# degree completion rate of validation sample
sum(train_df.grad_6years)/train_df.shape[0]

0.34642371661602434

In [29]:
# degree completion rate of validation sample
sum(test_df.grad_6years)/test_df.shape[0]

0.34144647440736825

In [30]:
def impute(train, test):
    for p in impute_list_3:
        avg_p = np.nanmean(train[p])
        train.loc[:,p] = train.loc[:,p].apply(lambda x: avg_p if pd.isnull(x) else x)
        test.loc[:,p] = test.loc[:,p].apply(lambda x: avg_p if pd.isnull(x) else x)
    return train, test                    

In [31]:
train_df_new, test_df_new = impute(train_df, test_df)
X_train = train_df_new.loc[:,predictors]
y_train = train_df_new.grad_6years
X_test = test_df_new.loc[:,predictors]
y_test = test_df_new.grad_6years

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [32]:
results_dir = "C:\\Users\\ys8mz\\Box Sync\\Predictive Models of College Completion (VCCS)\\evaluation_results\\truncated_simple_predictors_2\\no_truncation\\"

#### (1) Run the logistic regression model

In [33]:
lr = sm.Logit(y_train, add_constant(X_train,prepend=True)).fit()

Optimization terminated successfully.
         Current function value: 0.464414
         Iterations 7


In [34]:
y_test_pred_lr = list(lr.predict(add_constant(X_test, prepend=True)))
print("Logistic Regression:")
print("AUC = {}".format(round(roc_auc_score(y_test, y_test_pred_lr),4)))

Logistic Regression:
AUC = 0.7837


In [35]:
# save the model object and predicted scores on the validation sample to local disk
# pickle.dump(lr, open(results_dir + "/lr.p", "wb"))
pickle.dump(list(y_test_pred_lr), open(results_dir + "/y_test_pred_lr.p", "wb"))

In [37]:
def find_optimal_threshold(p,r,t):
    to_drop = np.union1d(np.where(pd.isnull(p[:-1]) == True)[0], np.where(pd.isnull(r[:-1]) == True)[0])
    to_drop = np.union1d(to_drop, np.where(pd.isnull(t) == True)[0])
    to_keep = np.setdiff1d(np.array(list(range(len(p)-1))), to_drop)
    p,r,t = p[to_keep],r[to_keep],t[to_keep]
    f1 = 2*p*r/(p+r)
    best_t = t[np.argmax(f1)]
    best_t
    return best_t

In [38]:
def cross_validation_lr(train):
    threshold_list = []
    auc_list = []
    k_fold =  StratifiedKFold(n_splits = 10, random_state = 12345, shuffle=True)
    for train_indices, test_indices in k_fold.split(train, train.grad_6years):
        train_part = train.iloc[train_indices,:]
        test_part = train.iloc[test_indices,:]
        train_part_new, test_part_new = impute(train_part, test_part)
        X_1 = train_part_new.loc[:,predictors]
        y_1 = train_part_new.grad_6years
        X_2 = test_part_new.loc[:,predictors]
        y_2 = test_part_new.grad_6years
        model = sm.Logit(y_1, add_constant(X_1,prepend=True)).fit()
        p,r,t = precision_recall_curve(y_2, model.predict(add_constant(X_2, prepend=True)))
        auc = roc_auc_score(y_2, model.predict(add_constant(X_2, prepend=True)))
        threshold_list.append(find_optimal_threshold(p,r,t))
        auc_list.append(auc)
    print(np.mean(auc_list), np.std(auc_list, ddof=1))
    return gmean(threshold_list)                                  

In [39]:
best_threshold = cross_validation_lr(train_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Optimization terminated successfully.
         Current function value: 0.463995
         Iterations 7


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Optimization terminated successfully.
         Current function value: 0.464610
         Iterations 7


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Optimization terminated successfully.
         Current function value: 0.464341
         Iterations 7


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Optimization terminated successfully.
         Current function value: 0.464399
         Iterations 7


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Optimization terminated successfully.
         Current function value: 0.464130
         Iterations 7


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Optimization terminated successfully.
         Current function value: 0.464420
         Iterations 7


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Optimization terminated successfully.
         Current function value: 0.464256
         Iterations 7


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Optimization terminated successfully.
         Current function value: 0.464723
         Iterations 7


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Optimization terminated successfully.
         Current function value: 0.464545
         Iterations 7


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Optimization terminated successfully.
         Current function value: 0.464703
         Iterations 7
0.8536536098566442 0.0014135856744095852


In [40]:
best_threshold

0.7172511739044277

In [42]:
def create_confusion_matrix(y_test_pred, threshold, fname):
    cm_arr = confusion_matrix(y_test, np.where(y_test_pred > threshold, 1, 0))
    cm_df = pd.DataFrame(cm_arr, columns=['Pred_0','Pred_1'], index=['Real_0', 'Real_1'])
    cm_df.loc[:,''] = cm_df.sum(axis=1)
    cm_df.loc['',:] = cm_df.sum(axis=0)
    print(cm_df)
    print("")
    p1 = cm_df.iloc[1,1]/cm_df.iloc[2,1]
    r1 = cm_df.iloc[1,1]/cm_df.iloc[1,2]
    p0 = cm_df.iloc[0,0]/cm_df.iloc[2,0]
    r0 = cm_df.iloc[0,0]/cm_df.iloc[0,2]    
    print("F1 score = {}".format(round(2*p1*r1/(p1+r1),4)))    
    cm_df.to_csv(results_dir + fname + ".csv")
    return p1,r1,p0,r0,round(2*p1*r1/(p1+r1),4)

In [43]:
print("F1 threshold = {}:\n".format(str(round(best_threshold,3))))
pr_lr = create_confusion_matrix(np.array(y_test_pred_lr), best_threshold, "LR1_cm1")

F1 threshold = 0.717:

         Pred_0   Pred_1         
Real_0  12604.0   9204.0  21808.0
Real_1   1759.0   9548.0  11307.0
        14363.0  18752.0  33115.0

F1 score = 0.6353


In [44]:
num_of_0 = int(round((1-np.mean(train_df.grad_6years))*len(y_test)))
y_test_pred_binary = np.ones(len(y_test))
y_test_pred_binary[np.argsort(y_test_pred_lr)[:num_of_0]] = 0
alternative_threshold = y_test_pred_lr[np.argsort(y_test_pred_lr)[num_of_0]]
print("Alternative threshold = {}:\n".format(str(round(alternative_threshold,3))))
pr2_lr = create_confusion_matrix(y_test_pred_binary, best_threshold, "LR1_cm2")

Alternative threshold = 0.871:

         Pred_0   Pred_1         
Real_0  17191.0   4617.0  21808.0
Real_1   4452.0   6855.0  11307.0
        21643.0  11472.0  33115.0

F1 score = 0.6019


In [45]:
precision_recall_df = pd.DataFrame([(best_threshold,)+pr_lr,(alternative_threshold,)+pr2_lr]).round(4)
precision_recall_df.index = ['F1','Same_Graduation_Rate']
precision_recall_df.columns = ['threshold','precision_1','recall_1','precision_0','recall_0','f1_score']
precision_recall_df.to_csv(results_dir + "LR1_precision_recall.csv", index=True)

#### (2) Run the OLS model

In [48]:
ols = sm.OLS(y_train, add_constant(X_train,prepend=True)).fit()

In [49]:
y_test_pred_ols = list(ols.predict(add_constant(X_test, prepend=True)))
print("OLS:")
print("AUC = {}".format(round(roc_auc_score(y_test, y_test_pred_ols),4)))

OLS:
AUC = 0.7611


In [50]:
# save the model object and predicted scores on the validation sample to local disk
# pickle.dump(lr, open(results_dir + "/ols.p", "wb"))
pickle.dump(list(y_test_pred_lr), open(results_dir + "/y_test_pred_ols.p", "wb"))

In [52]:
def cross_validation_ols(train):
    threshold_list = []
    auc_list = []
    k_fold =  StratifiedKFold(n_splits = 10, random_state = 12345, shuffle=True)
    for train_indices, test_indices in k_fold.split(train, train.grad_6years):
        train_part = train.iloc[train_indices,:]
        test_part = train.iloc[test_indices,:]
        train_part_new, test_part_new = impute(train_part, test_part)
        X_1 = train_part_new.loc[:,predictors]
        y_1 = train_part_new.grad_6years
        X_2 = test_part_new.loc[:,predictors]
        y_2 = test_part_new.grad_6years
        model = sm.OLS(y_1, add_constant(X_1,prepend=True)).fit()
        p,r,t = precision_recall_curve(y_2, model.predict(add_constant(X_2, prepend=True)))
        auc = roc_auc_score(y_2, model.predict(add_constant(X_2, prepend=True)))
        threshold_list.append(find_optimal_threshold(p,r,t))
        auc_list.append(auc)
    print(np.mean(auc_list), np.std(auc_list, ddof=1))
    return gmean(threshold_list)   

In [53]:
best_threshold_3 = cross_validation_ols(train_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/

0.8390112933977552 0.0013042093073024413


In [54]:
best_threshold_3

0.7180976195816408

In [56]:
print("F1 threshold = {}:\n".format(str(round(best_threshold_3,3))))
pr_ols = create_confusion_matrix(np.array(y_test_pred_ols), best_threshold_3, "OLS_cm1")

F1 threshold = 0.718:

         Pred_0   Pred_1         
Real_0  12927.0   8881.0  21808.0
Real_1   2315.0   8992.0  11307.0
        15242.0  17873.0  33115.0

F1 score = 0.6163


In [57]:
num_of_0 = int(round((1-np.mean(train_df.grad_6years))*len(y_test)))
y_test_pred_binary = np.ones(len(y_test))
y_test_pred_binary[np.argsort(y_test_pred_ols)[:num_of_0]] = 0
alternative_threshold_3 = y_test_pred_ols[np.argsort(y_test_pred_ols)[num_of_0]]
print("Alternative threshold = {}:\n".format(str(round(alternative_threshold_3,3))))
pr2_ols = create_confusion_matrix(y_test_pred_binary, best_threshold_3, "OLS_cm2")

Alternative threshold = 0.832:

         Pred_0   Pred_1         
Real_0  16946.0   4862.0  21808.0
Real_1   4697.0   6610.0  11307.0
        21643.0  11472.0  33115.0

F1 score = 0.5804


In [58]:
precision_recall_df_3 = pd.DataFrame([(best_threshold_3,)+pr_ols,(alternative_threshold_3,)+pr2_ols]).round(4)
precision_recall_df_3.index = ['F1','Same_Graduation_Rate']
precision_recall_df_3.columns = ['threshold','precision_1','recall_1','precision_0','recall_0','f1_score']
precision_recall_df_3.to_csv(results_dir + "OLS_precision_recall.csv", index=True)