In [29]:
import pandas as pd
import numpy as np
from scipy.stats.mstats import gmean
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, precision_recall_curve, confusion_matrix, precision_score, recall_score
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt
import pickle

fpath = "/Users/ys8mz/Box Sync/Predictive Models of College Completion (VCCS)/intermediate_files"
results_dir = "/Users/ys8mz/Box Sync/Predictive Models of College Completion (VCCS)/evaluation_results/truncated_predictors/"

In [4]:
df = pd.read_stata(fpath + "/full_data_truncated.dta")

In [5]:
predictors = list(df.columns.values[10:])
print(len(predictors))
impute_list_1 = set(["prop_comp_pre","cum_gpa_pre"])
impute_list_2 = set([t1+"_"+t2+str(t3) for t1 in ["term_gpa", "prop_comp", "lvl2_prop_comp", "dev_prop_comp"] for t2 in ["fa", "sp", "su"] for t3 in range(1,7,1)])
impute_list_3 = set(["cum_gpa", "lvl2_prop_comp", "dev_prop_comp", "prop_comp", "prop_comp_sd", "withdrawn_prop_comp_sd"])
impute_list_4 = set(["admrate", "gradrate", "satvr25", "satvr75", "satmt25", "satmt75", "satwr25", "satwr75"])

331


In [6]:
train_df = df[df.valid == 0]
test_df = df[df.valid == 1]
print(train_df.shape,test_df.shape)

(298139, 341) (33115, 341)


In [7]:
def impute(train_original, test_original):
    train = train_original.copy()
    test = test_original.copy()
    for p in impute_list_1:
        avg_p = np.nanmean(train[train.enrolled_pre == 1][p])
        train.loc[:,p] = train.loc[:,p].apply(lambda x: avg_p if pd.isnull(x) else x)
        test.loc[:,p] = test.loc[:,p].apply(lambda x: avg_p if pd.isnull(x) else x)
    for p in impute_list_3:
        avg_p = np.nanmean(train[p])
        train.loc[:,p] = train.loc[:,p].apply(lambda x: avg_p if pd.isnull(x) else x)
        test.loc[:,p] = test.loc[:,p].apply(lambda x: avg_p if pd.isnull(x) else x)
    for p in impute_list_2:
        suffix = p[-3:]
        avg_p = np.nanmean(train[train["enrolled_" + suffix] == 1][p])
        train.loc[:,p] = train.loc[:,p].apply(lambda x: avg_p if pd.isnull(x) else x)
        test.loc[:,p] = test.loc[:,p].apply(lambda x: avg_p if pd.isnull(x) else x)
    for p in impute_list_4:
        avg_p = np.nanmean(train[train["enrolled_nsc"] == 1][p])
        train.loc[:,p] = train.loc[:,p].apply(lambda x: avg_p if pd.isnull(x) else x)
        test.loc[:,p] = test.loc[:,p].apply(lambda x: avg_p if pd.isnull(x) else x)
    return train, test                    

In [8]:
train_df_new, test_df_new = impute(train_df, test_df)
X_train = train_df_new.loc[:,predictors]
y_train = train_df_new.grad_6years
X_test = test_df_new.loc[:,predictors]
y_test = test_df_new.grad_6years

In [7]:
def create_cv_folds(n_folds, rs, train_df):
    k_fold = StratifiedKFold(n_splits = n_folds, shuffle=True, random_state = rs)
    data_folds = []
    for train_indices, test_indices in k_fold.split(train_df, train_df.grad_6years):
        train_part = train_df.iloc[train_indices, :]
        test_part = train_df.iloc[test_indices, :]
        train_part_new, test_part_new = impute(train_part, test_part)
        X1 = train_part_new.loc[:,predictors]
        y1 = train_part_new.grad_6years
        X2 = test_part_new.loc[:,predictors]
        y2 = test_part_new.grad_6years
        scaler = MinMaxScaler()
        X1_new = scaler.fit_transform(X1)
        X2_new = scaler.transform(X2)
        data_folds.append((X1_new,y1,X2_new,y2))
    return data_folds

In [8]:
cv_folds = create_cv_folds(10, 12345, train_df)

  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)


In [9]:
for i,f in enumerate(cv_folds):
    pickle.dump(f, open(fpath + "/fold_{}.p".format(i+1), "wb"))

### Run the following part after executing the script "Lasso_Classifier_CV.py"

#### (1) The optimal set of predictors (a total of 147) identified by Lasso feature selection

In [17]:
def find_optimal_threshold(p,r,t):
    to_drop = np.union1d(np.where(pd.isnull(p[:-1]) == True)[0], np.where(pd.isnull(r[:-1]) == True)[0])
    to_drop = np.union1d(to_drop, np.where(pd.isnull(t) == True)[0])
    to_keep = np.setdiff1d(np.array(list(range(len(p)-1))), to_drop)
    p,r,t = p[to_keep],r[to_keep],t[to_keep]
    f1 = 2*p*r/(p+r)
    best_t = t[np.argmax(f1)]
    best_t
    return best_t

In [20]:
auc_list = []
threshold_list = []
for indx,(X_1,y_1,X_2,y_2) in enumerate(cv_folds):
    print(indx)
    lasso_cv = LogisticRegression(penalty='l1', C=0.01153, solver="saga", max_iter=10000)
    lasso_cv.fit(X_1,y_1)
    p,r,t = precision_recall_curve(y_2, lasso_cv.predict_proba(X_2)[:,1])
    auc = roc_auc_score(y_2, lasso_cv.predict_proba(X_2)[:,1])
    threshold_list.append(find_optimal_threshold(p,r,t))
    auc_list.append(auc)
print(np.mean(auc_list), np.std(auc_list, ddof=1))
best_threshold = gmean(threshold_list)

0
1
2
3
4
5
6
7
8
9
0.8769782028442551 0.0022235769382606225


NameError: name 'threshold' is not defined

In [21]:
print(best_threshold)

0.43858621973241146


In [1]:
def selected_predictors(lasso_m):
    zero_predictors = set(np.array(predictors)[lasso_m.coef_[0] == 0])
    nonzero_predictors = set(np.array(predictors)[lasso_m.coef_[0] != 0])
    to_keep = []
    for p in zero_predictors:
        if p == 'enrolled_pre':
            if sum([int(e in nonzero_predictors) for e in ['cum_gpa_pre', 'prop_com_pre']]) > 0:
                to_keep.append(p)
        elif p.startswith("enrolled_nsc_"):
            suffix = p[-4:]
            if ('enrl_intensity_nsc' + suffix) in nonzero_predictors:
                to_keep.append(p)
        elif p.startswith("enrolled_") or p.startswith("available_"):
            suffix = p[-4:]
            if sum(int(e+suffix in nonzero_predictors) for e in ['withdrawn_prop_comp', 'repeat', 'pell_0', 'pell_1', 'degree_seeking', 'term_cred_att', 'term_gpa', 'prop_comp', 'lvl2_prop_comp', 'dev_prop_comp']):
                to_keep.append(p)
    return list(nonzero_predictors) + to_keep

In [14]:
lasso_selected_predictors = selected_predictors(lasso_1)
pickle.dump(lasso_selected_predictors, 
            open("C:\\Users\\ys8mz\\Box Sync\\Predictive Models of College Completion (VCCS)\\intermediate_files\\lasso_selected_predictors.p", "wb"))

In [15]:
len(lasso_selected_predictors)

147

#### (2) Find how the model performance changes over num of predictors: Use to generate Figure A18 of the paper

In [2]:
num_of_predictors_dict_0 = {0.0001: 14, 0.001: 50, 0.01: 139, 0.1: 235, 1: 306, 100: 330}

In [9]:
num_of_predictors_dict_1 = {}
grid_val =\
[0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09] +\
[0.011,0.012,0.013,0.014,0.015,0.016,0.017,0.018,0.019] +\
[0.0111,0.0112,0.0113,0.0114,0.0115,0.0116,0.0117,0.0118,0.0119] +\
[0.01153]
for C_val in grid_val:
    print("C parameter value = {}:".format(C_val))
    lasso_test = LogisticRegression(penalty='l1', C=C_val, solver="saga", max_iter=10000)
    lasso_test.fit(X_train_new, y_train)
    selected = selected_predictors(lasso_test)
    print(len(selected))
    num_of_predictors_dict_1[C_val] = len(selected)

C parameter value = 0.02:
161
C parameter value = 0.03:
179
C parameter value = 0.04:
194
C parameter value = 0.05:
202
C parameter value = 0.06:
214
C parameter value = 0.07:
218
C parameter value = 0.08:
223
C parameter value = 0.09:
231
C parameter value = 0.011:
146
C parameter value = 0.012:
147
C parameter value = 0.013:
150
C parameter value = 0.014:
151
C parameter value = 0.015:
151
C parameter value = 0.016:
153
C parameter value = 0.017:
154
C parameter value = 0.018:
156
C parameter value = 0.019:
160
C parameter value = 0.0111:
146
C parameter value = 0.0112:
146
C parameter value = 0.0113:
147
C parameter value = 0.0114:
147
C parameter value = 0.0115:
147
C parameter value = 0.0116:
147
C parameter value = 0.0117:
147
C parameter value = 0.0118:
147
C parameter value = 0.0119:
147
C parameter value = 0.01153:
147


In [None]:
num_of_predictors_dict_2 = {}
for C_val in [2, 4]:
    print("C parameter value = {}:".format(C_val))
    lasso_test = LogisticRegression(penalty='l1', C=C_val, solver="saga", max_iter=10000)
    lasso_test.fit(X_train_new, y_train)
    selected = selected_predictors(lasso_test)
    print(len(selected))
    num_of_predictors_dict_2[C_val] = len(selected)

C parameter value = 2.5e-05:
0
C parameter value = 5e-05:
0
C parameter value = 7.5e-05:
10
C parameter value = 0.00015:
14
C parameter value = 0.0002:
22
C parameter value = 0.0003:
26
C parameter value = 0.0005:
36
C parameter value = 0.00075:
45
C parameter value = 0.0015:
67
C parameter value = 0.002:
68
C parameter value = 0.0025:
76
C parameter value = 0.003:
86
C parameter value = 0.0035:
90
C parameter value = 0.004:
95
C parameter value = 0.0045:
105
C parameter value = 0.005:
108
C parameter value = 0.006:
113
C parameter value = 0.007:
122
C parameter value = 0.008:
128
C parameter value = 0.009:
134
C parameter value = 0.023:
168
C parameter value = 0.026:
175
C parameter value = 0.035:
186
C parameter value = 0.055:
209
C parameter value = 0.15:
254
C parameter value = 0.2:
266
C parameter value = 0.25:
272
C parameter value = 0.3:
277
C parameter value = 0.4:
285
C parameter value = 0.5:
296
C parameter value = 0.6:
298
C parameter value = 0.7:
300
C parameter value = 0.8

In [10]:
num_of_predictors_dict_3 = {}
for C_val in [0.9,2,4,5.5e-5, 6e-5, 6.5e-5, 7e-5, 0.000175, 0.0004, 0.0006, 0.0011, 0.0012, 0.0013, 0.00275, 0.00425, 0.12, 0.135, 0.17]:
    print("C parameter value = {}:".format(C_val))
    lasso_test = LogisticRegression(penalty='l1', C=C_val, solver="saga", max_iter=10000)
    lasso_test.fit(X_train_new, y_train)
    selected = selected_predictors(lasso_test)
    print(len(selected))
    num_of_predictors_dict_3[C_val] = len(selected)

C parameter value = 0.9:
303
C parameter value = 2:
321
C parameter value = 4:
328
C parameter value = 5.5e-05:
3
C parameter value = 6e-05:
6
C parameter value = 6.5e-05:
9
C parameter value = 7e-05:
9
C parameter value = 0.000175:
19
C parameter value = 0.0004:
31
C parameter value = 0.0006:
42
C parameter value = 0.0011:
54
C parameter value = 0.0012:
62
C parameter value = 0.0013:
59
C parameter value = 0.00275:
82
C parameter value = 0.00425:
99
C parameter value = 0.12:
248
C parameter value = 0.135:
253
C parameter value = 0.17:
259


In [12]:
num_of_predictors_dict_4 = {}
for C_val in [1.2, 1.5, 0.45, 0.11]:
    print("C parameter value = {}:".format(C_val))
    lasso_test = LogisticRegression(penalty='l1', C=C_val, solver="saga", max_iter=10000)
    lasso_test.fit(X_train_new, y_train)
    selected = selected_predictors(lasso_test)
    print(len(selected))
    num_of_predictors_dict_4[C_val] = len(selected)

C parameter value = 1.2:
311
C parameter value = 1.5:
317
C parameter value = 0.45:
286
C parameter value = 0.11:
243


In [13]:
num_of_predictors_dict_5 = {}
for C_val in [0.475]:
    print("C parameter value = {}:".format(C_val))
    lasso_test = LogisticRegression(penalty='l1', C=C_val, solver="saga", max_iter=10000)
    lasso_test.fit(X_train_new, y_train)
    selected = selected_predictors(lasso_test)
    print(len(selected))
    num_of_predictors_dict_5[C_val] = len(selected)

C parameter value = 0.475:
287


#### (3) Feature ranking using RFE, for logit and OLS

In [8]:
results_dir = "C:\\Users\\ys8mz\\Box Sync\\Predictive Models of College Completion (VCCS)\\evaluation_results\\truncated_predictors\\"

In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
import datetime as dt

In [11]:
lr_estimator = LogisticRegression(solver="saga", max_iter=10000)
selector = RFE(lr_estimator, n_features_to_select=1, step=1)
print(dt.datetime.now())
selector.fit(X_train_new, y_train)
print(dt.datetime.now())
df1 = pd.DataFrame({'predictor_name':predictors, 'ranking':selector.ranking_}).sort_values(['ranking'])
df1.to_csv(results_dir + "reduced_lr_feature_ranking.csv", index=False)

2020-03-17 22:17:13.637940
2020-03-17 22:38:34.030930


In [12]:
ols_estimator = LinearRegression()
selector_3 = RFE(ols_estimator, n_features_to_select=1, step=1)
print(dt.datetime.now())
selector_3.fit(X_train_new, y_train)
print(dt.datetime.now())
df3 = pd.DataFrame({'predictor_name':predictors, 'ranking':selector_3.ranking_}).sort_values(['ranking'])
df3.to_csv(results_dir + "reduced_ols_feature_ranking.csv", index=False)

2020-03-17 22:38:34.049918
2020-03-17 22:40:58.618356


In [13]:
lr2_estimator = LogisticRegression(solver="saga", max_iter=10000, class_weight="balanced")
selector_2 = RFE(lr2_estimator, n_features_to_select=1, step=1)
print(dt.datetime.now())
selector_2.fit(X_train_new, y_train)
print(dt.datetime.now())
df2 = pd.DataFrame({'predictor_name':predictors, 'ranking':selector_2.ranking_}).sort_values(['ranking'])
df2.to_csv(results_dir + "reduced_lr2_feature_ranking.csv", index=False)

2020-03-17 22:40:58.637346
2020-03-17 23:02:22.586946


In [9]:
lr_estimator = LogisticRegression(solver="saga", max_iter=10000)
selector = RFE(lr_estimator, n_features_to_select=1, step=1)

In [10]:
print(dt.datetime.now())
selector.fit(X_train_new, y_train)
print(dt.datetime.now())

2020-03-15 09:16:27.172666
2020-03-16 02:46:04.836714


In [30]:
df1 = pd.DataFrame({'predictor_name':predictors, 'ranking':selector.ranking_}).sort_values(['ranking'])
df1.to_csv(results_dir + "lr_feature_ranking.csv", index=False)

In [11]:
ols_estimator = LinearRegression()
selector_3 = RFE(ols_estimator, n_features_to_select=1, step=1)

In [12]:
print(dt.datetime.now())
selector_3.fit(X_train_new, y_train)
print(dt.datetime.now())

2020-03-16 02:46:05.226761
2020-03-16 02:59:37.192593


In [31]:
df3 = pd.DataFrame({'predictor_name':predictors, 'ranking':selector_3.ranking_}).sort_values(['ranking'])
df3.to_csv(results_dir + "ols_feature_ranking.csv", index=False)