#### This script attempts to provide more "apple-to-apple" comparison for the performance between the two outcome definitions

In [337]:
import pickle
import pandas as pd
import numpy as np
from scipy.stats.mstats import gmean
import sklearn
import statsmodels.formula.api as sm
from statsmodels.tools import add_constant
from statsmodels.genmod.generalized_linear_model import GLM
from statsmodels.genmod import families
from sklearn.metrics import precision_recall_curve, roc_auc_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import KFold, StratifiedKFold
import seaborn as sns
import matplotlib.pyplot as plt
import random
fpath = "/Users/ys8mz/Box Sync/Predictive Models of College Completion (VCCS)/intermediate_files"
fpath_1 = "/Users/ys8mz/Box Sync/Predictive Models of College Completion (VCCS)/evaluation_results/truncated_predictors/cleaned_results/"
fpath_2 = "/Users/ys8mz/Box Sync/Predictive Models of College Completion (VCCS)/evaluation_results/truncated_new_5/cleaned_results/"

In [32]:
df_old = pd.read_stata(fpath + "/full_data_truncated.dta").loc[:,['vccsid','valid','grad_6years','first_degree_strm']]
df_old = df_old[df_old.valid == 1].drop(['valid'], axis=1)
df_new = pd.read_stata(fpath + "/full_data_truncated_alternative.dta").loc[:,['vccsid','valid','grad_6years','first_degree_strm']]
df_new = df_new[df_new.valid == 1].drop(['valid'], axis=1)

#### Focus on the performance comparison of observations where the outcome is the same according to both definitions (either the student earned the 1st degree at VCCS, or the student never earned any degree during the 6-year window)

In [43]:
y = df_old.merge(df_new, how='left', on=['vccsid'])
y = y[y.grad_6years_x == y.grad_6years_y]
print(y.shape)
y = y[np.array(y.first_degree_strm_x == y.first_degree_strm_y) | np.array(pd.isnull(y.first_degree_strm_x))]
print(y.shape)
y = y.iloc[:,:2]
y.columns = ['vccsid', 'y_real']

(29679, 5)
(29636, 5)


In [377]:
# Share of students who earned the 1st degree at VCCS in the validation set
np.mean(y.y_real)

0.26413822174072266

In [44]:
score_1 = pd.read_csv(fpath_1 + "all_pred_scores.csv").rename(columns={"Unnamed: 0": "vccsid"}).merge(y, on=['vccsid'],how='inner')
score_2 = pd.read_csv(fpath_2 + "all_pred_scores.csv").rename(columns={"Unnamed: 0": "vccsid"}).merge(y, on=['vccsid'],how='inner')

In [47]:
# Compare the performance (c-statistics) of the alternative definition model with the original definition on the focus group
r_list = []
for m in ['OLS','Logit','RF','XGBoost']:
    r_list.append((m, roc_auc_score(score_1.y_real, score_1[m]),roc_auc_score(score_2.y_real, score_2[m])))
    print(r_list[-1])
pd.DataFrame(r_list, columns=['model','base_cstat','no_nsc_cstat']).round(4).to_csv(fpath_2 + "/new_cstat_comparison.csv", index=False)
# The alternative definiton model slightly outperform the base model

('OLS', 0.9112265183703853, 0.9132358817567428)
('Logit', 0.9130524950457206, 0.9228889765317495)
('RF', 0.9202382385306466, 0.9262694274573918)
('XGBoost', 0.9334251263688, 0.9437862631968842)


#### Compare the mean balanced F1 score for non-graduation for the two outcome definitions through random sampling

In [59]:
df_old_score = df_old.merge(pd.read_csv(fpath_1 + "all_pred_scores.csv").rename(columns={"Unnamed: 0": "vccsid"}), on=['vccsid'], how='inner')
df_new_score = df_new.merge(pd.read_csv(fpath_2 + "all_pred_scores.csv").rename(columns={"Unnamed: 0": "vccsid"}), on=['vccsid'], how='inner')

In [63]:
df_old_score_1 = df_old_score[df_old_score.grad_6years == 1]
df_old_score_0 = df_old_score[df_old_score.grad_6years == 0]
df_new_score_1 = df_new_score[df_new_score.grad_6years == 1]
df_new_score_0 = df_new_score[df_new_score.grad_6years == 0]

In [335]:
t_df = pd.read_csv(fpath_1 + "/main_eval_metrics.csv").iloc[:,[0,2]].merge(pd.read_csv(fpath_2 + "/main_eval_metrics.csv").iloc[:,[0,2]], how='inner', on=['model'])
thresholds = {}
for i in range(t_df.shape[0]):
    thresholds[t_df.iloc[i,0]] = tuple(t_df.iloc[i,1:])

In [347]:
random.seed(4321)
n=5000
f1_list = []
for i in range(n):
    if i % 200 == 0:
        print(i)
    fl = []
    df_old_score_sample = pd.concat([df_old_score_1.sample(5000, replace = True),df_old_score_0.sample(5000, replace = True)])
    df_new_score_sample = pd.concat([df_new_score_1.sample(5000, replace = True),df_new_score_0.sample(5000, replace = True)])
    for m in ['OLS','Logit','RF','XGBoost']:
        fl.append(f1_score(df_old_score_sample.grad_6years, np.where(df_old_score_sample[m]>thresholds[m][0],1,0), pos_label=0))
        fl.append(f1_score(df_new_score_sample.grad_6years, np.where(df_new_score_sample[m]>thresholds[m][0],1,0), pos_label=0))
    f1_list.append(tuple(fl))

0
200
400
600
800
1000
1200
1400
1600
1800
2000
2200
2400
2600
2800
3000
3200
3400
3600
3800
4000
4200
4400
4600
4800


In [358]:
pd.DataFrame({'model':['OLS']*2 + ['Logit']*2 + ['RF']*2 + ['XGBoost']*2,
              'variant':['base','no_nsc']*4,
              'mean_f1_score_0': np.array(f1_list).mean(axis=0),'std':np.array(f1_list).std(axis=0)}).loc[:,['model','variant','mean_f1_score_0','std']].round(4).to_csv(fpath_2 + "/adjusted_f1_score_0.csv", index=False)

##### A sample comparison for XGBoost models:

In [317]:
df_old_score_sample = pd.concat([df_old_score_1.sample(5000, replace = True),df_old_score_0.sample(5000, replace = True)])
f1_score(df_old_score_sample.grad_6years, np.where(df_old_score_sample['XGBoost']>0.3853,1,0), pos_label=0)

0.8345588235294118

In [327]:
df_new_score_sample = pd.concat([df_new_score_1.sample(5000, replace = True),df_new_score_0.sample(5000, replace = True)])
f1_score(df_new_score_sample.grad_6years, np.where(df_new_score_sample['XGBoost']>0.3849,1,0), pos_label=0)

0.8558918713116315