This script is used to find out how model performance changes with number of predictors for the admin-only, non-first-term model.

In [1]:
import pickle
import pandas as pd
import numpy as np
from collections import Counter
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_curve, roc_auc_score, confusion_matrix, precision_score, recall_score
from sklearn.model_selection import KFold, StratifiedKFold
from scipy.stats.mstats import gmean
import seaborn as sns
import matplotlib.pyplot as plt

results_dir = "~\\Box Sync\\Clickstream\\evaluation_results\\full\\"

In [2]:
sn_dict = {"Blue Ridge": "BRCC",
           "Central Virginia": "CVCC",
           "Dabney S. Lancaster": "DSLCC",
           "Danville": "DCC",
           "Eastern Shore": "ESCC",
           "Germanna": "GCC",
           'J. Sargeant Reynolds': "JSRCC",
           'John Tyler': "JTCC",
           "Lord Fairfax": "LFCC",
           "Mountain Empire": "MECC",
           "New River": "NRCC",
           "Northern Virginia": "NVCC",
           "Patrick Henry": "PHCC",
           "Paul D. Camp": "PDCCC",
           "Piedmont Virginia": "PVCC",
           "Rappahannock": "RCC",
           "Southside Virginia": "SSVCC",
           "Southwest Virginia": "SWVCC",
           "Thomas Nelson": "TNCC",
           "Tidewater": "TCC",
           "Virginia Highlands": "VHCC",
           "Virginia Western": "VWCC",
           "Wytheville": "WCC"}

In [3]:
df0 = pd.read_stata("~\\Box Sync\\Clickstream\\data\\full\\LMS_data_final.dta").loc[:,['vccsid','strm', 'college', 'course','section','grade']]
df1 = pd.read_csv("~\\Box Sync\\Clickstream\\data\\full\\course_specific_predictors_new.csv")
df2 = pd.read_csv("~\\Box Sync\\Clickstream\\data\\full\\term_specific_predictors_new.csv")
for v in [int(e) for e in np.unique(df2.cip) if e != 0]:
    df2.loc[:,'cip_'+str(v)] = (df2.cip == v).astype(int)
for v in [int(e) for e in np.unique(df2.degree_level) if e != 4]:
    df2.loc[:,'degree_level_'+str(v)] = (df2.degree_level == v).astype(int)
df2 = df2.drop(['cip', 'degree_level'], axis=1)
df3 = pd.read_csv("~\\Box Sync\\Clickstream\\data\\full\\cluster_specific_predictors.csv")
df4 = pd.read_stata("~\\Box Sync\\Clickstream\\data\\full\\instructor_related_predictors.dta")
df5 = df0.iloc[:,:5].copy()
df5.loc[:,'college_new'] = df5.college.apply(lambda x: sn_dict[x])
for sn in [e for e in sn_dict.values() if e != "BRCC"]:
    df5.loc[:,'college_'+sn] = (df5.college_new == sn).astype(int)
df5 = df5.drop(['college_new'], axis=1)
df = df0.merge(df1, how='inner', on=['vccsid','strm','college','course','section'])\
.merge(df2, how='inner', on=['vccsid','strm'])\
.merge(df3, how='inner', on=['vccsid','strm','college','course','section'])\
.merge(df4, how='inner', on=['vccsid','strm','college','course','section'])\
.merge(df5, how='inner', on=['vccsid','strm','college','course','section'])
predictors = list(df.columns)[6:]
len(predictors)

279

In [4]:
assert pd.isnull(df).any().any() == False

In [5]:
df.shape

(969025, 285)

In [6]:
train_df = df[df.strm != 2212]
test_df = df[df.strm == 2212]
original_test_grade = np.array(test_df.grade)
train_df.loc[:,'grade'] = train_df.apply(lambda x: 1 if x.loc['grade'] in {'A','B','C'} else 0, axis=1)
test_df.loc[:,'grade'] = test_df.apply(lambda x: 1 if x.loc['grade'] in {'A','B','C'} else 0, axis=1)
print(train_df.shape,test_df.shape)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


(698361, 285) (270664, 285)


In [7]:
# Number of unique students in the sample
print(len(np.unique(train_df.vccsid)), len(np.unique(test_df.vccsid)))

164245 87022


In [8]:
print(len(np.unique(train_df.course)), len(np.unique(test_df.course)))

2246 1989


In [9]:
# Number of unique college x course observations in the sample
print(train_df.loc[:,['college', 'course']].drop_duplicates().shape[0],
      test_df.loc[:,['college', 'course']].drop_duplicates().shape[0])

7061 6037


In [10]:
# Total number of unique college x course observations in the entire sample (training + test)
print(pd.concat([train_df.loc[:,['college', 'course']], test_df.loc[:,['college', 'course']]]).drop_duplicates().shape[0])

9098


In [11]:
len(np.union1d(np.unique(train_df.course), np.unique(test_df.course)))

2639

In [12]:
# raw_predictors = {\
# 10: 'pct_withdrawn, crnt_enrl_intensity, cum_gpa, term_gpa_1, has_term_gpa_1, past_avg_grade, has_past_avg_grade, avg_g, term_gpa_2, has_term_gpa_2',
# 20: 'overall_prop_comp, avg_g_concurrent, has_avg_g_concurrent, gpa_trend, cum_cred_earn, prop_comp_sd, section_size, enrl_intensity_trend, enrl_intensity, age',
# 30: 'pct_stopped, lvl2_share, pct_dev, num_of_prior_terms, online_share, eve_share, HUM_SOC_grade, HUM_SOC, HUM_HUM_grade, HUM_HUM',
# 40: 'prereq_grade, has_prereq_grade, pct_incomplete, SOC_SOC_grade, SOC_SOC, full_time, HUM_SCI_grade, HUM_SCI, online_ind, summer_ind',
# 50: 'tenure, college_NVCC, SOC_HUM_grade, SOC_HUM, HUM_MTH_grade, HUM_MTH, SOC_SCI_grade, SOC_SCI, lvl2_ind, cip_24',
# 60: 'ever_dual, degree_level_1, SCI_SCI_grade, SCI_SCI, MTH_SCI_grade, MTH_SCI, MTH_SOC_grade, MTH_SOC, MTH_HUM_grade, MTH_HUM',
# 70: 'HUM_EGR_grade, HUM_EGR, has_repeat_grade, repeat_grade, college_TCC, HUM_MED_grade, HUM_MED, EGR_EGR_grade, EGR_EGR, degree_level_2',
# 80: 'SOC_MTH_grade, SOC_MTH, SCI_SOC_grade, SCI_SOC, SCI_MED_grade, SCI_MED, MTH_MTH_grade, MTH_MTH, EGR_SOC_grade, EGR_SOC',
# 90: 'cip_52, EGR_HUM_grade, EGR_HUM, SCI_HUM_grade, SCI_HUM, MED_MED_grade, MED_MED, cip_30, MTH_EGR_grade, MTH_EGR',
# 100: 'SOC_EGR_grade, SOC_EGR, SOC_MED_grade, SOC_MED, eve_ind, EGR_SCI_grade, EGR_SCI, MED_SCI_grade, MED_SCI, dev',
# 110: 'MED_SOC_grade, MED_SOC, cip_11, MED_HUM_grade, MED_HUM, SCI_MTH_grade, SCI_MTH, cip_45, cip_51, college_JSRCC',
# 120: 'EGR_MTH_grade, EGR_MTH, HUM_BUS_grade, HUM_BUS, college_TNCC, college_JTCC, college_GCC, degree_level_3, MTH_MED_grade, MTH_MED',
# 130: 'HUM_ART_grade, HUM_ART, BUS_BUS_grade, BUS_BUS, ART_SOC_grade, ART_SOC, SOC_BUS_grade, SOC_BUS, ART_SCI_grade, ART_SCI',
# 140: 'BUS_SOC_grade, BUS_SOC, college_LFCC, SCI_EGR_grade, SCI_EGR, BUS_HUM_grade, BUS_HUM, ART_HUM_grade, ART_HUM, college_VWCC',
# 150: 'MTH_BUS_grade, MTH_BUS, cip_14, MED_EGR_grade, MED_EGR, college_PVCC, MED_MTH_grade, MED_MTH, EGR_BUS_grade, EGR_BUS',
# 160: 'SOC_ART_grade, SOC_ART, BUS_SCI_grade, BUS_SCI, FLA_HUM_grade, FLA_HUM, cip_43, BUS_EGR_grade, BUS_EGR, college_NRCC',
# 170: 'ART_EGR_grade, ART_EGR, ART_MTH_grade, ART_MTH, BUS_MTH_grade, BUS_MTH, EGR_MED_grade, EGR_MED, FLA_SCI_grade, FLA_SCI',
# 180: 'FLA_SOC_grade, FLA_SOC, MTH_ART_grade, MTH_ART, college_SWVCC, cip_50, college_CVCC, cip_99, college_PHCC, college_DCC',
# 190: 'SCI_BUS_grade, SCI_BUS, OCC_OCC_grade, OCC_OCC, ART_ART_grade, ART_ART, cip_15, college_SSVCC, FLA_MTH_grade, FLA_MTH',
# 200: 'MED_BUS_grade, MED_BUS, college_MECC, college_RCC, college_VHCC, college_WCC, EGR_ART_grade, EGR_ART, HUM_OCC_grade, HUM_OCC',
# 210: 'SCI_ART_grade, SCI_ART, FLA_EGR_grade, FLA_EGR, OCC_HUM_grade, OCC_HUM, cip_19, HUM_FLA_grade, HUM_FLA, cip_47',
# 220: 'ART_BUS_grade, ART_BUS, MED_ART_grade, MED_ART, OCC_EGR_grade, OCC_EGR, ART_MED_grade, ART_MED, SOC_FLA_grade, SOC_FLA',
# 230: 'BUS_MED_grade, BUS_MED, OCC_SOC_grade, OCC_SOC, college_PDCCC, cip_42, OCC_SCI_grade, OCC_SCI, MTH_OCC_grade, MTH_OCC',
# 240: 'MTH_FLA_grade, MTH_FLA, BUS_ART_grade, BUS_ART, SOC_OCC_grade, SOC_OCC, EGR_OCC_grade, EGR_OCC, college_DSLCC, college_ESCC',
# 250: 'OCC_MTH_grade, OCC_MTH, FLA_BUS_grade, FLA_BUS, FLA_MED_grade, FLA_MED, SCI_FLA_grade, SCI_FLA, FLA_FLA_grade, FLA_FLA',
# 259: 'cip_48, EGR_FLA_grade, EGR_FLA, FLA_ART_grade, FLA_ART, MED_FLA_grade, MED_FLA, SCI_OCC_grade, SCI_OCC',
# 269: 'OCC_MED_grade, OCC_MED, MED_OCC_grade, MED_OCC, OCC_BUS_grade, OCC_BUS, ART_FLA_grade, ART_FLA, OCC_ART_grade, OCC_ART',
# 279: 'BUS_FLA_grade, BUS_FLA, BUS_OCC_grade, BUS_OCC, ART_OCC_grade, ART_OCC, FLA_OCC_grade, FLA_OCC, OCC_FLA_grade, OCC_FLA'}

In [13]:
raw_predictors = {\
5: 'pct_withdrawn, crnt_enrl_intensity, cum_gpa, term_gpa_1, has_term_gpa_1',
15: 'past_avg_grade, has_past_avg_grade, avg_g, term_gpa_2, has_term_gpa_2, overall_prop_comp, avg_g_concurrent, has_avg_g_concurrent, gpa_trend, cum_cred_earn',
25: 'prop_comp_sd, section_size, enrl_intensity_trend, enrl_intensity, age, pct_stopped, lvl2_share, pct_dev, num_of_prior_terms, online_share',
35: 'eve_share, HUM_SOC_grade, HUM_SOC, HUM_HUM_grade, HUM_HUM, prereq_grade, has_prereq_grade, pct_incomplete, SOC_SOC_grade, SOC_SOC',
45: 'full_time, HUM_SCI_grade, HUM_SCI, online_ind, summer_ind, tenure, college_NVCC, SOC_HUM_grade, SOC_HUM, lvl2_ind'}

In [14]:
clean_predictors = {}
for k,v in raw_predictors.items():
    clean_predictors[k] = [p.strip() for p in v.split(",")]
num_of_predictors = sorted(list(clean_predictors.keys()))
clean_predictors_2 = {}
for k,v in clean_predictors.items():
    clean_predictors_2[k] = []
    for n in num_of_predictors:
        if n <= k:
            clean_predictors_2[k] += clean_predictors[n]
    assert len(clean_predictors_2[k]) == k

In [15]:
def create_cv_folds(train, predictors, n_fold = 5):
    folds = []
    k_fold = StratifiedKFold(n_splits = n_fold, random_state = 12345, shuffle=True)
    for train_indices, test_indices in k_fold.split(train, train.grade):
        train_part = train.iloc[train_indices,:]
        test_part = train.iloc[test_indices,:]
        X_1 = train_part.loc[:,predictors]
        y_1 = train_part.grade
        X_2 = test_part.loc[:,predictors]
        y_2 = test_part.grade
        folds.append([(X_1.copy(),y_1.copy()),(X_2.copy(),y_2.copy())])
    return folds

In [16]:
def cross_validation_RF(rf_model, folds):
    auc_by_fold = []
    for f in folds:
        X_1 = f[0][0]
        y_1 = f[0][1]
        X_2 = f[1][0]
        y_2 = f[1][1]
        rf_model.fit(X_1,y_1)
        y_2_pred = rf_model.predict_proba(X_2)[:,1]
        auc_by_fold.append(roc_auc_score(y_2,y_2_pred))
    return round(np.mean(auc_by_fold),4)  

In [17]:
def calc_cw(y):
    # Calculate the weight of each letter grade to be used in the modeling fitting procedure: the weight is inversely proportional to the square root of the frequency of the letter grade in the training sample
    cw = Counter(y)
    class_weight = {k:np.sqrt(cw.most_common()[0][-1]/v, dtype=np.float32) for k,v in cw.items()}
    return class_weight # The output is a dictionary mapping letter grade to the corresponding weight

In [18]:
cstat_by_num = []
for n in sorted(list(clean_predictors_2.keys())):
    print("n = {}:\n".format(n))
    predictors_list = clean_predictors_2[n]
    five_folds = create_cv_folds(train_df, predictors_list)
    auc_by_d=[]
    for d in range(5,36):
        rf = RandomForestClassifier(n_estimators=200, criterion="entropy", 
                                    max_depth=d,
                                    random_state=0, n_jobs=20, max_features="auto",
                                    class_weight = calc_cw(train_df.grade))
        auc = cross_validation_RF(rf, five_folds)
        auc_by_d.append(auc)
        print("max_depth = {0}: {1}".format(d, auc))
        if d > 5:
            if auc - auc_by_d[-2] < 0.001:
                break
            else:
                best_d = d
        else:
            best_d = d
    rf = RandomForestClassifier(n_estimators=200, criterion="entropy",
                                max_depth=best_d,
                                random_state=0, n_jobs=-1, max_features="auto",
                                class_weight = calc_cw(train_df.grade))
    rf.fit(train_df.loc[:,predictors_list], train_df.grade)
    cstat_by_num.append(round(roc_auc_score(test_df.grade, rf.predict_proba(test_df.loc[:,predictors_list])[:,1]),4))
    print("\nC-statistic = {}".format(cstat_by_num[-1]))
    print("\n\n\n")

n = 5:

max_depth = 5: 0.7997
max_depth = 6: 0.8017
max_depth = 7: 0.8037
max_depth = 8: 0.8055
max_depth = 9: 0.8074
max_depth = 10: 0.8094
max_depth = 11: 0.8118
max_depth = 12: 0.8147
max_depth = 13: 0.818
max_depth = 14: 0.8217
max_depth = 15: 0.8257
max_depth = 16: 0.8298
max_depth = 17: 0.8336
max_depth = 18: 0.837
max_depth = 19: 0.8399
max_depth = 20: 0.8418
max_depth = 21: 0.8431
max_depth = 22: 0.8438

C-statistic = 0.8216




n = 15:

max_depth = 5: 0.8104
max_depth = 6: 0.815
max_depth = 7: 0.8188
max_depth = 8: 0.8225
max_depth = 9: 0.8256
max_depth = 10: 0.8283
max_depth = 11: 0.831
max_depth = 12: 0.8334
max_depth = 13: 0.836
max_depth = 14: 0.8389
max_depth = 15: 0.8418
max_depth = 16: 0.845
max_depth = 17: 0.848
max_depth = 18: 0.851
max_depth = 19: 0.8541
max_depth = 20: 0.857
max_depth = 21: 0.8593
max_depth = 22: 0.8613
max_depth = 23: 0.863
max_depth = 24: 0.8646
max_depth = 25: 0.8657
max_depth = 26: 0.8667
max_depth = 27: 0.8675

C-statistic = 0.8437




n = 25:


In [19]:
cstat_by_num

[0.8216, 0.8437, 0.8513, 0.8529, 0.854]