This script is used to find out how model performance changes with number of predictors for the admin-only, first-term model.

In [1]:
import pickle
import pandas as pd
import numpy as np
from collections import Counter
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_curve, roc_auc_score, confusion_matrix, precision_score, recall_score
from sklearn.model_selection import KFold, StratifiedKFold
from scipy.stats.mstats import gmean
import seaborn as sns
import matplotlib.pyplot as plt

results_dir = "~\\Box Sync\\Clickstream\\evaluation_results\\first\\"

In [2]:
sn_dict = {"Blue Ridge": "BRCC",
           "Central Virginia": "CVCC",
           "Dabney S. Lancaster": "DSLCC",
           "Danville": "DCC",
           "Eastern Shore": "ESCC",
           "Germanna": "GCC",
           'J. Sargeant Reynolds': "JSRCC",
           'John Tyler': "JTCC",
           "Lord Fairfax": "LFCC",
           "Mountain Empire": "MECC",
           "New River": "NRCC",
           "Northern Virginia": "NVCC",
           "Patrick Henry": "PHCC",
           "Paul D. Camp": "PDCCC",
           "Piedmont Virginia": "PVCC",
           "Rappahannock": "RCC",
           "Southside Virginia": "SSVCC",
           "Southwest Virginia": "SWVCC",
           "Thomas Nelson": "TNCC",
           "Tidewater": "TCC",
           "Virginia Highlands": "VHCC",
           "Virginia Western": "VWCC",
           "Wytheville": "WCC"}

In [3]:
df0 = pd.read_stata("~\\Box Sync\\Clickstream\\data\\first\\LMS_data_final.dta").loc[:,['vccsid','strm', 'college', 'course','section','grade']]
df1 = pd.read_csv("~\\Box Sync\\Clickstream\\data\\first\\course_specific_predictors_new.csv")
df2 = pd.read_csv("~\\Box Sync\\Clickstream\\data\\first\\term_specific_predictors_new.csv")
for v in [int(e) for e in np.unique(df2.cip) if e != 0]:
    df2.loc[:,'cip_'+str(v)] = (df2.cip == v).astype(int)
for v in [int(e) for e in np.unique(df2.degree_level) if e != 4]:
    df2.loc[:,'degree_level_'+str(v)] = (df2.degree_level == v).astype(int)
df2 = df2.drop(['cip', 'degree_level'], axis=1)
df4 = pd.read_stata("~\\Box Sync\\Clickstream\\data\\first\\instructor_related_predictors.dta")
df5 = df0.iloc[:,:5].copy()
df5.loc[:,'college_new'] = df5.college.apply(lambda x: sn_dict[x])
for sn in [e for e in sn_dict.values() if e != "BRCC"]:
    df5.loc[:,'college_'+sn] = (df5.college_new == sn).astype(int)
df5 = df5.drop(['college_new'], axis=1)
df = df0.merge(df1, how='inner', on=['vccsid','strm','college','course','section'])\
.merge(df2, how='inner', on=['vccsid','strm'])\
.merge(df4, how='inner', on=['vccsid','strm','college','course','section'])\
.merge(df5, how='inner', on=['vccsid','strm','college','course','section'])
predictors = list(df.columns)[6:]
len(predictors)

59

In [4]:
assert pd.isnull(df).any().any() == False

In [5]:
df.shape

(204853, 65)

In [6]:
train_df = df[df.strm != 2212]
test_df = df[df.strm == 2212]
original_test_grade = np.array(test_df.grade)
train_df.loc[:,'grade'] = train_df.apply(lambda x: 1 if x.loc['grade'] in {'A','B','C'} else 0, axis=1)
test_df.loc[:,'grade'] = test_df.apply(lambda x: 1 if x.loc['grade'] in {'A','B','C'} else 0, axis=1)
print(train_df.shape,test_df.shape)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


(181673, 65) (23180, 65)


In [7]:
# Number of unique students in the sample
print(len(np.unique(train_df.vccsid)), len(np.unique(test_df.vccsid)))

63603 8196


In [8]:
print(len(np.unique(train_df.course)), len(np.unique(test_df.course)))

1399 966


In [9]:
# Number of unique college x course observations in the sample
print(train_df.loc[:,['college', 'course']].drop_duplicates().shape[0],
      test_df.loc[:,['college', 'course']].drop_duplicates().shape[0])

4457 2615


In [10]:
# Total number of unique college x course observations in the entire sample (training + test)
print(pd.concat([train_df.loc[:,['college', 'course']], test_df.loc[:,['college', 'course']]]).drop_duplicates().shape[0])

5013


In [11]:
len(np.union1d(np.unique(train_df.course), np.unique(test_df.course)))

1556

In [12]:
# raw_predictors = {\
# 10: 'crnt_enrl_intensity, avg_g_concurrent, has_avg_g_concurrent, past_avg_grade, has_past_avg_grade, avg_g, section_size, age, lvl2_share, online_share',
# 20: 'eve_share, summer_ind, full_time, tenure, dev, degree_level_1, college_NVCC, cip_24, online_ind, college_TCC',
# 30: 'lvl2_ind, cip_52, degree_level_2, cip_30, cip_11, eve_ind, college_JSRCC, cip_51, college_JTCC, degree_level_3',
# 40: 'college_GCC, cip_45, college_TNCC, cip_14, college_LFCC, cip_50, cip_43, college_VWCC, college_CVCC, cip_99',
# 50: 'college_NRCC, college_SWVCC, college_PVCC, cip_15, cip_47, college_PHCC, cip_42, college_VHCC, cip_19, college_DCC',
# 59: 'college_MECC, college_RCC, college_PDCCC, college_WCC, college_SSVCC, cip_48, college_DSLCC, cip_46, college_ESCC'}

In [20]:
raw_predictors = {\
5: 'crnt_enrl_intensity, avg_g_concurrent, has_avg_g_concurrent, past_avg_grade, has_past_avg_grade',
15: 'avg_g, section_size, age, lvl2_share, online_share, eve_share, summer_ind, full_time, tenure, dev',
25: 'degree_level_1, college_NVCC, cip_24, online_ind, college_TCC, lvl2_ind, cip_52, degree_level_2, cip_30, cip_11',
35: 'eve_ind, college_JSRCC, cip_51, college_JTCC, degree_level_3, college_GCC, cip_45, college_TNCC, cip_14, college_LFCC',
45: 'cip_50, cip_43, college_VWCC, college_CVCC, cip_99, college_NRCC, college_SWVCC, college_PVCC, cip_15, cip_47'}

In [21]:
clean_predictors = {}
for k,v in raw_predictors.items():
    clean_predictors[k] = [p.strip() for p in v.split(",")]
num_of_predictors = sorted(list(clean_predictors.keys()))
clean_predictors_2 = {}
for k,v in clean_predictors.items():
    clean_predictors_2[k] = []
    for n in num_of_predictors:
        if n <= k:
            clean_predictors_2[k] += clean_predictors[n]
    assert len(clean_predictors_2[k]) == k

In [22]:
def create_cv_folds(train, predictors, n_fold = 5):
    folds = []
    k_fold = StratifiedKFold(n_splits = n_fold, random_state = 12345, shuffle=True)
    for train_indices, test_indices in k_fold.split(train, train.grade):
        train_part = train.iloc[train_indices,:]
        test_part = train.iloc[test_indices,:]
        X_1 = train_part.loc[:,predictors]
        y_1 = train_part.grade
        X_2 = test_part.loc[:,predictors]
        y_2 = test_part.grade
        folds.append([(X_1.copy(),y_1.copy()),(X_2.copy(),y_2.copy())])
    return folds

In [23]:
def cross_validation_RF(rf_model, folds):
    auc_by_fold = []
    for f in folds:
        X_1 = f[0][0]
        y_1 = f[0][1]
        X_2 = f[1][0]
        y_2 = f[1][1]
        rf_model.fit(X_1,y_1)
        y_2_pred = rf_model.predict_proba(X_2)[:,1]
        auc_by_fold.append(roc_auc_score(y_2,y_2_pred))
    return round(np.mean(auc_by_fold),4)  

In [24]:
def calc_cw(y):
    # Calculate the weight of each letter grade to be used in the modeling fitting procedure: the weight is inversely proportional to the square root of the frequency of the letter grade in the training sample
    cw = Counter(y)
    class_weight = {k:np.sqrt(cw.most_common()[0][-1]/v, dtype=np.float32) for k,v in cw.items()}
    return class_weight # The output is a dictionary mapping letter grade to the corresponding weight

In [25]:
cstat_by_num = []
for n in sorted(list(clean_predictors_2.keys())):
    print("n = {}:\n".format(n))
    predictors_list = clean_predictors_2[n]
    five_folds = create_cv_folds(train_df, predictors_list)
    auc_by_d=[]
    for d in range(2,36):
        rf = RandomForestClassifier(n_estimators=200, criterion="entropy", 
                                    max_depth=d,
                                    random_state=0, n_jobs=20, max_features="auto",
                                    class_weight = calc_cw(train_df.grade))
        auc = cross_validation_RF(rf, five_folds)
        auc_by_d.append(auc)
        print("max_depth = {0}: {1}".format(d, auc))
        if d > 2:
            if auc - auc_by_d[-2] < 0.001:
                break
            else:
                best_d = d
        else:
            best_d = d
    rf = RandomForestClassifier(n_estimators=200, criterion="entropy",
                                max_depth=best_d,
                                random_state=0, n_jobs=-1, max_features="auto",
                                class_weight = calc_cw(train_df.grade))
    rf.fit(train_df.loc[:,predictors_list], train_df.grade)
    cstat_by_num.append(round(roc_auc_score(test_df.grade, rf.predict_proba(test_df.loc[:,predictors_list])[:,1]),4))
    print("\nC-statistic = {}".format(cstat_by_num[-1]))
    print("\n\n\n")

n = 5:

max_depth = 2: 0.6678
max_depth = 3: 0.676
max_depth = 4: 0.6817
max_depth = 5: 0.6848
max_depth = 6: 0.6875
max_depth = 7: 0.6908
max_depth = 8: 0.694
max_depth = 9: 0.6973
max_depth = 10: 0.7005
max_depth = 11: 0.7032
max_depth = 12: 0.7059
max_depth = 13: 0.7084
max_depth = 14: 0.7103
max_depth = 15: 0.7118
max_depth = 16: 0.713
max_depth = 17: 0.7135

C-statistic = 0.6913




n = 15:

max_depth = 2: 0.6876
max_depth = 3: 0.7044
max_depth = 4: 0.7119
max_depth = 5: 0.7204
max_depth = 6: 0.725
max_depth = 7: 0.7292
max_depth = 8: 0.7335
max_depth = 9: 0.7375
max_depth = 10: 0.7411
max_depth = 11: 0.7447
max_depth = 12: 0.7482
max_depth = 13: 0.7518
max_depth = 14: 0.7552
max_depth = 15: 0.7583
max_depth = 16: 0.7608
max_depth = 17: 0.7638
max_depth = 18: 0.7657
max_depth = 19: 0.7681
max_depth = 20: 0.7695
max_depth = 21: 0.7709
max_depth = 22: 0.7721
max_depth = 23: 0.7727

C-statistic = 0.7212




n = 25:

max_depth = 2: 0.6885
max_depth = 3: 0.7031
max_depth = 4: 0.7142
ma

In [27]:
cstat_by_num

[0.6913, 0.7212, 0.7247, 0.7267, 0.7268]