This script is used to find out how model performance changes with number of predictors for the lms-only, first-term model.

In [1]:
import pickle
import pandas as pd
import numpy as np
from collections import Counter
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_curve, roc_auc_score, confusion_matrix, precision_score, recall_score
from sklearn.model_selection import KFold, StratifiedKFold
from scipy.stats.mstats import gmean
import seaborn as sns
import matplotlib.pyplot as plt

results_dir = "~\\Box Sync\\Clickstream\\evaluation_results\\first\\"

In [2]:
df = pd.read_stata("~\\Box Sync\\Clickstream\\data\\first\\LMS_data_final.dta")
predictors = [e for e in list(df.columns)[5:] if e != "grade"]
len(predictors)

21

In [3]:
assert pd.isnull(df).any().any() == False

In [4]:
df.shape

(204853, 27)

In [5]:
train_df = df[df.strm != 2212]
test_df = df[df.strm == 2212]
original_test_grade = np.array(test_df.grade)
train_df.loc[:,'grade'] = train_df.apply(lambda x: 1 if x.loc['grade'] in {'A','B','C'} else 0, axis=1)
test_df.loc[:,'grade'] = test_df.apply(lambda x: 1 if x.loc['grade'] in {'A','B','C'} else 0, axis=1)
print(train_df.shape,test_df.shape)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


(181673, 27) (23180, 27)


In [6]:
raw_predictors = {\
5: 'tot_click_cnt_qrt1, tot_time_qrt1, disc_reply_cnt_qtr1, assign_sub_cnt_qtr1, has_assign_sub_cnt_qtr1',
10: 'disc_post_cnt_qtr1, on_time_assign_share_qtr1, has_on_time_assign_share_qtr1, has_concurrent_qtr1, tot_click_cnt_qrt1c',
15: 'avg_depth_post_qtr1, tot_time_qrt1c, avg_word_tot_qtr1, irreg_session_len_qrt1, avg_session_len_qrt1',
21: 'irreg_session_len_qrt1c, assign_sub_cnt_qtr1c, avg_session_len_qrt1c, on_time_assign_share_qtr1c, has_on_time_assign_share_qtr1c, has_assign_sub_cnt_qtr1c'}

In [7]:
clean_predictors = {}
for k,v in raw_predictors.items():
    clean_predictors[k] = [p.strip() for p in v.split(",")]
num_of_predictors = sorted(list(clean_predictors.keys()))
clean_predictors_2 = {}
for k,v in clean_predictors.items():
    clean_predictors_2[k] = []
    for n in num_of_predictors:
        if n <= k:
            clean_predictors_2[k] += clean_predictors[n]
    assert len(clean_predictors_2[k]) == k

In [8]:
def create_cv_folds(train, predictors, n_fold = 5):
    folds = []
    k_fold = StratifiedKFold(n_splits = n_fold, random_state = 12345, shuffle=True)
    for train_indices, test_indices in k_fold.split(train, train.grade):
        train_part = train.iloc[train_indices,:]
        test_part = train.iloc[test_indices,:]
        X_1 = train_part.loc[:,predictors]
        y_1 = train_part.grade
        X_2 = test_part.loc[:,predictors]
        y_2 = test_part.grade
        folds.append([(X_1.copy(),y_1.copy()),(X_2.copy(),y_2.copy())])
    return folds

In [9]:
def cross_validation_RF(rf_model, folds):
    auc_by_fold = []
    for f in folds:
        X_1 = f[0][0]
        y_1 = f[0][1]
        X_2 = f[1][0]
        y_2 = f[1][1]
        rf_model.fit(X_1,y_1)
        y_2_pred = rf_model.predict_proba(X_2)[:,1]
        auc_by_fold.append(roc_auc_score(y_2,y_2_pred))
    return round(np.mean(auc_by_fold),4)  

In [10]:
def calc_cw(y):
    # Calculate the weight of each letter grade to be used in the modeling fitting procedure: the weight is inversely proportional to the square root of the frequency of the letter grade in the training sample
    cw = Counter(y)
    class_weight = {k:np.sqrt(cw.most_common()[0][-1]/v, dtype=np.float32) for k,v in cw.items()}
    return class_weight # The output is a dictionary mapping letter grade to the corresponding weight

In [11]:
cstat_by_num = []
for n in sorted(list(clean_predictors_2.keys())):
    print("n = {}:\n".format(n))
    predictors_list = clean_predictors_2[n]
    five_folds = create_cv_folds(train_df, predictors_list)
    auc_by_d=[]
    for d in range(2,36):
        rf = RandomForestClassifier(n_estimators=200, criterion="entropy", 
                                    max_depth=d,
                                    random_state=0, n_jobs=20, max_features="auto",
                                    class_weight = calc_cw(train_df.grade))
        auc = cross_validation_RF(rf, five_folds)
        auc_by_d.append(auc)
        print("max_depth = {0}: {1}".format(d, auc))
        if d > 2:
            if auc - auc_by_d[-2] < 0.001:
                break
            else:
                best_d = d
        else:
            best_d = d
    rf = RandomForestClassifier(n_estimators=200, criterion="entropy",
                                max_depth=best_d,
                                random_state=0, n_jobs=-1, max_features="auto",
                                class_weight = calc_cw(train_df.grade))
    rf.fit(train_df.loc[:,predictors_list], train_df.grade)
    cstat_by_num.append(round(roc_auc_score(test_df.grade, rf.predict_proba(test_df.loc[:,predictors_list])[:,1]),4))
    print("\nC-statistic = {}".format(cstat_by_num[-1]))
    print("\n\n\n")

n = 5:

max_depth = 2: 0.7024
max_depth = 3: 0.7082
max_depth = 4: 0.7104
max_depth = 5: 0.7121
max_depth = 6: 0.713

C-statistic = 0.7297




n = 10:

max_depth = 2: 0.7157
max_depth = 3: 0.7217
max_depth = 4: 0.7292
max_depth = 5: 0.7363
max_depth = 6: 0.7416
max_depth = 7: 0.7459
max_depth = 8: 0.7488
max_depth = 9: 0.7512
max_depth = 10: 0.7527
max_depth = 11: 0.7538
max_depth = 12: 0.7546

C-statistic = 0.7683




n = 15:

max_depth = 2: 0.7158
max_depth = 3: 0.7212
max_depth = 4: 0.7293
max_depth = 5: 0.7365
max_depth = 6: 0.7415
max_depth = 7: 0.7459
max_depth = 8: 0.7495
max_depth = 9: 0.7519
max_depth = 10: 0.7541
max_depth = 11: 0.7557
max_depth = 12: 0.7571
max_depth = 13: 0.758

C-statistic = 0.773




n = 21:

max_depth = 2: 0.7151
max_depth = 3: 0.7223
max_depth = 4: 0.7297
max_depth = 5: 0.7366
max_depth = 6: 0.7419
max_depth = 7: 0.7459
max_depth = 8: 0.7497
max_depth = 9: 0.7524
max_depth = 10: 0.7547
max_depth = 11: 0.7565
max_depth = 12: 0.7582
max_depth = 13: 0.7595

In [12]:
cstat_by_num

[0.7297, 0.7683, 0.773, 0.7755]