This script is used to find out how model performance changes with number of predictors for the LMS-only, non-first-term model.

In [12]:
import pickle
import pandas as pd
import numpy as np
from collections import Counter
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_curve, roc_auc_score, confusion_matrix, precision_score, recall_score
from sklearn.model_selection import KFold, StratifiedKFold
from scipy.stats.mstats import gmean
import seaborn as sns
import matplotlib.pyplot as plt

results_dir = "~\\Box Sync\\Clickstream\\evaluation_results\\full\\"

In [13]:
df = pd.read_csv("~\\Box Sync\\Clickstream\\data\\full\\LMS_data_final_full_new.csv")
predictors = [e for e in list(df.columns)[5:] if e != "grade"]
len(predictors)

50

In [52]:
assert pd.isnull(df).any().any() == False

In [53]:
df.shape

(969025, 56)

In [5]:
train_df = df[df.strm != 2212]
test_df = df[df.strm == 2212]
original_test_grade = np.array(test_df.grade)
train_df.loc[:,'grade'] = train_df.apply(lambda x: 1 if x.loc['grade'] in {'A','B','C'} else 0, axis=1)
test_df.loc[:,'grade'] = test_df.apply(lambda x: 1 if x.loc['grade'] in {'A','B','C'} else 0, axis=1)
print(train_df.shape,test_df.shape)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


(698361, 56) (270664, 56)


In [6]:
# raw_predictors = {\
# 10: 'tot_click_cnt_qrt1, tot_time_qrt1, assign_sub_cnt_qtr1, has_assign_sub_cnt_qtr1, disc_post_cnt_qtr1, disc_reply_cnt_qtr1, on_time_assign_share_qtr1, has_on_time_assign_share_qtr1, avg_depth_post_qtr1, avg_word_tot_qtr1',
# 20: 'has_concurrent_qtr1, tot_click_cnt_qrt1c, tot_time_qrt1c, prior_disc_post_cnt, irreg_session_len_qrt1, prior_has_full, prior_tot_click_cnt, prior_tot_act_day_cnt, prior_disc_reply_cnt, avg_session_len_qrt1',
# 30: 'prior_tot_act_wk_cnt, assign_sub_cnt_qtr1c, has_assign_sub_cnt_qtr1c, irreg_session_len_qrt1c, prior_on_time_assign_share, prior_has_on_time_assign_share, avg_session_len_qrt1c, prior_has_qtr1, prior_tot_click_cnt_qrt1, prior_tot_time',
# 40: 'prior_tot_session_cnt, prior_disc_reply_cnt_qtr1, prior_assign_sub_cnt, prior_has_assign_sub_cnt, prior_disc_post_cnt_qtr1, prior_tot_time_qrt1, prior_avg_word_tot, prior_avg_depth_post, prior_avg_depth_post_qtr1, prior_avg_word_tot_qtr1',
# 50: 'prior_on_time_assign_share_qtr1, prior_irreg_session_len, prior_avg_session_len, prior_irreg_session_len_qrt1, prior_avg_session_len_qrt1, prior_assign_sub_cnt_qtr1, on_time_assign_share_qtr1c, has_on_time_assign_share_qtr1c, prior_has_on_time_assign_share_qtr1, prior_has_assign_sub_cnt_qtr1'}

In [74]:
raw_predictors = {\
5: 'tot_click_cnt_qrt1, tot_time_qrt1, assign_sub_cnt_qtr1, has_assign_sub_cnt_qtr1, disc_post_cnt_qtr1',
15: 'disc_reply_cnt_qtr1, on_time_assign_share_qtr1, has_on_time_assign_share_qtr1, avg_depth_post_qtr1, has_concurrent_qtr1, tot_click_cnt_qrt1c, avg_word_tot_qtr1, tot_time_qrt1c, prior_has_full, prior_disc_post_cnt',
25: 'irreg_session_len_qrt1, prior_tot_click_cnt, prior_tot_act_day_cnt, prior_disc_reply_cnt, avg_session_len_qrt1, prior_tot_act_wk_cnt, assign_sub_cnt_qtr1c, has_assign_sub_cnt_qtr1c, irreg_session_len_qrt1c, avg_session_len_qrt1c',
35: 'prior_on_time_assign_share, prior_has_on_time_assign_share, prior_has_qtr1, prior_tot_click_cnt_qrt1, prior_tot_time, prior_tot_session_cnt, prior_disc_reply_cnt_qtr1, prior_assign_sub_cnt, prior_has_assign_sub_cnt, prior_disc_post_cnt_qtr1',
45: 'prior_tot_time_qrt1, prior_avg_word_tot, prior_avg_depth_post, prior_avg_depth_post_qtr1, prior_avg_word_tot_qtr1, prior_on_time_assign_share_qtr1, prior_has_on_time_assign_share_qtr1, prior_irreg_session_len, prior_avg_session_len, prior_irreg_session_len_qrt1'}

In [75]:
clean_predictors = {}
for k,v in raw_predictors.items():
    clean_predictors[k] = [p.strip() for p in v.split(",")]
num_of_predictors = sorted(list(clean_predictors.keys()))
clean_predictors_2 = {}
for k,v in clean_predictors.items():
    clean_predictors_2[k] = []
    for n in num_of_predictors:
        if n <= k:
            clean_predictors_2[k] += clean_predictors[n]
    assert len(clean_predictors_2[k]) == k

In [76]:
def create_cv_folds(train, predictors, n_fold = 5):
    folds = []
    k_fold = StratifiedKFold(n_splits = n_fold, random_state = 12345, shuffle=True)
    for train_indices, test_indices in k_fold.split(train, train.grade):
        train_part = train.iloc[train_indices,:]
        test_part = train.iloc[test_indices,:]
        X_1 = train_part.loc[:,predictors]
        y_1 = train_part.grade
        X_2 = test_part.loc[:,predictors]
        y_2 = test_part.grade
        folds.append([(X_1.copy(),y_1.copy()),(X_2.copy(),y_2.copy())])
    return folds

In [77]:
def cross_validation_RF(rf_model, folds):
    auc_by_fold = []
    for f in folds:
        X_1 = f[0][0]
        y_1 = f[0][1]
        X_2 = f[1][0]
        y_2 = f[1][1]
        rf_model.fit(X_1,y_1)
        y_2_pred = rf_model.predict_proba(X_2)[:,1]
        auc_by_fold.append(roc_auc_score(y_2,y_2_pred))
    return round(np.mean(auc_by_fold),4)  

In [78]:
def calc_cw(y):
    # Calculate the weight of each letter grade to be used in the modeling fitting procedure: the weight is inversely proportional to the square root of the frequency of the letter grade in the training sample
    cw = Counter(y)
    class_weight = {k:np.sqrt(cw.most_common()[0][-1]/v, dtype=np.float32) for k,v in cw.items()}
    return class_weight # The output is a dictionary mapping letter grade to the corresponding weight

In [79]:
cstat_by_num = []
for n in sorted(list(clean_predictors_2.keys())):
    print("n = {}:\n".format(n))
    predictors_list = clean_predictors_2[n]
    five_folds = create_cv_folds(train_df, predictors_list)
    auc_by_d=[]
    for d in range(9,36):
        rf = RandomForestClassifier(n_estimators=200, criterion="entropy", 
                                    max_depth=d,
                                    random_state=0, n_jobs=20, max_features="auto",
                                    class_weight = calc_cw(train_df.grade))
        auc = cross_validation_RF(rf, five_folds)
        auc_by_d.append(auc)
        print("max_depth = {0}: {1}".format(d, auc))
        if d > 9:
            if auc - auc_by_d[-2] < 0.001:
                break
            else:
                best_d = d
        else:
            best_d = d
    rf = RandomForestClassifier(n_estimators=200, criterion="entropy",
                                max_depth=best_d,
                                random_state=0, n_jobs=-1, max_features="auto",
                                class_weight = calc_cw(train_df.grade))
    rf.fit(train_df.loc[:,predictors_list], train_df.grade)
    cstat_by_num.append(round(roc_auc_score(test_df.grade, rf.predict_proba(test_df.loc[:,predictors_list])[:,1]),4))
    print("\nC-statistic = {}".format(cstat_by_num[-1]))
    print("\n\n\n")

n = 5:

max_depth = 9: 0.7082
max_depth = 10: 0.7088

C-statistic = 0.7052




n = 15:

max_depth = 9: 0.7445
max_depth = 10: 0.7467
max_depth = 11: 0.7483
max_depth = 12: 0.7498
max_depth = 13: 0.7507

C-statistic = 0.7621




n = 25:

max_depth = 9: 0.7461
max_depth = 10: 0.7483
max_depth = 11: 0.7503
max_depth = 12: 0.752
max_depth = 13: 0.7533
max_depth = 14: 0.7544
max_depth = 15: 0.7552

C-statistic = 0.7706




n = 35:

max_depth = 9: 0.745
max_depth = 10: 0.7474
max_depth = 11: 0.7499
max_depth = 12: 0.7517
max_depth = 13: 0.7535
max_depth = 14: 0.7549
max_depth = 15: 0.7562
max_depth = 16: 0.7572
max_depth = 17: 0.7581

C-statistic = 0.7759




n = 45:

max_depth = 9: 0.7447
max_depth = 10: 0.7475
max_depth = 11: 0.7499
max_depth = 12: 0.7519
max_depth = 13: 0.7539
max_depth = 14: 0.7557
max_depth = 15: 0.757
max_depth = 16: 0.7584
max_depth = 17: 0.7598
max_depth = 18: 0.7607

C-statistic = 0.7773






In [80]:
cstat_by_num

[0.7052, 0.7621, 0.7706, 0.7759, 0.7773]

#### LMS predictor comparisons by in-person vs. online, for the five courses

In [21]:
lms_5courses = pd.read_stata("~\\Box Sync\\Clickstream\\data\\full\\LMS_data_five_courses.dta").fillna(0)

In [22]:
online_ind_df1 = pd.read_csv("~\\Box Sync\\Clickstream\\data\\first\\course_specific_predictors_new.csv")
online_ind_df2 = pd.read_csv("~\\Box Sync\\Clickstream\\data\\full\\course_specific_predictors_new.csv")
online_ind_df = pd.concat([online_ind_df1, online_ind_df2])
online_ind_df = online_ind_df.loc[:,['vccsid', 'strm', 'college', 'course', 'section', 'online_ind']]
lms_5courses = lms_5courses.merge(online_ind_df, on=['vccsid', 'strm', 'college', 'course', 'section'])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


In [23]:
new_df = df[df.course.apply(lambda x: x in {'ENG_111','ENG_112', 'BIO_101', 'MTH_154', 'MTH_161'})]
new_df.shape

(120941, 56)

In [24]:
new_df = new_df.loc[:,['vccsid', 'strm', 'course', 'section']].merge(lms_5courses, on=['vccsid', 'strm', 'course', 'section'], how='inner')
new_df.shape

(120941, 18)

In [27]:
sub_list = []
for c in ['ENG_111','ENG_112', 'BIO_101', 'MTH_154', 'MTH_161']:
    sub0 = new_df[new_df.course == c]
    for i in [0,1]:
        sub = sub0[sub0.online_ind == i].describe().drop(['strm', 'online_ind'], axis=1).loc[['mean', 'std', '50%'], :].T.reset_index()
        sub = sub.rename(columns={'index':'predictor', '50%': 'median'})
        sub.loc[:,'course'] = c
        if i == 0:
            sub.loc[:,'mode'] = "in-person"
        else:
            sub.loc[:,'mode'] = "online"
        sub_list.append(sub)
sub_merged = pd.concat(sub_list).loc[:,['predictor', 'course', 'mode', 'mean', 'median', 'std']].sort_values(['predictor', 'course', 'mode'])

In [40]:
sub_merged_new = \
sub_merged[sub_merged.predictor.apply(lambda x: x in {'tot_click_cnt_qrt1', 'tot_time_qrt1', 
                                                      'avg_word_tot_qtr1', 'has_assign_sub_cnt_qtr1'})]

In [48]:
all_p = []
for v in ['mean', 'median', 'std']:
    p1 = sub_merged_new[sub_merged_new.loc[:,'mode'] == "in-person"].pivot(index='course', columns='predictor', values=v).reset_index()
    p1.loc[:,"mode"] = "in-person"
    p2 = sub_merged_new[sub_merged_new.loc[:,'mode'] == "online"].pivot(index='course', columns='predictor', values=v).reset_index()
    p2.loc[:,"mode"] = "online"
    p = pd.concat([p1,p2]).sort_values(['course', 'mode'])
    p = p.loc[:,['course','mode','tot_click_cnt_qrt1','tot_time_qrt1','avg_word_tot_qtr1','has_assign_sub_cnt_qtr1']]
    all_p.append(p) 

In [53]:
all_p[0].merge(all_p[1],on=['course','mode']).merge(all_p[2], on=['course', 'mode']).iloc[:,[0,1,2,6,10,3,7,11,4,8,12,5,9,13]].round(4).to_csv(results_dir + "lms_predictors_comparison_2.csv", index=False)

In [73]:
sub_merged.round(4).to_csv(results_dir + "lms_predictor_comparison.csv", index=False)