This script applies the models which only include the early-term lms predictors constructed using the non-first-term observations to the 5 selected courses at VCCS, and generates the key evaluation metrics for each course-specific validation sample (including both first-term and non-first-term ones).

In [1]:
import pickle
import pandas as pd
import numpy as np
from collections import Counter
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_curve, roc_auc_score, confusion_matrix, precision_score, recall_score
from sklearn.model_selection import KFold, StratifiedKFold
from scipy.stats.mstats import gmean
import seaborn as sns
import matplotlib.pyplot as plt

results_dir = "~\\Box Sync\\Clickstream\\evaluation_results\\full\\updated\\"
results_dir_2 = "~\\Box Sync\\Clickstream\\evaluation_results\\first\\updated\\"

In [2]:
sn_dict = {"Blue Ridge": "BRCC",
           "Central Virginia": "CVCC",
           "Dabney S. Lancaster": "DSLCC",
           "Danville": "DCC",
           "Eastern Shore": "ESCC",
           "Germanna": "GCC",
           'J. Sargeant Reynolds': "JSRCC",
           'John Tyler': "JTCC",
           "Lord Fairfax": "LFCC",
           "Mountain Empire": "MECC",
           "New River": "NRCC",
           "Northern Virginia": "NVCC",
           "Patrick Henry": "PHCC",
           "Paul D. Camp": "PDCCC",
           "Piedmont Virginia": "PVCC",
           "Rappahannock": "RCC",
           "Southside Virginia": "SSVCC",
           "Southwest Virginia": "SWVCC",
           "Thomas Nelson": "TNCC",
           "Tidewater": "TCC",
           "Virginia Highlands": "VHCC",
           "Virginia Western": "VWCC",
           "Wytheville": "WCC"}

In [3]:
df = pd.read_csv("~\\Box Sync\\Clickstream\\data\\full\\updated\\LMS_data_final_full_new.csv")
predictors = [e for e in list(df.columns)[5:] if e != "grade"]
len(predictors)

50

In [4]:
assert pd.isnull(df).any().any() == False

In [5]:
df.shape

(969025, 56)

In [6]:
train_df = df[df.strm != 2212]
test_df = df[df.strm == 2212]
original_test_grade = np.array(test_df.grade)
train_df.loc[:,'grade'] = train_df.apply(lambda x: 1 if x.loc['grade'] in {'A','B','C'} else 0, axis=1)
test_df.loc[:,'grade'] = test_df.apply(lambda x: 1 if x.loc['grade'] in {'A','B','C'} else 0, axis=1)
print(train_df.shape,test_df.shape)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


(698361, 56) (270664, 56)


In [7]:
predictor_category = pd.read_csv("~\\Box Sync\\Clickstream\\evaluation_results\\first\\updated\\predictor_category_table.csv")
predictor_df = pd.DataFrame({'predictor': predictors}).merge(predictor_category, how='inner', on=['predictor'])
predictor_df = predictor_df[predictor_df.predictor_subcategory.apply(lambda x: x.startswith("Early-term"))]
predictors = list(predictor_df.predictor)
len(predictors)

21

In [8]:
def calc_cw(y):
    # Calculate the weight of each letter grade to be used in the modeling fitting procedure: the weight is inversely proportional to the square root of the frequency of the letter grade in the training sample
    cw = Counter(y)
    class_weight = {k:np.sqrt(cw.most_common()[0][-1]/v, dtype=np.float32) for k,v in cw.items()}
    return class_weight # The output is a dictionary mapping letter grade to the corresponding weight

In [9]:
rf = RandomForestClassifier(n_estimators=140, criterion="entropy",
                            max_depth=14,
                            random_state=0, n_jobs=-1, max_features=4,
                            class_weight = calc_cw(train_df.grade))
rf.fit(train_df.loc[:,predictors], train_df.grade)

RandomForestClassifier(bootstrap=True, class_weight={0: 1.9044098, 1: 1.0},
            criterion='entropy', max_depth=14, max_features=4,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=140, n_jobs=-1, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [10]:
for c in ['ENG_111', 'ENG_112', 'BIO_101', 'MTH_154', 'MTH_161']:
    test_df_sub = test_df[test_df.course == c]
    original = list(original_test_grade[test_df.course == c])
    y_test_pred = list(rf.predict_proba(test_df_sub.loc[:,predictors])[:,1])
    y_test_real = list(test_df_sub.grade)
    y_df = pd.DataFrame({'pred_y': y_test_pred, 'real_y': y_test_real, 'grade': original,
                         'first_ind': [0]*len(y_test_pred)})
    y_df.to_csv(results_dir + "{}_lms_only_subcategory1.csv".format(c), index=False)

In [11]:
y_test_pred_rf = rf.predict_proba(test_df.loc[:,predictors])[:,1]
best_threshold = np.sort(y_test_pred_rf)[int(len(y_test_pred_rf) * (1-np.mean(train_df.grade)))-1]
print(best_threshold)

0.5981450657528227


In [12]:
best_threshold_2 = 0.6122792382152739

In [13]:
def create_confusion_matrix(y_real, y_pred, y_pred_2, original_test_grade, best_threshold, best_threshold_2, fname):
    cm_arr = confusion_matrix(y_real, np.concatenate([np.where(y_pred > best_threshold, 1, 0), np.where(y_pred_2 > best_threshold_2, 1, 0)]))
    cm_df = pd.DataFrame(cm_arr, columns=['Pred_DFW','Pred_ABC'], index=['Actual_DFW', 'Actual_ABC'])
    cm_df.loc[:,''] = cm_df.sum(axis=1)
    cm_df.loc['',:] = cm_df.sum(axis=0)
    p1 = cm_df.iloc[1,1]/cm_df.iloc[2,1]
    r1 = cm_df.iloc[1,1]/cm_df.iloc[1,2]
    p0 = cm_df.iloc[0,0]/cm_df.iloc[2,0]
    r0 = cm_df.iloc[0,0]/cm_df.iloc[0,2] 
    y_pred_bin = np.concatenate([np.where(y_pred > best_threshold, 1, 0), np.where(y_pred_2 > best_threshold_2, 1, 0)])
    cm_dict = {}
    cm_dict['Pred_DFW'] = Counter(original_test_grade[np.where(y_pred_bin==0)[0]])
    cm_dict['Pred_ABC'] = Counter(original_test_grade[np.where(y_pred_bin==1)[0]])
    new_cm = pd.DataFrame.from_dict(cm_dict, orient='index').T.loc[['W','F','D','C','B','A'],['Pred_DFW','Pred_ABC']]
    new_cm.index = ["Actual_"+e for e in new_cm.index]
    new_cm.loc[:,''] = new_cm.sum(axis=1)
    new_cm.loc['',:] = new_cm.sum(axis=0)
    new_cm.to_csv(results_dir + fname + "_6x2.csv")
    return round(p1,4),round(r1,4),round(p0,4),round(r0,4),round(2*p1*r1/(p1+r1),4),round(2*p0*r0/(p0+r0),4)

In [14]:
for c in ['ENG_111', 'ENG_112', 'BIO_101', 'MTH_154', 'MTH_161']:
    t1 = pd.read_csv(results_dir + "{}_lms_only_subcategory1.csv".format(c))
    t2 = pd.read_csv(results_dir_2 + "{}_lms_only.csv".format(c))
    t = pd.concat([t1, t2])
    print(c)
    print("AUC = {}".format(round(roc_auc_score(t.real_y, t.pred_y),4)))
    pr_rf = create_confusion_matrix(np.array(t.real_y), np.array(t1.pred_y), np.array(t2.pred_y), t.grade, best_threshold, best_threshold_2, "lms_only_subcategory1_applied_to_{}_cm".format(c))
    print(pr_rf)

ENG_111
AUC = 0.8052
(0.7391, 0.8879, 0.7688, 0.5432, 0.8067, 0.6366)
ENG_112
AUC = 0.7962
(0.8454, 0.8749, 0.5953, 0.5351, 0.8599, 0.5636)
BIO_101
AUC = 0.7594
(0.7988, 0.8974, 0.6311, 0.4371, 0.8453, 0.5165)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self.loc[key]
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self.loc[key]
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self.loc[key]



MTH_154
AUC = 0.7162
(0.6971, 0.871, 0.6676, 0.4064, 0.7744, 0.5053)
MTH_161
AUC = 0.689
(0.6566, 0.882, 0.691, 0.3638, 0.7528, 0.4767)


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self.loc[key]
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self.loc[key]
