This script runs the random forest model using the full set of predictors, for all first-term courses. It also creates the data set that contains all predicted scores for all first-term observations.

In [1]:
import pickle
import pandas as pd
import numpy as np
from collections import Counter
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_curve, roc_auc_score, confusion_matrix, precision_score, recall_score
from sklearn.model_selection import KFold, StratifiedKFold
from scipy.stats.mstats import gmean
import seaborn as sns
import matplotlib.pyplot as plt

results_dir = "~\\Box Sync\\Clickstream\\evaluation_results\\first\\updated\\"

In [2]:
sn_dict = {"Blue Ridge": "BRCC",
           "Central Virginia": "CVCC",
           "Dabney S. Lancaster": "DSLCC",
           "Danville": "DCC",
           "Eastern Shore": "ESCC",
           "Germanna": "GCC",
           'J. Sargeant Reynolds': "JSRCC",
           'John Tyler': "JTCC",
           "Lord Fairfax": "LFCC",
           "Mountain Empire": "MECC",
           "New River": "NRCC",
           "Northern Virginia": "NVCC",
           "Patrick Henry": "PHCC",
           "Paul D. Camp": "PDCCC",
           "Piedmont Virginia": "PVCC",
           "Rappahannock": "RCC",
           "Southside Virginia": "SSVCC",
           "Southwest Virginia": "SWVCC",
           "Thomas Nelson": "TNCC",
           "Tidewater": "TCC",
           "Virginia Highlands": "VHCC",
           "Virginia Western": "VWCC",
           "Wytheville": "WCC"}

In [3]:
df0 = pd.read_stata("~\\Box Sync\\Clickstream\\data\\first\\updated\\LMS_data_final.dta")
df1 = pd.read_csv("~\\Box Sync\\Clickstream\\data\\first\\updated\\course_specific_predictors_new.csv")
df2 = pd.read_csv("~\\Box Sync\\Clickstream\\data\\first\\updated\\term_specific_predictors_new.csv")
for v in [int(e) for e in np.unique(df2.cip) if e != 0]:
    df2.loc[:,'cip_'+str(v)] = (df2.cip == v).astype(int)
for v in [int(e) for e in np.unique(df2.degree_level) if e != 4]:
    df2.loc[:,'degree_level_'+str(v)] = (df2.degree_level == v).astype(int)
df2 = df2.drop(['cip', 'degree_level'], axis=1)
df4 = pd.read_stata("~\\Box Sync\\Clickstream\\data\\first\\updated\\instructor_related_predictors.dta")
df5 = df0.loc[:,['vccsid','strm','college','course','section']].copy()
df5.loc[:,'college_new'] = df5.college.apply(lambda x: sn_dict[x])
for sn in [e for e in sn_dict.values() if e != "BRCC"]:
    df5.loc[:,'college_'+sn] = (df5.college_new == sn).astype(int)
df5 = df5.drop(['college_new'], axis=1)
df = df0.merge(df1, how='inner', on=['vccsid','strm','college','course','section'])\
.merge(df2, how='inner', on=['vccsid','strm'])\
.merge(df4, how='inner', on=['vccsid','strm','college','course','section'])\
.merge(df5, how='inner', on=['vccsid','strm','college','course','section'])
predictors = [e for e in list(df.columns)[5:] if e != "grade"]
len(predictors)

80

In [4]:
assert pd.isnull(df).any().any() == False

In [5]:
df.shape

(204853, 86)

In [6]:
train_df = df[df.strm != 2212]
test_df = df[df.strm == 2212]
original_test_grade = np.array(test_df.grade)
train_df.loc[:,'grade'] = train_df.apply(lambda x: 1 if x.loc['grade'] in {'A','B','C'} else 0, axis=1)
test_df.loc[:,'grade'] = test_df.apply(lambda x: 1 if x.loc['grade'] in {'A','B','C'} else 0, axis=1)
print(train_df.shape,test_df.shape)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


(181673, 86) (23180, 86)


#### Fine-tune and fit RF models

In [7]:
def create_cv_folds(train, n_fold = 5):
    folds = []
    k_fold = StratifiedKFold(n_splits = n_fold, random_state = 12345, shuffle=True)
    for train_indices, test_indices in k_fold.split(train, train.grade):
        train_part = train.iloc[train_indices,:]
        test_part = train.iloc[test_indices,:]
        X_1 = train_part.loc[:,predictors]
        y_1 = train_part.grade
        X_2 = test_part.loc[:,predictors]
        y_2 = test_part.grade
        folds.append([(X_1.copy(),y_1.copy()),(X_2.copy(),y_2.copy())])
    return folds

In [8]:
five_folds = create_cv_folds(train_df)

In [9]:
def cross_validation_RF(rf_model, folds):
    auc_by_fold = []
    for f in folds:
        X_1 = f[0][0]
        y_1 = f[0][1]
        X_2 = f[1][0]
        y_2 = f[1][1]
        rf_model.fit(X_1,y_1)
        y_2_pred = rf_model.predict_proba(X_2)[:,1]
        auc_by_fold.append(roc_auc_score(y_2,y_2_pred))
    return round(np.mean(auc_by_fold),4)  

In [7]:
def calc_cw(y):
    # Calculate the weight of each letter grade to be used in the modeling fitting procedure: the weight is inversely proportional to the square root of the frequency of the letter grade in the training sample
    cw = Counter(y)
    class_weight = {k:np.sqrt(cw.most_common()[0][-1]/v, dtype=np.float32) for k,v in cw.items()}
    return class_weight # The output is a dictionary mapping letter grade to the corresponding weight

In [11]:
### Using grid search to find the optimal maximum tree depth
auc_by_d=[]
for d in range(11,31):
    rf = RandomForestClassifier(n_estimators=200, criterion="entropy", 
                                max_depth=d,
                                random_state=0, n_jobs=20, max_features="auto",
                                class_weight = calc_cw(train_df.grade))
    auc = cross_validation_RF(rf, five_folds)
    auc_by_d.append(auc)
    print("Max_depth =", d)
    print("Mean CV AUC:", auc)
    print("")
plt.plot(range(11,31),auc_by_d)
plt.xlabel("Maximum Depth")
plt.ylabel("AUC")
plt.show()

Max_depth = 11
Mean CV AUC: 0.8104

Max_depth = 12
Mean CV AUC: 0.8136

Max_depth = 13
Mean CV AUC: 0.8165

Max_depth = 14
Mean CV AUC: 0.8188

Max_depth = 15
Mean CV AUC: 0.821

Max_depth = 16
Mean CV AUC: 0.8228

Max_depth = 17
Mean CV AUC: 0.8239

Max_depth = 18
Mean CV AUC: 0.8248

Max_depth = 19
Mean CV AUC: 0.826



KeyboardInterrupt: 

In [12]:
### Using grid search to find the optimal number of estimators (trees)
auc_by_n = []
for n in range(100,320,20):
    rf = RandomForestClassifier(n_estimators=n, criterion="entropy", 
                                max_depth=17,
                                random_state=0, n_jobs=-1, max_features="auto",
                                class_weight = calc_cw(train_df.grade))
    auc = cross_validation_RF(rf, five_folds)
    auc_by_n.append(auc)
    print("Number of Trees =", n)
    print("Mean CV AUC:", auc)
    print("")
plt.plot(range(100,320,20), auc_by_n)
plt.xlabel("Number of Trees")
plt.ylabel("AUC")
plt.show()

Number of Trees = 100
Mean CV AUC: 0.8228

Number of Trees = 120
Mean CV AUC: 0.8231

Number of Trees = 140
Mean CV AUC: 0.8232

Number of Trees = 160
Mean CV AUC: 0.8235



KeyboardInterrupt: 

In [13]:
### Using grid search to find the optimal maximum number of features (trees)
auc_by_nf = []
max_nf = int(np.floor(2*np.sqrt(len(predictors))))
for nf in range(2,max_nf+1):
    rf = RandomForestClassifier(n_estimators=120, criterion="entropy", 
                                max_depth=17,
                                random_state=0, n_jobs=-1, max_features=nf,
                                class_weight = calc_cw(train_df.grade))
    auc = cross_validation_RF(rf, five_folds)
    auc_by_nf.append(auc)
    print("Max_features =", nf)
    print("Mean CV AUC:", auc)
    print("")
plt.plot(range(2,max_nf+1), auc_by_nf)
plt.xlabel("Maximum Number of Features")
plt.ylabel("AUC")
plt.show()

Max_features = 2
Mean CV AUC: 0.8076

Max_features = 3
Mean CV AUC: 0.8145

Max_features = 4
Mean CV AUC: 0.8177

Max_features = 5
Mean CV AUC: 0.82

Max_features = 6
Mean CV AUC: 0.8215

Max_features = 7
Mean CV AUC: 0.8224

Max_features = 8
Mean CV AUC: 0.8231

Max_features = 9
Mean CV AUC: 0.8232

Max_features = 10
Mean CV AUC: 0.8235



KeyboardInterrupt: 

In [8]:
rf = RandomForestClassifier(n_estimators=120, criterion="entropy",
                            max_depth=17,
                            random_state=0, n_jobs=-1, max_features=8,
                            class_weight = calc_cw(train_df.grade))
rf.fit(train_df.loc[:,predictors], train_df.grade)

RandomForestClassifier(bootstrap=True, class_weight={0: 1.6198435, 1: 1.0},
            criterion='entropy', max_depth=17, max_features=8,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=120, n_jobs=-1, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [9]:
print("Random Forest:")
print("AUC = {}".format(round(roc_auc_score(test_df.grade, rf.predict_proba(test_df.loc[:,predictors])[:,1]),4)))
y_test_pred_rf = rf.predict_proba(test_df.loc[:,predictors])[:,1]
y_train_pred_rf = rf.predict_proba(train_df.loc[:,predictors])[:,1]

Random Forest:
AUC = 0.8245


#### Create the data set for storing the predicted scores of admin+lms, admin_only, lms_only RF models

In [None]:
best_threshold = np.sort(y_test_pred_rf)[int(len(y_test_pred_rf) * (1-np.mean(train_df.grade)))-1]
y_test_pred_rf_bin = np.where(y_test_pred_rf > best_threshold, 1, 0)
threshold_train = np.sort(y_train_pred_rf)[int(len(y_train_pred_rf) * (1-np.mean(train_df.grade)))-1]
y_train_pred_rf_bin = np.where(y_train_pred_rf > threshold_train, 1, 0)

In [None]:
info_df_test = test_df.loc[:,['vccsid', 'strm', 'college', 'course', 'section'] + predictors]
info_df_test.loc[:,'actual_grade'] = original_test_grade
info_df_test.loc[:,'predicted_binary_outcome_full_predictor'] = y_test_pred_rf_bin
info_df_test.loc[:,'predicted_score_full_predictor'] = y_test_pred_rf
info_df_test = info_df_test.loc[:, ['vccsid', 'strm', 'college', 'course', 'section', 'predicted_score_full_predictor', 'predicted_binary_outcome_full_predictor', 'actual_grade'] + predictors]

In [None]:
lms_1 = pd.read_stata("~\\Box Sync\\Clickstream\\data\\first\\updated\\LMS_data_updated.dta").drop(['grade'], axis=1)

In [None]:
print(info_df_test.shape)
info_df_test = info_df_test.merge(lms_1, on=['vccsid', 'strm', 'college', 'course', 'section'], how='left')
print(info_df_test.shape)

In [None]:
for cn in info_df_test.columns.values:
    if cn.endswith("_x"):
        info_df_test.rename(columns = {cn: cn[:-2]}, inplace=True)
    if cn.endswith("_y"):
        info_df_test.rename(columns = {cn: cn[:-1] + "raw"}, inplace=True)

In [None]:
info_df_train = train_df.loc[:,['vccsid', 'strm', 'college', 'course', 'section'] + predictors]
info_df_train.loc[:,'actual_grade'] = original_train_grade
info_df_train.loc[:,'predicted_binary_outcome_full_predictor'] = y_train_pred_rf_bin
info_df_train.loc[:,'predicted_score_full_predictor'] = y_train_pred_rf
info_df_train = info_df_train.loc[:, ['vccsid', 'strm', 'college', 'course', 'section', 'predicted_score_full_predictor', 'predicted_binary_outcome_full_predictor', 'actual_grade'] + predictors]

In [None]:
print(info_df_train.shape)
info_df_train = info_df_train.merge(lms_1, on=['vccsid', 'strm', 'college', 'course', 'section'], how='left')
print(info_df_train.shape)

In [None]:
for cn in info_df_train.columns.values:
    if cn.endswith("_x"):
        info_df_train.rename(columns = {cn: cn[:-2]}, inplace=True)
    if cn.endswith("_y"):
        info_df_train.rename(columns = {cn: cn[:-1] + "raw"}, inplace=True)

In [None]:
info_df_train.loc[:,'is_validation'] = 0
info_df_test.loc[:,'is_validation'] = 1
info_df = pd.concat([info_df_train, info_df_test])

In [None]:
print(info_df.shape)
info_df = info_df.merge(pd.read_csv("~\\Box Sync\\Clickstream\\data\\first\\updated\\first_info_df_admin_only.csv"), on=['vccsid', 'strm', 'college', 'course', 'section']).merge(pd.read_csv("~\\Box Sync\\Clickstream\\data\\first\\updated\\first_info_df_lms_only.csv"), on=['vccsid', 'strm', 'college', 'course', 'section'])
print(info_df.shape)

In [None]:
var_list_1 = ['vccsid', 'strm', 'college', 'course', 'section', 'is_validation', 'predicted_score_full_predictor', 'predicted_binary_outcome_full_predictor', 'predicted_score_admin_only', 'predicted_binary_outcome_admin_only', 'predicted_score_lms_only', 'predicted_binary_outcome_lms_only']
var_list_2 = [p for p in info_df.columns.values if p not in set(var_list_1)]
var_list = var_list_1 + var_list_2
info_df = info_df.loc[:,var_list]

In [None]:
info_df.to_csv("~\\Box Sync\\Clickstream\\data\\first\\updated\\first_term_dataset.csv", index=False)

#### Compare online vs. in-person courses

In [13]:
online_ind_df = pd.read_stata("~\\Box Sync\\Clickstream\\data\\first\\updated\\updated_online_ind.dta")
online_ind = test_df.merge(online_ind_df, how='inner', on=['vccsid','strm','college','course','section']).online_ind_y
inperson_ind = test_df.merge(online_ind_df, how='inner', on=['vccsid','strm','college','course','section']).inperson_ind

In [14]:
print("Random Forest:")
print("Onine AUC = {}".format(round(roc_auc_score(test_df[np.array(online_ind == 1)].grade, np.array(y_test_pred_rf)[np.array(online_ind == 1)]),4)))

Random Forest:
Onine AUC = 0.8235


In [15]:
print("Random Forest:")
print("In-person AUC = {}".format(round(roc_auc_score(test_df[np.array(inperson_ind == 1)].grade, np.array(y_test_pred_rf)[np.array(inperson_ind == 1)]),4)))

Random Forest:
In-person AUC = 0.8085


In [12]:
fi_df = pd.DataFrame({"feature_importance": rf.feature_importances_, "predictor": predictors})\
.loc[:,['predictor', 'feature_importance']].sort_values(['feature_importance'], ascending=False)
fi_df.loc[:,'feature_ranking'] = np.arange(1, fi_df.shape[0] + 1) / fi_df.shape[0]

In [13]:
fi_df.loc[:,['predictor', 'feature_importance']].to_csv(results_dir + "feature_importance.csv", index=False)

In [1]:
for c in ['ENG_111', 'ENG_112', 'MTH_154', 'MTH_161', 'BIO_101']:
    pred_new = pred_df[pred_df.course == c]
    print(c, pred_new.shape[0])
    print("AUC = {}".format(round(roc_auc_score(pred_new.real_y, pred_new.pred_y), 4)))
    print("")


ENG_111 8979
AUC = 0.8535

ENG_112 11471
AUC = 0.8652

MTH_154 5738
AUC = 0.8189

MTH_161 4080
AUC = 0.7994

BIO_101 8881
AUC = 0.8552



#### Performance over all observations (non-first-term + first-term)

In [14]:
y_test_pred_full = pickle.load(open("~\\Box Sync\\Clickstream\\evaluation_results\\full\\y_test_pred_full.p", "rb"))
y_test_full = pickle.load(open("~\\Box Sync\\Clickstream\\evaluation_results\\full\\y_test_full.p", "rb"))

In [15]:
round(roc_auc_score(list(test_df.grade) + y_test_full, list(rf.predict_proba(test_df.loc[:,predictors])[:,1]) + y_test_pred_full),4)

0.8684

In [20]:
for c in ['ENG_111', 'ENG_112', 'MTH_154', 'MTH_161', 'BIO_101']:
    test_new = test_df[test_df.course == c]
    print(c)
    print("AUC = {}".format(round(roc_auc_score(test_new.grade, rf.predict_proba(test_new.loc[:,predictors])[:,1]),4)))
    print("")

ENG_111
AUC = 0.812

ENG_112
AUC = 0.8279

MTH_154
AUC = 0.7841

MTH_161
AUC = 0.7844

BIO_101
AUC = 0.8461



In [21]:
print("ABC vs. DF")
print("AUC = {}".format(round(roc_auc_score(np.array(test_df.grade)[np.where(np.array(original_test_grade) != "W")[0]], 
                                            rf.predict_proba(test_df.loc[:,predictors])[np.where(np.array(original_test_grade) != "W")[0],1]),4)))

ABC vs. DF
AUC = 0.7955


#### Feature importance

In [24]:
fi_df = pd.DataFrame({"feature_importance": rf.feature_importances_, "predictor": predictors})\
.loc[:,['predictor', 'feature_importance']].sort_values(['feature_importance'], ascending=False)
fi_df.loc[:,'feature_ranking'] = np.arange(1, fi_df.shape[0] + 1) / fi_df.shape[0]

In [25]:
cw_df = pd.read_csv(results_dir + "predictor_category_table.csv")
fi_df = fi_df.merge(cw_df, on=['predictor'], how='left')

In [26]:
fi_df.loc[:,'predictor_category'] = fi_df.predictor_category.apply(lambda x: "Admin" if pd.isnull(x) else x)
fi_df.loc[:,'predictor_subcategory'] = fi_df.predictor_subcategory.apply(lambda x: "Non-course-specific academic records" if pd.isnull(x) else x)

In [27]:
early_lms = [e for e in predictors if (e.endswith("_qtr1") or e.endswith("_qrt1")) and (e != "has_concurrent_qtr1")]
concurrent_lms = [e for e in predictors if e.endswith("_qtr1c") or e.endswith("_qrt1c") or e == 'has_concurrent_qtr1']
all_lms = early_lms + concurrent_lms
assign_lms = [e for e in all_lms if "assign" in e]
disc_lms = [e for e in all_lms if "disc" in e or "word" in e or "post" in e]
click_lms = [e for e in all_lms if e not in assign_lms + disc_lms]
all_admin = [e for e in predictors if e not in set(all_lms)]

In [28]:
fi_df.loc[:,'predictor_subcategory'] = fi_df.predictor_subcategory.apply(lambda x: x.split(" & ")[0])
fi_df_top30 = fi_df.iloc[:30,:].drop(['feature_ranking'], axis=1)
fi_df_top30.loc[:,'feature_ranking'] = np.arange(1,31)
fi_df_top30 = fi_df_top30.round(3)
fi_df_top30 = fi_df_top30.rename(columns = {'feature_importance': 'feature_importance_score',
                                            'feature_ranking': 'ranking',
                                            'predictor_category': 'predictor_type'})
fi_df_top30 = fi_df_top30.loc[:,['predictor', 'predictor_type', 'predictor_subcategory', 'ranking', 'feature_importance_score']]
fi_df_top30.to_csv(results_dir + "top30_predictors.csv", index=False)
fi_df_top30

Unnamed: 0,predictor,predictor_type,predictor_subcategory,ranking,feature_importance_score
0,tot_click_cnt_qrt1,LMS,Early-term,1,0.109
1,crnt_enrl_intensity,Admin,Non-course-specific academic records,2,0.079
2,tot_time_qrt1,LMS,Early-term,3,0.073
3,disc_reply_cnt_qtr1,LMS,Early-term,4,0.049
4,avg_g,Admin,Course-specific,5,0.048
5,disc_post_cnt_qtr1,LMS,Early-term,6,0.047
6,assign_sub_cnt_qtr1,LMS,Early-term,7,0.044
7,past_avg_grade,Admin,Instructor-related,8,0.04
8,avg_depth_post_qtr1,LMS,Early-term,9,0.037
9,tot_click_cnt_qrt1c,LMS,Early-term concurrent,10,0.035


In [29]:
# fi_results = []
# for c in ['early_lms', 'concurrent_lms', 'click_lms', 'assign_lms', 'disc_lms', 'all_lms', 'all_admin']:
#     l = fi_df.merge(pd.DataFrame({'predictor': eval(c)}), how='inner', on=['predictor']).feature_ranking
#     fi_results.append((c, len(l), l.iloc[0], l.mean()))
# fi_results

In [30]:
# category_dict = {'early_lms': "Early-term LMS",
#                  'concurrent_lms': 'Early-term concurrent LMS',
#                  'historical_early_lms': 'Historical early-term LMS',
#                  'historical_full_lms': 'Historical full-term LMS',
#                  'click_lms': 'LMS -- clicks/session/time',
#                  'assign_lms': 'LMS -- assignments',
#                  'disc_lms': 'LMS -- discussion forums',
#                  'all_lms': 'All LMS',
#                  'all_admin': 'All admin'}
# fi_df = pd.DataFrame(fi_results, columns=['predictor_category', 'number_of_predictors', 'highest_normalized_ranking', 'average_normalized_ranking']).round(3)
# fi_df.loc[:,'predictor_category'] = fi_df.predictor_category.apply(lambda x: category_dict[x])
# fi_df.to_csv(results_dir + "normalized_feature_ranking.csv", index=False)
# fi_df

In [31]:
fi_part1 = fi_df.iloc[:,:-1]
fi_part1.loc[:,'predictor_category'] = fi_part1.predictor_category.apply(lambda x: "All " + x)
fi_part2 = fi_df.copy()
fi_part2.loc[:,'predictor_subcategory'] = fi_part2.predictor_subcategory.apply(lambda x: x.split(" & ")[0])
fi_part2.loc[:,'predictor_subcategory'] = fi_part2.predictor_category + " -- " + fi_part2.predictor_subcategory
# fi_part3 = fi_df.copy()
# fi_part3.loc[:,'predictor_subcategory'] = fi_part3.predictor_subcategory.apply(lambda x: x.split(" & ")[-1])
# fi_part3 = fi_part3[fi_part3.predictor_category == "LMS"]
# fi_part3.loc[:,'predictor_subcategory'] = fi_part3.predictor_category + " -- " + fi_part3.predictor_subcategory
fi_part2 = fi_part2.groupby(['predictor_subcategory']).agg({'feature_ranking': ['count','first','mean']}).reset_index()
fi_part2.columns = ['predictor_subcategory', 'number_of_predictors', 'highest_normalized_ranking', 'average_normalized_ranking']
# fi_part3 = fi_part3.groupby(['predictor_subcategory']).agg({'feature_ranking': ['count','first','mean']}).reset_index()
# fi_part3.columns = ['predictor_subcategory', 'number_of_predictors', 'highest_normalized_ranking', 'average_normalized_ranking']
# fi_part3
fi_part1 = fi_part1.groupby(['predictor_category']).agg({'feature_ranking': ['count','first','mean']}).reset_index()
fi_part1.columns = ['predictor_subcategory', 'number_of_predictors', 'highest_normalized_ranking', 'average_normalized_ranking']

In [33]:
fi_all_parts = pd.concat([fi_part2, fi_part1])
fi_all_parts.round(3).to_csv(results_dir + "normalized_feature_ranking.csv", index=False)

In [34]:
fi_all_parts

Unnamed: 0,predictor_subcategory,number_of_predictors,highest_normalized_ranking,average_normalized_ranking
0,Admin -- Course-specific,30,0.0625,0.654583
1,Admin -- Demographic,1,0.2375,0.2375
2,Admin -- Instructor-related,4,0.1,0.3625
3,Admin -- Non-course-specific academic records,24,0.025,0.595313
4,LMS -- Early-term,12,0.0125,0.164583
5,LMS -- Early-term concurrent,9,0.125,0.323611
0,All Admin,59,0.025,0.603602
1,All LMS,21,0.0125,0.232738


#### Confusion matrices

In [None]:
best_threshold = np.sort(y_test_pred_rf)[int(len(y_test_pred_rf) * (1-np.mean(train_df.grade)))-1]

In [None]:
def create_confusion_matrix(y_test_pred, threshold, fname):
    cm_arr = confusion_matrix(y_test, np.where(y_test_pred > threshold, 1, 0))
    cm_df = pd.DataFrame(cm_arr, columns=['Pred_DFW','Pred_ABC'], index=['Actual_DFW', 'Actual_ABC'])
    cm_df.loc[:,''] = cm_df.sum(axis=1)
    cm_df.loc['',:] = cm_df.sum(axis=0)
    print(cm_df)
    print("")
    p1 = cm_df.iloc[1,1]/cm_df.iloc[2,1]
    r1 = cm_df.iloc[1,1]/cm_df.iloc[1,2]
    p0 = cm_df.iloc[0,0]/cm_df.iloc[2,0]
    r0 = cm_df.iloc[0,0]/cm_df.iloc[0,2]    
    print("F1 score for A/B/C = {}".format(round(2*p1*r1/(p1+r1),4)))
    print("F1 score for D/F/W = {}".format(round(2*p0*r0/(p0+r0),4))) 
    cm_df.to_csv(results_dir + fname + ".csv")
    y_test_pred_bin = np.where(y_test_pred > best_threshold, 1, 0)
    cm_dict = {}
    cm_dict['Pred_DFW'] = Counter(original_test_grade[np.where(y_test_pred_bin==0)[0]])
    cm_dict['Pred_ABC'] = Counter(original_test_grade[np.where(y_test_pred_bin==1)[0]])
    new_cm = pd.DataFrame.from_dict(cm_dict, orient='index').T.loc[['W','F','D','C','B','A'],['Pred_DFW','Pred_ABC']]
    new_cm.index = ["Actual_"+e for e in new_cm.index]
    new_cm.loc[:,''] = new_cm.sum(axis=1)
    new_cm.loc['',:] = new_cm.sum(axis=0)
    new_cm.to_csv(results_dir + fname + "_6x2.csv")
    return round(p1,4),round(r1,4),round(p0,4),round(r0,4),round(2*p1*r1/(p1+r1),4),round(2*p0*r0/(p0+r0),4)

In [2]:
y_test = np.array(test_df.grade)
print("F1 threshold = {}:\n".format(str(round(best_threshold,4))))
pr_rf = create_confusion_matrix(y_test_pred_rf, best_threshold, "RF_1_full_cm")


F1 threshold = 0.5942:

            Pred_DFW  Pred_ABC         
Actual_DFW    4131.0    2614.0   6745.0
Actual_ABC    2265.0   14170.0  16435.0
              6396.0   16784.0  23180.0

F1 score for A/B/C = 0.8531
F1 score for D/F/W = 0.6287



In [3]:
pr_df

(0.8443, 0.8622, 0.6459, 0.6125, 0.8531, 0.6287)
