This script fits the random forest model for the course MTH 154, using the full set of predictors.

In [1]:
import pickle
import pandas as pd
import numpy as np
from collections import Counter
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_curve, roc_auc_score, confusion_matrix, precision_score, recall_score
from sklearn.model_selection import KFold, StratifiedKFold
from scipy.stats.mstats import gmean
import seaborn as sns
import matplotlib.pyplot as plt

results_dir = "~\\Box Sync\\Clickstream\\evaluation_results\\full\\"

In [2]:
sn_dict = {"Blue Ridge": "BRCC",
           "Central Virginia": "CVCC",
           "Dabney S. Lancaster": "DSLCC",
           "Danville": "DCC",
           "Eastern Shore": "ESCC",
           "Germanna": "GCC",
           'J. Sargeant Reynolds': "JSRCC",
           'John Tyler': "JTCC",
           "Lord Fairfax": "LFCC",
           "Mountain Empire": "MECC",
           "New River": "NRCC",
           "Northern Virginia": "NVCC",
           "Patrick Henry": "PHCC",
           "Paul D. Camp": "PDCCC",
           "Piedmont Virginia": "PVCC",
           "Rappahannock": "RCC",
           "Southside Virginia": "SSVCC",
           "Southwest Virginia": "SWVCC",
           "Thomas Nelson": "TNCC",
           "Tidewater": "TCC",
           "Virginia Highlands": "VHCC",
           "Virginia Western": "VWCC",
           "Wytheville": "WCC"}

In [3]:
df0 = pd.read_csv("~\\Box Sync\\Clickstream\\data\\full\\LMS_data_final_full_new.csv")
df1 = pd.read_csv("~\\Box Sync\\Clickstream\\data\\full\\course_specific_predictors_new.csv")
df2 = pd.read_csv("~\\Box Sync\\Clickstream\\data\\full\\term_specific_predictors_new.csv")
df3 = pd.read_csv("~\\Box Sync\\Clickstream\\data\\full\\cluster_specific_predictors.csv")
df3 = df3.loc[:,['vccsid','strm','college','course','section'] + [e for e in df3.columns.values if e.endswith("MTH") or e.endswith("MTH_grade")]]
df4 = pd.read_stata("~\\Box Sync\\Clickstream\\data\\full\\instructor_related_predictors.dta")
df = df0.merge(df1, how='inner', on=['vccsid','strm','college','course','section'])\
.merge(df2, how='inner', on=['vccsid','strm'])\
.merge(df3, how='inner', on=['vccsid','strm','college','course','section'])\
.merge(df4, how='inner', on=['vccsid','strm','college','course','section'])
df = df[df.course == "MTH_154"]
df.loc[:,'first_ind'] = 0

In [4]:
df0 = pd.read_stata("~\\Box Sync\\Clickstream\\data\\first\\LMS_data_final.dta")
df1 = pd.read_csv("~\\Box Sync\\Clickstream\\data\\first\\course_specific_predictors_new.csv")
df2 = pd.read_csv("~\\Box Sync\\Clickstream\\data\\first\\term_specific_predictors_new.csv")
df4 = pd.read_stata("~\\Box Sync\\Clickstream\\data\\first\\instructor_related_predictors.dta")
df5 = df0.loc[:,['vccsid','strm','college','course','section']].copy()
df_first = df0.merge(df1, how='inner', on=['vccsid','strm','college','course','section'])\
.merge(df2, how='inner', on=['vccsid','strm'])\
.merge(df4, how='inner', on=['vccsid','strm','college','course','section'])\
.merge(df5, how='inner', on=['vccsid','strm','college','course','section'])
df_first = df_first[df_first.course == "MTH_154"]
df_first.loc[:,'first_ind'] = 1

In [5]:
df = pd.concat([df, df_first], axis=0, join='outer').fillna(0)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [6]:
Counter(df.first_ind)

Counter({0: 18892, 1: 6283})

In [7]:
for v in [int(e) for e in np.unique(df.cip) if e != 0]:
    df.loc[:,'cip_'+str(v)] = (df.cip == v).astype(int)
for v in [int(e) for e in np.unique(df.degree_level) if e != 4]:
    df.loc[:,'degree_level_'+str(v)] = (df.degree_level == v).astype(int)
df = df.drop(['cip', 'degree_level'], axis=1)
df.loc[:,'college_new'] = df.college.apply(lambda x: sn_dict[x])
for sn in [e for e in sn_dict.values() if e != "BRCC"]:
    df.loc[:,'college_'+sn] = (df.college_new == sn).astype(int)
df = df.drop(['college_new'], axis=1)

In [8]:
test = df.describe().T
test[test['mean'] == 0]

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
has_prereq_grade,25175.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
lvl2_ind,25175.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
prereq_grade,25175.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
df = df.drop(['lvl2_ind', 'has_prereq_grade', 'prereq_grade'], axis=1)
predictors = [e for e in list(df.columns) if e not in {"grade",'vccsid','strm','college','course','section'}]
len(predictors)

148

In [10]:
assert pd.isnull(df).any().any() == False

In [11]:
df.shape

(25175, 154)

In [12]:
train_df = df[df.strm != 2212]
test_df = df[df.strm == 2212]
original_test_grade = np.array(test_df.grade)
train_df.loc[:,'grade'] = train_df.apply(lambda x: 1 if x.loc['grade'] in {'A','B','C'} else 0, axis=1)
test_df.loc[:,'grade'] = test_df.apply(lambda x: 1 if x.loc['grade'] in {'A','B','C'} else 0, axis=1)
print(train_df.shape,test_df.shape)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


(19437, 154) (5738, 154)


In [13]:
online_ind_df_1 = pd.read_stata("~\\Box Sync\\Clickstream\\data\\full\\updated_online_ind.dta")
online_ind_df_2 = pd.read_stata("~\\Box Sync\\Clickstream\\data\\first\\updated_online_ind.dta")
online_ind_df = pd.concat([online_ind_df_1, online_ind_df_2])
test_df = test_df.merge(online_ind_df, how='inner', on=['vccsid','strm','college','course','section'])

In [14]:
Counter(test_df.inperson_ind)

Counter({0.0: 5572, 1.0: 166})

In [15]:
test_df[np.array(test_df.inperson_ind == 1) & np.array(test_df.first_ind == 1)].shape[0]

19

In [13]:
def create_cv_folds(train, n_fold = 5):
    folds = []
    k_fold = StratifiedKFold(n_splits = n_fold, random_state = 12345, shuffle=True)
    for train_indices, test_indices in k_fold.split(train, train.grade):
        train_part = train.iloc[train_indices,:]
        test_part = train.iloc[test_indices,:]
        X_1 = train_part.loc[:,predictors]
        y_1 = train_part.grade
        X_2 = test_part.loc[:,predictors]
        y_2 = test_part.grade
        folds.append([(X_1.copy(),y_1.copy()),(X_2.copy(),y_2.copy())])
    return folds

In [14]:
five_folds = create_cv_folds(train_df)

In [15]:
def cross_validation_RF(rf_model, folds):
    auc_by_fold = []
    for f in folds:
        X_1 = f[0][0]
        y_1 = f[0][1]
        X_2 = f[1][0]
        y_2 = f[1][1]
        rf_model.fit(X_1,y_1)
        y_2_pred = rf_model.predict_proba(X_2)[:,1]
        auc_by_fold.append(roc_auc_score(y_2,y_2_pred))
    return round(np.mean(auc_by_fold),4)  

In [16]:
def calc_cw(y):
    # Calculate the weight of each letter grade to be used in the modeling fitting procedure: the weight is inversely proportional to the square root of the frequency of the letter grade in the training sample
    cw = Counter(y)
    class_weight = {k:np.sqrt(cw.most_common()[0][-1]/v, dtype=np.float32) for k,v in cw.items()}
    return class_weight # The output is a dictionary mapping letter grade to the corresponding weight

In [17]:
### Using grid search to find the optimal maximum tree depth
auc_by_d=[]
for d in range(2,26):
    rf = RandomForestClassifier(n_estimators=200, criterion="entropy", 
                                max_depth=d,
                                random_state=0, n_jobs=20, max_features="auto",
                                class_weight = calc_cw(train_df.grade))
    auc = cross_validation_RF(rf, five_folds)
    auc_by_d.append(auc)
    print("Max_depth =", d)
    print("Mean CV AUC:", auc)
    print("")
plt.plot(range(2,26),auc_by_d)
plt.xlabel("Maximum Depth")
plt.ylabel("AUC")
plt.show()

Max_depth = 2
Mean CV AUC: 0.7379

Max_depth = 3
Mean CV AUC: 0.7479

Max_depth = 4
Mean CV AUC: 0.7568

Max_depth = 5
Mean CV AUC: 0.7654

Max_depth = 6
Mean CV AUC: 0.7713

Max_depth = 7
Mean CV AUC: 0.7778

Max_depth = 8
Mean CV AUC: 0.7816

Max_depth = 9
Mean CV AUC: 0.7862

Max_depth = 10
Mean CV AUC: 0.7892

Max_depth = 11
Mean CV AUC: 0.7931

Max_depth = 12
Mean CV AUC: 0.7953

Max_depth = 13
Mean CV AUC: 0.7962

Max_depth = 14
Mean CV AUC: 0.7976

Max_depth = 15
Mean CV AUC: 0.7985

Max_depth = 16
Mean CV AUC: 0.7993

Max_depth = 17
Mean CV AUC: 0.8011

Max_depth = 18
Mean CV AUC: 0.8029

Max_depth = 19
Mean CV AUC: 0.8026



KeyboardInterrupt: 

In [18]:
### Using grid search to find the optimal number of estimators (trees)
auc_by_n = []
for n in range(100,320,20):
    rf = RandomForestClassifier(n_estimators=n, criterion="entropy", 
                                max_depth=12,
                                random_state=0, n_jobs=-1, max_features="auto",
                                class_weight = calc_cw(train_df.grade))
    auc = cross_validation_RF(rf, five_folds)
    auc_by_n.append(auc)
    print("Number of Trees =", n)
    print("Mean CV AUC:", auc)
    print("")
plt.plot(range(100,320,20), auc_by_n)
plt.xlabel("Number of Trees")
plt.ylabel("AUC")
plt.show()

Number of Trees = 100
Mean CV AUC: 0.7942

Number of Trees = 120
Mean CV AUC: 0.7946

Number of Trees = 140
Mean CV AUC: 0.7944

Number of Trees = 160
Mean CV AUC: 0.7948

Number of Trees = 180
Mean CV AUC: 0.7953



KeyboardInterrupt: 

In [19]:
### Using grid search to find the optimal maximum number of features (trees)
auc_by_nf = []
max_nf = int(np.floor(2*np.sqrt(len(predictors))))
for nf in range(2,max_nf+1):
    rf = RandomForestClassifier(n_estimators=120, criterion="entropy", 
                                max_depth=12,
                                random_state=0, n_jobs=-1, max_features=nf,
                                class_weight = calc_cw(train_df.grade))
    auc = cross_validation_RF(rf, five_folds)
    auc_by_nf.append(auc)
    print("Max_features =", nf)
    print("Mean CV AUC:", auc)
    print("")
plt.plot(range(2,max_nf+1), auc_by_nf)
plt.xlabel("Maximum Number of Features")
plt.ylabel("AUC")
plt.show()

Max_features = 2
Mean CV AUC: 0.7614

Max_features = 3
Mean CV AUC: 0.7705

Max_features = 4
Mean CV AUC: 0.7757

Max_features = 5
Mean CV AUC: 0.7775

Max_features = 6
Mean CV AUC: 0.7828

Max_features = 7
Mean CV AUC: 0.784

Max_features = 8
Mean CV AUC: 0.7877

Max_features = 9
Mean CV AUC: 0.7897

Max_features = 10
Mean CV AUC: 0.7911

Max_features = 11
Mean CV AUC: 0.7917

Max_features = 12
Mean CV AUC: 0.7946

Max_features = 13
Mean CV AUC: 0.7946

Max_features = 14
Mean CV AUC: 0.7949

Max_features = 15
Mean CV AUC: 0.7967

Max_features = 16
Mean CV AUC: 0.7969

Max_features = 17
Mean CV AUC: 0.7975



KeyboardInterrupt: 

In [17]:
rf = RandomForestClassifier(n_estimators=120, criterion="entropy",
                            max_depth=12,
                            random_state=0, n_jobs=-1, max_features=12,
                            class_weight = calc_cw(train_df.grade))
rf.fit(train_df.loc[:,predictors], train_df.grade)

RandomForestClassifier(bootstrap=True, class_weight={0: 1.3534144, 1: 1.0},
            criterion='entropy', max_depth=12, max_features=12,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=120, n_jobs=-1, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [15]:
print("Random Forest:")
print("AUC = {}".format(round(roc_auc_score(test_df.grade, rf.predict_proba(test_df.loc[:,predictors])[:,1]),4)))
y_test_pred_rf = rf.predict_proba(test_df.loc[:,predictors])[:,1]

Random Forest:
AUC = 0.8143


In [16]:
print("Random Forest:")
print("AUC = {}".format(round(roc_auc_score(test_df[test_df.first_ind == 0].grade, rf.predict_proba(test_df[test_df.first_ind == 0].loc[:,predictors])[:,1]),4)))
y_test_pred_rf_nonfirst = rf.predict_proba(test_df[test_df.first_ind == 0].loc[:,predictors])[:,1]

Random Forest:
AUC = 0.8218


In [17]:
print("Random Forest:")
print("AUC = {}".format(round(roc_auc_score(test_df[test_df.first_ind == 1].grade, rf.predict_proba(test_df[test_df.first_ind == 1].loc[:,predictors])[:,1]),4)))
y_test_pred_rf_first = rf.predict_proba(test_df[test_df.first_ind == 1].loc[:,predictors])[:,1]

Random Forest:
AUC = 0.7709


In [18]:
print("Random Forest:")
print("Online AUC = {}".format(round(roc_auc_score(test_df[test_df.online_ind == 1].grade, rf.predict_proba(test_df[test_df.online_ind == 1].loc[:,predictors])[:,1]),4)))
y_test_pred_rf_first = rf.predict_proba(test_df[test_df.online_ind == 1].loc[:,predictors])[:,1]

Random Forest:
Online AUC = 0.8224


In [19]:
print("Random Forest:")
print("In-person AUC = {}".format(round(roc_auc_score(test_df[test_df.online_ind == 0].grade, rf.predict_proba(test_df[test_df.online_ind == 0].loc[:,predictors])[:,1]),4)))
y_test_pred_rf_first = rf.predict_proba(test_df[test_df.online_ind == 0].loc[:,predictors])[:,1]

Random Forest:
In-person AUC = 0.8135


In [18]:
# print("ABC vs. DF")
# print("AUC = {}".format(round(roc_auc_score(np.array(test_df.grade)[np.where(np.array(original_test_grade) != "W")[0]], 
#                                             rf.predict_proba(test_df.loc[:,predictors])[np.where(np.array(original_test_grade) != "W")[0],1]),4)))

In [19]:
# def find_optimal_threshold(p,r,t):
#     to_drop = np.union1d(np.where(pd.isnull(p[:-1]) == True)[0], np.where(pd.isnull(r[:-1]) == True)[0])
#     to_drop = np.union1d(to_drop, np.where(pd.isnull(t) == True)[0])
#     to_keep = np.setdiff1d(np.array(list(range(len(p)-1))), to_drop)
#     p,r,t = p[to_keep],r[to_keep],t[to_keep]
#     to_keep_2 = np.where(t < 0.8)[0]
#     p,r,t = p[to_keep_2],r[to_keep_2],t[to_keep_2]
#     f1 = 2*p*r/(p+r)
#     best_t = t[np.argmax(f1)]
#     best_t
#     return best_t

# def cross_validation(train, model):
#     threshold_list = []
#     auc_list = []
#     k_fold =  StratifiedKFold(n_splits = 10, random_state = 54321, shuffle=True)
#     for train_indices, test_indices in k_fold.split(train, train.grade):
#         train_part = train.iloc[train_indices,:]
#         test_part = train.iloc[test_indices,:]
#         X_1 = train_part.loc[:,predictors]
#         y_1 = train_part.grade
#         X_2 = test_part.loc[:,predictors]
#         y_2 = test_part.grade
#         model.fit(X_1,y_1)
#         p,r,t = precision_recall_curve(1-np.array(y_2), model.predict_proba(X_2)[:,0])
#         threshold_list.append(1-find_optimal_threshold(p,r,t))
#         auc = roc_auc_score(y_2, model.predict_proba(X_2)[:,1])
#         auc_list.append(auc)
#     print(threshold_list)
#     print(np.mean(auc_list), np.std(auc_list, ddof=1))
#     return gmean(threshold_list)

In [20]:
# best_threshold = cross_validation(train_df,rf)

In [21]:
# best_threshold

In [22]:
best_threshold = np.sort(y_test_pred_rf)[int(len(y_test_pred_rf) * (1-np.mean(train_df.grade)))-1]

In [23]:
def create_confusion_matrix_old(y_test_pred, threshold, fname):
    cm_arr = confusion_matrix(y_test, np.where(y_test_pred > threshold, 1, 0))
    cm_df = pd.DataFrame(cm_arr, columns=['Pred_DFW','Pred_ABC'], index=['Actual_DFW', 'Actual_ABC'])
    cm_df.loc[:,''] = cm_df.sum(axis=1)
    cm_df.loc['',:] = cm_df.sum(axis=0)
    print(cm_df)
    print("")
    p1 = cm_df.iloc[1,1]/cm_df.iloc[2,1]
    r1 = cm_df.iloc[1,1]/cm_df.iloc[1,2]
    p0 = cm_df.iloc[0,0]/cm_df.iloc[2,0]
    r0 = cm_df.iloc[0,0]/cm_df.iloc[0,2]    
    print("F1 score for A/B/C = {}".format(round(2*p1*r1/(p1+r1),4)))
    print("F1 score for D/F/W = {}".format(round(2*p0*r0/(p0+r0),4))) 
    cm_df.to_csv(results_dir + fname + ".csv")
    y_test_pred_bin = np.where(y_test_pred > best_threshold, 1, 0)
    cm_dict = {}
    cm_dict['Pred_DFW'] = Counter(original_test_grade[np.where(y_test_pred_bin==0)[0]])
    cm_dict['Pred_ABC'] = Counter(original_test_grade[np.where(y_test_pred_bin==1)[0]])
    new_cm = pd.DataFrame.from_dict(cm_dict, orient='index').T.loc[['W','F','D','C','B','A'],['Pred_DFW','Pred_ABC']]
    new_cm.index = ["Actual_"+e for e in new_cm.index]
    new_cm.loc[:,''] = new_cm.sum(axis=1)
    new_cm.loc['',:] = new_cm.sum(axis=0)
    new_cm.to_csv(results_dir + fname + "_6x2.csv")
    return round(p1,4),round(r1,4),round(p0,4),round(r0,4),round(2*p1*r1/(p1+r1),4),round(2*p0*r0/(p0+r0),4)

In [24]:
y_test = np.array(test_df.grade)
print("F1 threshold = {}:\n".format(str(round(best_threshold,4))))
pr_rf = create_confusion_matrix_old(y_test_pred_rf, best_threshold, "RF_MTH154_all_cm")
print(pr_rf)

F1 threshold = 0.5291:

            Pred_DFW  Pred_ABC        
Actual_DFW    1386.0     848.0  2234.0
Actual_ABC     640.0    2864.0  3504.0
              2026.0    3712.0  5738.0

F1 score for A/B/C = 0.7938
F1 score for D/F/W = 0.6507
(0.7716, 0.8174, 0.6841, 0.6204, 0.7938, 0.6507)


In [25]:
def create_confusion_matrix(y_test_pred, threshold, fname, ind = 0):
    cm_arr = confusion_matrix(y_test[np.array(test_df.first_ind == ind)], np.where(y_test_pred > threshold, 1, 0))
    cm_df = pd.DataFrame(cm_arr, columns=['Pred_DFW','Pred_ABC'], index=['Actual_DFW', 'Actual_ABC'])
    cm_df.loc[:,''] = cm_df.sum(axis=1)
    cm_df.loc['',:] = cm_df.sum(axis=0)
    print(cm_df)
    print("")
    p1 = cm_df.iloc[1,1]/cm_df.iloc[2,1]
    r1 = cm_df.iloc[1,1]/cm_df.iloc[1,2]
    p0 = cm_df.iloc[0,0]/cm_df.iloc[2,0]
    r0 = cm_df.iloc[0,0]/cm_df.iloc[0,2]    
    print("F1 score for A/B/C = {}".format(round(2*p1*r1/(p1+r1),4)))
    print("F1 score for D/F/W = {}".format(round(2*p0*r0/(p0+r0),4))) 
    cm_df.to_csv(results_dir + fname + ".csv")
    y_test_pred_bin = np.where(y_test_pred > best_threshold, 1, 0)
    cm_dict = {}
    cm_dict['Pred_DFW'] = Counter(original_test_grade[np.array(test_df.first_ind == ind)][np.where(y_test_pred_bin==0)[0]])
    cm_dict['Pred_ABC'] = Counter(original_test_grade[np.array(test_df.first_ind == ind)][np.where(y_test_pred_bin==1)[0]])
    new_cm = pd.DataFrame.from_dict(cm_dict, orient='index').T.loc[['W','F','D','C','B','A'],['Pred_DFW','Pred_ABC']]
    new_cm.index = ["Actual_"+e for e in new_cm.index]
    new_cm.loc[:,''] = new_cm.sum(axis=1)
    new_cm.loc['',:] = new_cm.sum(axis=0)
    new_cm.to_csv(results_dir + fname + "_6x2.csv")
    return round(p1,4),round(r1,4),round(p0,4),round(r0,4),round(2*p1*r1/(p1+r1),4),round(2*p0*r0/(p0+r0),4)

In [26]:
y_test = np.array(test_df.grade)
print("F1 threshold = {}:\n".format(str(round(best_threshold,4))))
pr_rf = create_confusion_matrix(y_test_pred_rf_nonfirst, best_threshold, "RF_MTH154_full_cm")

F1 threshold = 0.5291:

            Pred_DFW  Pred_ABC        
Actual_DFW    1211.0     740.0  1951.0
Actual_ABC     554.0    2560.0  3114.0
              1765.0    3300.0  5065.0

F1 score for A/B/C = 0.7983
F1 score for D/F/W = 0.6518


In [27]:
pr_rf

(0.7758, 0.8221, 0.6861, 0.6207, 0.7983, 0.6518)

In [28]:
print("F1 threshold = {}:\n".format(str(round(best_threshold,4))))
pr_rf = create_confusion_matrix(y_test_pred_rf_first, best_threshold, "RF_MTH154_first_cm", 1)

F1 threshold = 0.5291:

            Pred_DFW  Pred_ABC       
Actual_DFW     175.0     108.0  283.0
Actual_ABC      86.0     304.0  390.0
               261.0     412.0  673.0

F1 score for A/B/C = 0.7581
F1 score for D/F/W = 0.6434


In [29]:
pr_rf

(0.7379, 0.7795, 0.6705, 0.6184, 0.7581, 0.6434)

#### Feature importance

In [15]:
fi_df = pd.DataFrame({"feature_importance": rf.feature_importances_, "predictor": predictors})\
.loc[:,['predictor', 'feature_importance']].sort_values(['feature_importance'], ascending=False)
fi_df.loc[:,'feature_ranking'] = np.arange(1, fi_df.shape[0] + 1) / fi_df.shape[0]
cw_df = pd.read_csv(results_dir + "predictor_crosswalk.csv").iloc[:,[0,3,4]]
fi_df = fi_df.merge(cw_df, on=['predictor'], how='left')
fi_df.loc[:,'predictor_category'] = fi_df.predictor_category.apply(lambda x: "Admin" if pd.isnull(x) else x)
fi_df.loc[:,'predictor_subcategory'] = fi_df.predictor_subcategory.apply(lambda x: "Non-course-specific academic records" if pd.isnull(x) else x)

In [17]:
fi_df.loc[:,'predictor_subcategory'] = fi_df.predictor_subcategory.apply(lambda x: x.split(" & ")[0])
fi_df_top30 = fi_df.iloc[:30,:].drop(['feature_ranking'], axis=1)
fi_df_top30.loc[:,'feature_ranking'] = np.arange(1,31)
fi_df_top30 = fi_df_top30.round(3)
fi_df_top30 = fi_df_top30.rename(columns = {'feature_importance': 'feature_importance_score',
                                            'feature_ranking': 'ranking',
                                            'predictor_category': 'predictor_type'})
fi_df_top30 = fi_df_top30.loc[:,['predictor', 'predictor_type', 'predictor_subcategory', 'ranking', 'feature_importance_score']]
fi_df_top30.to_csv(results_dir + "top30_predictors_MTH154.csv", index=False)
fi_df_top30

Unnamed: 0,predictor,predictor_type,predictor_subcategory,ranking,feature_importance_score
0,pct_withdrawn,Admin,Non-course-specific academic records,1,0.071
1,cum_gpa,Admin,Non-course-specific academic records,2,0.059
2,tot_click_cnt_qrt1,LMS,Early-term,3,0.058
3,term_gpa_1,Admin,Non-course-specific academic records,4,0.045
4,crnt_enrl_intensity,Admin,Non-course-specific academic records,5,0.045
5,tot_time_qrt1,LMS,Early-term,6,0.04
6,term_gpa_2,Admin,Non-course-specific academic records,7,0.029
7,overall_prop_comp,Admin,Non-course-specific academic records,8,0.023
8,assign_sub_cnt_qtr1,LMS,Early-term,9,0.023
9,HUM_MTH_grade,Admin,Course-subject-specific,10,0.021


In [18]:
fi_part1 = fi_df.iloc[:,:-1]
fi_part1.loc[:,'predictor_category'] = fi_part1.predictor_category.apply(lambda x: "All " + x)
fi_part2 = fi_df.copy()
fi_part2.loc[:,'predictor_subcategory'] = fi_part2.predictor_subcategory.apply(lambda x: x.split(" & ")[0])
fi_part2.loc[:,'predictor_subcategory'] = fi_part2.predictor_category + " -- " + fi_part2.predictor_subcategory
fi_part2 = fi_part2.groupby(['predictor_subcategory']).agg({'feature_ranking': ['count','first','mean']}).reset_index()
fi_part2.columns = ['predictor_subcategory', 'number_of_predictors', 'highest_normalized_ranking', 'average_normalized_ranking']
fi_part1 = fi_part1.groupby(['predictor_category']).agg({'feature_ranking': ['count','first','mean']}).reset_index()
fi_part1.columns = ['predictor_subcategory', 'number_of_predictors', 'highest_normalized_ranking', 'average_normalized_ranking']
fi_all_parts = pd.concat([fi_part2, fi_part1])
fi_all_parts.round(3).to_csv(results_dir + "normalized_feature_ranking_MTH154.csv", index=False)
fi_all_parts

Unnamed: 0,predictor_subcategory,number_of_predictors,highest_normalized_ranking,average_normalized_ranking
0,Admin -- Course-specific,31,0.162162,0.736704
1,Admin -- Course-subject-specific,20,0.067568,0.493581
2,Admin -- Demographic,1,0.155405,0.155405
3,Admin -- Instructor-related,4,0.128378,0.459459
4,Admin -- Non-course-specific academic records,42,0.006757,0.533301
5,LMS -- Early-term,12,0.02027,0.175113
6,LMS -- Early-term concurrent,9,0.074324,0.322072
7,LMS -- Prior early-term,13,0.25,0.451143
8,LMS -- Prior full-term,16,0.222973,0.408361
0,All Admin,98,0.006757,0.582667
