This script computes the feature importance of predictors and perform comparison of top 10 predictors from different model variants.

In [18]:
import numpy as np
import pandas as pd
from scipy import stats
import pickle
from collections import Counter
from sklearn.metrics import precision_recall_curve, roc_auc_score, confusion_matrix, balanced_accuracy_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from scipy.stats.mstats import gmean
import math

In [19]:
df_new = pd.read_stata("df_new.dta")
predictors = pickle.load(open("predictors.p", "rb"))

In [20]:
def calc_cw(y):
    # Calculate the weight of each letter grade to be used in the modeling fitting procedure: the weight is inversely proportional to the square root of the frequency of the letter grade in the training sample
    cw = Counter(y)
    class_weight = {k:np.sqrt(cw.most_common()[0][-1]/v, dtype=np.float32) for k,v in cw.items()}
    return class_weight # The output is a dictionary mapping letter grade to the corresponding weight

In [21]:
train_df = df_new[df_new.valid == 0]
test_df = df_new[df_new.valid == 1]

In [22]:
optimal_d = 16
optimal_n = 120
optimal_nf = 12
rf = RandomForestClassifier(n_estimators=optimal_n, criterion="entropy",
                            max_depth=optimal_d,
                            random_state=0, n_jobs=-1, max_features=optimal_nf,
                            class_weight = calc_cw(train_df.grad_6years))
rf.fit(train_df.loc[:,predictors], train_df.grad_6years)

RandomForestClassifier(bootstrap=True,
            class_weight={0.0: 1.0, 1.0: 1.3931639}, criterion='entropy',
            max_depth=16, max_features=12, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=120, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [34]:
xx = np.array(predictors)[np.argsort(rf.feature_importances_)[::-1]]
yy = rf.feature_importances_[np.argsort(rf.feature_importances_)[::-1]]
fi_base = pd.DataFrame({'predictor':xx, 'fi_base':yy})

In [24]:
train_df_afam = train_df[train_df.afam == 1]
test_df_afam = test_df[test_df.afam == 1]

In [25]:
optimal_d = 15
optimal_n = 120
optimal_nf = 11
rf_afam = RandomForestClassifier(n_estimators=optimal_n, criterion="entropy",
                            max_depth=optimal_d,
                            random_state=0, n_jobs=-1, max_features=optimal_nf,
                            class_weight = calc_cw(train_df.grad_6years))
rf_afam.fit(train_df_afam.loc[:,predictors], train_df_afam.grad_6years)

RandomForestClassifier(bootstrap=True,
            class_weight={0.0: 1.0, 1.0: 1.3931639}, criterion='entropy',
            max_depth=15, max_features=11, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=120, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [35]:
xx = np.array(predictors)[np.argsort(rf_afam.feature_importances_)[::-1]]
yy = rf_afam.feature_importances_[np.argsort(rf_afam.feature_importances_)[::-1]]
fi_afam = pd.DataFrame({'predictor':xx, 'fi_afam':yy})

In [27]:
train_df_white = train_df[train_df.white == 1]
test_df_white = test_df[test_df.white == 1]

In [28]:
optimal_d = 15
optimal_n = 120
optimal_nf = 11
rf_white = RandomForestClassifier(n_estimators=optimal_n, criterion="entropy",
                            max_depth=optimal_d,
                            random_state=0, n_jobs=-1, max_features=optimal_nf,
                            class_weight = calc_cw(train_df.grad_6years))
rf_white.fit(train_df_white.loc[:,predictors], train_df_white.grad_6years)

RandomForestClassifier(bootstrap=True,
            class_weight={0.0: 1.0, 1.0: 1.3931639}, criterion='entropy',
            max_depth=15, max_features=11, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=120, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [36]:
xx = np.array(predictors)[np.argsort(rf_white.feature_importances_)[::-1]]
yy = rf_white.feature_importances_[np.argsort(rf_white.feature_importances_)[::-1]]
fi_white = pd.DataFrame({'predictor':xx, 'fi_white':yy})

In [37]:
fi_all = fi_base.merge(fi_afam, on=['predictor'], how='inner').merge(fi_white, on=['predictor'], how='inner')

In [50]:
from scipy.stats import rankdata
for p in ['fi_base', 'fi_afam', 'fi_white']:
    fi_all.loc[:, p+"_rank"] = rankdata(fi_all[p])

In [53]:
afam_rmse = np.sqrt(np.sum((fi_all.fi_base - fi_all.fi_afam)**2))
white_rmse = np.sqrt(np.sum((fi_all.fi_base - fi_all.fi_white)**2))
print(afam_rmse, white_rmse)
afam_rmse_rank = np.sqrt(np.sum((fi_all.fi_base_rank - fi_all.fi_afam_rank)**2))
white_rmse_rank = np.sqrt(np.sum((fi_all.fi_base_rank - fi_all.fi_white_rank)**2))
print(afam_rmse_rank/fi_all.shape[0], white_rmse_rank/fi_all.shape[0])

0.025446081110612133 0.014417848999630981
1.1319429450539957 0.6739502165903136


In [42]:
fi_base_2 = pd.read_csv("D:\\Yifeng -- Project Work\\ys8mz_sandbox\\bias_analyses_3h\\fi_base.csv")
fi_base_2 = fi_base_2.rename(columns = {'fi':'fi_base'})
fi_afam_2 = pd.read_csv("D:\\Yifeng -- Project Work\\ys8mz_sandbox\\bias_analyses_3h\\fi_afam.csv")
fi_afam_2 = fi_afam_2.rename(columns = {'fi':'fi_afam'})
fi_white_2 = pd.read_csv("D:\\Yifeng -- Project Work\\ys8mz_sandbox\\bias_analyses_3h\\fi_white.csv")
fi_white_2 = fi_white_2.rename(columns = {'fi':'fi_white'})
fi_all_2 = fi_base_2.merge(fi_afam_2, on=['predictor'], how='inner').merge(fi_white_2, on=['predictor'], how='inner')

In [47]:
from scipy.stats import rankdata
for p in ['fi_base', 'fi_afam', 'fi_white']:
    fi_all_2.loc[:, p+"_rank"] = rankdata(fi_all_2[p])

In [54]:
afam_rmse_2 = np.sqrt(np.sum((fi_all_2.fi_base - fi_all_2.fi_afam)**2))
white_rmse_2 = np.sqrt(np.sum((fi_all_2.fi_base - fi_all_2.fi_white)**2))
print(afam_rmse_2, white_rmse_2)
afam_rmse_2_rank = np.sqrt(np.sum((fi_all_2.fi_base_rank - fi_all_2.fi_afam_rank)**2))
white_rmse_2_rank = np.sqrt(np.sum((fi_all_2.fi_base_rank - fi_all_2.fi_white_rank)**2))
print(afam_rmse_2_rank/fi_all_2.shape[0], white_rmse_2_rank/fi_all_2.shape[0])

0.013336820972155604 0.006247435684610987
0.8026433379149043 0.4425616263926167


In [55]:
list(fi_all_2.predictor)

['crnt_enrl_intensity',
 'cum_gpa',
 'avg_g',
 'term_gpa_1',
 'past_avg_grade',
 'avg_g_concurrent',
 'median_income_households',
 'perc_below_pov',
 'overall_prop_comp',
 'section_size',
 'distance',
 'age',
 'term_gpa_2',
 'first_strm',
 'cum_cred_earn',
 'gpa_trend',
 'prop_comp_sd',
 'pct_withdrawn',
 'lvl2_share',
 'enrl_intensity',
 'enrl_intensity_trend',
 'online_share',
 'pct_stopped',
 'pct_dev',
 'eve_share',
 'num_of_prior_terms',
 'online_ind',
 'full_time',
 'tenure',
 'SOC_SOC_grade',
 'HUM_SOC_grade',
 'male',
 'HUM_HUM_grade',
 'pct_incomplete',
 'firstgen_0',
 'pell_ever_0',
 'pell_target_1',
 'summer_ind',
 'lvl2_ind',
 'firstgen_1',
 'HUM_MTH_grade',
 'degree_level_1',
 'SOC_HUM_grade',
 'cip_24',
 'college_NVCC',
 'dev',
 'prereq_grade',
 'repeat_grade',
 'MTH_SOC_grade',
 'HUM_MED_grade',
 'college_TCC',
 'MTH_HUM_grade',
 'pell_target_0',
 'ever_dual',
 'degree_level_2',
 'SCI_SOC_grade',
 'HUM_SCI_grade',
 'eve_ind',
 'cip_52',
 'SOC_MED_grade',
 'MTH_MTH_grade'

In [7]:
xx = np.array(predictors)[np.argsort(rf.feature_importances_)[::-1]]
yy = rf.feature_importances_[np.argsort(rf.feature_importances_)[::-1]]
top10 = list(pd.DataFrame({'predictor':xx, 'fi':yy}).iloc[:10,:].predictor)

In [11]:
xx[10], yy[10]

('term_cred_att_sp1', 0.02123789629262577)

In [12]:
pd.DataFrame({'predictor':xx, 'fi':yy}).iloc[:10,:].to_csv("top10_predictors_without_race.csv", index=False)

In [24]:
top10_predictors = list(pd.DataFrame({'predictor':xx, 'fi':yy}).iloc[:10,:].predictor)

In [32]:
df2 = pd.read_stata("degree_completion_full_sample_orig_RF.dta")
df2 = df2.merge(test_df.loc[:,['vccsid','white','afam']])

In [33]:
top10_new = [p for p in top10 if p.endswith("1")]
top10_new

['prop_comp_sp1', 'term_gpa_fa1', 'term_gpa_sp1', 'prop_comp_fa1']

In [34]:
from statsmodels.stats.weightstats import CompareMeans,DescrStatsW

In [42]:
rows = []
for pp in top10_predictors:
    if pp in top10_new:
        p = pp + "_orig"
    else:
        p = pp
    x = np.array(df2.loc[:,p])
    white_indices = np.where(df2.white == 1)[0]
    nonwhite_indices = np.where(df2['afam'] == 1)[0]
    x_1 = x[white_indices]
    x_1 = x_1[~pd.isnull(x_1)]
    x_2 = x[nonwhite_indices]
    x_2 = x_2[~pd.isnull(x_2)]
    test_result = CompareMeans(DescrStatsW(x_1), DescrStatsW(x_2)).ztest_ind(alternative='two-sided', usevar='unequal')
    p_vals = str(test_result[1]).split("e")
    if len(p_vals) == 1:
        new_p_val = str(round(float(p_vals[0]),4))
    else:
        new_p_val = str(round(float(p_vals[0]),4)) + "e" + p_vals[1]
    white_mean = round(np.mean(x_1), 4)
    diff_in_mean = round(np.mean(x_2) - np.mean(x_1),4)
    if pp in top10_new:
        x_1 = x[white_indices]
        x_1 = pd.isnull(x_1).astype(int)
        x_2 = x[nonwhite_indices]
        x_2 = pd.isnull(x_2).astype(int)
        test_result = CompareMeans(DescrStatsW(x_1), DescrStatsW(x_2)).ztest_ind(alternative='two-sided', usevar='unequal')
        p_vals = str(test_result[1]).split("e")
        if len(p_vals) == 1:
            new_p_val_2 = str(round(float(p_vals[0]),4))
        else:
            new_p_val_2 = str(round(float(p_vals[0]),4)) + "e" + p_vals[1]
        white_mean_2 = round(np.mean(x_1), 4)
        diff_in_mean_2 = round(np.mean(x_2) - np.mean(x_1),4)
        rows.append((pp, white_mean, diff_in_mean, new_p_val, white_mean_2, diff_in_mean_2, new_p_val_2))
    else:
        rows.append((pp, white_mean, diff_in_mean, new_p_val))
table = pd.DataFrame(rows[:], columns=['predictor', "white_mean", 'diff_in_mean', 'p_value', 'white_mi_perc', 'diff_in_mi_perc', 'p_value_mi'])
table.loc[:,'p_value'] = table.p_value.astype(float).round(4)
table.loc[:,'p_value_mi'] = table.p_value_mi.astype(float).round(4)
table.round(4).to_csv("key_predictor_difference_{}.csv".format('afam'), index=False)