In [19]:
import numpy as np
import pandas as pd
from scipy import stats
import pickle
from collections import Counter
from sklearn.metrics import precision_recall_curve, roc_auc_score, confusion_matrix, balanced_accuracy_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from scipy.stats.mstats import gmean
import math

In [20]:
top10_predictors = ['crnt_enrl_intensity', 'cum_gpa', 'avg_g', 'term_gpa_1', 'past_avg_grade', 'avg_g_concurrent', 'median_income_households', 'perc_below_pov', 'overall_prop_comp']

In [21]:
top10_new = ['cum_gpa', 'term_gpa_1', 'past_avg_grade', 'avg_g_concurrent', 'overall_prop_comp']

In [22]:
df2 = pd.read_stata("top10_predictors_table.dta")

In [23]:
from statsmodels.stats.weightstats import CompareMeans,DescrStatsW

In [24]:
rows = []
for pp in top10_predictors:
    if pp in top10_new:
        p = pp + "_orig"
    else:
        p = pp
    x = np.array(df2.loc[:,p])
    white_indices = np.where(df2.white == 1)[0]
    nonwhite_indices = np.where(df2['afam'] == 1)[0]
    x_1 = x[white_indices]
    x_1 = x_1[~pd.isnull(x_1)]
    x_2 = x[nonwhite_indices]
    x_2 = x_2[~pd.isnull(x_2)]
    test_result = CompareMeans(DescrStatsW(x_1), DescrStatsW(x_2)).ztest_ind(alternative='two-sided', usevar='unequal')
    p_vals = str(test_result[1]).split("e")
    if len(p_vals) == 1:
        new_p_val = str(round(float(p_vals[0]),4))
    else:
        new_p_val = str(round(float(p_vals[0]),4)) + "e" + p_vals[1]
    white_mean = round(np.mean(x_1), 4)
    diff_in_mean = round(np.mean(x_2) - np.mean(x_1),4)
    if pp in top10_new:
        x_1 = x[white_indices]
        x_1 = pd.isnull(x_1).astype(int)
        x_2 = x[nonwhite_indices]
        x_2 = pd.isnull(x_2).astype(int)
        test_result = CompareMeans(DescrStatsW(x_1), DescrStatsW(x_2)).ztest_ind(alternative='two-sided', usevar='unequal')
        p_vals = str(test_result[1]).split("e")
        if len(p_vals) == 1:
            new_p_val_2 = str(round(float(p_vals[0]),4))
        else:
            new_p_val_2 = str(round(float(p_vals[0]),4)) + "e" + p_vals[1]
        white_mean_2 = round(np.mean(x_1), 4)
        diff_in_mean_2 = round(np.mean(x_2) - np.mean(x_1),4)
        rows.append((pp, white_mean, diff_in_mean, new_p_val, white_mean_2, diff_in_mean_2, new_p_val_2))
    else:
        rows.append((pp, white_mean, diff_in_mean, new_p_val))
table = pd.DataFrame(rows[:], columns=['predictor', "white_mean", 'diff_in_mean', 'p_value', 'white_mi_perc', 'diff_in_mi_perc', 'p_value_mi'])
table.loc[:,'p_value'] = table.p_value.astype(float).round(4)
table.loc[:,'p_value_mi'] = table.p_value_mi.astype(float).round(4)
table.round(4).to_csv("key_predictor_difference_{}.csv".format('afam'), index=False)