In [18]:
import numpy as np
import pandas as pd
from scipy import stats
from scipy.stats import rankdata
from sklearn.metrics import confusion_matrix
import pickle
from sklearn.metrics import roc_auc_score
import matplotlib
font = {'size': 24}
matplotlib.rc('font', **font)
import matplotlib.pyplot as plt
import seaborn as sns
import re
from collections import Counter

In [19]:
df = pd.read_stata("../degree_completion_s2/pred_score_by_race.dta")

In [20]:
def two_side_z_test(n1, p1, n2, p2):
    z = (p2-p1)/np.sqrt(p2*(1-p2)/(n2-1)+p1*(1-p1)/(n1-1))
    return 2*(1-stats.norm.cdf(np.abs(z)))

In [21]:
new_pred_real = pd.DataFrame({'pred_score': list(df.pred_y),'real_y': list(df.test_y), 'race': list(df.race_column)}).sample(frac=1, random_state=54321)

In [22]:
new_pred_real.loc[:, 'pred_y'] = rankdata(new_pred_real.pred_score, method='ordinal')

In [23]:
try:
    new_pred_real.loc[:,'pred_y_binned'] = pd.cut(new_pred_real.pred_y, bins=[0] + list(np.percentile(new_pred_real.pred_y, np.arange(2,100,2))) + [1])
except ValueError:
    new_pred_real.loc[:,'pred_y_binned'] = pd.cut(new_pred_real.pred_y.rank(method='first'), bins=[0] + list(np.percentile(new_pred_real.pred_y.rank(method='first'), np.arange(2,100,2))) + [new_pred_real.shape[0]+1])
try:
    new_pred_real.loc[:,'pred_y_binned_2'] = pd.cut(new_pred_real.pred_y, bins=[min(new_pred_real.pred_y) - 1e-3] + list(np.percentile(new_pred_real.pred_y, np.arange(10,100,10))) + [max(new_pred_real.pred_y) + 1e-3])
except ValueError:
    new_pred_real.loc[:,'pred_y_binned_2'] = pd.cut(new_pred_real.pred_y.rank(method='first'), bins=[0] + list(np.percentile(new_pred_real.pred_y.rank(method='first'), np.arange(10,100,10))) + [new_pred_real.shape[0]+1])
pct_dict = {e:(10*indx+5) for indx, e in enumerate(sorted(list(Counter(new_pred_real.pred_y_binned_2).keys())))}

In [24]:
new_pred_real.loc[:,'real_y'] = new_pred_real.real_y * 100
for r in ['afam']:
    print(r)
    new_sub = new_pred_real.copy()[new_pred_real.race.apply(lambda x: x in ['white', r])]
    new_sub = new_sub.groupby(['pred_y_binned', 'race']).agg({'real_y':'mean'}).reset_index()
    new_sub.loc[:,r] = new_sub.race.apply(lambda x: 1 if x == r else 0)
    new_sub = new_sub.sort_values([r, 'pred_y_binned'])
    print(new_sub.shape[0])
    new_sub.loc[:,'pred_score_percentile'] = list(np.linspace(1,99,50))*2
    new_sub = new_sub.rename(columns={'real_y':'share_of_actual_ABC'}).drop(['pred_y_binned'], axis=1)

    sns.set_style(style = "darkgrid")
    fig, ax = plt.subplots(1,1, figsize=(24,16.5))
    sns.scatterplot(x="pred_score_percentile", y="share_of_actual_ABC", hue='race', hue_order = ['white', r],
                    data=new_sub,
                    palette = ['C0','C2'], marker="x", ax=ax, s=150, alpha=0.7, linewidth = 3)
    handles, labels = ax.get_legend_handles_labels()
    ax.legend(handles=handles[1:], labels=['White', 'Black'], fontsize='40', markerscale=3)
    plt.xticks(np.linspace(0,100,11),fontsize=36)
    plt.yticks(np.linspace(0,100,11),fontsize=36)
    
    new_sub = new_pred_real.copy()[new_pred_real.race.apply(lambda x: x in ['white', r])]
    new_sub.loc[:,'pred_score_percentile_new'] = new_sub.pred_y_binned_2.apply(lambda x: pct_dict[x])
    np.random.seed(4321)
    sns.lineplot(data=new_sub, x="pred_score_percentile_new", y="real_y", hue='race', hue_order = ['white', r],
                 err_style="bars", err_kws = {'capsize': 8, 'elinewidth':3, 'capthick':2}, 
                 ci=95, ax=ax, linewidth = 6,
                 palette = ['C0','C2'], legend=False,
                 marker=".", markersize=36)
    plt.xlabel("Enrollment Intensity Percentile", fontsize=40, labelpad=16)
    plt.ylabel("% of Students Who Earn Degree or Certificate in 6 Years", fontsize=36, labelpad=16)
    plt.savefig("../degree_completion_s2/degree_completion_enrollment_intensity_model.png")

afam
100
