In [1]:
import numpy as np
import pandas as pd
import os

import statsmodels.api as sm

In [2]:
state_list = []
for obj in os.listdir('state_level_data'): 
    if '.zip' not in obj and '.pdf' not in obj and obj != '.DS_Store': 
        state_list.append(obj)

In [20]:
def get_state_df(state, clf_result=pd.read_csv('clf_result.csv', index_col=0)): 
    files = os.listdir(f'state_level_data/csv_p{state.lower()}/')
    num = 0
    for file in files: 
        if '.csv' in file: 
            df = pd.read_csv(f'state_level_data/csv_p{state.lower()}/{file}')
            num += 1
    if num != 1: 
        print(f'More than 1 csv file exist for {state}.')
    df = df[['AGEP', 'SEX', 'SOCP', 'RAC1P', 'PWGTP', 'SCHL']].dropna()
    df['SOCP'] = df['SOCP'].map(lambda x: x[:2] + '-' + x[2:])
    df = df.merge(clf_result, left_on='SOCP', right_on='SOC Code', how='left').dropna(0)
    df = pd.get_dummies(df, columns=['RAC1P', 'SEX', 'SCHL'], prefix=['RAC1P', 'SEX', 'SCHL'], drop_first=True)
    df['AGEP_quad'] = df['AGEP'] ** 2
    RAC_list = []
    SCHL_list = []
    for col in df.columns: 
        if col.startswith('RAC1P_'): 
            RAC_list.append(col)
        if col.startswith('SCHL_'): 
            SCHL_list.append(col)
    RAC_list_interaction = []
    for race in RAC_list: 
        RAC_list_interaction.append(f'SEX_2 * {race}')
        df[f'SEX_2 * {race}'] = df['SEX_2'] * df[race]
    return df, RAC_list + RAC_list_interaction + SCHL_list

In [24]:
def state_level_gender_gap(state): 
    df, var_list = get_state_df(state)
    X_reg = df[['SEX_2', 'AGEP', 'AGEP_quad'] + var_list]
    pred_prob = df[['pred_prob']]
    weights = df[['PWGTP']].values
    wls_reg_pred_prob = sm.WLS(np.log(pred_prob), sm.add_constant(X_reg), weights=weights)
    wls_reg_pred_prob = wls_reg_pred_prob.fit()
#     logit_reg_official_stem = sm.Logit(official_stem, sm.add_constant(X_reg), weights=weights)
#     logit_reg_official_stem = logit_reg_official_stem.fit()
#     logit_reg_pred_low_threshold = sm.Logit(pred_low_threshold, sm.add_constant(X_reg), weights=weights)
#     logit_reg_pred_low_threshold = logit_reg_pred_low_threshold.fit()
    return wls_reg_pred_prob.conf_int().loc['SEX_2'].mean()

In [25]:
state_level_reg_gender_result = {}
for state_file in state_list: 
    state = state_file[-2:]
    state_level_reg_gender_result[state] = state_level_gender_gap(state)
    print('Completed ' + state)

Completed mi
Completed mn
Completed al
Completed nh
Completed ky
Completed ak
Completed ma
Completed mo
Completed mt
Completed ms
Completed ga
Completed ut
Completed sc
Completed sd
Completed va
Completed vt
Completed tx
Completed wy
Completed ri
Completed wv
Completed fl
Completed ct
Completed ia
Completed ca
Completed co
Completed in
Completed ok
Completed ny
Completed az
Completed de
Completed hi
Completed ne
Completed ks
Completed md
Completed dc
Completed ar
Completed nv
Completed me
Completed nj
Completed nm
Completed nd
Completed nc
Completed pa
Completed pr
Completed tn
Completed wa
Completed wi
Completed la
Completed il
Completed or
Completed oh
Completed id


In [27]:
import plotly.express as px
state_list_plot = []
value_list_plot = []

for state, result in state_level_reg_gender_result.items(): 
    state_list_plot.append(state.upper())
    value_list_plot.append(result)
    
fig = px.choropleth(locations=state_list_plot, locationmode="USA-states", color=value_list_plot, scope="usa")
# fig.update_layout(
#     title={
#         'text': 'Gender Gaps in STEM Engagement in the US (2018)', 
#         'y':0.95,
#         'x':0.5,
#         'xanchor': 'center',
#         'yanchor': 'top'
#     })
fig.show()

In [32]:
pd.DataFrame(data={'state': state_list_plot, 'coef': value_list_plot}).sort_values('state')

Unnamed: 0,state,coef
5,AK,0.105701
2,AL,0.047565
35,AR,0.149072
28,AZ,0.109227
23,CA,0.057969
24,CO,0.084865
21,CT,0.141742
34,DC,0.116631
29,DE,0.141635
20,FL,0.104873
