# Hypothesis 2 — Education & Diabetes Prevention

In [None]:
if base_df['education'].isna().all():
    raise ValueError("Education is all NA — rebuild mapping in Cell 0 first.")

prev_edu = prevalence(base_df, 'education')
fig = px.scatter(prev_edu, x='education', y='prevalence',
                 error_y=prev_edu['ci_hi']-prev_edu['prevalence'],
                 error_y_minus=prev_edu['prevalence']-prev_edu['ci_lo'],
                 title='Diabetes Prevalence by Education (95% CI)')
fig.update_traces(mode='lines+markers')
fig.update_yaxes(range=[0,1], tickformat=".0%")
fig.show()





In [None]:
if 'income' in base_df.columns:
    # If income numeric, bin into quartiles for readability
    if pd.api.types.is_numeric_dtype(base_df['income']) and base_df['income'].nunique() > 8:
        base_df['income_band'] = pd.qcut(base_df['income'], 4, labels=['Q1','Q2','Q3','Q4'])
        inc = 'income_band'
    else:
        inc = 'income'
    tmp = (base_df.groupby(['education', inc])['diabetes_binary']
                  .mean().reset_index(name='prevalence'))
    fig = px.line(tmp, x='education', y='prevalence', color=inc, markers=True,
                  title='Diabetes Prevalence by Education, stratified by Income')
    fig.update_yaxes(range=[0,1], tickformat=".0%")
    fig.show()





In [None]:
for access_var in ['anyhealthcare','nodocbccost']:
    if access_var in base_df.columns:
        counts = (base_df.groupby(['education', access_var]).size()
                        .reset_index(name='n'))
        counts['pct'] = counts.groupby('education')['n'].transform(lambda x: x/x.sum())
        fig = px.bar(counts, x='education', y='pct', color=access_var, barmode='stack',
                     title=f'{access_var} distribution by Education (100% stacked)')
        fig.update_yaxes(range=[0,1], tickformat=".0%")
        fig.show()

for beh in ['smoker','physactivity']:
    if beh in base_df.columns:
        counts = (base_df.groupby(['education', beh]).size()
                        .reset_index(name='n'))
        counts['pct'] = counts.groupby('education')['n'].transform(lambda x: x/x.sum())
        fig = px.bar(counts, x='education', y='pct', color=beh, barmode='stack',
                     title=f'{beh.title()} distribution by Education (100% stacked)')
        fig.update_yaxes(range=[0,1], tickformat=".0%")
        fig.show()

























In [None]:
controls = [c for c in ['age','sex','bmi','income','anyhealthcare','nodocbccost',
                        'smoker','physactivity','hvyalcoholconsump','fruits','veggies']
            if c in base_df.columns]
mdata = base_df.dropna(subset=['diabetes_binary','education_num'])
model = smf.logit('diabetes_binary ~ education_num' + ((' + ' + ' + '.join(controls)) if controls else ''),
                  data=mdata).fit(disp=False)
print(model.summary())

# ORs
or_tab = odds_ratio_table(model); or_tab

# Marginal effects
mfx = model.get_margeff(at='overall', method='dydx'); print(mfx.summary())

                           Logit Regression Results                           
Dep. Variable:        diabetes_binary   No. Observations:                70692
Model:                          Logit   Df Residuals:                    70679
Method:                           MLE   Df Model:                           12
Date:                Sat, 01 Nov 2025   Pseudo R-squ.:                  0.1716
Time:                        22:03:40   Log-Likelihood:                -40593.
converged:                       True   LL-Null:                       -49000.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                          coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept              -4.1240      0.081    -51.225      0.000      -4.282      -3.966
sex[T.Male]             0.3296      0.018     18.696      0.000       0.295       0.364
smoker[T.Yes]   