In [None]:

formula_high_adi = 'High_ADI_Placement ~ C(gndr, Treatment(reference="F")) + \
           C(pri_spec_grouped, Treatment(reference="Primary Care")) + \
           top20_institution + Log_Total_Population'

model_high_adi_2015 = smf.logit(formula=formula_high_adi, data=cohort_2015)
result_high_adi_2015 = model_high_adi_2015.fit()
print("Logistic Regression Results for High ADI Placement (2015 Cohort):")
print(result_high_adi_2015.summary())

params_2015 = result_high_adi_2015.params
conf_2015 = result_high_adi_2015.conf_int()
conf_2015['Odds_Ratio'] = params_2015
conf_2015.columns = ['2.5%', '97.5%', 'Odds_Ratio']
conf_2015['Odds_Ratio'] = np.exp(conf_2015['Odds_Ratio'])
conf_2015['2.5%'] = np.exp(conf_2015['2.5%'])
conf_2015['97.5%'] = np.exp(conf_2015['97.5%'])
print("\nOdds Ratios and 95% Confidence Intervals for 2015 Cohort:")
print(conf_2015)




model_high_adi_2020 = smf.logit(formula=formula_high_adi, data=cohort_2020)
result_high_adi_2020 = model_high_adi_2020.fit()
print("\nLogistic Regression Results for High ADI Placement (2020 Cohort):")
print(result_high_adi_2020.summary())


params_2020 = result_high_adi_2020.params
conf_2020 = result_high_adi_2020.conf_int()
conf_2020['Odds_Ratio'] = params_2020
conf_2020.columns = ['2.5%', '97.5%', 'Odds_Ratio']
conf_2020['Odds_Ratio'] = np.exp(conf_2020['Odds_Ratio'])
conf_2020['2.5%'] = np.exp(conf_2020['2.5%'])
conf_2020['97.5%'] = np.exp(conf_2020['97.5%'])
print("\nOdds Ratios and 95% Confidence Intervals for 2020 Cohort:")
print(conf_2020)


Logistic by specs

In [None]:
# List of unique specializations
specializations = cohort_2015['pri_spec_grouped'].unique()

# Iterate over each specialization
for spec in specializations:
    # Filter data for the specialization
    data_spec = cohort_2015[cohort_2015['pri_spec_grouped'] == spec]

    # Ensure sufficient sample size
    if len(data_spec) < 50:
        print(f"Skipping {spec} due to insufficient data.")
        continue

    # Define formula without 'pri_spec_grouped' since it's constant
    formula_spec = 'High_ADI_Placement ~ C(gndr, Treatment(reference="F")) + top20_institution + Log_Total_Population'

    # Run the logistic regression
    model_spec = smf.logit(formula=formula_spec, data=data_spec)
    try:
        result_spec = model_spec.fit()
    except Exception as e:
        print(f"Could not fit model for {spec}: {e}")
        continue

    # Print the summary
    print(f"\nLogistic Regression Results for {spec} (2015 Cohort):")
    print(result_spec.summary())

    # Calculate odds ratios
    params_spec = result_spec.params
    conf_spec = result_spec.conf_int()
    conf_spec.columns = ['2.5%', '97.5%']
    odds_ratios_spec = pd.DataFrame({
        'Variable': params_spec.index,
        'OR': np.exp(params_spec),
        '2.5%': np.exp(conf_spec['2.5%']),
        '97.5%': np.exp(conf_spec['97.5%'])
    })
    print("\nOdds Ratios and 95% Confidence Intervals:")
    print(odds_ratios_spec)


In [None]:
from scipy.stats import fisher_exact

# Create an empty list to store the results
state_results_2015 = []

# Get unique states
states_2015 = cohort_2015['State'].unique()

# Loop over each state
for state in states_2015:
    # Subset data for the state
    state_data = cohort_2015[cohort_2015['State'] == state]

    # Create contingency table for 'top20_institution' vs. 'High_ADI_Placement'
    contingency_table = pd.crosstab(state_data['top20_institution'], state_data['High_ADI_Placement'])

    # Ensure the table has both levels of 'top20_institution' and 'High_ADI_Placement'
    if contingency_table.shape == (2, 2):
        # Calculate odds ratio and p-value
        # Using statsmodels for odds ratio and confidence intervals
        ct = sm.stats.Table2x2(contingency_table.values)
        oddsratio = ct.oddsratio
        ci_low, ci_upp = ct.oddsratio_confint()
        # Use Fisher's Exact Test for p-value
        _, p_value = fisher_exact(contingency_table)

        # Append the results
        state_results_2015.append({
            'State': state,
            'odds_ratio': oddsratio,
            'ci_lower': ci_low,
            'ci_upper': ci_upp,
            'p_value': p_value,
            'n_top20': state_data['top20_institution'].sum(),
            'n_total': len(state_data)
        })
    else:
        # Skip states that do not have both levels
        continue

# Create a dataframe from the results
state_results_df_2015 = pd.DataFrame(state_results_2015)

# Sort the dataframe by odds ratio
state_results_df_2015 = state_results_df_2015.sort_values('odds_ratio')

# Output the top 10 states with the lowest odds ratios
top10_states_lowest_odds_2015 = state_results_df_2015.head(20)

print("Top 10 States with Lowest Odds Ratios (2015 Cohort):")
print(top10_states_lowest_odds_2015[['State', 'odds_ratio', 'ci_lower', 'ci_upper', 'p_value', 'n_top20', 'n_total']])

# For 2020 Cohort
print("\nPerforming State-Level Analysis for 2020 Cohort...")

# Create an empty list to store the results
state_results_2020 = []

# Get unique states
states_2020 = cohort_2020['State'].unique()

# Loop over each state
for state in states_2020:
    # Subset data for the state
    state_data = cohort_2020[cohort_2020['State'] == state]

    # Create contingency table for 'top20_institution' vs. 'High_ADI_Placement'
    contingency_table = pd.crosstab(state_data['top20_institution'], state_data['High_ADI_Placement'])

    # Ensure the table has both levels of 'top20_institution' and 'High_ADI_Placement'
    if contingency_table.shape == (2, 2):
        # Calculate odds ratio and p-value
        ct = sm.stats.Table2x2(contingency_table.values)
        oddsratio = ct.oddsratio
        ci_low, ci_upp = ct.oddsratio_confint()
        # Use Fisher's Exact Test for p-value
        _, p_value = fisher_exact(contingency_table)

        # Append the results
        state_results_2020.append({
            'State': state,
            'odds_ratio': oddsratio,
            'ci_lower': ci_low,
            'ci_upper': ci_upp,
            'p_value': p_value,
            'n_top20': state_data['top20_institution'].sum(),
            'n_total': len(state_data)
        })
    else:
        # Skip states that do not have both levels
        continue

# Create a dataframe from the results
state_results_df_2020 = pd.DataFrame(state_results_2020)

# Sort the dataframe by odds ratio
state_results_df_2020 = state_results_df_2020.sort_values('odds_ratio')

# Output the top 10 states with the lowest odds ratios
top10_states_lowest_odds_2020 = state_results_df_2020.head(20)

print("Top 10 States with Lowest Odds Ratios (2020 Cohort):")
print(top10_states_lowest_odds_2020[['State', 'odds_ratio', 'ci_lower', 'ci_upper', 'p_value', 'n_top20', 'n_total']])