In [1]:
import os
import pandas as pd
import numpy as np

import statsmodels.api as sm
import statsmodels.formula.api as smf

#Pandas settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
#change directory for your path
os.chdir('E:\\OneDrive\\Documents\\GitHub\\eHealthEquity\\Research\\Data Source')

In [23]:
df = pd.read_csv('acs5y_2021_brfss_2021_merged.csv')

index_list = [
    'state','county'
]
brfss_list = [
    col for col in df.columns if col.startswith('brfss_') 
    and not col.endswith('crdprv') and not col.endswith('upper') 
    and not col.endswith('lower')
]
acs_list = [
    col for col in df.columns if col.startswith('pct_')
]

all_columns = index_list + brfss_list + acs_list

df = df[all_columns]

# Feature Engineering

Unnamed: 0,state,county,brfss_access2_ageadjprv,brfss_checkup_ageadjprv,brfss_csmoking_ageadjprv,brfss_depression_ageadjprv,brfss_diabetes_ageadjprv,brfss_ghlth_ageadjprv,brfss_mhlth_ageadjprv,brfss_obesity_ageadjprv,pct_ed_lt9,pct_ed_9_12,pct_ed_hs,pct_ed_sc,pct_ed_asc,pct_ed_b,pct_ed_gr,pct_age_gte65,pct_race_white,pct_race_black,pct_race_aian,pct_race_asian,pct_race_nhopi,pct_race_other,pct_eth_hisp,pct_sex_male,pct_sex_female,pct_occ_unemp,pct_occ_mgt,pct_occ_svc,pct_occ_sales,pct_occ_nat_res,pct_occ_prod,pct_hlth_unins,pct_ses_pov,pct_tp_veh_0,pct_tp_veh_1,pct_tp_veh_2,pct_tp_veh_3,pct_dg_bb_int,edu_bach
0,Alaska,Aleutians East,16.9,60.7,18.6,13.1,12.6,19.1,12.9,31.3,6.1,9.2,41.6,21.9,5.4,10.6,5.4,11.7,14.6,2.1,44.1,20.9,0.3,6.5,10.5,59.1,40.9,3.5,23.9,13.2,11.7,12.7,38.6,40.0,13.4,24.0,37.2,28.0,10.8,57.5,16.0


# Linear Regression Assumptions Check
1. Linearity between the dependent variable and the independent variables
2. No or little multicollinearity between the independent variables
3. No auto-correlation in the data (i.e. the data is not time-series data)
4. Homoscedasticity (i.e. the variance of the error terms is similar across all the independent variables)
5. Residuals are normally distributed
6. No endogeneity (i.e. the independent variables are not correlated with the error terms)

Linear regression is suseptible to outliers and multicollinearity. 
Multicollinearity is when two or more independent variables are highly correlated with one another.
Multicollinearity can be detected by looking at the correlation matrix or by using VIF (Variance Inflation Factor) scores.
VIF scores of 1.5 to 2.5 are generally considered okay, while scores greater than 10 represent an issue that needs to be addressed.

- Ensure that the independent variables are not correlated with the error terms
- Check for outliers
- Check for normality of the residuals
- Check for homoscedasticity

In [8]:
dv = [
    'brfss_diabetes_ageadjprv'
]
iv = [
    'pct_dg_bb_int'
]
covariates = [
    "brfss_access2_ageadjprv",
    "brfss_checkup_ageadjprv",
    "brfss_csmoking_ageadjprv",
    "brfss_depression_ageadjprv",
    "brfss_mhlth_ageadjprv",
    "brfss_obesity_ageadjprv",
    "pct_ed_lt9",
    "pct_ed_9_12",
    "pct_ed_hs",
    "pct_ed_sc",
    "pct_ed_asc",
    "pct_ed_b",
    "pct_ed_gr",
    "pct_age_gte65",
    "pct_occ_unemp",
    "pct_occ_mgt",
    "pct_occ_svc",
    "pct_occ_sales",
    "pct_occ_nat_res",
    "pct_occ_prod",
    "pct_hlth_unins",
    "pct_ses_pov",
    "pct_tp_veh_0",
    "pct_tp_veh_1",
    "pct_tp_veh_2",
    "pct_tp_veh_3"
]

In [14]:
X = df[iv + covariates]
y = df[dv]
#add constant
X = sm.add_constant(X)

In [26]:
# Simple Multivariate Regression Model
model = sm.OLS(y, X).fit()
model.summary2()

0,1,2,3
Model:,OLS,Adj. R-squared:,0.921
Dependent Variable:,brfss_diabetes_ageadjprv,AIC:,5953.5961
Date:,2023-09-16 05:45,BIC:,6122.4749
No. Observations:,3076,Log-Likelihood:,-2948.8
Df Model:,27,F-statistic:,1336.0
Df Residuals:,3048,Prob (F-statistic):,0.0
R-squared:,0.922,Scale:,0.40195

0,1,2,3,4,5,6
,Coef.,Std.Err.,t,P>|t|,[0.025,0.975]
const,43.7139,30.7552,1.4214,0.1553,-16.5891,104.0170
pct_dg_bb_int,-0.0188,0.0024,-7.6988,0.0000,-0.0236,-0.0140
brfss_access2_ageadjprv,0.1013,0.0042,24.0779,0.0000,0.0931,0.1095
brfss_checkup_ageadjprv,0.0624,0.0035,17.7686,0.0000,0.0555,0.0692
brfss_csmoking_ageadjprv,-0.0070,0.0082,-0.8611,0.3893,-0.0231,0.0090
brfss_depression_ageadjprv,-0.1494,0.0069,-21.7570,0.0000,-0.1629,-0.1359
brfss_mhlth_ageadjprv,0.3383,0.0163,20.7911,0.0000,0.3064,0.3702
brfss_obesity_ageadjprv,0.1029,0.0044,23.4859,0.0000,0.0943,0.1115
pct_ed_lt9,-0.0560,0.1496,-0.3743,0.7082,-0.3493,0.2373

0,1,2,3
Omnibus:,107.158,Durbin-Watson:,1.284
Prob(Omnibus):,0.0,Jarque-Bera (JB):,130.446
Skew:,0.404,Prob(JB):,0.0
Kurtosis:,3.604,Condition No.:,403528.0


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices