In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import math
import seaborn as sns

custom_params = {"axes.spines.right": False, "axes.spines.top": False}
sns.set_theme(style="ticks", rc=custom_params); np.random.seed(0)
%matplotlib inline
tqdm.pandas()

import warnings
warnings.filterwarnings('ignore')

In [2]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Load & Pre-process dataset

In [12]:
bg_access = pd.read_csv('../data/bg_fp_regression.csv', dtype={'bg_fips':str, 'bg_county':str})
bg_access

Unnamed: 0,bg_fips,address,distance_mi,transit_time,walking_time,ADI_NATRANK,ADI_STATERNK,bg_state,bg_county,status,access
0,010010201001,"203 N Court St, Prattville, AL 36067",0.835864,20.08,20.08,73.0,5,AL,01001,Metro,20.08
1,010010201002,"203 N Court St, Prattville, AL 36067",1.684913,37.30,37.30,62.0,3,AL,01001,Metro,37.30
2,010010202001,"203 N Court St, Prattville, AL 36067",1.065305,20.33,20.33,83.0,7,AL,01001,Metro,20.33
3,010010202002,"203 N Court St, Prattville, AL 36067",0.327409,8.40,8.40,87.0,7,AL,01001,Metro,8.40
4,010010203001,"203 N Court St, Prattville, AL 36067",1.367486,42.63,42.63,73.0,5,AL,01001,Metro,42.63
...,...,...,...,...,...,...,...,...,...,...,...
239775,560430002003,,,,,47.0,5,WY,56043,Nonmetro,
239776,560459511002,,,,,49.0,6,WY,56045,Nonmetro,
239777,560459513001,,,,,85.0,10,WY,56045,Nonmetro,
239778,560459513002,,,,,70.0,9,WY,56045,Nonmetro,


In [13]:
bg_access.describe()

Unnamed: 0,distance_mi,transit_time,walking_time,ADI_NATRANK,access
count,238536.0,166589.0,237842.0,233887.0,237934.0
mean,2.566915,28.230644,85.222355,50.115509,78.712034
std,3.181968,29.585616,371.591318,28.7413,336.442417
min,0.000197,0.0,0.0,1.0,0.0
25%,0.681729,13.28,19.98,25.0,17.25
50%,1.435726,23.6,42.35,50.0,35.22
75%,3.074115,38.5,90.43,75.0,80.48
max,24.990148,2631.88,28848.12,100.0,28848.12


# 1. Linear Regression

## (1) ~ ADI

- The coefficient: 0.611
    - If ADI increases (more disadvantaged), then time increases (less accessibility) by transit or walk.

In [15]:
adi_lm = smf.glm(formula = "access ~ ADI_NATRANK", data=bg_access)
adi_res = adi_lm.fit()
adi_res.summary()

0,1,2,3
Dep. Variable:,access,No. Observations:,232337.0
Model:,GLM,Df Residuals:,232335.0
Model Family:,Gaussian,Df Model:,1.0
Link Function:,Identity,Scale:,113940.0
Method:,IRLS,Log-Likelihood:,-1682300.0
Date:,"Fri, 19 May 2023",Deviance:,26473000000.0
Time:,21:28:35,Pearson chi2:,26500000000.0
No. Iterations:,3,Pseudo R-squ. (CS):,0.002697
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,48.0173,1.406,34.145,0.000,45.261,50.773
ADI_NATRANK,0.6106,0.024,25.047,0.000,0.563,0.658


## (2) ~ ADI + Rural

In [16]:
adi_rural_lm = smf.glm(formula = "access ~ ADI_NATRANK + C(status)", data=bg_access)
adi_rural_res = adi_rural_lm.fit()
adi_rural_res.summary()

0,1,2,3
Dep. Variable:,access,No. Observations:,232336.0
Model:,GLM,Df Residuals:,232333.0
Model Family:,Gaussian,Df Model:,2.0
Link Function:,Identity,Scale:,112360.0
Method:,IRLS,Log-Likelihood:,-1680600.0
Date:,"Fri, 19 May 2023",Deviance:,26106000000.0
Time:,21:28:58,Pearson chi2:,26100000000.0
No. Iterations:,3,Pseudo R-squ. (CS):,0.01664
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,53.4745,1.400,38.203,0.000,50.731,56.218
C(status)[T.Nonmetro],112.7195,1.974,57.110,0.000,108.851,116.588
ADI_NATRANK,0.1237,0.026,4.821,0.000,0.073,0.174
