In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import math
import seaborn as sns

custom_params = {"axes.spines.right": False, "axes.spines.top": False}
sns.set_theme(style="ticks", rc=custom_params); np.random.seed(0)
%matplotlib inline
tqdm.pandas()

import warnings
warnings.filterwarnings('ignore')

In [2]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Load & Pre-process dataset

In [3]:
metro_bg_access = pd.read_csv('../data/metro_fp_regression.csv',dtype={'bg_fips':str, 'bg_county':str})
metro_bg_access

Unnamed: 0,bg_fips,address,distance_mi,transit_time,walking_time,driving_distance_km,ADI_NATRANK,ADI_STATERNK,bg_state,bg_county,status,access,category
0,040130405022,"360 W. Yavapai Street Wickenburg, AZ 85390",0.540367,,14.700000,1.285,40.0,4,AZ,04013,Metro,14.700000,High access
1,040130405024,"360 W. Yavapai Street Wickenburg, AZ 85390",0.188818,,5.830000,0.482,64.0,7,AZ,04013,Metro,5.830000,High access
2,040159501021,"280 W Township Ave, Colorado City, AZ 86021",1.230661,,8.500000,0.774,67.0,8,AZ,04015,Metro,8.500000,High access
3,050690010001,"500 S Main St Pine Bluff, AR 71601",0.236860,,6.083333,0.474,,GQ,AR,05069,Metro,6.083333,High access
4,120210111071,"101 S Copeland Ave, Everglades City, FL 34139",0.611384,,8.230000,1.464,54.0,6,FL,12021,Metro,8.230000,High access
...,...,...,...,...,...,...,...,...,...,...,...,...,...
198762,560250016031,"4301 Casper Mountain Rd, Casper, WY 82601",2.872862,,95.750000,7.84,23.0,1,WY,56025,Metro,95.750000,Low access
198763,560250017003,"4250 Poison Spider Road MIlls, WY 82644",4.112575,,106.980000,9.361,39.0,4,WY,56025,Metro,106.980000,Low access
198764,560250018011,"411 S. Walsh Dr. Casper, WY 82609",7.722157,,169.970000,30.529,18.0,1,WY,56025,Metro,169.970000,Low access
198765,560250018012,"4301 Casper Mountain Rd, Casper, WY 82601",15.406593,,539.400000,55.932,18.0,1,WY,56025,Metro,539.400000,Low access


In [4]:
metro_bg_access.describe()

Unnamed: 0,distance_mi,transit_time,walking_time,ADI_NATRANK,access
count,198579.0,153558.0,198068.0,193841.0,198153.0
mean,2.011049,28.131101,66.696565,45.748691,59.30231
std,2.29021,29.577986,303.883043,28.246172,249.845871
min,0.000197,0.0,0.0,1.0,0.0
25%,0.629761,13.18,18.55,21.0,15.95
50%,1.278648,23.43,37.92,43.0,31.05
75%,2.498006,38.27,74.22,69.0,61.67
max,24.971662,2631.88,28824.68,100.0,28783.73


In [5]:
df = pd.read_csv("../data/nhgis0001_ds258_2020_blck_grp.csv", encoding='ISO-8859-1', dtype={'GEOCODE':str})
print(df)

                GISJOIN  YEAR STUSAB                  GEOID       GEOCODE  \
0       G01000100201001  2020     AL  1500000US010010201001  010010201001   
1       G01000100201002  2020     AL  1500000US010010201002  010010201002   
2       G01000100202001  2020     AL  1500000US010010202001  010010202001   
3       G01000100202002  2020     AL  1500000US010010202002  010010202002   
4       G01000100203001  2020     AL  1500000US010010203001  010010203001   
...                 ...   ...    ...                    ...           ...   
242330  G72015307506011  2020     PR  1500000US721537506011  721537506011   
242331  G72015307506012  2020     PR  1500000US721537506012  721537506012   
242332  G72015307506013  2020     PR  1500000US721537506013  721537506013   
242333  G72015307506021  2020     PR  1500000US721537506021  721537506021   
242334  G72015307506022  2020     PR  1500000US721537506022  721537506022   

        REGIONA  DIVISIONA        STATE  STATEA           COUNTY  ...  \
0 

In [6]:
bg = df.loc[(~df.GEOCODE.astype(str).str.startswith('72')) ]

In [7]:
metro_merged =  metro_bg_access.merge(bg, left_on='bg_fips', right_on='GEOCODE', how='left')
print(metro_merged)

             bg_fips                                        address  \
0       040130405022     360 W. Yavapai Street Wickenburg, AZ 85390   
1       040130405024     360 W. Yavapai Street Wickenburg, AZ 85390   
2       040159501021    280 W Township Ave, Colorado City, AZ 86021   
3       050690010001             500 S Main St Pine Bluff, AR 71601   
4       120210111071  101 S Copeland Ave, Everglades City, FL 34139   
...              ...                                            ...   
198762  560250016031      4301 Casper Mountain Rd, Casper, WY 82601   
198763  560250017003        4250 Poison Spider Road MIlls, WY 82644   
198764  560250018011              411 S. Walsh Dr. Casper, WY 82609   
198765  560250018012      4301 Casper Mountain Rd, Casper, WY 82601   
198766  560250018022                 4976 Paige St Casper, WY 82604   

        distance_mi  transit_time  walking_time driving_distance_km  \
0          0.540367           NaN     14.700000               1.285   
1    

# 1. Linear Regression

## (1) ~ ADI

- The coefficient: 0.611
    - If ADI increases (more disadvantaged), then time increases (less accessibility) by transit or walk.

In [8]:
metro_adi_lm = smf.glm(formula = "access ~ ADI_NATRANK", data=metro_bg_access)
metro_adi_res = metro_adi_lm.fit()
metro_adi_res.summary()

0,1,2,3
Dep. Variable:,access,No. Observations:,193388.0
Model:,GLM,Df Residuals:,193386.0
Model Family:,Gaussian,Df Model:,1.0
Link Function:,identity,Scale:,63007.0
Method:,IRLS,Log-Likelihood:,-1343000.0
Date:,"Wed, 10 Apr 2024",Deviance:,12185000000.0
Time:,00:55:22,Pearson chi2:,12200000000.0
No. Iterations:,3,Pseudo R-squ. (CS):,0.0001991
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,53.3974,1.087,49.145,0.000,51.268,55.527
ADI_NATRANK,0.1254,0.020,6.205,0.000,0.086,0.165


In [9]:
metro_adi_lm2 = smf.glm(formula = "access ~ ADI_NATRANK + AREALAND", data=metro_merged)
metro_adi_res2 = metro_adi_lm2.fit()
metro_adi_res2.summary()

0,1,2,3
Dep. Variable:,access,No. Observations:,193388.0
Model:,GLM,Df Residuals:,193385.0
Model Family:,Gaussian,Df Model:,2.0
Link Function:,identity,Scale:,61761.0
Method:,IRLS,Log-Likelihood:,-1341000.0
Date:,"Wed, 10 Apr 2024",Deviance:,11944000000.0
Time:,00:55:34,Pearson chi2:,11900000000.0
No. Iterations:,3,Pseudo R-squ. (CS):,0.02018
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,49.9532,1.077,46.376,0.000,47.842,52.064
ADI_NATRANK,0.0706,0.020,3.525,0.000,0.031,0.110
AREALAND,5.61e-07,8.98e-09,62.468,0.000,5.43e-07,5.79e-07


## (2) ~ ADI + Rural

In [10]:
rural_bg_access = pd.read_csv('../data/rural_fp_regression.csv', dtype={'bg_fips':str, 'bg_county':str})
rural_bg_access

Unnamed: 0,bg_fips,address,distance_mi,transit_time,walking_time,driving_distance_km,ADI_NATRANK,ADI_STATERNK,bg_state,bg_county,status,access,category
0,010059501001,"111 Jackson Street Eufaula, AL 36072",4.921180,,126.470000,10.197,76.0,5,AL,01005,Nonmetro,6.336120,High access
1,010059501002,"111 Jackson Street Eufaula, AL 36072",11.686954,,272.920000,22.485,94.0,9,AL,01005,Nonmetro,13.971527,Medium access
2,010059502001,"11 East College Street Clayton, AL 36016",5.718206,,117.570000,9.295,95.0,9,AL,01005,Nonmetro,5.775643,High access
3,010059502002,"276 AL-239, Clayton, AL 36016",4.960451,,147.216667,13.644,,GQ,AL,01005,Nonmetro,8.477986,High access
4,010059502003,"11 East College Street Clayton, AL 36016",3.365542,,105.870000,8.608,90.0,8,AL,01005,Nonmetro,5.348762,High access
...,...,...,...,...,...,...,...,...,...,...,...,...,...
41008,560430002003,,,,,69.685,47.0,5,WY,56043,Nonmetro,43.300238,Low access
41009,560459511002,,,,,93.009,49.0,6,WY,56045,Nonmetro,57.793095,Low access
41010,560459513001,,,,,63.563,85.0,10,WY,56045,Nonmetro,39.496205,Low access
41011,560459513002,,,,,57.73,70.0,9,WY,56045,Nonmetro,35.871748,Low access


In [11]:
rural_bg_access.describe()

Unnamed: 0,distance_mi,transit_time,walking_time,ADI_NATRANK,access
count,39957.0,13031.0,39774.0,40046.0,40578.0
mean,5.329469,29.403673,177.477754,71.252909,9.478568
std,5.020971,29.651358,596.340135,20.580527,31.025795
min,0.002277,0.02,0.02,1.0,0.001243
25%,1.281108,14.43,36.07,58.0,1.95328
50%,3.805173,25.97,106.94,75.0,5.740847
75%,7.957177,41.22,217.565,88.0,11.810399
max,24.990148,1019.63,28848.12,100.0,1535.684251


In [12]:
rural_merged =  rural_bg_access.merge(bg, left_on='bg_fips', right_on='GEOCODE', how='left')
print(rural_merged)

            bg_fips                                   address  distance_mi  \
0      010059501001      111 Jackson Street Eufaula, AL 36072     4.921180   
1      010059501002      111 Jackson Street Eufaula, AL 36072    11.686954   
2      010059502001  11 East College Street Clayton, AL 36016     5.718206   
3      010059502002             276 AL-239, Clayton, AL 36016     4.960451   
4      010059502003  11 East College Street Clayton, AL 36016     3.365542   
...             ...                                       ...          ...   
41008  560430002003                                       NaN          NaN   
41009  560459511002                                       NaN          NaN   
41010  560459513001                                       NaN          NaN   
41011  560459513002                                       NaN          NaN   
41012  560459513003                                       NaN          NaN   

       transit_time  walking_time driving_distance_km  ADI_NATR

In [13]:
rural_adi_lm = smf.glm(formula = "access ~ ADI_NATRANK", data=rural_bg_access)
rural_adi_res = rural_adi_lm.fit()
rural_adi_res.summary()

0,1,2,3
Dep. Variable:,access,No. Observations:,39720.0
Model:,GLM,Df Residuals:,39718.0
Model Family:,Gaussian,Df Model:,1.0
Link Function:,identity,Scale:,979.25
Method:,IRLS,Log-Likelihood:,-193130.0
Date:,"Wed, 10 Apr 2024",Deviance:,38894000.0
Time:,00:57:19,Pearson chi2:,38900000.0
No. Iterations:,3,Pseudo R-squ. (CS):,5.236e-06
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,9.7194,0.568,17.122,0.000,8.607,10.832
ADI_NATRANK,-0.0035,0.008,-0.456,0.648,-0.018,0.012


In [16]:
rural_adi_lm2 = smf.glm(formula = "access ~ ADI_NATRANK + AREALAND", data=rural_merged)
rural_adi_res2 = rural_adi_lm2.fit()
rural_adi_res2.summary()

0,1,2,3
Dep. Variable:,access,No. Observations:,39720.0
Model:,GLM,Df Residuals:,39717.0
Model Family:,Gaussian,Df Model:,2.0
Link Function:,identity,Scale:,959.14
Method:,IRLS,Log-Likelihood:,-192720.0
Date:,"Sun, 08 Oct 2023",Deviance:,38094000.0
Time:,17:14:06,Pearson chi2:,38100000.0
No. Iterations:,3,Pseudo R-squ. (CS):,0.02078
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,7.6138,0.567,13.440,0.000,6.503,8.724
ADI_NATRANK,0.0086,0.008,1.137,0.256,-0.006,0.023
AREALAND,1.068e-08,3.7e-10,28.874,0.000,9.96e-09,1.14e-08
