In [28]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd
import os as os
import libpysal as ps
import spreg

In [6]:
# Adjust directory accordingly
os.chdir('C:\\Users\\wesch\\OneDrive\\20 Spring Summer Urban Science Intensive\\nyconnect\\data')

In [113]:
# load datasets
subscription = pd.read_csv("./ACS_Internet_Subscription/subscription_acs_2018.csv")
income = pd.read_csv("./Demographics/Income/household_income_acs2018.csv")
race = pd.read_csv("./Demographics/Basic Count/tract_race_acs2018.csv")
census_tracts = gpd.read_file("./2010_Census_Tracts/geo_export_0ca1f0a1-f242-497a-9f86-91b88fc18647.shp")

[Methodology](http://darribas.org/gds_scipy16/ipynb_md/08_spatial_regression.html) adapted from terrific notebooks published by Sergio Rey and Arribas and their PySal [notebooks](http://pysal.org/notebooks/lib/libpysal/weights.html) page.

In [114]:
# data cleaning 
## Calculating race as a percentage
race = race.assign(White = lambda x: x["White"]/x["Total"]*100,
                  Black = lambda x: x["Black"]/x["Total"]*100,
                  Native = lambda x: x["Native"]/x["Total"]*100,
                  Asian = lambda x: x["Asian"]/x["Total"]*100,
                  PacificIslander = lambda x: x["Pacific Islander"]/x["Total"]*100,
                  Other = lambda x: x["Other"]/x["Total"]*100,
                  TwoOrMore = lambda x: x["Two or More"]/x["Total"]*100)
race = race[['city', 'tract', 'county', "BoroCTLbl", 'Total', 'White', 'Black', 'Native', 'Asian', 
                'PacificIslander', 'Other','TwoOrMore']]
## Calculating subscription as a percentage
subscription = subscription.assign(Mobile_Dependent = lambda x: x["Mobile_Dependent"]/x["Total"]*100,
                                   Wired_Broadband = lambda x: x["Wired_Broadband"]/x["Total"]*100,
                                   No_Internet = lambda x: x["No_Internet"]/x["Total"]*100)
## filling NAs
race = race.fillna(0)
income = income.fillna(0)
subscription = subscription.fillna(0)
## returning centroid of each census tract
census_tracts = census_tracts.assign(longitude = lambda x: x.geometry.centroid.x,
                                    latitude = lambda x: x.geometry.centroid.y)
## creating a new dataframe with all the variables 
indep_vars = income.merge(race, on = ["city", "tract", "county", "BoroCTLbl"])
indep_vars = indep_vars.merge(subscription, on = ["city", "tract", "county", "BoroCTLbl"])
indep_vars = indep_vars.merge(census_tracts.loc[:, ["BoroCTLbl", "longitude", "latitude", "geometry"]], on = ["BoroCTLbl"])
## rename variables 
renamed_cols = {
    'Total_x': 'PopulationCount',
    'Total_y' : 'HouseholdCount'}
indep_vars.rename(columns=renamed_cols, inplace=True)


In [107]:
indep_vars = indep_vars.dropna()

In [119]:
from libpysal.weights import Queen, KNN
weights = Queen.from_dataframe(indep_vars)
weights.transform = "r"



 There are 6 disconnected components.
 There are 3 islands with ids: 338, 1099, 2164.


In [116]:
indep_vars2 = indep_vars[['Households (HH) Count',
       'HH 0-10k', 'HH 10k-15k', 'HH 15k-25k', 'HH 25k-35k', 'HH 35k-50k',
       'HH 50k-75k', 'HH 75k-100k', 'HH 100k-150k', 'HH 150k-200k',
       'HH 200k-UP', 'PopulationCount', 'White', 'Black', 'Native', 'Asian',
       'PacificIslander', 'Other', 'TwoOrMore']]

In [120]:
m1 = spreg.OLS(y = np.array(indep_vars.loc[:,["Wired_Broadband"]]), 
               x = np.array(indep_vars2),
              w = weights,
              spat_diag = True)

In [121]:
print(m1.summary)

REGRESSION
----------
SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set            :     unknown
Weights matrix      :     unknown
Dependent Variable  :     dep_var                Number of Observations:        2165
Mean dependent var  :     68.6500                Number of Variables   :          20
S.D. dependent var  :     16.2861                Degrees of Freedom    :        2145
R-squared           :      0.7615
Adjusted R-squared  :      0.7594
Sum squared residual:  136910.445                F-statistic           :    360.3947
Sigma-square        :      63.828                Prob(F-statistic)     :           0
S.E. of regression  :       7.989                Log likelihood        :   -7561.028
Sigma-square ML     :      63.238                Akaike info criterion :   15162.057
S.E of regression ML:      7.9522                Schwarz criterion     :   15275.660

-----------------------------------------------------------------------------

In [122]:
indep_vars2.columns

Index(['Households (HH) Count', 'HH 0-10k', 'HH 10k-15k', 'HH 15k-25k',
       'HH 25k-35k', 'HH 35k-50k', 'HH 50k-75k', 'HH 75k-100k', 'HH 100k-150k',
       'HH 150k-200k', 'HH 200k-UP', 'PopulationCount', 'White', 'Black',
       'Native', 'Asian', 'PacificIslander', 'Other', 'TwoOrMore'],
      dtype='object')

From the above, we can conclude that race is not statistically significant in testing for wired broadband subscription. Income is a much better predictor.  
At the bottom of the summary object are four statistics for spatial dependence. I tested whether the residuals of the regression are spatially correlated, against the null of a random distribution over a designated space, NYC's census tracts in this case. Since, the tests reject the null hypothesis that error terms are randomly distributed in space, the normal OLS assumption of randomness is violated. Thus, OLS on its own is not particularly well suited. 

In [123]:
m2 = spreg.GM_Lag(y = np.array(indep_vars.loc[:,["Wired_Broadband"]]), 
                  x = np.array(indep_vars2),
                  w = weights,
                  spat_diag = True,
                 name_x = indep_vars2.columns.tolist(),
                 name_y = "WiredBroadband_Pct")

In [124]:
print(m2.summary)

REGRESSION
----------
SUMMARY OF OUTPUT: SPATIAL TWO STAGE LEAST SQUARES
--------------------------------------------------
Data set            :     unknown
Weights matrix      :     unknown
Dependent Variable  :WiredBroadband_Pct                Number of Observations:        2165
Mean dependent var  :     68.6500                Number of Variables   :          21
S.D. dependent var  :     16.2861                Degrees of Freedom    :        2144
Pseudo R-squared    :      0.7705
Spatial Pseudo R-squared:  0.7614

------------------------------------------------------------------------------------
            Variable     Coefficient       Std.Error     z-Statistic     Probability
------------------------------------------------------------------------------------
            CONSTANT      -3.4318433       1.8892969      -1.8164658       0.0692989
Households (HH) Count       0.0033963       0.0005068       6.7015949       0.0000000
            HH 0-10k       0.4917318       0.0424810

The results are roughly similar, and we can see that the spatially lagged variable is significant. What this means is that there is some form of spatial interaction between census tracts that causes some autocorrelation in wired broadband subscription. Even if we are not entirely convinced that a regression model can predict the rates of household broadband subscription, we want to see whether a lagged model can reduce MSE.

In [125]:
y = indep_vars.loc[:,["Wired_Broadband"]]

In [126]:
from sklearn.metrics import mean_squared_error as mse

mses = pd.Series({'OLS': mse(y, m1.predy.flatten()),
                'Lag': mse(y, m2.predy.flatten()),
                    })
mses.sort_values()

Lag    60.852850
OLS    63.238081
dtype: float64