In [1]:
import geopandas as gpd 
import pandas as pd 

In [2]:
sidewalks_nyc = gpd.read_parquet('/share/ju/sidewalk_utils/data/nyc/claustrophobia/nyc_sidewalks_claustrophobia.parquet')

In [3]:
collisions_nyc = pd.read_csv('/share/ju/sidewalk_utils/data/nyc/Motor_Vehicle_Collisions_-_Crashes_20240915.csv', engine='pyarrow')

In [4]:
# drop na latitude and longitude
collisions_nyc = collisions_nyc.dropna(subset=['LATITUDE', 'LONGITUDE'])

In [5]:
collisions_nyc = gpd.GeoDataFrame(
    collisions_nyc, 
    geometry=gpd.points_from_xy(collisions_nyc['LONGITUDE'], collisions_nyc['LATITUDE']),
    crs='EPSG:4326'
).to_crs('EPSG:2263')

In [6]:
# spatial join collisions with sidewalks
collisions_with_sidewalks = gpd.sjoin_nearest(
    collisions_nyc, 
    sidewalks_nyc, 
    how='left', 
    max_distance=50,  # Adjust as needed
    distance_col='distance_to_sidewalk'
)

In [7]:
# drop rows with na distance_to_sidewalk
collisions_with_sidewalks = collisions_with_sidewalks.dropna(subset=['distance_to_sidewalk'])

In [8]:
collisions_nyc.columns

Index(['CRASH DATE', 'CRASH TIME', 'BOROUGH', 'ZIP CODE', 'LATITUDE',
       'LONGITUDE', 'LOCATION', 'ON STREET NAME', 'CROSS STREET NAME',
       'OFF STREET NAME', 'NUMBER OF PERSONS INJURED',
       'NUMBER OF PERSONS KILLED', 'NUMBER OF PEDESTRIANS INJURED',
       'NUMBER OF PEDESTRIANS KILLED', 'NUMBER OF CYCLIST INJURED',
       'NUMBER OF CYCLIST KILLED', 'NUMBER OF MOTORIST INJURED',
       'NUMBER OF MOTORIST KILLED', 'CONTRIBUTING FACTOR VEHICLE 1',
       'CONTRIBUTING FACTOR VEHICLE 2', 'CONTRIBUTING FACTOR VEHICLE 3',
       'CONTRIBUTING FACTOR VEHICLE 4', 'CONTRIBUTING FACTOR VEHICLE 5',
       'COLLISION_ID', 'VEHICLE TYPE CODE 1', 'VEHICLE TYPE CODE 2',
       'VEHICLE TYPE CODE 3', 'VEHICLE TYPE CODE 4', 'VEHICLE TYPE CODE 5',
       'geometry'],
      dtype='object')

In [9]:
# count the number of collisions per sidewalk, the number of persons injured per sidewalk, and the number of persons killed per sidewalk
collisions_per_sidewalk = collisions_with_sidewalks.groupby('index_right').agg(
    total_collisions=('CRASH DATE', 'count'),
    total_persons_injured=('NUMBER OF PERSONS INJURED', 'sum'),
    total_persons_killed=('NUMBER OF PERSONS KILLED', 'sum')
).reset_index()


In [10]:
# merge the counts with the sidewalks GeoDataFrame
del collisions_with_sidewalks

sidewalks_nyc = sidewalks_nyc.merge(
    collisions_per_sidewalk, 
    left_index=True, 
    right_on='index_right', 
    how='left'
).fillna({
    'total_collisions': 0,
    'total_persons_injured': 0,
    'total_persons_killed': 0
}).reset_index(drop=True)


In [11]:
# drop rows with missing claustrophobia_95th or claustrohpobia_median
sidewalks_nyc = sidewalks_nyc.dropna(subset=['claustrophobia_95th', 'claustrophobia_median'])

In [13]:
# scale the claustrophobia metrics to a 0-100 range
sidewalks_nyc['claustrophobia_95th'] *= 100 
sidewalks_nyc['claustrophobia_median'] *= 100

In [14]:
# regress claustrophobia on the number of collisions, persons injured, and persons killed
import statsmodels.api as sm
X = sidewalks_nyc[['total_collisions', 'total_persons_injured', 'total_persons_killed']]
X = sm.add_constant(X)  # Adds a constant term to the predictor
y = sidewalks_nyc['claustrophobia_median']

model = sm.OLS(y, X).fit()
print(model.summary())

                              OLS Regression Results                             
Dep. Variable:     claustrophobia_median   R-squared:                       0.002
Model:                               OLS   Adj. R-squared:                  0.002
Method:                    Least Squares   F-statistic:                     641.3
Date:                   Tue, 03 Jun 2025   Prob (F-statistic):               0.00
Time:                           14:32:07   Log-Likelihood:            -4.7214e+06
No. Observations:                1017120   AIC:                         9.443e+06
Df Residuals:                    1017116   BIC:                         9.443e+06
Df Model:                              3                                         
Covariance Type:               nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------


In [15]:
from pysal.model import spreg 
from pysal.lib import weights 
from pysal.explore import esda 
from scipy import stats 
import statsmodels.formula.api as sm

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
explanatory_vars = ['claustrophobia_median']
dep_vars = ['total_collisions']

data = sidewalks_nyc[explanatory_vars + dep_vars + ['geometry']].copy()

w = weights.KNN.from_dataframe(data, k=8)
w.transform = 'R'
w

 There are 708 disconnected components.
  W.__init__(self, neighbors, id_order=ids, **kwargs)


<libpysal.weights.distance.KNN at 0x7f6a618e9a30>

In [29]:
m1 = spreg.OLS(data[explanatory_vars].values, data[dep_vars].values, name_y=dep_vars, name_x=explanatory_vars, w=w)

In [30]:
print(m1.summary)

REGRESSION RESULTS
------------------

SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set            :     unknown
Weights matrix      :     unknown
Dependent Variable  :['total_collisions']                Number of Observations:     1017120
Mean dependent var  :      7.6388                Number of Variables   :           2
S.D. dependent var  :     25.1269                Degrees of Freedom    :     1017118
R-squared           :      0.0014
Adjusted R-squared  :      0.0014
Sum squared residual: 6.41295e+08                F-statistic           :   1390.3942
Sigma-square        :     630.502                Prob(F-statistic)     :  4.133e-304
S.E. of regression  :      25.110                Log likelihood        :-4721670.145
Sigma-square ML     :     630.501                Akaike info criterion : 9443344.289
S.E of regression ML:     25.1098                Schwarz criterion     : 9443367.954

----------------------------------------------------

In [38]:
lagged = data.assign(w_claustrophobia_median=weights.spatial_lag.lag_spatial(w, data['claustrophobia_median']))

In [39]:
lagged

Unnamed: 0,claustrophobia_median,total_collisions,geometry,w_claustrophobia_median
119,2.275649,0.0,"POLYGON ((917337.741 121366.645, 917337.741 12...",3.124896
120,2.319788,1.0,"POLYGON ((917328.038 121407.339, 917328.038 12...",3.076496
121,2.813901,0.0,"POLYGON ((917369.525 121427.689, 917369.525 12...",3.057615
122,2.749218,0.0,"POLYGON ((917409.292 121438.156, 917409.292 12...",3.019423
123,2.687442,0.0,"POLYGON ((917449.06 121448.623, 917449.06 1213...",3.027145
...,...,...,...,...
1328293,1.445994,0.0,"POLYGON ((1060434.037 218565.077, 1060434.037 ...",1.850419
1328296,36.139474,0.0,"POLYGON ((1060726.096 218565.689, 1060726.096 ...",1.929731
1328297,1.610776,0.0,"POLYGON ((1060697.09 218526.181, 1060697.09 21...",6.245819
1328298,1.568526,11.0,"POLYGON ((1060668.084 218486.672, 1060668.084 ...",6.064507


In [41]:
extended_vars = explanatory_vars + ['w_claustrophobia_median']
m2 = spreg.OLS(lagged[['total_collisions']].values, lagged[extended_vars].values, name_y='total_collisions', name_x=extended_vars, w=w)
print(m2.summary)

REGRESSION RESULTS
------------------

SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set            :     unknown
Weights matrix      :     unknown
Dependent Variable  :total_collisions                Number of Observations:     1017120
Mean dependent var  :      1.9148                Number of Variables   :           3
S.D. dependent var  :     12.7571                Degrees of Freedom    :     1017117
R-squared           :      0.0015
Adjusted R-squared  :      0.0015
Sum squared residual: 1.65276e+08                F-statistic           :    781.6881
Sigma-square        :     162.495                Prob(F-statistic)     :           0
S.E. of regression  :      12.747                Log likelihood        :-4032127.523
Sigma-square ML     :     162.494                Akaike info criterion : 8064261.046
S.E of regression ML:     12.7473                Schwarz criterion     : 8064296.543

--------------------------------------------------------