In [1]:
import numpy as np
import pandas as pd
import pyarrow as pa
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('NYC_Wi-Fi_Hotspot_Locations.csv', keep_default_na=True, delimiter=',', skipinitialspace=True)

In [3]:
df


Unnamed: 0,OBJECTID,Borough,Type,Provider,Name,Location,Latitude,Longitude,X,Y,...,Neighborhood Tabulation Area (NTA),Council Distrcit,Postcode,BoroCD,Census Tract,BCTCB2010,BIN,BBL,DOITT_ID,"Location (Lat, Long)"
0,10604,4,Limited Free,SPECTRUM,Baisley Pond Park,Park Perimeter,40.674860,-73.784120,1.044132e+06,185219.892077,...,Springfield Gardens North,28,11434,412,294,294,0,0,1408,"(40.6748599999, -73.7841200005)"
1,10555,4,Limited Free,SPECTRUM,Kissena Park,Park Perimeter,40.747560,-73.818150,1.034638e+06,211685.217755,...,Flushing,20,11355,407,845,845,0,0,1359,"(40.7475599996, -73.8181499997)"
2,12370,3,Free,Transit Wireless,Grand St (L),Grand St (L),40.711926,-73.940670,1.000698e+06,198655.908840,...,East Williamsburg,34,11206,301,495,495,0,0,1699,"(40.7119259997, -73.9406699994)"
3,9893,3,Free,Downtown Brooklyn,,125 Court St.,40.689985,-73.991995,9.864700e+05,190656.680416,...,Brooklyn Heights-Cobble Hill,33,11201,302,9,9,3388736,3002777501,298,"(40.6899850001, -73.9919950004)"
4,10169,1,Free,Transit Wireless,Lexington Av-63 St (F),Lexington Av-63 St (F),40.764630,-73.966115,9.936366e+05,217853.888161,...,Upper East Side-Carnegie Hill,4,10065,108,120,120,0,0,599,"(40.7646300002, -73.9661150001)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3314,10872,3,Limited Free,SPECTRUM,Carroll Park,Court off Smith St between Carrol St and 1st P...,40.680630,-73.995382,9.855309e+05,187248.314202,...,Carroll Gardens-Columbia Street-Red Hook,39,11231,306,77,77,3007547,3004490015,1338,"(40.6806299998, -73.9953819995)"
3315,12026,2,Free,LinkNYC - Citybridge,bx-01-138789,312 WILLIS AVENUE,40.810896,-73.921434,1.005999e+06,234718.294065,...,Mott Haven-Port Morris,8,10454,201,39,39,2000335,2022850010,4113,"(40.8108964904, -73.9214341701)"
3316,12063,3,Free,LinkNYC - Citybridge,bk-01-143982,32 GRAHAM AVENUE,40.701930,-73.942239,1.000265e+06,195013.901033,...,East Williamsburg,34,11206,301,491,491,3071609,3031200000,3018,"(40.7019303441, -73.9422392381)"
3317,12066,3,Free,LinkNYC - Citybridge,bk-17-126527,1339 FLATBUSH AVENUE,40.638560,-73.953603,9.971268e+05,171924.271359,...,Erasmus,45,11226,317,790,790,3120360,3052110060,3021,"(40.6385596088, -73.9536032378)"


In [4]:
df.shape

(3319, 29)

In [5]:
# check duplicated rows
print('Number of duplicate (excluding original) rows is:', df.duplicated().sum())
print('Number of duplicate rows (including first) in the table is:', df[df.duplicated(keep=False)].shape[0])
# Show duplicate row data that can be dropped
df[df.duplicated(keep=False)]

Number of duplicate (excluding original) rows is: 0
Number of duplicate rows (including first) in the table is: 0


Unnamed: 0,OBJECTID,Borough,Type,Provider,Name,Location,Latitude,Longitude,X,Y,...,Neighborhood Tabulation Area (NTA),Council Distrcit,Postcode,BoroCD,Census Tract,BCTCB2010,BIN,BBL,DOITT_ID,"Location (Lat, Long)"


In [6]:
df = df[['Name', 'Borough Name', 'Type', 'Longitude', 'Latitude']]

In [7]:
df

Unnamed: 0,Name,Borough Name,Type,Longitude,Latitude
0,Baisley Pond Park,Queens,Limited Free,-73.784120,40.674860
1,Kissena Park,Queens,Limited Free,-73.818150,40.747560
2,Grand St (L),Brooklyn,Free,-73.940670,40.711926
3,,Brooklyn,Free,-73.991995,40.689985
4,Lexington Av-63 St (F),Manhattan,Free,-73.966115,40.764630
...,...,...,...,...,...
3314,Carroll Park,Brooklyn,Limited Free,-73.995382,40.680630
3315,bx-01-138789,Bronx,Free,-73.921434,40.810896
3316,bk-01-143982,Brooklyn,Free,-73.942239,40.701930
3317,bk-17-126527,Brooklyn,Free,-73.953603,40.638560


In [8]:
without_long_lat = df[df['Longitude'].isnull() | df['Latitude'].isnull()]

without_long_lat

Unnamed: 0,Name,Borough Name,Type,Longitude,Latitude


In [9]:
# check the data type
df.dtypes

Name             object
Borough Name     object
Type             object
Longitude       float64
Latitude        float64
dtype: object

In [10]:
# check null value
df.isnull().sum()

Name            230
Borough Name      0
Type              0
Longitude         0
Latitude          0
dtype: int64

In [11]:
df[df['Name'].isnull()]

Unnamed: 0,Name,Borough Name,Type,Longitude,Latitude
3,,Brooklyn,Free,-73.991995,40.689985
6,,Manhattan,Free,-73.941693,40.812944
12,,Manhattan,Free,-73.939510,40.810260
52,,Brooklyn,Free,-73.989635,40.690883
53,,Brooklyn,Free,-73.983025,40.692723
...,...,...,...,...,...
3149,,Brooklyn,Free,-73.989635,40.690883
3160,,Manhattan,Free,-74.006637,40.744408
3192,,Manhattan,Free,-73.941112,40.815114
3193,,Manhattan,Free,-73.941262,40.813713


In [12]:
df['Name'] = df['Name'].fillna(df['Borough Name'] + ' wifi')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Name'] = df['Name'].fillna(df['Borough Name'] + ' wifi')


In [13]:
df

Unnamed: 0,Name,Borough Name,Type,Longitude,Latitude
0,Baisley Pond Park,Queens,Limited Free,-73.784120,40.674860
1,Kissena Park,Queens,Limited Free,-73.818150,40.747560
2,Grand St (L),Brooklyn,Free,-73.940670,40.711926
3,Brooklyn wifi,Brooklyn,Free,-73.991995,40.689985
4,Lexington Av-63 St (F),Manhattan,Free,-73.966115,40.764630
...,...,...,...,...,...
3314,Carroll Park,Brooklyn,Limited Free,-73.995382,40.680630
3315,bx-01-138789,Bronx,Free,-73.921434,40.810896
3316,bk-01-143982,Brooklyn,Free,-73.942239,40.701930
3317,bk-17-126527,Brooklyn,Free,-73.953603,40.638560


In [14]:
import geopandas as gpd
from shapely.geometry import Point, Polygon

In [15]:
# change 'Longitude' and 'Latitude' into GeoDataFrame
df['Coordinates'] = list(zip(df.Longitude, df.Latitude))
df['Coordinates'] = df['Coordinates'].apply(Point)

gdf_hotspot = gpd.GeoDataFrame(df, geometry='Coordinates')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Coordinates'] = list(zip(df.Longitude, df.Latitude))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Coordinates'] = df['Coordinates'].apply(Point)


In [16]:
gdf_hotspot

Unnamed: 0,Name,Borough Name,Type,Longitude,Latitude,Coordinates
0,Baisley Pond Park,Queens,Limited Free,-73.784120,40.674860,POINT (-73.78412 40.67486)
1,Kissena Park,Queens,Limited Free,-73.818150,40.747560,POINT (-73.81815 40.74756)
2,Grand St (L),Brooklyn,Free,-73.940670,40.711926,POINT (-73.94067 40.71193)
3,Brooklyn wifi,Brooklyn,Free,-73.991995,40.689985,POINT (-73.99200 40.68999)
4,Lexington Av-63 St (F),Manhattan,Free,-73.966115,40.764630,POINT (-73.96612 40.76463)
...,...,...,...,...,...,...
3314,Carroll Park,Brooklyn,Limited Free,-73.995382,40.680630,POINT (-73.99538 40.68063)
3315,bx-01-138789,Bronx,Free,-73.921434,40.810896,POINT (-73.92143 40.81090)
3316,bk-01-143982,Brooklyn,Free,-73.942239,40.701930,POINT (-73.94224 40.70193)
3317,bk-17-126527,Brooklyn,Free,-73.953603,40.638560,POINT (-73.95360 40.63856)


In [17]:
# Read the geojson file
taxi_zone = gpd.read_file('NYC Taxi Zones.geojson')
taxi_zone

Unnamed: 0,shape_area,objectid,shape_leng,location_id,zone,borough,geometry
0,0.0007823067885,1,0.116357453189,1,Newark Airport,EWR,"MULTIPOLYGON (((-74.18445 40.69500, -74.18449 ..."
1,0.00486634037837,2,0.43346966679,2,Jamaica Bay,Queens,"MULTIPOLYGON (((-73.82338 40.63899, -73.82277 ..."
2,0.000314414156821,3,0.0843411059012,3,Allerton/Pelham Gardens,Bronx,"MULTIPOLYGON (((-73.84793 40.87134, -73.84725 ..."
3,0.000111871946192,4,0.0435665270921,4,Alphabet City,Manhattan,"MULTIPOLYGON (((-73.97177 40.72582, -73.97179 ..."
4,0.000497957489363,5,0.0921464898574,5,Arden Heights,Staten Island,"MULTIPOLYGON (((-74.17422 40.56257, -74.17349 ..."
...,...,...,...,...,...,...,...
258,0.000168611097013,256,0.0679149669603,256,Williamsburg (South Side),Brooklyn,"MULTIPOLYGON (((-73.95834 40.71331, -73.95681 ..."
259,0.000394552487366,259,0.126750305191,259,Woodlawn/Wakefield,Bronx,"MULTIPOLYGON (((-73.85107 40.91037, -73.85207 ..."
260,0.000422345326907,260,0.133514154636,260,Woodside,Queens,"MULTIPOLYGON (((-73.90175 40.76078, -73.90147 ..."
261,0.0000343423231652,261,0.0271204563616,261,World Trade Center,Manhattan,"MULTIPOLYGON (((-74.01333 40.70503, -74.01327 ..."


In [18]:
gdf_taxi_zone = gpd.GeoDataFrame(taxi_zone, geometry='geometry')
gdf_taxi_zone

Unnamed: 0,shape_area,objectid,shape_leng,location_id,zone,borough,geometry
0,0.0007823067885,1,0.116357453189,1,Newark Airport,EWR,"MULTIPOLYGON (((-74.18445 40.69500, -74.18449 ..."
1,0.00486634037837,2,0.43346966679,2,Jamaica Bay,Queens,"MULTIPOLYGON (((-73.82338 40.63899, -73.82277 ..."
2,0.000314414156821,3,0.0843411059012,3,Allerton/Pelham Gardens,Bronx,"MULTIPOLYGON (((-73.84793 40.87134, -73.84725 ..."
3,0.000111871946192,4,0.0435665270921,4,Alphabet City,Manhattan,"MULTIPOLYGON (((-73.97177 40.72582, -73.97179 ..."
4,0.000497957489363,5,0.0921464898574,5,Arden Heights,Staten Island,"MULTIPOLYGON (((-74.17422 40.56257, -74.17349 ..."
...,...,...,...,...,...,...,...
258,0.000168611097013,256,0.0679149669603,256,Williamsburg (South Side),Brooklyn,"MULTIPOLYGON (((-73.95834 40.71331, -73.95681 ..."
259,0.000394552487366,259,0.126750305191,259,Woodlawn/Wakefield,Bronx,"MULTIPOLYGON (((-73.85107 40.91037, -73.85207 ..."
260,0.000422345326907,260,0.133514154636,260,Woodside,Queens,"MULTIPOLYGON (((-73.90175 40.76078, -73.90147 ..."
261,0.0000343423231652,261,0.0271204563616,261,World Trade Center,Manhattan,"MULTIPOLYGON (((-74.01333 40.70503, -74.01327 ..."


In [19]:
# Ensure both GeoDataFrames are using the same CRS
gdf_hotspot = gdf_hotspot.set_crs("EPSG:4326")
gdf_taxi_zone = gdf_taxi_zone.set_crs("EPSG:4326")

# Perform spatial join with the 'predicate' parameter
result = gpd.sjoin(gdf_hotspot, gdf_taxi_zone, how="inner", predicate='intersects')

In [20]:
result

Unnamed: 0,Name,Borough Name,Type,Longitude,Latitude,Coordinates,index_right,shape_area,objectid,shape_leng,location_id,zone,borough
0,Baisley Pond Park,Queens,Limited Free,-73.784120,40.674860,POINT (-73.78412 40.67486),217,0.000281293736407,218,0.0837008281049,218,Springfield Gardens North,Queens
1283,Baisley Pond Park,Queens,Limited Free,-73.786780,40.672680,POINT (-73.78678 40.67268),217,0.000281293736407,218,0.0837008281049,218,Springfield Gardens North,Queens
2322,Baisley Pond Park,Queens,Limited Free,-73.785170,40.673180,POINT (-73.78517 40.67318),217,0.000281293736407,218,0.0837008281049,218,Springfield Gardens North,Queens
2894,Rochdale Village,Queens,Free,-73.770590,40.672875,POINT (-73.77059 40.67288),217,0.000281293736407,218,0.0837008281049,218,Springfield Gardens North,Queens
3071,Baisley Pond Park,Queens,Limited Free,-73.782540,40.676090,POINT (-73.78254 40.67609),217,0.000281293736407,218,0.0837008281049,218,Springfield Gardens North,Queens
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2795,Parkchester,Bronx,Free,-73.858355,40.833736,POINT (-73.85836 40.83374),181,0.0000910644156732,182,0.0393606369479,182,Parkchester,Bronx
2824,Hollis,Queens,Free,-73.762304,40.720168,POINT (-73.76230 40.72017),135,0.000423696152789,131,0.116547120922,131,Jamaica Estates,Queens
2903,Bayside,Queens,Free,-73.768362,40.760126,POINT (-73.76836 40.76013),20,0.000871889446182,16,0.141291873771,16,Bayside,Queens
2926,Maspeth,Queens,Free,-73.893105,40.726995,POINT (-73.89310 40.72700),157,0.000354370128323,157,0.131300217777,157,Maspeth,Queens


In [21]:
result = result[['Name', 'Longitude', 'Latitude', 'Type', 'zone', "location_id", "borough" ]]
result

Unnamed: 0,Name,Longitude,Latitude,Type,zone,location_id,borough
0,Baisley Pond Park,-73.784120,40.674860,Limited Free,Springfield Gardens North,218,Queens
1283,Baisley Pond Park,-73.786780,40.672680,Limited Free,Springfield Gardens North,218,Queens
2322,Baisley Pond Park,-73.785170,40.673180,Limited Free,Springfield Gardens North,218,Queens
2894,Rochdale Village,-73.770590,40.672875,Free,Springfield Gardens North,218,Queens
3071,Baisley Pond Park,-73.782540,40.676090,Limited Free,Springfield Gardens North,218,Queens
...,...,...,...,...,...,...,...
2795,Parkchester,-73.858355,40.833736,Free,Parkchester,182,Bronx
2824,Hollis,-73.762304,40.720168,Free,Jamaica Estates,131,Queens
2903,Bayside,-73.768362,40.760126,Free,Bayside,16,Queens
2926,Maspeth,-73.893105,40.726995,Free,Maspeth,157,Queens


In [22]:
# check null value
result.isnull().sum()

Name           0
Longitude      0
Latitude       0
Type           0
zone           0
location_id    0
borough        0
dtype: int64

# the code below used for model

In [23]:
# Rename the column
result.rename(columns={'Type': 'Wifi Type'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result.rename(columns={'Type': 'Wifi Type'}, inplace=True)


In [24]:
result

Unnamed: 0,Name,Longitude,Latitude,Wifi Type,zone,location_id,borough
0,Baisley Pond Park,-73.784120,40.674860,Limited Free,Springfield Gardens North,218,Queens
1283,Baisley Pond Park,-73.786780,40.672680,Limited Free,Springfield Gardens North,218,Queens
2322,Baisley Pond Park,-73.785170,40.673180,Limited Free,Springfield Gardens North,218,Queens
2894,Rochdale Village,-73.770590,40.672875,Free,Springfield Gardens North,218,Queens
3071,Baisley Pond Park,-73.782540,40.676090,Limited Free,Springfield Gardens North,218,Queens
...,...,...,...,...,...,...,...
2795,Parkchester,-73.858355,40.833736,Free,Parkchester,182,Bronx
2824,Hollis,-73.762304,40.720168,Free,Jamaica Estates,131,Queens
2903,Bayside,-73.768362,40.760126,Free,Bayside,16,Queens
2926,Maspeth,-73.893105,40.726995,Free,Maspeth,157,Queens


In [25]:
result['Wifi Type'] = result['Wifi Type'].str.replace('^(.*)$', r'\1 Wifi', regex=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result['Wifi Type'] = result['Wifi Type'].str.replace('^(.*)$', r'\1 Wifi', regex=True)


In [26]:
result

Unnamed: 0,Name,Longitude,Latitude,Wifi Type,zone,location_id,borough
0,Baisley Pond Park,-73.784120,40.674860,Limited Free Wifi,Springfield Gardens North,218,Queens
1283,Baisley Pond Park,-73.786780,40.672680,Limited Free Wifi,Springfield Gardens North,218,Queens
2322,Baisley Pond Park,-73.785170,40.673180,Limited Free Wifi,Springfield Gardens North,218,Queens
2894,Rochdale Village,-73.770590,40.672875,Free Wifi,Springfield Gardens North,218,Queens
3071,Baisley Pond Park,-73.782540,40.676090,Limited Free Wifi,Springfield Gardens North,218,Queens
...,...,...,...,...,...,...,...
2795,Parkchester,-73.858355,40.833736,Free Wifi,Parkchester,182,Bronx
2824,Hollis,-73.762304,40.720168,Free Wifi,Jamaica Estates,131,Queens
2903,Bayside,-73.768362,40.760126,Free Wifi,Bayside,16,Queens
2926,Maspeth,-73.893105,40.726995,Free Wifi,Maspeth,157,Queens


In [27]:
result = result[['Wifi Type', 'location_id']]
result

Unnamed: 0,Wifi Type,location_id
0,Limited Free Wifi,218
1283,Limited Free Wifi,218
2322,Limited Free Wifi,218
2894,Free Wifi,218
3071,Limited Free Wifi,218
...,...,...
2795,Free Wifi,182
2824,Free Wifi,131
2903,Free Wifi,16
2926,Free Wifi,157


In [28]:
result['Wifi Type'] = result['Wifi Type'].astype(str)
result['location_id'] = result['location_id'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result['Wifi Type'] = result['Wifi Type'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result['location_id'] = result['location_id'].astype(str)


In [29]:
df_combined = result.groupby(['location_id', 'Wifi Type']).size().reset_index(name='Counts')

df_combined

Unnamed: 0,location_id,Wifi Type,Counts
0,10,Free Wifi,1
1,10,Limited Free Wifi,16
2,100,Free Wifi,21
3,101,Free Wifi,1
4,102,Free Wifi,1
...,...,...,...
304,94,Free Wifi,9
305,95,Free Wifi,30
306,97,Free Wifi,46
307,98,Free Wifi,2


In [30]:
df_combined.to_csv("basic_hotspot_df.csv", index=False)