In [1]:
# importing libraries
import pandas as pd
import numpy as np
from sodapy import Socrata
import os

In [2]:
socrata_domain = 'data.cityofnewyork.us'
socrata_dataset_identifier = 'erm2-nwe9'

# If you choose to use a token, run the following command on the terminal (or add it to your .bashrc)
# $ export SODAPY_APPTOKEN=<token>
socrata_token = os.environ.get("SODAPY_APPTOKEN")

In [3]:
client = Socrata(socrata_domain, socrata_token)

In [4]:
metadata = client.get_metadata(socrata_dataset_identifier)
[x['name'] for x in metadata['columns']]

['Unique Key',
 'Created Date',
 'Closed Date',
 'Agency',
 'Agency Name',
 'Complaint Type',
 'Descriptor',
 'Location Type',
 'Incident Zip',
 'Incident Address',
 'Street Name',
 'Cross Street 1',
 'Cross Street 2',
 'Intersection Street 1',
 'Intersection Street 2',
 'Address Type',
 'City',
 'Landmark',
 'Facility Type',
 'Status',
 'Due Date',
 'Resolution Description',
 'Resolution Action Updated Date',
 'Community Board',
 'BBL',
 'Borough',
 'X Coordinate (State Plane)',
 'Y Coordinate (State Plane)',
 'Open Data Channel Type',
 'Park Facility Name',
 'Park Borough',
 'Vehicle Type',
 'Taxi Company Borough',
 'Taxi Pick Up Location',
 'Bridge Highway Name',
 'Bridge Highway Direction',
 'Road Ramp',
 'Bridge Highway Segment',
 'Latitude',
 'Longitude',
 'Location',
 'Zip Codes',
 'Community Districts',
 'Borough Boundaries',
 'City Council Districts',
 'Police Precincts']

In [5]:
meta_amount = [x for x in metadata['columns'] if x['name'] == 'Complaint Type'][0]
meta_amount

{'id': 354922035,
 'name': 'Complaint Type',
 'dataTypeName': 'text',
 'description': 'This is the first level of a hierarchy identifying the topic of the incident or condition. Complaint Type may have a corresponding Descriptor (below) or may stand alone.',
 'fieldName': 'complaint_type',
 'position': 6,
 'renderTypeName': 'text',
 'tableColumnId': 1567792,
 'width': 268,
 'cachedContents': {'largest': 'ZTESTINT',
  'non_null': '25093886',
  'null': '0',
  'top': [{'item': 'Noise - Residential', 'count': '2272076'},
   {'item': 'HEAT/HOT WATER', 'count': '1469657'},
   {'item': 'Illegal Parking', 'count': '1154532'},
   {'item': 'Blocked Driveway', 'count': '1063932'},
   {'item': 'Street Condition', 'count': '1028708'},
   {'item': 'Street Light Condition', 'count': '990689'},
   {'item': 'HEATING', 'count': '887869'},
   {'item': 'PLUMBING', 'count': '753700'},
   {'item': 'Water System', 'count': '692428'},
   {'item': 'Noise - Street/Sidewalk', 'count': '688245'},
   {'item': 'Gen

In [6]:
[x['fieldName'] for x in metadata['columns']]

['unique_key',
 'created_date',
 'closed_date',
 'agency',
 'agency_name',
 'complaint_type',
 'descriptor',
 'location_type',
 'incident_zip',
 'incident_address',
 'street_name',
 'cross_street_1',
 'cross_street_2',
 'intersection_street_1',
 'intersection_street_2',
 'address_type',
 'city',
 'landmark',
 'facility_type',
 'status',
 'due_date',
 'resolution_description',
 'resolution_action_updated_date',
 'community_board',
 'bbl',
 'borough',
 'x_coordinate_state_plane',
 'y_coordinate_state_plane',
 'open_data_channel_type',
 'park_facility_name',
 'park_borough',
 'vehicle_type',
 'taxi_company_borough',
 'taxi_pick_up_location',
 'bridge_highway_name',
 'bridge_highway_direction',
 'road_ramp',
 'bridge_highway_segment',
 'latitude',
 'longitude',
 'location',
 ':@computed_region_efsh_h5xi',
 ':@computed_region_f5dn_yrer',
 ':@computed_region_yeji_bk3q',
 ':@computed_region_92fq_4b7q',
 ':@computed_region_sbqj_enih']

In [7]:
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("data.cityofnewyork.us", socrata_token, timeout=1000)

query = """
SELECT 
    complaint_type, count(complaint_type)
GROUP BY 
    complaint_type
ORDER BY 
    count(complaint_type) DESC
LIMIT 
    1000000
"""

# Returned as JSON from API / converted to Python list of
# dictionaries by sodapy.
results = client.get("erm2-nwe9", 
                     query=query)

# Convert to pandas DataFrame
results_df = pd.DataFrame.from_records(results)

print(results_df.shape)
results_df.head(10)

(445, 2)


Unnamed: 0,complaint_type,count_complaint_type
0,Noise - Residential,2272076
1,HEAT/HOT WATER,1469657
2,Illegal Parking,1154532
3,Blocked Driveway,1063932
4,Street Condition,1028708
5,Street Light Condition,990689
6,HEATING,887869
7,PLUMBING,753700
8,Water System,692428
9,Noise - Street/Sidewalk,688245


In [8]:
results_df.shape

(445, 2)

In [9]:
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("data.cityofnewyork.us", socrata_token, timeout=1000)

query = """
SELECT 
    descriptor, count(descriptor)
WHERE 
    LOWER(descriptor) LIKE '%flood%'
GROUP BY 
    descriptor
ORDER BY 
    count(descriptor) DESC
LIMIT 
    1000000
"""

# Returned as JSON from API / converted to Python list of
# dictionaries by sodapy.
results = client.get("erm2-nwe9", 
                     query=query)

# Convert to pandas DataFrame
results_df = pd.DataFrame.from_records(results)

print(results_df.shape)
results_df

(11, 2)


Unnamed: 0,descriptor,count_descriptor
0,Catch Basin Clogged/Flooding (Use Comments) (SC),90181
1,Street Flooding (SJ),27691
2,Flood Light Lamp Out,5982
3,Highway Flooding (SH),2841
4,Flood Light Lamp Cycling,2515
5,Ready NY - Flooding,271
6,Flood Light Lamp Dayburning,205
7,Flood Light Lamp Missing,192
8,Flood Light Lamp Dim,177
9,RAIN GARDEN FLOODING (SRGFLD),80


In [10]:
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("data.cityofnewyork.us", socrata_token, timeout=1000)

query = """
SELECT 
    *
WHERE 
    LOWER(descriptor) LIKE '%flood%'
LIMIT 
    1000000
"""

# Returned as JSON from API / converted to Python list of
# dictionaries by sodapy.
results = client.get("erm2-nwe9", 
                     query=query)

# Convert to pandas DataFrame
results_df = pd.DataFrame.from_records(results)

print(results_df.shape)
results_df.head()

(130183, 34)


Unnamed: 0,unique_key,created_date,closed_date,agency,agency_name,complaint_type,descriptor,cross_street_1,cross_street_2,intersection_street_1,...,incident_zip,city,bbl,x_coordinate_state_plane,y_coordinate_state_plane,latitude,longitude,location,location_type,landmark
0,31137202,2015-07-22T18:37:00.000,2015-07-23T12:30:00.000,DEP,Department of Environmental Protection,Sewer,Catch Basin Clogged/Flooding (Use Comments) (SC),SPRINGFIELD BLVD,113 AVE,113 AVE,...,,,,,,,,,,
1,31142407,2015-07-22T09:27:00.000,2015-08-01T09:55:00.000,DEP,Department of Environmental Protection,Sewer,Catch Basin Clogged/Flooding (Use Comments) (SC),GRAND CENTRAL PKWY,164 PL,GRAND CENTRAL PKWY,...,,,,,,,,,,
2,31145751,2015-07-23T10:45:00.000,2015-07-29T10:45:00.000,DEP,Department of Environmental Protection,Sewer,Catch Basin Clogged/Flooding (Use Comments) (SC),PARK DRIVE E,JEWEL AVE,PARK DRIVE E,...,,,,,,,,,,
3,31164693,2015-07-25T07:22:00.000,2015-07-29T15:05:00.000,DEP,Department of Environmental Protection,Sewer,Catch Basin Clogged/Flooding (Use Comments) (SC),WEST END DR,GLENWOOD ST,GLENWOOD ST,...,,,,,,,,,,
4,31176641,2015-07-27T12:41:00.000,2015-07-29T12:15:00.000,DEP,Department of Environmental Protection,Sewer,Catch Basin Clogged/Flooding (Use Comments) (SC),125 ST,109 AVE,125 ST,...,,,,,,,,,,


In [11]:
results_df['descriptor'].value_counts()

Catch Basin Clogged/Flooding (Use Comments) (SC)    90181
Street Flooding (SJ)                                27691
Flood Light Lamp Out                                 5982
Highway Flooding (SH)                                2841
Flood Light Lamp Cycling                             2515
Ready NY - Flooding                                   271
Flood Light Lamp Dayburning                           205
Flood Light Lamp Missing                              192
Flood Light Lamp Dim                                  177
RAIN GARDEN FLOODING (SRGFLD)                          80
Flooded                                                48
Name: descriptor, dtype: int64

In [12]:
results_df = results_df.loc[results_df['descriptor'] == 'Street Flooding (SJ)']

In [13]:
results_df['descriptor'].value_counts()

Street Flooding (SJ)    27691
Name: descriptor, dtype: int64

In [14]:
results_df['complaint_type'].value_counts()

Sewer    27691
Name: complaint_type, dtype: int64

In [15]:
results_df.shape

(27691, 34)

In [16]:
results_df.head()

Unnamed: 0,unique_key,created_date,closed_date,agency,agency_name,complaint_type,descriptor,cross_street_1,cross_street_2,intersection_street_1,...,incident_zip,city,bbl,x_coordinate_state_plane,y_coordinate_state_plane,latitude,longitude,location,location_type,landmark
25,31202778,2015-07-30T15:13:00.000,2015-07-31T12:50:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),141 ST,143 ST,,...,,,,,,,,,,
36,31256677,2015-08-06T09:20:00.000,2015-08-06T12:20:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),W 254 ST,INDEPENDENCE AVE,W 254 ST,...,,,,,,,,,,
41,31290216,2015-08-11T19:12:00.000,2015-08-13T13:19:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),122 AVE,LAKEVIEW LN,LAKEVIEW LN,...,,,,,,,,,,
58,49798109,2021-02-16T17:58:00.000,2021-02-17T09:15:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),CARLTON BLVD,ARDEN AVE,,...,10312.0,STATEN ISLAND,5057120007.0,932687.0,139594.0,40.54967903401289,-74.18554127312115,"{'latitude': '40.54967903401289', 'longitude':...",,
69,49833607,2021-02-21T16:45:00.000,2021-02-21T16:52:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),HILLSIDE AVE,89 AVE,,...,11427.0,Queens Village,4106690136.0,1052502.0,203304.0,40.72443591923869,-73.75376185859332,"{'latitude': '40.724435919238694', 'longitude'...",,


# Deleting records greater than year 2020

In [17]:
print('Number of total records:', len(results_df))
      
print('Min:', results_df.created_date.min())

print('Max:', results_df.created_date.max())

Number of total records: 27691
Min: 2010-01-02T08:26:00.000
Max: 2021-03-08T18:49:00.000


In [19]:
results_df.loc[results_df.created_date < '2021'].sort_values(by='created_date', ascending=False).head()

Unnamed: 0,unique_key,created_date,closed_date,agency,agency_name,complaint_type,descriptor,cross_street_1,cross_street_2,intersection_street_1,...,incident_zip,city,bbl,x_coordinate_state_plane,y_coordinate_state_plane,latitude,longitude,location,location_type,landmark
119665,48542220,2020-12-31T15:41:00.000,2021-01-01T00:20:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),FOCH BLVD,120 AVE,,...,11420,South Ozone Park,4116990057.0,1038500,186000,40.67703755925495,-73.80441718054371,"{'latitude': '40.67703755925495', 'longitude':...",,
124228,48536430,2020-12-31T14:49:00.000,2021-01-04T10:15:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),20 AVE,20 RD,,...,11357,Whitestone,4046700029.0,1035682,223771,40.78072630540092,-73.81428794578581,"{'latitude': '40.78072630540092', 'longitude':...",,
3799,48539361,2020-12-31T14:03:00.000,2021-01-02T11:25:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),72 ST,7 AVE,,...,11228,BROOKLYN,3059120001.0,979584,168255,40.62849640806448,-74.01680967626773,"{'latitude': '40.62849640806448', 'longitude':...",,
3802,48543132,2020-12-31T13:48:00.000,2020-12-31T14:50:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),,,RIVERSIDE DRIVE,...,10032,NEW YORK,,999557,245700,40.84105168954552,-73.9446789892306,"{'latitude': '40.841051689545516', 'longitude'...",,
2093,48536441,2020-12-31T13:10:00.000,2021-01-03T10:45:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),E 31 ST,E 32 ST,,...,11234,BROOKLYN,3084750001.0,1001868,161232,40.60920344739991,-73.93654793950026,"{'latitude': '40.609203447399906', 'longitude'...",,


In [20]:
results_df = results_df.loc[results_df.created_date < '2021']

print(len(results_df))

print('Min:', results_df.created_date.min())

print('Max:', results_df.created_date.max())

27428
Min: 2010-01-02T08:26:00.000
Max: 2020-12-31T15:41:00.000


In [21]:
# writing output file as a csv
results_df.to_csv('../data-raw/raw_street_flooding_complaints.csv', index=False)

# listing items in data folder
%ls ../data-raw/

311_SR_Data_Dictionary_2018.xlsx    nynta_metadata.pdf
Centerline.pdf                      raw-street-flooding-data.csv
README.md                           raw-streets-clipped.json
nybb_metadata.pdf                   raw_street_flooding_complaints.csv


# Briefly reviewing what descriptors are in the complaint_type='Sewer'

In [22]:
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("data.cityofnewyork.us", socrata_token, timeout=1000)

# Example authenticated client (needed for non-public datasets):
# client = Socrata(data.cityofnewyork.us,
#                  MyAppToken,
#                  userame="user@example.com",
#                  password="AFakePassword")

query = """
SELECT 
    descriptor, count(descriptor)
WHERE 
    complaint_type='Sewer'
GROUP BY 
    descriptor
ORDER BY 
    count(descriptor) DESC
"""

# First 2000 results, returned as JSON from API / converted to Python list of
# dictionaries by sodapy.
results = client.get("erm2-nwe9", 
                     query=query)

# Convert to pandas DataFrame
results_df = pd.DataFrame.from_records(results)

print(results_df.shape)
results_df.head(len(results_df))

(27, 2)


Unnamed: 0,descriptor,count_descriptor
0,Sewer Backup (Use Comments) (SA),149981
1,Catch Basin Clogged/Flooding (Use Comments) (SC),90181
2,Catch Basin Sunken/Damaged/Raised (SC1),28632
3,Street Flooding (SJ),27691
4,Manhole Cover Broken/Making Noise (SB),19780
5,Manhole Cover Missing (Emergency) (SA3),17649
6,Sewer Odor (SA2),15453
7,Defective/Missing Curb Piece (SC4),8526
8,Manhole Overflow (Use Comments) (SA1),6928
9,Catch Basin Search (SC2),4154
