# Querying 311 Street Flooding Complaints 
Querying 311 Street Flooding Complaints data from NYC Open Data using the Python library sodapy  
Mark Bauer

In [13]:
# importing libraries
import pandas as pd
import numpy as np
from sodapy import Socrata
import os

In [14]:
# nyc open data domain and 311 dataset id
socrata_domain = 'data.cityofnewyork.us'
socrata_dataset_identifier = 'erm2-nwe9'

# If you choose to use a token, run the following command on the terminal (or add it to your .bashrc)
# $ export SODAPY_APPTOKEN=<token>
socrata_token = os.environ.get("SODAPY_APPTOKEN")

In [15]:
# connecting to soda api
client = Socrata(socrata_domain, socrata_token)

metadata = client.get_metadata(socrata_dataset_identifier)
print('type: {}'.format(type(metadata)))
print('count of items: {}'.format(len(metadata)))



type: <class 'dict'>
count of items: 41


In [16]:
for i in metadata.keys():
    print(i)

id
name
assetType
attribution
averageRating
category
createdAt
description
displayType
downloadCount
hideFromCatalog
hideFromDataJson
indexUpdatedAt
newBackend
numberOfComments
oid
provenance
publicationAppendEnabled
publicationDate
publicationGroup
publicationStage
rowClass
rowIdentifierColumnId
rowsUpdatedAt
rowsUpdatedBy
tableId
totalTimesRated
viewCount
viewLastModified
viewType
approvals
clientContext
columns
grants
metadata
owner
query
rights
tableAuthor
tags
flags


In [24]:
print('type: {}'.format(type(metadata['columns'])))
metadata['columns'][0]

type: <class 'list'>


{'id': 354922030,
 'name': 'Unique Key',
 'dataTypeName': 'text',
 'description': 'Unique identifier of a Service Request (SR) in the open data set\n',
 'fieldName': 'unique_key',
 'position': 1,
 'renderTypeName': 'text',
 'tableColumnId': 1567787,
 'width': 220,
 'cachedContents': {'largest': '52892093',
  'non_null': '27382103',
  'null': '0',
  'top': [{'item': '10693408', 'count': '1'},
   {'item': '10836749', 'count': '1'},
   {'item': '10836967', 'count': '1'},
   {'item': '11051177', 'count': '1'},
   {'item': '11413576', 'count': '1'},
   {'item': '11463895', 'count': '1'},
   {'item': '11463896', 'count': '1'},
   {'item': '11464334', 'count': '1'},
   {'item': '11464394', 'count': '1'},
   {'item': '11464467', 'count': '1'},
   {'item': '11464508', 'count': '1'},
   {'item': '11464509', 'count': '1'},
   {'item': '11464521', 'count': '1'},
   {'item': '11464567', 'count': '1'},
   {'item': '11464572', 'count': '1'},
   {'item': '11464639', 'count': '1'},
   {'item': '1146484

In [5]:
[x['name'] for x in metadata['columns']]

['Unique Key',
 'Created Date',
 'Closed Date',
 'Agency',
 'Agency Name',
 'Complaint Type',
 'Descriptor',
 'Location Type',
 'Incident Zip',
 'Incident Address',
 'Street Name',
 'Cross Street 1',
 'Cross Street 2',
 'Intersection Street 1',
 'Intersection Street 2',
 'Address Type',
 'City',
 'Landmark',
 'Facility Type',
 'Status',
 'Due Date',
 'Resolution Description',
 'Resolution Action Updated Date',
 'Community Board',
 'BBL',
 'Borough',
 'X Coordinate (State Plane)',
 'Y Coordinate (State Plane)',
 'Open Data Channel Type',
 'Park Facility Name',
 'Park Borough',
 'Vehicle Type',
 'Taxi Company Borough',
 'Taxi Pick Up Location',
 'Bridge Highway Name',
 'Bridge Highway Direction',
 'Road Ramp',
 'Bridge Highway Segment',
 'Latitude',
 'Longitude',
 'Location',
 'Zip Codes',
 'Community Districts',
 'Borough Boundaries',
 'City Council Districts',
 'Police Precincts']

In [25]:
meta_amount = [x for x in metadata['columns'] if x['name'] == 'Complaint Type']
meta_amount[0]

{'id': 354922035,
 'name': 'Complaint Type',
 'dataTypeName': 'text',
 'description': 'This is the first level of a hierarchy identifying the topic of the incident or condition. Complaint Type may have a corresponding Descriptor (below) or may stand alone.',
 'fieldName': 'complaint_type',
 'position': 6,
 'renderTypeName': 'text',
 'tableColumnId': 1567792,
 'width': 268,
 'cachedContents': {'largest': 'ZTESTINT',
  'non_null': '27382103',
  'null': '0',
  'top': [{'item': 'Noise - Residential', 'count': '2575599'},
   {'item': 'HEAT/HOT WATER', 'count': '1589587'},
   {'item': 'Illegal Parking', 'count': '1424426'},
   {'item': 'Blocked Driveway', 'count': '1188263'},
   {'item': 'Street Condition', 'count': '1090368'},
   {'item': 'Street Light Condition', 'count': '1035400'},
   {'item': 'HEATING', 'count': '887869'},
   {'item': 'Noise - Street/Sidewalk', 'count': '863368'},
   {'item': 'PLUMBING', 'count': '780618'},
   {'item': 'Water System', 'count': '739604'},
   {'item': 'UN

In [26]:
meta_amount = [x for x in metadata['columns'] if x['name'] == 'Descriptor']
meta_amount[0]

{'id': 354922036,
 'name': 'Descriptor',
 'dataTypeName': 'text',
 'description': 'This is  associated to the Complaint Type, and provides further detail on the incident or condition. Descriptor values are dependent on the Complaint Type, and are not always required in SR. \n',
 'fieldName': 'descriptor',
 'position': 7,
 'renderTypeName': 'text',
 'tableColumnId': 1567793,
 'width': 220,
 'cachedContents': {'largest': 'Zoning - Non-Conforming/Illegal Vehicle Storage',
  'non_null': '27312254',
  'null': '69849',
  'top': [{'item': 'Loud Music/Party', 'count': '2779118'},
   {'item': 'ENTIRE BUILDING', 'count': '1036777'},
   {'item': 'No Access', 'count': '887751'},
   {'item': 'HEAT', 'count': '868960'},
   {'item': 'Street Light Out', 'count': '766221'},
   {'item': 'Banging/Pounding', 'count': '696263'},
   {'item': 'Pothole', 'count': '660455'},
   {'item': 'APARTMENT ONLY', 'count': '552810'},
   {'item': 'Loud Talking', 'count': '407296'},
   {'item': 'Blocked Hydrant', 'count':

In [27]:
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("data.cityofnewyork.us", socrata_token, timeout=1000)

query = """
SELECT 
    complaint_type, 
    count(complaint_type)   
GROUP BY 
    complaint_type   
ORDER BY 
    count(complaint_type) DESC
LIMIT
    10000000
"""

# Returned as JSON from API / converted to Python list of
# dictionaries by sodapy.
results = client.get("erm2-nwe9", query=query)

# Convert to pandas DataFrame
results_df = pd.DataFrame.from_records(results)

print('shape of data: {}'.format(results_df.shape))
results_df.head(10)



shape of data: (484, 2)


Unnamed: 0,complaint_type,count_complaint_type
0,Noise - Residential,2894862
1,HEAT/HOT WATER,1761608
2,Illegal Parking,1739076
3,Blocked Driveway,1308581
4,Street Condition,1156665
5,Street Light Condition,1080817
6,Request Large Bulky Item Collection,1073753
7,Noise - Street/Sidewalk,996948
8,HEATING,887869
9,PLUMBING,841251


In [33]:
client = Socrata("data.cityofnewyork.us", socrata_token, timeout=1000)

query = """
SELECT 
    descriptor, 
    count(descriptor)   
GROUP BY 
    descriptor    
ORDER BY 
    count(descriptor) DESC
LIMIT
    10000000    
"""

results = client.get("erm2-nwe9", query=query)
results_df = pd.DataFrame.from_records(results)

print('shape of data: {}'.format(results_df.shape))
results_df.head(10)



shape of data: (1879, 2)


Unnamed: 0,descriptor,count_descriptor
0,Loud Music/Party,3167047
1,ENTIRE BUILDING,1148320
2,Request Large Bulky Item Collection,1073753
3,No Access,978109
4,HEAT,868960
5,Street Light Out,798217
6,Banging/Pounding,770000
7,Pothole,702652
8,APARTMENT ONLY,613288
9,Blocked Hydrant,484508


In [34]:
client = Socrata("data.cityofnewyork.us", socrata_token, timeout=1000)

query = """
SELECT 
    complaint_type, 
    count(complaint_type)
WHERE 
    LOWER(complaint_type) LIKE '%flood%'   
GROUP BY 
    complaint_type
ORDER BY 
    count(complaint_type) DESC
LIMIT
    10000000    
"""

results = client.get("erm2-nwe9", query=query)
results_df = pd.DataFrame.from_records(results)

print('shape of data: {}'.format(results_df.shape))
results_df



shape of data: (0, 0)


In [35]:
client = Socrata("data.cityofnewyork.us", socrata_token, timeout=1000)

query = """
SELECT 
    descriptor, 
    count(descriptor) 
WHERE 
    LOWER(descriptor) LIKE '%flood%' 
GROUP BY 
    descriptor  
ORDER BY 
    count(descriptor) DESC
LIMIT
    10000000    
"""

results = client.get("erm2-nwe9", query=query)
results_df = pd.DataFrame.from_records(results)

print('shape of data: {}'.format(results_df.shape))
results_df



shape of data: (11, 2)


Unnamed: 0,descriptor,count_descriptor
0,Catch Basin Clogged/Flooding (Use Comments) (SC),104004
1,Street Flooding (SJ),33854
2,Flood Light Lamp Out,6271
3,Highway Flooding (SH),3022
4,Flood Light Lamp Cycling,2549
5,Ready NY - Flooding,271
6,Flood Light Lamp Dayburning,216
7,Flood Light Lamp Missing,203
8,Flood Light Lamp Dim,181
9,RAIN GARDEN FLOODING (SRGFLD),152


In [36]:
client = Socrata("data.cityofnewyork.us", socrata_token, timeout=1000)

query = """
SELECT 
    *
WHERE 
    LOWER(descriptor) LIKE '%flood%'
LIMIT
    10000000
"""

results = client.get("erm2-nwe9", query=query)
results_df = pd.DataFrame.from_records(results)

print('shape of data: {}'.format(results_df.shape))
results_df.head()



shape of data: (150776, 34)


Unnamed: 0,unique_key,created_date,agency,agency_name,complaint_type,descriptor,incident_zip,incident_address,street_name,cross_street_1,...,location,intersection_street_1,intersection_street_2,closed_date,resolution_description,resolution_action_updated_date,location_type,landmark,facility_type,due_date
0,55649663,2022-10-09T19:05:00.000,DEP,Department of Environmental Protection,Sewer,Catch Basin Clogged/Flooding (Use Comments) (SC),10473,1880 SEWARD AVENUE,SEWARD AVENUE,UNDERHILL AVE,...,"{'latitude': '40.819625544642555', 'longitude'...",,,,,,,,,
1,55651611,2022-10-09T17:50:00.000,DEP,Department of Environmental Protection,Sewer,Catch Basin Clogged/Flooding (Use Comments) (SC),11223,370 AVENUE U,AVENUE U,E 1 ST,...,"{'latitude': '40.59728277829299', 'longitude':...",,,,,,,,,
2,55651612,2022-10-09T17:38:00.000,DEP,Department of Environmental Protection,Sewer,Catch Basin Clogged/Flooding (Use Comments) (SC),11372,77-01 34 AVENUE,34 AVENUE,77 ST,...,"{'latitude': '40.753265794630934', 'longitude'...",,,,,,,,,
3,55652555,2022-10-09T17:17:00.000,DEP,Department of Environmental Protection,Sewer,Catch Basin Clogged/Flooding (Use Comments) (SC),11235,1402 AVENUE Y,AVENUE Y,E 14 ST,...,"{'latitude': '40.589993280458884', 'longitude'...",,,,,,,,,
4,55649664,2022-10-09T16:32:00.000,DEP,Department of Environmental Protection,Sewer,Catch Basin Clogged/Flooding (Use Comments) (SC),11233,352 MALCOLM X BOULEVARD,MALCOLM X BOULEVARD,BAINBRIDGE ST,...,"{'latitude': '40.680813005063555', 'longitude'...",,,,,,,,,


In [13]:
results_df['descriptor'].value_counts()

Catch Basin Clogged/Flooding (Use Comments) (SC)    103401
Street Flooding (SJ)                                 33666
Flood Light Lamp Out                                  6260
Highway Flooding (SH)                                 3014
Flood Light Lamp Cycling                              2548
Ready NY - Flooding                                    271
Flood Light Lamp Dayburning                            216
Flood Light Lamp Missing                               202
Flood Light Lamp Dim                                   181
RAIN GARDEN FLOODING (SRGFLD)                          152
Flooded                                                 53
Name: descriptor, dtype: int64

In [37]:
flooding_df = results_df.loc[results_df['descriptor'] == 'Street Flooding (SJ)']

flooding_df.head()

Unnamed: 0,unique_key,created_date,agency,agency_name,complaint_type,descriptor,incident_zip,incident_address,street_name,cross_street_1,...,location,intersection_street_1,intersection_street_2,closed_date,resolution_description,resolution_action_updated_date,location_type,landmark,facility_type,due_date
9,55652561,2022-10-09T14:05:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),11209,340 97 STREET,97 STREET,BARWELL TER,...,"{'latitude': '40.6152602290429', 'longitude': ...",,,,,,,,,
16,55644720,2022-10-08T16:11:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),11004,264-04 82 AVENUE,82 AVENUE,264 ST,...,"{'latitude': '40.74200818993101', 'longitude':...",,,,,,,,,
18,55648664,2022-10-08T11:47:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),10025,32 WEST 96 STREET,WEST 96 STREET,CENTRAL PARK W,...,"{'latitude': '40.79204737253363', 'longitude':...",,,,,,,,,
24,55636579,2022-10-07T17:44:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),11204,,,,...,"{'latitude': '40.61084382425589', 'longitude':...",WEST 4 STREET,AVENUE O,,,,,,,
31,55632767,2022-10-07T14:20:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),11230,1314 ELM AVENUE,ELM AVENUE,E 13 ST,...,"{'latitude': '40.61727050046426', 'longitude':...",,,,,,,,,


In [38]:
flooding_df['descriptor'].value_counts()

Street Flooding (SJ)    33854
Name: descriptor, dtype: int64

In [39]:
flooding_df['complaint_type'].value_counts()

Sewer    33854
Name: complaint_type, dtype: int64

# Briefly reviewing what descriptors are in the complaint_type='Sewer'

In [40]:
client = Socrata("data.cityofnewyork.us", socrata_token, timeout=1000)

query = """
SELECT 
    descriptor, 
    count(descriptor) 
WHERE 
    complaint_type='Sewer'  
GROUP BY 
    descriptor  
ORDER BY 
    count(descriptor) DESC
LIMIT
    10000000 
"""
results = client.get("erm2-nwe9", query=query)
results_df = pd.DataFrame.from_records(results)

print(results_df.shape)
results_df.head(len(results_df))



(30, 2)


Unnamed: 0,descriptor,count_descriptor
0,Sewer Backup (Use Comments) (SA),176862
1,Catch Basin Clogged/Flooding (Use Comments) (SC),104004
2,Street Flooding (SJ),33854
3,Catch Basin Sunken/Damaged/Raised (SC1),31287
4,Manhole Cover Broken/Making Noise (SB),19801
5,Manhole Cover Missing (Emergency) (SA3),19501
6,Sewer Odor (SA2),17849
7,Defective/Missing Curb Piece (SC4),9077
8,Manhole Overflow (Use Comments) (SA1),8374
9,Catch Basin Search (SC2),4156


# Deleting records greater than year 2020

In [41]:
print('Number of total records: {:,}\n'.format(len(flooding_df)))
      
print('min date:', flooding_df['created_date'].min())
print('max date:', flooding_df['created_date'].max())

Number of total records: 33,854

min date: 2010-01-02T08:26:00.000
max date: 2022-10-09T14:05:00.000


In [42]:
(flooding_df
 .loc[flooding_df.created_date < '2021']
 .sort_values(by='created_date', ascending=False)
 .head())

Unnamed: 0,unique_key,created_date,agency,agency_name,complaint_type,descriptor,incident_zip,incident_address,street_name,cross_street_1,...,location,intersection_street_1,intersection_street_2,closed_date,resolution_description,resolution_action_updated_date,location_type,landmark,facility_type,due_date
19927,48542220,2020-12-31T15:41:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),11420,117-17 135 STREET,135 STREET,FOCH BLVD,...,"{'latitude': '40.67703755925495', 'longitude':...",,,2021-01-01T00:20:00.000,Please call 311 for further information. If yo...,2021-01-01T00:20:00.000,,,,
19929,48536430,2020-12-31T14:49:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),11357,20-24 150 STREET,150 STREET,20 AVE,...,"{'latitude': '40.78072630540092', 'longitude':...",,,2021-01-04T10:15:00.000,The Department of Environment Protection inspe...,2021-01-04T10:15:00.000,,,,
19932,48539361,2020-12-31T14:03:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),11228,7223 8 AVENUE,8 AVENUE,72 ST,...,"{'latitude': '40.62849640806448', 'longitude':...",,,2021-01-02T11:25:00.000,The Department of Environmental Protection has...,2021-01-02T11:25:00.000,,,,
19933,48543132,2020-12-31T13:48:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),10032,,,,...,"{'latitude': '40.841051689545516', 'longitude'...",RIVERSIDE DRIVE,WEST 165 STREET,2020-12-31T14:50:00.000,Please call 311 for further information. If yo...,2020-12-31T14:50:00.000,,,,
19934,48536441,2020-12-31T13:10:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),11234,3123 FILLMORE AVENUE,FILLMORE AVENUE,E 31 ST,...,"{'latitude': '40.609203447399906', 'longitude'...",,,2021-01-03T10:45:00.000,The Department of Environmental Protection ins...,2021-01-03T10:45:00.000,,,,


In [43]:
flooding_df = flooding_df.loc[flooding_df.created_date < '2021']

print('Number of total records: {:,}\n'.format(len(flooding_df)))
      
print('min date:', flooding_df['created_date'].min())
print('max date:', flooding_df['created_date'].max())

Number of total records: 27,902

min date: 2010-01-02T08:26:00.000
max date: 2020-12-31T15:41:00.000


In [44]:
# writing output file as a csv
flooding_df.to_csv('data/street-flooding-complaints.csv', index=False)

# listing items in data folder
%ls data/

README.md                           raw_streets_clipped.json
[34mdata-dictionaries[m[m/                  street-flooding-complaints.csv
raw_street_flooding_complaints.csv  streets-clipped.json
