# Querying 311 Street Flooding Complaints Using Sodapy 

Mark Bauer

In [1]:
# importing libraries
import pandas as pd
import numpy as np
from sodapy import Socrata
import os

In [2]:
# nyc open data domain and 311 dataset id
socrata_domain = 'data.cityofnewyork.us'
socrata_dataset_identifier = 'erm2-nwe9'

# If you choose to use a token, run the following command on the terminal (or add it to your .bashrc)
# $ export SODAPY_APPTOKEN=<token>
socrata_token = os.environ.get("SODAPY_APPTOKEN")

# Understanding the sodapy client

In [3]:
# connecting to soda api
client = Socrata(socrata_domain, socrata_token)

metadata = client.get_metadata(socrata_dataset_identifier)
print('type: {}'.format(type(metadata)))
print('count of items: {}'.format(len(metadata)))



type: <class 'dict'>
count of items: 41


In [4]:
# preview keys
for key in metadata.keys():
    print(key)

id
name
assetType
attribution
averageRating
category
createdAt
description
displayType
downloadCount
hideFromCatalog
hideFromDataJson
indexUpdatedAt
newBackend
numberOfComments
oid
provenance
publicationAppendEnabled
publicationDate
publicationGroup
publicationStage
rowClass
rowIdentifierColumnId
rowsUpdatedAt
rowsUpdatedBy
tableId
totalTimesRated
viewCount
viewLastModified
viewType
approvals
clientContext
columns
grants
metadata
owner
query
rights
tableAuthor
tags
flags


In [5]:
# continue to preview items
print('type: {}'.format(type(metadata['columns'])))
print('length: {}'.format(len(metadata['columns'])))
metadata['columns'][0]

type: <class 'list'>
length: 46


{'id': 354922030,
 'name': 'Unique Key',
 'dataTypeName': 'text',
 'description': 'Unique identifier of a Service Request (SR) in the open data set\n',
 'fieldName': 'unique_key',
 'position': 1,
 'renderTypeName': 'text',
 'tableColumnId': 1567787,
 'width': 220,
 'cachedContents': {'largest': '52892093',
  'non_null': '27382103',
  'null': '0',
  'top': [{'item': '10693408', 'count': '1'},
   {'item': '10836749', 'count': '1'},
   {'item': '10836967', 'count': '1'},
   {'item': '11051177', 'count': '1'},
   {'item': '11413576', 'count': '1'},
   {'item': '11463895', 'count': '1'},
   {'item': '11463896', 'count': '1'},
   {'item': '11464334', 'count': '1'},
   {'item': '11464394', 'count': '1'},
   {'item': '11464467', 'count': '1'},
   {'item': '11464508', 'count': '1'},
   {'item': '11464509', 'count': '1'},
   {'item': '11464521', 'count': '1'},
   {'item': '11464567', 'count': '1'},
   {'item': '11464572', 'count': '1'},
   {'item': '11464639', 'count': '1'},
   {'item': '1146484

In [6]:
# printing column names
[x['name'] for x in metadata['columns']]

['Unique Key',
 'Created Date',
 'Closed Date',
 'Agency',
 'Agency Name',
 'Complaint Type',
 'Descriptor',
 'Location Type',
 'Incident Zip',
 'Incident Address',
 'Street Name',
 'Cross Street 1',
 'Cross Street 2',
 'Intersection Street 1',
 'Intersection Street 2',
 'Address Type',
 'City',
 'Landmark',
 'Facility Type',
 'Status',
 'Due Date',
 'Resolution Description',
 'Resolution Action Updated Date',
 'Community Board',
 'BBL',
 'Borough',
 'X Coordinate (State Plane)',
 'Y Coordinate (State Plane)',
 'Open Data Channel Type',
 'Park Facility Name',
 'Park Borough',
 'Vehicle Type',
 'Taxi Company Borough',
 'Taxi Pick Up Location',
 'Bridge Highway Name',
 'Bridge Highway Direction',
 'Road Ramp',
 'Bridge Highway Segment',
 'Latitude',
 'Longitude',
 'Location',
 'Zip Codes',
 'Community Districts',
 'Borough Boundaries',
 'City Council Districts',
 'Police Precincts']

In [7]:
# printing column field names
[x['fieldName'] for x in metadata['columns']]

['unique_key',
 'created_date',
 'closed_date',
 'agency',
 'agency_name',
 'complaint_type',
 'descriptor',
 'location_type',
 'incident_zip',
 'incident_address',
 'street_name',
 'cross_street_1',
 'cross_street_2',
 'intersection_street_1',
 'intersection_street_2',
 'address_type',
 'city',
 'landmark',
 'facility_type',
 'status',
 'due_date',
 'resolution_description',
 'resolution_action_updated_date',
 'community_board',
 'bbl',
 'borough',
 'x_coordinate_state_plane',
 'y_coordinate_state_plane',
 'open_data_channel_type',
 'park_facility_name',
 'park_borough',
 'vehicle_type',
 'taxi_company_borough',
 'taxi_pick_up_location',
 'bridge_highway_name',
 'bridge_highway_direction',
 'road_ramp',
 'bridge_highway_segment',
 'latitude',
 'longitude',
 'location',
 ':@computed_region_efsh_h5xi',
 ':@computed_region_f5dn_yrer',
 ':@computed_region_yeji_bk3q',
 ':@computed_region_92fq_4b7q',
 ':@computed_region_sbqj_enih']

In [8]:
# preview complaint type column
meta_amount = [x for x in metadata['columns'] if x['name'] == 'Complaint Type']
meta_amount[0]

{'id': 354922035,
 'name': 'Complaint Type',
 'dataTypeName': 'text',
 'description': 'This is the first level of a hierarchy identifying the topic of the incident or condition. Complaint Type may have a corresponding Descriptor (below) or may stand alone.',
 'fieldName': 'complaint_type',
 'position': 6,
 'renderTypeName': 'text',
 'tableColumnId': 1567792,
 'width': 268,
 'cachedContents': {'largest': 'ZTESTINT',
  'non_null': '27382103',
  'null': '0',
  'top': [{'item': 'Noise - Residential', 'count': '2575599'},
   {'item': 'HEAT/HOT WATER', 'count': '1589587'},
   {'item': 'Illegal Parking', 'count': '1424426'},
   {'item': 'Blocked Driveway', 'count': '1188263'},
   {'item': 'Street Condition', 'count': '1090368'},
   {'item': 'Street Light Condition', 'count': '1035400'},
   {'item': 'HEATING', 'count': '887869'},
   {'item': 'Noise - Street/Sidewalk', 'count': '863368'},
   {'item': 'PLUMBING', 'count': '780618'},
   {'item': 'Water System', 'count': '739604'},
   {'item': 'UN

In [9]:
# preview descriptor column
meta_amount = [x for x in metadata['columns'] if x['name'] == 'Descriptor']
meta_amount[0]

{'id': 354922036,
 'name': 'Descriptor',
 'dataTypeName': 'text',
 'description': 'This is  associated to the Complaint Type, and provides further detail on the incident or condition. Descriptor values are dependent on the Complaint Type, and are not always required in SR. \n',
 'fieldName': 'descriptor',
 'position': 7,
 'renderTypeName': 'text',
 'tableColumnId': 1567793,
 'width': 220,
 'cachedContents': {'largest': 'Zoning - Non-Conforming/Illegal Vehicle Storage',
  'non_null': '27312254',
  'null': '69849',
  'top': [{'item': 'Loud Music/Party', 'count': '2779118'},
   {'item': 'ENTIRE BUILDING', 'count': '1036777'},
   {'item': 'No Access', 'count': '887751'},
   {'item': 'HEAT', 'count': '868960'},
   {'item': 'Street Light Out', 'count': '766221'},
   {'item': 'Banging/Pounding', 'count': '696263'},
   {'item': 'Pothole', 'count': '660455'},
   {'item': 'APARTMENT ONLY', 'count': '552810'},
   {'item': 'Loud Talking', 'count': '407296'},
   {'item': 'Blocked Hydrant', 'count':

# Preview and explore the dataset
Group and count 311 complaints by `complaint_type`

In [10]:
# practice query using the sodapy client and basic query format
# manually force limit rows to high value that includes ~all rows

client = Socrata("data.cityofnewyork.us", socrata_token, timeout=1000)

query = """
SELECT 
    complaint_type, 
    count(complaint_type)   
GROUP BY 
    complaint_type   
ORDER BY 
    count(complaint_type) DESC
LIMIT
    10000000
"""

# Returned as JSON from API / converted to Python list of
# dictionaries by sodapy
results = client.get("erm2-nwe9", query=query)

# Convert to pandas DataFrame
results_df = pd.DataFrame.from_records(results)

print('shape of data: {}'.format(results_df.shape))
results_df.head(10)



shape of data: (484, 2)


Unnamed: 0,complaint_type,count_complaint_type
0,Noise - Residential,2908269
1,HEAT/HOT WATER,1773144
2,Illegal Parking,1758381
3,Blocked Driveway,1315567
4,Street Condition,1159061
5,Street Light Condition,1083353
6,Request Large Bulky Item Collection,1073753
7,Noise - Street/Sidewalk,1002924
8,HEATING,887869
9,PLUMBING,843818


Group and count 311 complaints by `descriptor`

In [11]:
client = Socrata("data.cityofnewyork.us", socrata_token, timeout=1000)

query = """
SELECT 
    descriptor, 
    count(descriptor)   
GROUP BY 
    descriptor    
ORDER BY 
    count(descriptor) DESC
LIMIT
    10000000    
"""

results = client.get("erm2-nwe9", query=query)
results_df = pd.DataFrame.from_records(results)

print('shape of data: {}'.format(results_df.shape))
results_df.head(10)



shape of data: (1879, 2)


Unnamed: 0,descriptor,count_descriptor
0,Loud Music/Party,3182410
1,ENTIRE BUILDING,1156393
2,Request Large Bulky Item Collection,1073753
3,No Access,983330
4,HEAT,868960
5,Street Light Out,799978
6,Banging/Pounding,774305
7,Pothole,703719
8,APARTMENT ONLY,616751
9,Blocked Hydrant,490203


Group `complaint_type` where type has the word `flood` in it

In [12]:
client = Socrata("data.cityofnewyork.us", socrata_token, timeout=1000)

query = """
SELECT 
    complaint_type, 
    count(complaint_type)
WHERE 
    LOWER(complaint_type) LIKE '%flood%'   
GROUP BY 
    complaint_type
ORDER BY 
    count(complaint_type) DESC
LIMIT
    10000000    
"""

results = client.get("erm2-nwe9", query=query)
results_df = pd.DataFrame.from_records(results)

print('shape of data: {}'.format(results_df.shape))
results_df



shape of data: (0, 0)


Group `descriptor` where type has the word `flood` in it

In [13]:
client = Socrata("data.cityofnewyork.us", socrata_token, timeout=1000)

query = """
SELECT 
    descriptor, 
    count(descriptor) 
WHERE 
    LOWER(descriptor) LIKE '%flood%' 
GROUP BY 
    descriptor  
ORDER BY 
    count(descriptor) DESC
LIMIT
    10000000    
"""

results = client.get("erm2-nwe9", query=query)
results_df = pd.DataFrame.from_records(results)

print('shape of data: {}'.format(results_df.shape))
results_df



shape of data: (11, 2)


Unnamed: 0,descriptor,count_descriptor
0,Catch Basin Clogged/Flooding (Use Comments) (SC),104223
1,Street Flooding (SJ),33930
2,Flood Light Lamp Out,6280
3,Highway Flooding (SH),3022
4,Flood Light Lamp Cycling,2555
5,Ready NY - Flooding,271
6,Flood Light Lamp Dayburning,217
7,Flood Light Lamp Missing,203
8,Flood Light Lamp Dim,181
9,RAIN GARDEN FLOODING (SRGFLD),152


Select all rows where `descriptor` has the word `flood` in it

In [14]:
client = Socrata("data.cityofnewyork.us", socrata_token, timeout=1000)

query = """
SELECT 
    *
WHERE 
    LOWER(descriptor) LIKE '%flood%'
LIMIT
    10000000
"""

results = client.get("erm2-nwe9", query=query)
results_df = pd.DataFrame.from_records(results)

print('shape of data: {}'.format(results_df.shape))
results_df.head()



shape of data: (151087, 34)


Unnamed: 0,unique_key,created_date,agency,agency_name,complaint_type,descriptor,incident_zip,incident_address,street_name,cross_street_1,...,location,intersection_street_1,intersection_street_2,closed_date,resolution_description,resolution_action_updated_date,location_type,landmark,facility_type,due_date
0,55804797,2022-10-25T21:20:00.000,DEP,Department of Environmental Protection,Sewer,Catch Basin Clogged/Flooding (Use Comments) (SC),11249,717 WYTHE AVENUE,WYTHE AVENUE,WILLIAMSBURG ST E,...,"{'latitude': '40.70103817290395', 'longitude':...",,,,,,,,,
1,55801297,2022-10-25T20:32:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),11237,1365 HALSEY STREET,HALSEY STREET,IRVING AVE,...,"{'latitude': '40.695338363340475', 'longitude'...",,,,,,,,,
2,55803640,2022-10-25T19:05:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),10029,232 EAST 103 STREET,EAST 103 STREET,DEAD END,...,"{'latitude': '40.78895133215942', 'longitude':...",,,,,,,,,
3,55803633,2022-10-25T18:20:00.000,DEP,Department of Environmental Protection,Sewer,Catch Basin Clogged/Flooding (Use Comments) (SC),11365,48-04 190 STREET,190 STREET,48 AVE,...,"{'latitude': '40.74913937432443', 'longitude':...",,,,,,,,,
4,55804798,2022-10-25T15:29:00.000,DEP,Department of Environmental Protection,Sewer,Catch Basin Clogged/Flooding (Use Comments) (SC),11225,1009 PRESIDENT STREET,PRESIDENT STREET,FRANKLIN AVE,...,"{'latitude': '40.668769635876465', 'longitude'...",,,,,,,,,


In [15]:
results_df['descriptor'].value_counts()

Catch Basin Clogged/Flooding (Use Comments) (SC)    104223
Street Flooding (SJ)                                 33930
Flood Light Lamp Out                                  6280
Highway Flooding (SH)                                 3022
Flood Light Lamp Cycling                              2555
Ready NY - Flooding                                    271
Flood Light Lamp Dayburning                            217
Flood Light Lamp Missing                               203
Flood Light Lamp Dim                                   181
RAIN GARDEN FLOODING (SRGFLD)                          152
Flooded                                                 53
Name: descriptor, dtype: int64

In [16]:
flooding_df = results_df.loc[results_df['descriptor'] == 'Street Flooding (SJ)']
flooding_df = flooding_df.reset_index(drop=True)

flooding_df.head()

Unnamed: 0,unique_key,created_date,agency,agency_name,complaint_type,descriptor,incident_zip,incident_address,street_name,cross_street_1,...,location,intersection_street_1,intersection_street_2,closed_date,resolution_description,resolution_action_updated_date,location_type,landmark,facility_type,due_date
0,55801297,2022-10-25T20:32:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),11237,1365 HALSEY STREET,HALSEY STREET,IRVING AVE,...,"{'latitude': '40.695338363340475', 'longitude'...",,,,,,,,,
1,55803640,2022-10-25T19:05:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),10029,232 EAST 103 STREET,EAST 103 STREET,DEAD END,...,"{'latitude': '40.78895133215942', 'longitude':...",,,,,,,,,
2,55803639,2022-10-25T12:17:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),10462,,,,...,"{'latitude': '40.8446812959928', 'longitude': ...",MATTHEWS AVENUE,VAN NEST AVENUE,2022-10-25T12:29:00.000,The status of this Service Request is currentl...,2022-10-25T12:29:00.000,,,,
3,55798929,2022-10-25T11:44:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),11204,296 AVENUE P,AVENUE P,W 4 ST,...,"{'latitude': '40.608367771259786', 'longitude'...",,,,,,,,,
4,55798930,2022-10-25T11:17:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),10312,11 NOEL STREET,NOEL STREET,WOODS OF ARDEN RD,...,"{'latitude': '40.53294926239822', 'longitude':...",,,,,,,,,


In [17]:
flooding_df['descriptor'].value_counts()

Street Flooding (SJ)    33930
Name: descriptor, dtype: int64

In [18]:
flooding_df['complaint_type'].value_counts()

Sewer    33930
Name: complaint_type, dtype: int64

Briefly reviewing what descriptors are in the `complaint_type`== `Sewer`

In [19]:
client = Socrata("data.cityofnewyork.us", socrata_token, timeout=1000)

query = """
SELECT 
    descriptor, 
    count(descriptor) 
WHERE 
    complaint_type='Sewer'  
GROUP BY 
    descriptor  
ORDER BY 
    count(descriptor) DESC
LIMIT
    10000000 
"""
results = client.get("erm2-nwe9", query=query)
results_df = pd.DataFrame.from_records(results)

print(results_df.shape)
results_df.head(len(results_df))



(30, 2)


Unnamed: 0,descriptor,count_descriptor
0,Sewer Backup (Use Comments) (SA),177256
1,Catch Basin Clogged/Flooding (Use Comments) (SC),104223
2,Street Flooding (SJ),33930
3,Catch Basin Sunken/Damaged/Raised (SC1),31363
4,Manhole Cover Broken/Making Noise (SB),19801
5,Manhole Cover Missing (Emergency) (SA3),19529
6,Sewer Odor (SA2),17894
7,Defective/Missing Curb Piece (SC4),9087
8,Manhole Overflow (Use Comments) (SA1),8409
9,Catch Basin Search (SC2),4156


# Deleting records greater than year 2020

In [20]:
print('Number of total records: {:,}\n'.format(len(flooding_df)))
      
print('min date:', flooding_df['created_date'].min())
print('max date:', flooding_df['created_date'].max())

Number of total records: 33,930

min date: 2010-01-02T08:26:00.000
max date: 2022-10-25T20:32:00.000


In [21]:
# previewing data
(flooding_df
 .loc[flooding_df.created_date < '2021']
 .sort_values(by='created_date', ascending=False)
 .head()
)

Unnamed: 0,unique_key,created_date,agency,agency_name,complaint_type,descriptor,incident_zip,incident_address,street_name,cross_street_1,...,location,intersection_street_1,intersection_street_2,closed_date,resolution_description,resolution_action_updated_date,location_type,landmark,facility_type,due_date
6028,48542220,2020-12-31T15:41:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),11420,117-17 135 STREET,135 STREET,FOCH BLVD,...,"{'latitude': '40.67703755925495', 'longitude':...",,,2021-01-01T00:20:00.000,Please call 311 for further information. If yo...,2021-01-01T00:20:00.000,,,,
6029,48536430,2020-12-31T14:49:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),11357,20-24 150 STREET,150 STREET,20 AVE,...,"{'latitude': '40.78072630540092', 'longitude':...",,,2021-01-04T10:15:00.000,The Department of Environment Protection inspe...,2021-01-04T10:15:00.000,,,,
6030,48539361,2020-12-31T14:03:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),11228,7223 8 AVENUE,8 AVENUE,72 ST,...,"{'latitude': '40.62849640806448', 'longitude':...",,,2021-01-02T11:25:00.000,The Department of Environmental Protection has...,2021-01-02T11:25:00.000,,,,
6031,48543132,2020-12-31T13:48:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),10032,,,,...,"{'latitude': '40.841051689545516', 'longitude'...",RIVERSIDE DRIVE,WEST 165 STREET,2020-12-31T14:50:00.000,Please call 311 for further information. If yo...,2020-12-31T14:50:00.000,,,,
6032,48536441,2020-12-31T13:10:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),11234,3123 FILLMORE AVENUE,FILLMORE AVENUE,E 31 ST,...,"{'latitude': '40.609203447399906', 'longitude'...",,,2021-01-03T10:45:00.000,The Department of Environmental Protection ins...,2021-01-03T10:45:00.000,,,,


In [22]:
flooding_df = flooding_df.loc[flooding_df.created_date < '2021']

print('Number of total records: {:,}\n'.format(len(flooding_df)))
      
print('min date:', flooding_df['created_date'].min())
print('max date:', flooding_df['created_date'].max())

Number of total records: 27,902

min date: 2010-01-02T08:26:00.000
max date: 2020-12-31T15:41:00.000


In [23]:
# writing output file as a csv
flooding_df.to_csv('data/street-flooding-complaints.csv', index=False)

# listing items in data folder
%ls data/

README.md                           raw_streets_clipped.json
[34mdata-dictionaries[m[m/                  street-flooding-complaints.csv
raw_street_flooding_complaints.csv  streets-clipped.json
