# Downloading NYC 311 Street Flooding Complaints Data Using Sodapy 

Author: Mark Bauer

In [1]:
# importing libraries
import os
import numpy as np
import pandas as pd
from sodapy import Socrata
from datetime import datetime

In [2]:
# reproducibility
%reload_ext watermark
%watermark -v -p numpy,pandas,sodapy

Python implementation: CPython
Python version       : 3.11.0
IPython version      : 8.6.0

numpy : 1.23.4
pandas: 1.5.1
sodapy: 2.2.0



In [3]:
# data retrieved
current_date = datetime.now()
print(f"The data was retrieved on {current_date.strftime('%Y-%m-%d')}.")

The data was retrieved on 2025-04-06.


# Sodapy
The Python client for the Socrata API.

In [4]:
# nyc open data domain and 311 dataset id
socrata_domain = 'data.cityofnewyork.us'
socrata_dataset_identifier = 'erm2-nwe9'

For now, we'll use the Socrata API without a token, but the steps to use one are below.

In [5]:
# If you choose to use a token, run the following command on the terminal (or add it to your .bashrc)
# $ export SODAPY_APPTOKEN=<token>
# socrata_token = os.environ.get("SODAPY_APPTOKEN")

# Preview and Explore the NYC 311 Dataset
Practice querying data using the sodapy client.

In [6]:
# Socrata object to fetch data
client = Socrata(
    domain=socrata_domain,
    app_token=None,
    timeout=1000
)

# inspect
print(client)



<sodapy.socrata.Socrata object at 0x10c5cbcd0>


We use the sodapy `get` method to fetch data. Pass the `dataset id` and the `query` to this method.

**Table xx:** Number of NYC 311 Complaints by Complaint Type.

In [7]:
query = """
    SELECT 
        complaint_type, 
        count(complaint_type) AS count  
    GROUP BY 
        complaint_type   
    ORDER BY 
        count(complaint_type) DESC
    LIMIT
        20
"""

# returned as JSON from API, converted to Python list of dictionaries by sodapy
results = client.get(socrata_dataset_identifier, query=query)

# convert to pandas DataFrame
results_df = pd.DataFrame(results)

print(f'shape of data: {results_df.shape}')
results_df

shape of data: (20, 2)


Unnamed: 0,complaint_type,count
0,Noise - Residential,3763852
1,Illegal Parking,2963250
2,HEAT/HOT WATER,2496471
3,Blocked Driveway,1726104
4,Noise - Street/Sidewalk,1345355
5,Street Condition,1326391
6,Street Light Condition,1166688
7,Request Large Bulky Item Collection,1073753
8,PLUMBING,1002879
9,UNSANITARY CONDITION,964080


**Table xx:** Number of NYC 311 Complaints by Descriptor.

In [8]:
query = """
    SELECT 
        descriptor, 
        count(descriptor) AS count 
    GROUP BY 
        descriptor    
    ORDER BY 
        count(descriptor) DESC
    LIMIT
        20
"""

results = client.get(socrata_dataset_identifier, query=query)
results_df = pd.DataFrame(results)

print(f'shape of data: {results_df.shape}')
results_df

shape of data: (20, 2)


Unnamed: 0,descriptor,count
0,Loud Music/Party,4116161
1,ENTIRE BUILDING,1630913
2,No Access,1279789
3,Request Large Bulky Item Collection,1073753
4,Banging/Pounding,1052889
5,HEAT,868960
6,APARTMENT ONLY,865558
7,Street Light Out,858510
8,Blocked Hydrant,827115
9,Pothole,800141


**Table xx:** Number of NYC 311 Complaints by Complaint Type and Descriptor.

In [9]:
query = """
    SELECT 
        complaint_type,
        descriptor, 
        count(*) AS count  
    GROUP BY 
        complaint_type, descriptor   
    ORDER BY 
        count(*) DESC
    LIMIT
        20
"""

results = client.get(socrata_dataset_identifier, query=query)
results_df = pd.DataFrame(results)

print(f'shape of data: {results_df.shape}')
results_df

shape of data: (20, 3)


Unnamed: 0,complaint_type,descriptor,count
0,Noise - Residential,Loud Music/Party,2483557
1,HEAT/HOT WATER,ENTIRE BUILDING,1630913
2,Blocked Driveway,No Access,1279783
3,Request Large Bulky Item Collection,Request Large Bulky Item Collection,1073753
4,Noise - Street/Sidewalk,Loud Music/Party,1050249
5,Noise - Residential,Banging/Pounding,985237
6,HEATING,HEAT,868960
7,HEAT/HOT WATER,APARTMENT ONLY,865558
8,Street Light Condition,Street Light Out,858510
9,Illegal Parking,Blocked Hydrant,825059


**Table xx:** Number of NYC 311 Complaints by Complaint Type Where Complaint Type Contains the Word *flood*.

In [10]:
query = """
    SELECT 
        complaint_type, 
        count(complaint_type)
    WHERE 
        LOWER(complaint_type) LIKE '%flood%'   
    GROUP BY 
        complaint_type
    ORDER BY 
        count(complaint_type) DESC
    LIMIT
        10
"""

results = client.get(socrata_dataset_identifier, query=query)
results_df = pd.DataFrame(results)

print(f'shape of data: {results_df.shape}')
results_df

shape of data: (0, 0)


**Table xx:** Number of NYC 311 Complaints by Descriptor Where Descriptor Contains the Word *flood*.

In [11]:
query = """
    SELECT 
        descriptor, 
        count(descriptor) AS count
    WHERE 
        LOWER(descriptor) LIKE '%flood%' 
    GROUP BY 
        descriptor  
    ORDER BY 
        count(descriptor) DESC
    LIMIT
        20 
"""

results = client.get(socrata_dataset_identifier, query=query)
results_df = pd.DataFrame(results)

print(f'shape of data: {results_df.shape}')
results_df

shape of data: (13, 2)


Unnamed: 0,descriptor,count
0,Catch Basin Clogged/Flooding (Use Comments) (SC),119775
1,Street Flooding (SJ),41687
2,Flood Light Lamp Out,6634
3,Highway Flooding (SH),3192
4,Flood Light Lamp Cycling,2621
5,Flooding on Street,673
6,Ready NY - Flooding,271
7,Flood Light Lamp Dayburning,242
8,Flood Light Lamp Missing,216
9,Flood Light Lamp Dim,198


# 311 Street Flooding Complaints

Select all rows where `descriptor` is `Street Flooding (SJ)` and `created_date` is `between 2010 and 2020`. We use limit 100,000 as an approximate of all possible rows.

In [12]:
query = """
    SELECT 
        *
    WHERE 
        descriptor == 'Street Flooding (SJ)'
        AND created_date BETWEEN '2010' AND '2020'
    LIMIT
        100000 -- must do this, assign a large number
"""

results = client.get(socrata_dataset_identifier, query=query)
results_df = pd.DataFrame(results)

print(f'shape of data: {results_df.shape}')
results_df

shape of data: (25747, 32)


Unnamed: 0,unique_key,created_date,closed_date,agency,agency_name,complaint_type,descriptor,cross_street_1,cross_street_2,intersection_street_1,...,city,x_coordinate_state_plane,y_coordinate_state_plane,latitude,longitude,location,incident_address,street_name,bbl,due_date
0,18265181,2010-07-14T08:38:00.000,2010-07-14T08:38:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),PELHAM PKWY,STILLWELL AVE,PELHAM PKWY,...,,,,,,,,,,
1,34783066,2016-11-15T09:27:00.000,2016-11-15T10:05:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),,,LAFAYETTE AVENUE,...,STATEN ISLAND,958594,170855,40.635596930697716,-74.09243785251621,"{'latitude': '40.635596930697716', 'longitude'...",,,,
2,21549616,2011-09-29T10:34:00.000,2011-09-30T10:40:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),,,THURSBY AVE,...,,,,,,,,,,
3,35839080,2017-03-31T20:24:00.000,2017-04-01T02:25:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),3 AVENUE,2 AVENUE,,...,NEW YORK,,,,,,EAST 106 STREET,EAST 106 STREET,,
4,29443390,2014-12-06T10:23:00.000,2014-12-06T11:30:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),NAGLE AVE,DYCKMAN ST,NAGLE AVE,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25742,45073978,2019-12-03T10:54:00.000,2019-12-30T22:01:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),SEAVER AVE,IONA ST,,...,STATEN ISLAND,960864,149333,40.576529751013474,-74.08418458891498,"{'latitude': '40.576529751013474', 'longitude'...",753 QUINCY AVENUE,QUINCY AVENUE,5038260015,
25743,45054586,2019-11-30T13:00:00.000,2019-11-30T22:25:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),INWOOD ST,146 ST,,...,Jamaica,1041634,183601,40.67043310054083,-73.79313890742239,"{'latitude': '40.67043310054083', 'longitude':...",145-33 130 AVENUE,130 AVENUE,4120710063,
25744,26909821,2013-12-18T11:49:00.000,2013-12-19T10:25:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),82 PL,63 AVE,,...,MIDDLE VILLAGE,1019086,202652,40.72284104810696,-73.87432253616811,"{'latitude': '40.72284104810696', 'longitude':...",62-82 DRY HARBOR ROAD,DRY HARBOR ROAD,4029690030,
25745,43351039,2019-07-22T22:06:00.000,2019-07-23T10:50:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),2 AVE,9 ST,2 AVENUE,...,BROOKLYN,986087,184324,40.67260335878416,-73.99337766904543,"{'latitude': '40.67260335878416', 'longitude':...",,,,


In [13]:
# sanity checks
print(f'Number of total records: {results_df.shape[0]:,}.\n')
      
print('Min date:', results_df['created_date'].min())
print('Max date:', results_df['created_date'].max())

Number of total records: 25,747.

Min date: 2010-01-02T08:26:00.000
Max date: 2019-12-31T22:42:00.000


In [14]:
# summary of dataframe
results_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25747 entries, 0 to 25746
Data columns (total 32 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   unique_key                      25747 non-null  object
 1   created_date                    25747 non-null  object
 2   closed_date                     25746 non-null  object
 3   agency                          25747 non-null  object
 4   agency_name                     25747 non-null  object
 5   complaint_type                  25747 non-null  object
 6   descriptor                      25747 non-null  object
 7   cross_street_1                  22472 non-null  object
 8   cross_street_2                  22464 non-null  object
 9   intersection_street_1           9616 non-null   object
 10  intersection_street_2           9616 non-null   object
 11  address_type                    25741 non-null  object
 12  facility_type                   23290 non-null

In [15]:
# sort count nulls descending per column
(results_df
 .isnull()
 .sum()
 .sort_values(ascending=False)
)

due_date                          25746
intersection_street_1             16131
intersection_street_2             16131
bbl                               11144
incident_address                   9559
street_name                        9559
cross_street_2                     3283
cross_street_1                     3275
facility_type                      2457
latitude                            930
location                            930
x_coordinate_state_plane            930
y_coordinate_state_plane            930
longitude                           930
incident_zip                        848
city                                846
address_type                          6
resolution_description                4
closed_date                           1
park_borough                          0
unique_key                            0
park_facility_name                    0
open_data_channel_type                0
borough                               0
created_date                          0


# Write Out Data

In [16]:
# writing output file as a csv
outpath = '../data/street-flooding-complaints.csv'
results_df.to_csv(outpath, index=False)

# sanity check, list items in data folder
%ls ../data/

README.md                               streets-clipped.gpkg
street-flooding-complaints-cleaned.csv  streets.gpkg
street-flooding-complaints.csv          water-main-break-raw.csv
street-flooding-raw.csv                 water-main-breaks.csv


Sanity Check of CSV.

In [17]:
path = '../data/street-flooding-complaints.csv'
df = pd.read_csv(path, low_memory=False)

print(f'shape of data: {df.shape}')
df.head()

shape of data: (25747, 32)


Unnamed: 0,unique_key,created_date,closed_date,agency,agency_name,complaint_type,descriptor,cross_street_1,cross_street_2,intersection_street_1,...,city,x_coordinate_state_plane,y_coordinate_state_plane,latitude,longitude,location,incident_address,street_name,bbl,due_date
0,18265181,2010-07-14T08:38:00.000,2010-07-14T08:38:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),PELHAM PKWY,STILLWELL AVE,PELHAM PKWY,...,,,,,,,,,,
1,34783066,2016-11-15T09:27:00.000,2016-11-15T10:05:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),,,LAFAYETTE AVENUE,...,STATEN ISLAND,958594.0,170855.0,40.635597,-74.092438,"{'latitude': '40.635596930697716', 'longitude'...",,,,
2,21549616,2011-09-29T10:34:00.000,2011-09-30T10:40:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),,,THURSBY AVE,...,,,,,,,,,,
3,35839080,2017-03-31T20:24:00.000,2017-04-01T02:25:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),3 AVENUE,2 AVENUE,,...,NEW YORK,,,,,,EAST 106 STREET,EAST 106 STREET,,
4,29443390,2014-12-06T10:23:00.000,2014-12-06T11:30:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),NAGLE AVE,DYCKMAN ST,NAGLE AVE,...,,,,,,,,,,


In [18]:
# summary of columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25747 entries, 0 to 25746
Data columns (total 32 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   unique_key                      25747 non-null  int64  
 1   created_date                    25747 non-null  object 
 2   closed_date                     25746 non-null  object 
 3   agency                          25747 non-null  object 
 4   agency_name                     25747 non-null  object 
 5   complaint_type                  25747 non-null  object 
 6   descriptor                      25747 non-null  object 
 7   cross_street_1                  22472 non-null  object 
 8   cross_street_2                  22464 non-null  object 
 9   intersection_street_1           9616 non-null   object 
 10  intersection_street_2           9616 non-null   object 
 11  address_type                    25741 non-null  object 
 12  facility_type                   

There are two columns that we should probably drop due to percentage of missing data, `facility_type` and `due_date`. We will dive in further in the data inspection notebook.

# Work in Progress
## Water Main Breaks in 311 Complaints
Understand the extent to which Water Main Break complaints are in the Street Flooding Complaints data and possibly filter out of this analysis. We may use this for later. For now, we'll at least write out this data.

In [19]:
query = """
    SELECT 
        descriptor, 
        count(descriptor) AS count
    WHERE 
        LOWER(descriptor) LIKE '%water main break%' 
    GROUP BY 
        descriptor  
    ORDER BY 
        count(descriptor) DESC
    LIMIT
        20 
"""

results = client.get(socrata_dataset_identifier, query=query)
results_df = pd.DataFrame(results)

print(f'shape of data: {results_df.shape}')
results_df

shape of data: (1, 2)


Unnamed: 0,descriptor,count
0,Possible Water Main Break (Use Comments) (WA1),42043


In [20]:
query = """
    SELECT 
        *
    WHERE 
        descriptor == 'Possible Water Main Break (Use Comments) (WA1)'
        AND created_date BETWEEN '2010' AND '2020'
    LIMIT
        100000 -- assign a large number
"""

results = client.get(socrata_dataset_identifier, query=query)
results_df = pd.DataFrame(results)

print(f'shape of data: {results_df.shape}')
results_df.head()

shape of data: (22631, 32)


Unnamed: 0,unique_key,created_date,closed_date,agency,agency_name,complaint_type,descriptor,intersection_street_1,intersection_street_2,address_type,...,street_name,incident_zip,city,bbl,x_coordinate_state_plane,y_coordinate_state_plane,latitude,longitude,location,due_date
0,21347159,2011-09-01T08:34:00.000,2011-09-09T08:00:00.000,DEP,Department of Environmental Protection,Water System,Possible Water Main Break (Use Comments) (WA1),ROCKAWAY BLVD,BROOKVILLE BLVD,INTERSECTION,...,,,,,,,,,,
1,32951975,2016-03-21T14:56:00.000,2016-03-21T14:56:00.000,DEP,Department of Environmental Protection,Water System,Possible Water Main Break (Use Comments) (WA1),PARKVILLE AVE,E 8 ST,INTERSECTION,...,,,,,,,,,,
2,36947802,2017-08-14T09:51:00.000,2017-08-14T14:30:00.000,DEP,Department of Environmental Protection,Water System,Possible Water Main Break (Use Comments) (WA1),,,INTERSECTION,...,EDISON AVE,,,,,,,,,
3,37999718,2017-12-22T04:01:00.000,2017-12-22T14:30:00.000,DEP,Department of Environmental Protection,Water System,Possible Water Main Break (Use Comments) (WA1),68 ST,WOODSIDE AVE,INTERSECTION,...,,,,,,,,,,
4,33478457,2016-05-31T14:24:00.000,2016-05-31T17:00:00.000,DEP,Department of Environmental Protection,Water System,Possible Water Main Break (Use Comments) (WA1),,,ADDRESS,...,EAST 161 STREET,10456.0,BRONX,2026270001.0,1009687.0,238808.0,40.82211177009745,-73.90809432309149,"{'latitude': '40.82211177009745', 'longitude':...",


In [21]:
# sanity checks
print(f'Number of total records: {results_df.shape[0]:,}.\n')
      
print('Min date:', results_df['created_date'].min())
print('Max date:', results_df['created_date'].max())

Number of total records: 22,631.

Min date: 2010-01-01T06:28:00.000
Max date: 2019-12-31T20:54:00.000


In [22]:
# summary of dataframe
results_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22631 entries, 0 to 22630
Data columns (total 32 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   unique_key                      22631 non-null  object
 1   created_date                    22631 non-null  object
 2   closed_date                     22630 non-null  object
 3   agency                          22631 non-null  object
 4   agency_name                     22631 non-null  object
 5   complaint_type                  22631 non-null  object
 6   descriptor                      22631 non-null  object
 7   intersection_street_1           7445 non-null   object
 8   intersection_street_2           7445 non-null   object
 9   address_type                    22627 non-null  object
 10  facility_type                   21112 non-null  object
 11  status                          22631 non-null  object
 12  resolution_description          22630 non-null

In [23]:
# writing output file as a csv
outpath = '../data/water-main-breaks.csv'
results_df.to_csv(outpath, index=False)

# sanity check, list items in data folder
%ls ../data/

README.md                               streets-clipped.gpkg
street-flooding-complaints-cleaned.csv  streets.gpkg
street-flooding-complaints.csv          water-main-break-raw.csv
street-flooding-raw.csv                 water-main-breaks.csv


In [24]:
# close sodapy client
client.close()