In [51]:
# importing libraries
import pandas as pd
import numpy as np
from sodapy import Socrata
import os

In [52]:
socrata_domain = 'data.cityofnewyork.us'
socrata_dataset_identifier = 'erm2-nwe9'

# If you choose to use a token, run the following command on the terminal (or add it to your .bashrc)
# $ export SODAPY_APPTOKEN=<token>
socrata_token = os.environ.get("SODAPY_APPTOKEN")

In [53]:
client = Socrata(socrata_domain, socrata_token)

In [54]:
metadata = client.get_metadata(socrata_dataset_identifier)
[x['name'] for x in metadata['columns']]

['Unique Key',
 'Created Date',
 'Closed Date',
 'Agency',
 'Agency Name',
 'Complaint Type',
 'Descriptor',
 'Location Type',
 'Incident Zip',
 'Incident Address',
 'Street Name',
 'Cross Street 1',
 'Cross Street 2',
 'Intersection Street 1',
 'Intersection Street 2',
 'Address Type',
 'City',
 'Landmark',
 'Facility Type',
 'Status',
 'Due Date',
 'Resolution Description',
 'Resolution Action Updated Date',
 'Community Board',
 'BBL',
 'Borough',
 'X Coordinate (State Plane)',
 'Y Coordinate (State Plane)',
 'Open Data Channel Type',
 'Park Facility Name',
 'Park Borough',
 'Vehicle Type',
 'Taxi Company Borough',
 'Taxi Pick Up Location',
 'Bridge Highway Name',
 'Bridge Highway Direction',
 'Road Ramp',
 'Bridge Highway Segment',
 'Latitude',
 'Longitude',
 'Location',
 'Zip Codes',
 'Community Districts',
 'Borough Boundaries',
 'City Council Districts',
 'Police Precincts']

In [55]:
meta_amount = [x for x in metadata['columns'] if x['name'] == 'Complaint Type'][0]
meta_amount

{'id': 354922035,
 'name': 'Complaint Type',
 'dataTypeName': 'text',
 'description': 'This is the first level of a hierarchy identifying the topic of the incident or condition. Complaint Type may have a corresponding Descriptor (below) or may stand alone.',
 'fieldName': 'complaint_type',
 'position': 6,
 'renderTypeName': 'text',
 'tableColumnId': 1567792,
 'width': 268,
 'cachedContents': {'largest': 'ZTESTINT',
  'non_null': '25644113',
  'null': '0',
  'top': [{'item': 'Noise - Residential', 'count': '2225885'},
   {'item': 'HEAT/HOT WATER', 'count': '1407396'},
   {'item': 'Illegal Parking', 'count': '1117015'},
   {'item': 'Blocked Driveway', 'count': '1041075'},
   {'item': 'Street Condition', 'count': '1018764'},
   {'item': 'Street Light Condition', 'count': '981297'},
   {'item': 'HEATING', 'count': '887869'},
   {'item': 'PLUMBING', 'count': '744723'},
   {'item': 'Water System', 'count': '698995'},
   {'item': 'Noise - Street/Sidewalk', 'count': '680970'},
   {'item': 'Noi

In [56]:
[x['fieldName'] for x in metadata['columns']]

['unique_key',
 'created_date',
 'closed_date',
 'agency',
 'agency_name',
 'complaint_type',
 'descriptor',
 'location_type',
 'incident_zip',
 'incident_address',
 'street_name',
 'cross_street_1',
 'cross_street_2',
 'intersection_street_1',
 'intersection_street_2',
 'address_type',
 'city',
 'landmark',
 'facility_type',
 'status',
 'due_date',
 'resolution_description',
 'resolution_action_updated_date',
 'community_board',
 'bbl',
 'borough',
 'x_coordinate_state_plane',
 'y_coordinate_state_plane',
 'open_data_channel_type',
 'park_facility_name',
 'park_borough',
 'vehicle_type',
 'taxi_company_borough',
 'taxi_pick_up_location',
 'bridge_highway_name',
 'bridge_highway_direction',
 'road_ramp',
 'bridge_highway_segment',
 'latitude',
 'longitude',
 'location',
 ':@computed_region_efsh_h5xi',
 ':@computed_region_f5dn_yrer',
 ':@computed_region_yeji_bk3q',
 ':@computed_region_92fq_4b7q',
 ':@computed_region_sbqj_enih']

In [57]:
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("data.cityofnewyork.us", socrata_token, timeout=1000)

query = """
SELECT 
    complaint_type, count(complaint_type)
GROUP BY 
    complaint_type
ORDER BY 
    count(complaint_type) DESC
LIMIT 
    1000000
"""

# Returned as JSON from API / converted to Python list of
# dictionaries by sodapy.
results = client.get("erm2-nwe9", 
                     query=query)

# Convert to pandas DataFrame
results_df = pd.DataFrame.from_records(results)

print(results_df.shape)
results_df.head(10)

(499, 2)


Unnamed: 0,complaint_type,count_complaint_type
0,Noise - Residential,2225885
1,HEAT/HOT WATER,1407396
2,Illegal Parking,1117015
3,Blocked Driveway,1041075
4,Street Condition,1018764
5,Street Light Condition,981297
6,HEATING,887869
7,PLUMBING,744723
8,Water System,698995
9,Noise - Street/Sidewalk,680970


In [58]:
results_df.shape

(499, 2)

In [59]:
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("data.cityofnewyork.us", socrata_token, timeout=1000)

query = """
SELECT 
    descriptor, count(descriptor)
WHERE 
    LOWER(descriptor) LIKE '%flood%'
GROUP BY 
    descriptor
ORDER BY 
    count(descriptor) DESC
LIMIT 
    1000000
"""

# Returned as JSON from API / converted to Python list of
# dictionaries by sodapy.
results = client.get("erm2-nwe9", 
                     query=query)

# Convert to pandas DataFrame
results_df = pd.DataFrame.from_records(results)

print(results_df.shape)
results_df

(13, 2)


Unnamed: 0,descriptor,count_descriptor
0,Catch Basin Clogged/Flooding (Use Comments) (SC),90647
1,Street Flooding (SJ),27906
2,Flood Light Lamp Out,5954
3,Flooding on Street,3175
4,Highway Flooding (SH),2839
5,Flood Light Lamp Cycling,2510
6,Ready NY - Flooding,271
7,Flood Light Lamp Dayburning,205
8,Flood Light Lamp Missing,190
9,Flood Light Lamp Dim,177


In [60]:
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("data.cityofnewyork.us", socrata_token, timeout=1000)

query = """
SELECT 
    *
WHERE 
    LOWER(descriptor) LIKE '%flood%'
LIMIT 
    1000000
"""

# Returned as JSON from API / converted to Python list of
# dictionaries by sodapy.
results = client.get("erm2-nwe9", 
                     query=query)

# Convert to pandas DataFrame
results_df = pd.DataFrame.from_records(results)

print(results_df.shape)
results_df.head()

(134083, 38)


Unnamed: 0,unique_key,created_date,closed_date,agency,agency_name,complaint_type,descriptor,incident_zip,incident_address,street_name,...,intersection_street_1,intersection_street_2,bbl,location_type,due_date,landmark,bridge_highway_name,bridge_highway_direction,road_ramp,bridge_highway_segment
0,24860647,2013-01-25T08:42:00.000,2013-01-28T09:20:00.000,DEP,Department of Environmental Protection,Sewer,Catch Basin Clogged/Flooding (Use Comments) (SC),11694,BEACH 128 STREET,BEACH 128 STREET,...,,,,,,,,,,
1,24883645,2013-01-28T15:42:00.000,2013-01-30T09:00:00.000,DEP,Department of Environmental Protection,Sewer,Catch Basin Clogged/Flooding (Use Comments) (SC),11201,FLATBUSH AVENUE EXTENSION,FLATBUSH AVENUE EXTENSION,...,,,,,,,,,,
2,24892185,2013-01-29T09:45:00.000,2013-03-11T09:20:00.000,DOT,Department of Transportation,Street Light Condition,Flood Light Lamp Cycling,11211,,,...,GRAND STREET,VANDERVOORT AVENUE,,,,,,,,
3,24897032,2013-01-29T09:19:00.000,2013-01-29T14:38:00.000,DOT,Department of Transportation,Street Light Condition,Flood Light Lamp Cycling,11215,1108 8 AVE,8 AVE,...,8 AVENUE,12 STREET,,,,,,,,
4,24912103,2013-01-31T16:20:00.000,2013-02-01T11:27:00.000,DOT,Department of Transportation,Street Light Condition,Flood Light Lamp Cycling,11213,176 UTICA AVE,UTICA AVE,...,UTICA AVENUE,PARK PLACE,,,,,,,,


In [61]:
results_df['descriptor'].value_counts()

Catch Basin Clogged/Flooding (Use Comments) (SC)    90647
Street Flooding (SJ)                                27906
Flood Light Lamp Out                                 5954
Flooding on Street                                   3175
Highway Flooding (SH)                                2839
Flood Light Lamp Cycling                             2510
Ready NY - Flooding                                   271
Flood Light Lamp Dayburning                           205
Flood Light Lamp Missing                              190
Flood Light Lamp Dim                                  177
RAIN GARDEN FLOODING (SRGFLD)                          85
Flooding on Highway                                    76
Flooded                                                48
Name: descriptor, dtype: int64

In [62]:
results_df = results_df[results_df['descriptor'] == 'Street Flooding (SJ)']

In [63]:
results_df['descriptor'].value_counts()

Street Flooding (SJ)    27906
Name: descriptor, dtype: int64

In [64]:
results_df['complaint_type'].value_counts()

Sewer    27906
Name: complaint_type, dtype: int64

In [65]:
results_df.shape

(27906, 38)

In [66]:
results_df.head()

Unnamed: 0,unique_key,created_date,closed_date,agency,agency_name,complaint_type,descriptor,incident_zip,incident_address,street_name,...,intersection_street_1,intersection_street_2,bbl,location_type,due_date,landmark,bridge_highway_name,bridge_highway_direction,road_ramp,bridge_highway_segment
38,31862075,2015-10-28T17:44:00.000,2015-10-29T11:30:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),11694,BEACH 101 STREET,BEACH 101 STREET,...,,,,,,,,,,
48,25163797,2013-03-12T12:38:00.000,2013-03-12T14:20:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),11236,CHURCH LANE,CHURCH LANE,...,,,,,,,,,,
53,25185984,2013-03-15T12:13:00.000,2013-03-18T14:15:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),11236,CHURCH LANE,CHURCH LANE,...,,,,,,,,,,
54,25187738,2013-03-15T12:20:00.000,2013-03-15T14:22:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),11236,CHURCH LANE,CHURCH LANE,...,,,,,,,,,,
107,25338491,2013-04-11T12:16:00.000,2013-04-11T12:16:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),11236,CHURCH LANE,CHURCH LANE,...,,,,,,,,,,


# Deleting records greater than year 2020

In [67]:
print('Number of total records:', len(results_df))
      
print('Min:', results_df.created_date.min())

print('Max:', results_df.created_date.max())

Number of total records: 27906
Min: 2010-01-02T08:26:00.000
Max: 2021-01-08T18:50:00.000


In [68]:
results_df[results_df.created_date < '2021'].sort_values(by='created_date', ascending=False).head()

Unnamed: 0,unique_key,created_date,closed_date,agency,agency_name,complaint_type,descriptor,incident_zip,incident_address,street_name,...,intersection_street_1,intersection_street_2,bbl,location_type,due_date,landmark,bridge_highway_name,bridge_highway_direction,road_ramp,bridge_highway_segment
119612,48542220,2020-12-31T15:41:00.000,2021-01-01T00:20:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),11420,117-17 135 STREET,135 STREET,...,,,4116990057.0,,,,,,,
119567,48536430,2020-12-31T14:49:00.000,2021-01-04T10:15:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),11357,20-24 150 STREET,150 STREET,...,,,4046700029.0,,,,,,,
119584,48539361,2020-12-31T14:03:00.000,2021-01-02T11:25:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),11228,7223 8 AVENUE,8 AVENUE,...,,,3059120001.0,,,,,,,
119629,48543132,2020-12-31T13:48:00.000,2020-12-31T14:50:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),10032,,,...,RIVERSIDE DRIVE,WEST 165 STREET,,,,,,,,
119568,48536441,2020-12-31T13:10:00.000,2021-01-03T10:45:00.000,DEP,Department of Environmental Protection,Sewer,Street Flooding (SJ),11234,3123 FILLMORE AVENUE,FILLMORE AVENUE,...,,,3084750001.0,,,,,,,


In [69]:
results_df = results_df[results_df.created_date < '2021']

print(len(results_df))

print('Min:', results_df.created_date.min())

print('Max:', results_df.created_date.max())

27868
min: 2010-01-02T08:26:00.000
max: 2020-12-31T15:41:00.000


In [70]:
# writing output file as a csv
results_df.to_csv('../data-raw/raw-street-flooding-data.csv', index=False)

# listing items in data folder
%ls ../data-raw/

raw-street-flooding-data.csv  raw-streets-clipped.json


# Briefly reviewing what descriptors are in the complaint_type='Sewer'

In [71]:
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("data.cityofnewyork.us", socrata_token, timeout=1000)

# Example authenticated client (needed for non-public datasets):
# client = Socrata(data.cityofnewyork.us,
#                  MyAppToken,
#                  userame="user@example.com",
#                  password="AFakePassword")

query = """
SELECT 
    descriptor, count(descriptor)
WHERE 
    complaint_type='Sewer'
GROUP BY 
    descriptor
ORDER BY 
    count(descriptor) DESC
"""

# First 2000 results, returned as JSON from API / converted to Python list of
# dictionaries by sodapy.
results = client.get("erm2-nwe9", 
                     query=query)

# Convert to pandas DataFrame
results_df = pd.DataFrame.from_records(results)

print(results_df.shape)
results_df.head(len(results_df))

(27, 2)


Unnamed: 0,descriptor,count_descriptor
0,Sewer Backup (Use Comments) (SA),149675
1,Catch Basin Clogged/Flooding (Use Comments) (SC),90647
2,Catch Basin Sunken/Damaged/Raised (SC1),28719
3,Street Flooding (SJ),27906
4,Manhole Cover Broken/Making Noise (SB),19777
5,Manhole Cover Missing (Emergency) (SA3),17566
6,Sewer Odor (SA2),15453
7,Defective/Missing Curb Piece (SC4),8517
8,Manhole Overflow (Use Comments) (SA1),6879
9,Catch Basin Search (SC2),4153
