# Text Flood
Author: Mark Bauer

Goal: Analyze character word *flood* from every dataset on NYC Open Data.

# Importing Libraries

In [1]:
# importing libraries
import pandas as pd
import numpy as np
from sodapy import Socrata
import requests
import time
import os
import glob

Documention for installing watermark: https://github.com/rasbt/watermark.

In [2]:
# performed for reproducibility
%reload_ext watermark
%watermark -t -d -v -p pandas,sodapy

Python implementation: CPython
Python version       : 3.11.0
IPython version      : 8.6.0

pandas: 1.5.1
sodapy: 2.2.0



# Socrata API
I used the Socrata API to retrieve metadata for datasets hosted on NYC Open Data. Documentation can be found here: https://dev.socrata.com/. Additionally, I used sodapy, the python client for the Socrata API, to query the metadata.

We'll use this API to gather all the datasets on NYC Open Data.

### Note:  
`WARNING:root:Requests made without an app_token will be subject to strict throttling limits.`

Read more from the SODA documentation here: https://dev.socrata.com/docs/app-tokens.html

In [3]:
# source domain for NYC Open Data on Socrata
socrata_domain = 'data.cityofnewyork.us'

# initialize Socrata object to fetch data
client = Socrata(
    domain=socrata_domain,
    app_token=None,
    timeout=1000
)

print(client)



<sodapy.socrata.Socrata object at 0x1067cb590>


In [4]:
# Discovery API
url = 'https://api.us.socrata.com/api/catalog/v1?search_context=data.cityofnewyork.us&limit=50000'

# fetch the JSON data from the web
response = requests.get(url)

# parse the JSON response
data_dict = response.json() 

# preview keys    
data_dict.keys() 



In [5]:
# convert into df
metadata_df = pd.DataFrame.from_records(data_dict['results'])

# sanity check
print(metadata_df.shape)
metadata_df.head()

(3245, 8)


Unnamed: 0,resource,classification,metadata,permalink,link,owner,creator,preview_image_url
0,"{'name': 'For Hire Vehicles (FHV) - Active', '...","{'categories': [], 'tags': [], 'domain_categor...",{'domain': 'data.cityofnewyork.us'},https://data.cityofnewyork.us/d/8wbx-tsch,https://data.cityofnewyork.us/Transportation/F...,"{'id': '5fuc-pqz2', 'user_type': 'interactive'...","{'id': '5fuc-pqz2', 'user_type': 'interactive'...",
1,"{'name': 'Civil Service List (Active)', 'id': ...","{'categories': [], 'tags': [], 'domain_categor...",{'domain': 'data.cityofnewyork.us'},https://data.cityofnewyork.us/d/vx8i-nprf,https://data.cityofnewyork.us/City-Government/...,"{'id': '5fuc-pqz2', 'user_type': 'interactive'...","{'id': '5fuc-pqz2', 'user_type': 'interactive'...",
2,"{'name': 'DOB Job Application Filings', 'id': ...","{'categories': [], 'tags': [], 'domain_categor...",{'domain': 'data.cityofnewyork.us'},https://data.cityofnewyork.us/d/ic3t-wcy2,https://data.cityofnewyork.us/Housing-Developm...,"{'id': '5fuc-pqz2', 'user_type': 'interactive'...","{'id': '5fuc-pqz2', 'user_type': 'interactive'...",
3,"{'name': 'TLC New Driver Application Status', ...","{'categories': [], 'tags': [], 'domain_categor...",{'domain': 'data.cityofnewyork.us'},https://data.cityofnewyork.us/d/dpec-ucu7,https://data.cityofnewyork.us/Transportation/T...,"{'id': '5fuc-pqz2', 'user_type': 'interactive'...","{'id': '5fuc-pqz2', 'user_type': 'interactive'...",
4,{'name': 'For Hire Vehicles (FHV) - Active Dri...,"{'categories': [], 'tags': [], 'domain_categor...",{'domain': 'data.cityofnewyork.us'},https://data.cityofnewyork.us/d/xjfq-wh2d,https://data.cityofnewyork.us/Transportation/F...,"{'id': '5fuc-pqz2', 'user_type': 'interactive'...","{'id': '5fuc-pqz2', 'user_type': 'interactive'...",


In [6]:
# convert resource key to a dataframe
metadata_df = pd.DataFrame.from_records(metadata_df['resource'])

# sanity check
print(metadata_df.shape)
metadata_df.head()

(3245, 27)


Unnamed: 0,name,id,resource_name,parent_fxf,description,attribution,attribution_link,contact_email,type,updatedAt,...,columns_description,columns_format,download_count,provenance,lens_view_type,lens_display_type,locked,blob_mime_type,hide_from_data_json,publication_date
0,For Hire Vehicles (FHV) - Active,8wbx-tsch,,[],"<b>PLEASE NOTE:</b> This dataset, which includ...",Taxi and Limousine Commission (TLC),,,dataset,2024-12-31T19:59:30.000Z,...,"[DMV License Plate Number, Permit License Numb...","[{'displayStyle': 'plain', 'align': 'left'}, {...",536011,official,tabular,table,False,,False,2021-04-05T13:20:47.000Z
1,Civil Service List (Active),vx8i-nprf,,[],A Civil Service List consists of all candidate...,Department of Citywide Administrative Services...,,,dataset,2025-01-01T14:16:19.000Z,...,[Represents the certification order for a list...,"[{'displayStyle': 'plain', 'align': 'left'}, {...",68956,official,tabular,table,False,,False,2024-01-12T16:15:05.000Z
2,DOB Job Application Filings,ic3t-wcy2,,[],This dataset contains all job applications sub...,Department of Buildings (DOB),,,dataset,2024-12-31T21:13:45.000Z,...,[Job is Professionally Certified by Licensed P...,"[{'align': 'right'}, {'align': 'right'}, {'ali...",59793,official,tabular,table,False,,False,2020-06-22T18:23:35.000Z
3,TLC New Driver Application Status,dpec-ucu7,,[],THIS DATASET IS UPDATED SEVERAL TIMES PER DAY....,Taxi and Limousine Commission (TLC),,,dataset,2025-01-01T11:06:04.000Z,...,"[The date you submitted your application., A N...","[{'view': 'date', 'align': 'left'}, {'displayS...",39678,official,tabular,table,False,,False,2019-12-17T18:44:57.000Z
4,For Hire Vehicles (FHV) - Active Drivers,xjfq-wh2d,,[],"<b>PLEASE NOTE:</b> This dataset, which includ...",Taxi and Limousine Commission (TLC),,,dataset,2024-12-31T20:00:56.000Z,...,"[Last Time Updated, Driver Name\n\n, Expiratio...","[{'displayStyle': 'plain', 'align': 'left'}, {...",422194,official,tabular,table,False,,False,2024-01-11T19:58:17.000Z


In [7]:
cols = [
    'name', 'id', 'description',
    'attribution', 'createdAt', 'publication_date', 'updatedAt',
    'metadata_updated_at', 'data_updated_at', 'download_count', 'page_views',
    'columns_name', 'columns_field_name', 'columns_datatype',
    'columns_description', 'columns_format'
]
       
metadata_df = metadata_df.loc[:, cols]

metadata_df.head()

Unnamed: 0,name,id,description,attribution,createdAt,publication_date,updatedAt,metadata_updated_at,data_updated_at,download_count,page_views,columns_name,columns_field_name,columns_datatype,columns_description,columns_format
0,For Hire Vehicles (FHV) - Active,8wbx-tsch,"<b>PLEASE NOTE:</b> This dataset, which includ...",Taxi and Limousine Commission (TLC),2015-07-16T17:33:32.000Z,2021-04-05T13:20:47.000Z,2024-12-31T19:59:30.000Z,2022-09-06T21:05:32.000Z,2024-12-31T19:59:30.000Z,536011,"{'page_views_last_week': 5937, 'page_views_las...","[DMV License Plate Number, Permit License Numb...","[dmv_license_plate_number, permit_license_numb...","[Text, Text, Calendar date, Calendar date, Tex...","[DMV License Plate Number, Permit License Numb...","[{'displayStyle': 'plain', 'align': 'left'}, {..."
1,Civil Service List (Active),vx8i-nprf,A Civil Service List consists of all candidate...,Department of Citywide Administrative Services...,2016-06-14T21:12:15.000Z,2024-01-12T16:15:05.000Z,2025-01-01T14:16:19.000Z,2024-05-14T14:26:52.000Z,2025-01-01T14:16:19.000Z,68956,"{'page_views_last_week': 5129, 'page_views_las...","[Group No, First Name, List Agency Desc, List ...","[group_no, first_name, list_agency_desc, list_...","[Text, Text, Text, Text, Text, Text, Number, T...",[Represents the certification order for a list...,"[{'displayStyle': 'plain', 'align': 'left'}, {..."
2,DOB Job Application Filings,ic3t-wcy2,This dataset contains all job applications sub...,Department of Buildings (DOB),2013-04-18T15:18:56.000Z,2020-06-22T18:23:35.000Z,2024-12-31T21:13:45.000Z,2020-06-23T02:08:44.000Z,2024-12-31T21:13:45.000Z,59793,"{'page_views_last_week': 283, 'page_views_last...","[Professional Cert, BUILDING_CLASS, Horizontal...","[professional_cert, building_class, horizontal...","[Text, Text, Text, Text, Text, Text, Text, Tex...",[Job is Professionally Certified by Licensed P...,"[{'align': 'right'}, {'align': 'right'}, {'ali..."
3,TLC New Driver Application Status,dpec-ucu7,THIS DATASET IS UPDATED SEVERAL TIMES PER DAY....,Taxi and Limousine Commission (TLC),2016-05-17T18:43:43.000Z,2019-12-17T18:44:57.000Z,2025-01-01T11:06:04.000Z,2022-05-09T22:28:03.000Z,2025-01-01T11:06:04.000Z,39678,"{'page_views_last_week': 97, 'page_views_last_...","[App Date, Defensive Driving, Driver Exam, App...","[app_date, defensive_driving, driver_exam, app...","[Calendar date, Text, Text, Number, Text, Cale...","[The date you submitted your application., A N...","[{'view': 'date', 'align': 'left'}, {'displayS..."
4,For Hire Vehicles (FHV) - Active Drivers,xjfq-wh2d,"<b>PLEASE NOTE:</b> This dataset, which includ...",Taxi and Limousine Commission (TLC),2015-07-16T17:24:02.000Z,2024-01-11T19:58:17.000Z,2024-12-31T20:00:56.000Z,2024-01-11T19:58:17.000Z,2024-12-31T20:00:56.000Z,422194,"{'page_views_last_week': 2187, 'page_views_las...","[Last Time Updated, Name, Expiration Date, Las...","[last_time_updated, name, expiration_date, las...","[Text, Text, Calendar date, Calendar date, Num...","[Last Time Updated, Driver Name\n\n, Expiratio...","[{'displayStyle': 'plain', 'align': 'left'}, {..."


In [8]:
# preview columns
metadata_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3245 entries, 0 to 3244
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   name                 3245 non-null   object
 1   id                   3245 non-null   object
 2   description          3245 non-null   object
 3   attribution          3174 non-null   object
 4   createdAt            3245 non-null   object
 5   publication_date     3220 non-null   object
 6   updatedAt            3245 non-null   object
 7   metadata_updated_at  3245 non-null   object
 8   data_updated_at      3068 non-null   object
 9   download_count       3245 non-null   int64 
 10  page_views           3245 non-null   object
 11  columns_name         3245 non-null   object
 12  columns_field_name   3245 non-null   object
 13  columns_datatype     3245 non-null   object
 14  columns_description  3245 non-null   object
 15  columns_format       3245 non-null   object
dtypes: int

# Read in results of our character match *flood*

In [9]:
path = "../results"

all_files = glob.glob(os.path.join(path, "*.csv"))
df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)

print(df.shape)
df.head(10)

(19042, 5)


Unnamed: 0,value,count,timestamp,column,id
0,Flood,525,2024-12-29 19:14:56,last_name,5gq7-rgmv
1,Flooderman,1,2024-12-29 19:14:56,last_name,5gq7-rgmv
2,Floods,1,2024-12-29 19:14:56,last_name,5gq7-rgmv
3,"FLOOD,NICOLE,R",18,2024-12-29 19:20:49,name,c4dx-tk4d
4,Flood,719,2024-12-29 19:25:49,last_name,j62e-7maa
5,Floods,3,2024-12-29 19:25:49,last_name,j62e-7maa
6,Floodberg,1,2024-12-29 19:25:49,last_name,j62e-7maa
7,Floodirst,1,2024-12-29 19:25:49,last_name,j62e-7maa
8,Floodstrand,1,2024-12-29 19:25:49,last_name,j62e-7maa
9,Floody,1,2024-12-29 19:25:49,last_name,j62e-7maa


In [10]:
df = df.merge(
    metadata_df,
    on='id',
    how='left'
)

print(df.shape)
df.head()

(19042, 20)


Unnamed: 0,value,count,timestamp,column,id,name,description,attribution,createdAt,publication_date,updatedAt,metadata_updated_at,data_updated_at,download_count,page_views,columns_name,columns_field_name,columns_datatype,columns_description,columns_format
0,Flood,525,2024-12-29 19:14:56,last_name,5gq7-rgmv,NYC Historical Vital Records: Index to Digitiz...,The dataset is an index to digitized historica...,Department of Records and Information Services...,2022-05-17T19:35:24.000Z,2023-07-19T15:32:38.000Z,2024-10-11T20:08:05.000Z,2024-10-11T20:06:00.000Z,2024-10-11T20:08:05.000Z,463,"{'page_views_last_week': 54, 'page_views_last_...","[Last Name, Certificate Month, First Name, Cer...","[last_name, month, first_name, year, soundex, ...","[Text, Text, Text, Text, Text, Text, Text, Text]","[Last name on the certificate, Month of Birth...","[{}, {}, {}, {}, {}, {}, {}, {}]"
1,Flooderman,1,2024-12-29 19:14:56,last_name,5gq7-rgmv,NYC Historical Vital Records: Index to Digitiz...,The dataset is an index to digitized historica...,Department of Records and Information Services...,2022-05-17T19:35:24.000Z,2023-07-19T15:32:38.000Z,2024-10-11T20:08:05.000Z,2024-10-11T20:06:00.000Z,2024-10-11T20:08:05.000Z,463,"{'page_views_last_week': 54, 'page_views_last_...","[Last Name, Certificate Month, First Name, Cer...","[last_name, month, first_name, year, soundex, ...","[Text, Text, Text, Text, Text, Text, Text, Text]","[Last name on the certificate, Month of Birth...","[{}, {}, {}, {}, {}, {}, {}, {}]"
2,Floods,1,2024-12-29 19:14:56,last_name,5gq7-rgmv,NYC Historical Vital Records: Index to Digitiz...,The dataset is an index to digitized historica...,Department of Records and Information Services...,2022-05-17T19:35:24.000Z,2023-07-19T15:32:38.000Z,2024-10-11T20:08:05.000Z,2024-10-11T20:06:00.000Z,2024-10-11T20:08:05.000Z,463,"{'page_views_last_week': 54, 'page_views_last_...","[Last Name, Certificate Month, First Name, Cer...","[last_name, month, first_name, year, soundex, ...","[Text, Text, Text, Text, Text, Text, Text, Text]","[Last name on the certificate, Month of Birth...","[{}, {}, {}, {}, {}, {}, {}, {}]"
3,"FLOOD,NICOLE,R",18,2024-12-29 19:20:49,name,c4dx-tk4d,Historical Medallion Drivers – Passenger Assis...,This list contains information on the status o...,Taxi and Limousine Commission (TLC),2018-01-25T17:39:53.000Z,2018-01-29T22:12:30.000Z,2022-05-09T22:28:13.000Z,2022-05-09T22:28:13.000Z,2018-01-29T21:58:57.000Z,575,"{'page_views_last_week': 3, 'page_views_last_m...","[License Number, Completed Both Training, Type...","[license_number, completed_both, type, last_ti...","[Number, Text, Text, Text, Text, Calendar date...","[, , , , , , ]","[{'precisionStyle': 'standard', 'noCommas': 't..."
4,Flood,719,2024-12-29 19:25:49,last_name,j62e-7maa,NYC Historical Vital Records: Index to Digitiz...,The dataset is an index to digitized historica...,Department of Records and Information Services...,2022-05-17T18:47:23.000Z,2023-07-19T15:29:27.000Z,2024-10-11T20:13:42.000Z,2024-10-11T20:08:22.000Z,2024-10-11T20:13:42.000Z,633,"{'page_views_last_week': 45, 'page_views_last_...","[County, First Name, Soundex, Last Name, Certi...","[county, first_name, soundex, last_name, year,...","[Text, Text, Text, Text, Text, Text, Text, Text]",[Borough in which the certificate was issued. ...,"[{}, {}, {}, {}, {}, {}, {}, {}]"


In [11]:
# save results of full file
df.to_csv('../results/results-total.csv', index=False)

In [12]:
# count of values by agency
(df
 .groupby(by='attribution')['count']
 .sum()
 .sort_values(ascending=False)
 .head(60)
 .reset_index()
)

Unnamed: 0,attribution,count
0,311,357706
1,Department of Transportation (DOT),233939
2,Department of Housing Preservation & Developme...,37243
3,Department of Parks and Recreation (DPR),15582
4,Department of Buildings (DOB),9263
5,Department of City Planning (DCP),6006
6,NYC Emergency Management (NYCEM),4609
7,NYC Department of Transportation,3220
8,New York City Office of Management and Budget,2542
9,Department of Environmental Protection (DEP),2532


In [13]:
# examine last four agencies
(df
 .groupby(by='attribution')['count']
 .sum()
 .sort_values(ascending=False)
 .reset_index()
 .iloc[60:, :]
)

Unnamed: 0,attribution,count
60,Office of the Staten Island Borough President,1
61,Mayor's Office of Management & Budget (OMB),1
62,NYC Service,1
63,Brooklyn Borough President,1


In [14]:
# count of values in columns by dataset
(df
 .groupby(by='name')['count']
 .sum()
 .sort_values(ascending=False)
 .head(60)
 .reset_index()
)

Unnamed: 0,name,count
0,311 Service Requests from 2010 to Present,179486
1,Pedestrian Mobility Plan Pedestrian Demand,121572
2,311 Web Content - Services,67528
3,Street Construction Permits - Stipulations (Hi...,52696
4,Housing Maintenance Code Complaints and Problems,37243
5,Bicycle Parking,34281
6,311 Service Requests for 2007,22238
7,311 Service Requests for 2006,19968
8,311 Service Requests for 2008,19168
9,311 Service Requests for 2005,18203


In [15]:
# count of values in columns by agency and dataset
(df
 .groupby(by=['attribution', 'name'])['count']
 .sum()
 .sort_values(ascending=False)
 .head(60)
 .reset_index()
)

Unnamed: 0,attribution,name,count
0,311,311 Service Requests from 2010 to Present,179486
1,Department of Transportation (DOT),Pedestrian Mobility Plan Pedestrian Demand,121572
2,311,311 Web Content - Services,67528
3,Department of Transportation (DOT),Street Construction Permits - Stipulations (Hi...,52696
4,Department of Housing Preservation & Developme...,Housing Maintenance Code Complaints and Problems,37243
5,Department of Transportation (DOT),Bicycle Parking,34281
6,311,311 Service Requests for 2007,22238
7,311,311 Service Requests for 2006,19968
8,311,311 Service Requests for 2008,19168
9,311,311 Service Requests for 2005,18203


In [16]:
# count of values in columns by agency, dataset and column
(df
 .groupby(by=['attribution', 'name', 'column'])['count']
 .sum()
 .sort_values(ascending=False)
 .head(60)
 .reset_index()
)

Unnamed: 0,attribution,name,column,count
0,311,311 Service Requests from 2010 to Present,descriptor,174252
1,Department of Transportation (DOT),Pedestrian Mobility Plan Pedestrian Demand,FEMAFldT,121572
2,Department of Housing Preservation & Developme...,Housing Maintenance Code Complaints and Problems,problem_code,37237
3,311,311 Web Content - Services,web_keywords,35090
4,Department of Transportation (DOT),Bicycle Parking,FEMAFldT,34281
5,Department of Transportation (DOT),Street Construction Permits - Stipulations (Hi...,stipulationid,26348
6,Department of Transportation (DOT),Street Construction Permits - Stipulations (Hi...,stipulationfulltext,26348
7,311,311 Service Requests for 2007,descriptor,22238
8,311,311 Service Requests for 2006,descriptor,19968
9,311,311 Service Requests for 2008,descriptor,19168


In [17]:
# count of values in columns by agency, dataset, column and column value
(df
 .groupby(by=['attribution', 'name', 'column', 'value'])['count']
 .sum()
 .sort_values(ascending=False)
 .head(60)
 .reset_index()
)

Unnamed: 0,attribution,name,column,value,count
0,311,311 Service Requests from 2010 to Present,descriptor,Catch Basin Clogged/Flooding (Use Comments) (SC),118865
1,Department of Transportation (DOT),Pedestrian Mobility Plan Pedestrian Demand,FEMAFldT,AREA OF MINIMAL FLOOD HAZARD,111949
2,311,311 Service Requests from 2010 to Present,descriptor,Street Flooding (SJ),41154
3,Department of Housing Preservation & Developme...,Housing Maintenance Code Complaints and Problems,problem_code,FIXTURE WET OR FLOODED,37237
4,Department of Transportation (DOT),Bicycle Parking,FEMAFldT,AREA OF MINIMAL FLOOD HAZARD,33396
5,Department of Transportation (DOT),Street Construction Permits - Stipulations (Hi...,stipulationid,FLOODA,26348
6,Department of Transportation (DOT),Street Construction Permits - Stipulations (Hi...,stipulationfulltext,NYC OEM EVACUATION FLOOD ZONE A,26348
7,311,311 Service Requests for 2007,descriptor,Catch Basin Clogged/Flooding (Use Comments) (SC),17628
8,311,311 Service Requests for 2006,descriptor,Catch Basin Clogged/Flooding (Use Comments) (SC),16713
9,311,311 Service Requests for 2005,descriptor,Catch Basin Clogged/Flooding (Use Comments) (SC),14980
