# Dataset Analysis
Author: Mark Bauer

Goal: Analyze character word *flood* from every dataset on NYC Open Data. Original process was performed in two steps. This workflow concatenates both results into one file.

# Importing Libraries

In [20]:
# importing libraries
import pandas as pd
import numpy as np
import requests

Documention for installing watermark: https://github.com/rasbt/watermark.

In [21]:
# performed for reproducibility
%reload_ext watermark
%watermark -t -d -v -p pandas,sodapy

Python implementation: CPython
Python version       : 3.11.0
IPython version      : 8.6.0

pandas: 1.5.1
sodapy: 2.2.0



# Socrata API and Dataset Metadata
I used the Socrata API to retrieve metadata for datasets hosted on NYC Open Data. Documentation can be found here: https://dev.socrata.com/.

In [22]:
# Discovery API
url = 'https://api.us.socrata.com/api/catalog/v1?search_context=data.cityofnewyork.us&limit=50000'

# fetch the JSON data from the web
response = requests.get(url)

# parse the JSON response
data_dict = response.json() 

# preview keys    
data_dict.keys() 



In [23]:
# convert results key into a dataframe
metadata_df = pd.DataFrame.from_records(data_dict['results'])

# sanity check
print(metadata_df.shape)
metadata_df.head()

(3203, 8)


Unnamed: 0,resource,classification,metadata,permalink,link,owner,creator,preview_image_url
0,"{'name': 'For Hire Vehicles (FHV) - Active', '...","{'categories': [], 'tags': [], 'domain_categor...",{'domain': 'data.cityofnewyork.us'},https://data.cityofnewyork.us/d/8wbx-tsch,https://data.cityofnewyork.us/Transportation/F...,"{'id': '5fuc-pqz2', 'user_type': 'interactive'...","{'id': '5fuc-pqz2', 'user_type': 'interactive'...",
1,"{'name': 'Civil Service List (Active)', 'id': ...","{'categories': [], 'tags': [], 'domain_categor...",{'domain': 'data.cityofnewyork.us'},https://data.cityofnewyork.us/d/vx8i-nprf,https://data.cityofnewyork.us/City-Government/...,"{'id': '5fuc-pqz2', 'user_type': 'interactive'...","{'id': '5fuc-pqz2', 'user_type': 'interactive'...",
2,"{'name': 'DOB Job Application Filings', 'id': ...","{'categories': [], 'tags': [], 'domain_categor...",{'domain': 'data.cityofnewyork.us'},https://data.cityofnewyork.us/d/ic3t-wcy2,https://data.cityofnewyork.us/Housing-Developm...,"{'id': '5fuc-pqz2', 'user_type': 'interactive'...","{'id': '5fuc-pqz2', 'user_type': 'interactive'...",
3,"{'name': 'TLC New Driver Application Status', ...","{'categories': [], 'tags': [], 'domain_categor...",{'domain': 'data.cityofnewyork.us'},https://data.cityofnewyork.us/d/dpec-ucu7,https://data.cityofnewyork.us/Transportation/T...,"{'id': '5fuc-pqz2', 'user_type': 'interactive'...","{'id': '5fuc-pqz2', 'user_type': 'interactive'...",
4,{'name': 'For Hire Vehicles (FHV) - Active Dri...,"{'categories': [], 'tags': [], 'domain_categor...",{'domain': 'data.cityofnewyork.us'},https://data.cityofnewyork.us/d/xjfq-wh2d,https://data.cityofnewyork.us/Transportation/F...,"{'id': '5fuc-pqz2', 'user_type': 'interactive'...","{'id': '5fuc-pqz2', 'user_type': 'interactive'...",


In [24]:
# convert resource key to a dataframe
metadata_df = pd.DataFrame.from_records(metadata_df['resource'])

# sanity check
print(metadata_df.shape)
metadata_df.head()

(3203, 27)


Unnamed: 0,name,id,resource_name,parent_fxf,description,attribution,attribution_link,contact_email,type,updatedAt,...,columns_description,columns_format,download_count,provenance,lens_view_type,lens_display_type,locked,blob_mime_type,hide_from_data_json,publication_date
0,For Hire Vehicles (FHV) - Active,8wbx-tsch,,[],"<b>PLEASE NOTE:</b> This dataset, which includ...",Taxi and Limousine Commission (TLC),,,dataset,2025-01-29T20:05:48.000Z,...,"[Vehicle VIN Number, Reason Code A,B,C or G\n,...","[{'displayStyle': 'plain', 'align': 'left'}, {...",540361,official,tabular,table,False,,False,2021-04-05T13:20:47.000Z
1,Civil Service List (Active),vx8i-nprf,,[],A Civil Service List consists of all candidate...,Department of Citywide Administrative Services...,,,dataset,2025-01-29T14:11:31.000Z,...,[The date on which a list will be extended bey...,"[{'view': 'date', 'align': 'left'}, {'view': '...",69881,official,tabular,table,False,,False,2024-01-12T16:15:05.000Z
2,DOB Job Application Filings,ic3t-wcy2,,[],This dataset contains all job applications sub...,Department of Buildings (DOB),,,dataset,2025-01-29T22:06:27.000Z,...,"[Proposed Height, Existing Occupancy, Existing...","[{'precisionStyle': 'standard', 'noCommas': 't...",60336,official,tabular,table,False,,False,2020-06-22T18:23:35.000Z
3,TLC New Driver Application Status,dpec-ucu7,,[],THIS DATASET IS UPDATED SEVERAL TIMES PER DAY....,Taxi and Limousine Commission (TLC),,,dataset,2025-01-30T11:01:24.000Z,...,[This field is for miscellaneous items that ma...,"[{'displayStyle': 'plain', 'align': 'left'}, {...",39724,official,tabular,table,False,,False,2019-12-17T18:44:57.000Z
4,For Hire Vehicles (FHV) - Active Drivers,xjfq-wh2d,,[],"<b>PLEASE NOTE:</b> This dataset, which includ...",Taxi and Limousine Commission (TLC),,,dataset,2025-01-29T20:08:09.000Z,...,"[Driver Name\n\n, WAV if Wheelchair Accessible...","[{'displayStyle': 'plain', 'align': 'left'}, {...",425712,official,tabular,table,False,,False,2024-01-11T19:58:17.000Z


In [25]:
# specify columns
cols = [
    'name', 'id', 'description',
    'attribution', 'createdAt', 'publication_date', 'updatedAt',
    'metadata_updated_at', 'data_updated_at', 'download_count', 'page_views'
]
 
# select specified columns
metadata_df = metadata_df.loc[:, cols]

# preview data
metadata_df.head()

Unnamed: 0,name,id,description,attribution,createdAt,publication_date,updatedAt,metadata_updated_at,data_updated_at,download_count,page_views
0,For Hire Vehicles (FHV) - Active,8wbx-tsch,"<b>PLEASE NOTE:</b> This dataset, which includ...",Taxi and Limousine Commission (TLC),2015-07-16T17:33:32.000Z,2021-04-05T13:20:47.000Z,2025-01-29T20:05:48.000Z,2022-09-06T21:05:32.000Z,2025-01-29T20:05:48.000Z,540361,"{'page_views_last_week': 13225, 'page_views_la..."
1,Civil Service List (Active),vx8i-nprf,A Civil Service List consists of all candidate...,Department of Citywide Administrative Services...,2016-06-14T21:12:15.000Z,2024-01-12T16:15:05.000Z,2025-01-29T14:11:31.000Z,2024-05-14T14:26:52.000Z,2025-01-29T14:11:31.000Z,69881,"{'page_views_last_week': 5925, 'page_views_las..."
2,DOB Job Application Filings,ic3t-wcy2,This dataset contains all job applications sub...,Department of Buildings (DOB),2013-04-18T15:18:56.000Z,2020-06-22T18:23:35.000Z,2025-01-29T22:06:27.000Z,2020-06-23T02:08:44.000Z,2025-01-29T22:06:27.000Z,60336,"{'page_views_last_week': 566, 'page_views_last..."
3,TLC New Driver Application Status,dpec-ucu7,THIS DATASET IS UPDATED SEVERAL TIMES PER DAY....,Taxi and Limousine Commission (TLC),2016-05-17T18:43:43.000Z,2019-12-17T18:44:57.000Z,2025-01-30T11:01:24.000Z,2022-05-09T22:28:03.000Z,2025-01-30T11:01:24.000Z,39724,"{'page_views_last_week': 160, 'page_views_last..."
4,For Hire Vehicles (FHV) - Active Drivers,xjfq-wh2d,"<b>PLEASE NOTE:</b> This dataset, which includ...",Taxi and Limousine Commission (TLC),2015-07-16T17:24:02.000Z,2024-01-11T19:58:17.000Z,2025-01-29T20:08:09.000Z,2024-01-11T19:58:17.000Z,2025-01-29T20:08:09.000Z,425712,"{'page_views_last_week': 3586, 'page_views_las..."


In [26]:
# preview columns
metadata_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3203 entries, 0 to 3202
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   name                 3203 non-null   object
 1   id                   3203 non-null   object
 2   description          3203 non-null   object
 3   attribution          3133 non-null   object
 4   createdAt            3203 non-null   object
 5   publication_date     3178 non-null   object
 6   updatedAt            3203 non-null   object
 7   metadata_updated_at  3203 non-null   object
 8   data_updated_at      3032 non-null   object
 9   download_count       3203 non-null   int64 
 10  page_views           3203 non-null   object
dtypes: int64(1), object(10)
memory usage: 275.4+ KB


# Read In Results
Read in results of our character match *flood*.

In [27]:
# list files in results folder
%ls ../results/

results.csv


In [28]:
# specify path
path = "../results/results.csv"

# read CSV
df = pd.read_csv(path)

# sanity check
print(df.shape)
df.head(10)

(19159, 5)


Unnamed: 0,value,count,timestamp,column,id
0,10. Flood Elimination,1,2025-01-26 21:02:28,Work Type,24nr-gahi
1,Flood Triangle,1,2025-01-26 21:36:18,name,2bsr-c6qq
2,INSTALLATION OF HIGH INTENSITY FLOOD LIGHTS IN...,1,2025-01-26 21:37:59,Budget Line Description,2cmn-uidm
3,Flood,20,2025-01-26 21:41:35,Officer Last Name,2fir-qns4
4,Floodsuero,1,2025-01-26 21:41:35,Officer Last Name,2fir-qns4
5,structural damage (has water but flooding surr...,1,2025-01-26 21:49:08,Comments,2jvr-j6ne
6,constantly running/side of fountain leaking & ...,1,2025-01-26 21:49:08,Comments,2jvr-j6ne
7,no water (park worker said it floods park house),1,2025-01-26 21:49:08,Comments,2jvr-j6ne
8,leaking/flooded,1,2025-01-26 21:49:08,Comments,2jvr-j6ne
9,leaking & causing flooding adj HBCT,1,2025-01-26 21:49:08,Comments,2jvr-j6ne


In [29]:
df.columns

Index(['value', 'count', 'timestamp', 'column', 'id'], dtype='object')

In [30]:
cols = ['id', 'column', 'value', 'count', 'timestamp']

df = df.loc[:, cols]

df.head()

Unnamed: 0,id,column,value,count,timestamp
0,24nr-gahi,Work Type,10. Flood Elimination,1,2025-01-26 21:02:28
1,2bsr-c6qq,name,Flood Triangle,1,2025-01-26 21:36:18
2,2cmn-uidm,Budget Line Description,INSTALLATION OF HIGH INTENSITY FLOOD LIGHTS IN...,1,2025-01-26 21:37:59
3,2fir-qns4,Officer Last Name,Flood,20,2025-01-26 21:41:35
4,2fir-qns4,Officer Last Name,Floodsuero,1,2025-01-26 21:41:35


In [31]:
# merge metadata with data
df = df.merge(
    metadata_df,
    on='id',
    how='left'
)

print(df.shape)
df.head()

(19159, 15)


Unnamed: 0,id,column,value,count,timestamp,name,description,attribution,createdAt,publication_date,updatedAt,metadata_updated_at,data_updated_at,download_count,page_views
0,24nr-gahi,Work Type,10. Flood Elimination,1,2025-01-26 21:02:28,Five Year Plan Summary by Capital Category,Five year plan summary cost by capital category.,NYC School Construction Authority (SCA),2016-06-02T14:46:59.000Z,2023-12-27T20:28:30.000Z,2025-01-10T16:06:06.000Z,2025-01-10T16:06:05.000Z,2025-01-10T16:06:06.000Z,1711,"{'page_views_last_week': 4, 'page_views_last_m..."
1,2bsr-c6qq,name,Flood Triangle,1,2025-01-26 21:36:18,Universal Solicitation for Broadband Asset Dat...,List of existing City and City Partner assets ...,Office of Technology and Innovation (OTI),2020-10-02T18:16:49.000Z,2021-03-02T22:05:30.000Z,2022-09-23T19:23:10.000Z,2022-09-23T19:23:10.000Z,2021-03-12T21:06:26.000Z,1215,"{'page_views_last_week': 5, 'page_views_last_m..."
2,2cmn-uidm,Budget Line Description,INSTALLATION OF HIGH INTENSITY FLOOD LIGHTS IN...,1,2025-01-26 21:37:59,Capital Commitment Plan,This dataset contains capital commitment plan ...,Mayor's Office of Management & Budget (OMB),2018-02-05T21:41:21.000Z,2019-02-14T23:27:13.000Z,2025-01-23T14:38:45.000Z,2025-01-23T14:38:44.000Z,2025-01-23T14:38:45.000Z,3672,"{'page_views_last_week': 29, 'page_views_last_..."
3,2fir-qns4,Officer Last Name,Flood,20,2025-01-26 21:41:35,Civilian Complaint Review Board: Police Officers,"A list of all NYPD officers, as reported to CC...",Civilian Complaint Review Board (CCRB),2022-06-06T16:15:01.000Z,2023-01-09T14:41:04.000Z,2025-01-29T15:06:14.000Z,2025-01-29T15:00:19.000Z,2025-01-29T15:06:14.000Z,5666,"{'page_views_last_week': 66, 'page_views_last_..."
4,2fir-qns4,Officer Last Name,Floodsuero,1,2025-01-26 21:41:35,Civilian Complaint Review Board: Police Officers,"A list of all NYPD officers, as reported to CC...",Civilian Complaint Review Board (CCRB),2022-06-06T16:15:01.000Z,2023-01-09T14:41:04.000Z,2025-01-29T15:06:14.000Z,2025-01-29T15:00:19.000Z,2025-01-29T15:06:14.000Z,5666,"{'page_views_last_week': 66, 'page_views_last_..."


# Analysis

In [32]:
# number of rows with character word flood by agency
(df
 .groupby(by='attribution')['count']
 .sum()
 .sort_values(ascending=False)
 .head(60)
 .reset_index()
)

Unnamed: 0,attribution,count
0,311,584776
1,Department of Transportation (DOT),234518
2,Department of Housing Preservation & Developme...,37348
3,Department of Parks and Recreation (DPR),15665
4,Department of Buildings (DOB),9372
5,Department of City Planning (DCP),6027
6,NYC Emergency Management (NYCEM),4613
7,NYC Department of Transportation,3220
8,New York City Office of Management and Budget,2700
9,Department of Records and Information Services...,2536


In [33]:
# examine last four agencies
(df
 .groupby(by='attribution')['count']
 .sum()
 .sort_values(ascending=False)
 .reset_index()
 .iloc[60:, :]
)

Unnamed: 0,attribution,count
60,NYC Service,1
61,Mayor's Office of Management & Budget (OMB),1
62,Office of the Staten Island Borough President,1
63,Brooklyn Borough President,1


In [34]:
# number of rows with character word flood by dataset
(df
 .groupby(by='name')['count']
 .sum()
 .sort_values(ascending=False)
 .head(60)
 .reset_index()
)

Unnamed: 0,name,count
0,311 Call Center Inquiry,226770
1,311 Service Requests from 2010 to Present,179786
2,Pedestrian Mobility Plan Pedestrian Demand,121572
3,311 Web Content - Services,67528
4,Street Construction Permits - Stipulations (Hi...,52696
5,Housing Maintenance Code Complaints and Problems,37348
6,Bicycle Parking,34473
7,311 Service Requests for 2007,22238
8,311 Service Requests for 2006,19968
9,311 Service Requests for 2008,19168


In [35]:
# number of rows with character word flood by agency and dataset
(df
 .groupby(by=['attribution', 'name'])['count']
 .sum()
 .sort_values(ascending=False)
 .head(60)
 .reset_index()
)

Unnamed: 0,attribution,name,count
0,311,311 Call Center Inquiry,226770
1,311,311 Service Requests from 2010 to Present,179786
2,Department of Transportation (DOT),Pedestrian Mobility Plan Pedestrian Demand,121572
3,311,311 Web Content - Services,67528
4,Department of Transportation (DOT),Street Construction Permits - Stipulations (Hi...,52696
5,Department of Housing Preservation & Developme...,Housing Maintenance Code Complaints and Problems,37348
6,Department of Transportation (DOT),Bicycle Parking,34473
7,311,311 Service Requests for 2007,22238
8,311,311 Service Requests for 2006,19968
9,311,311 Service Requests for 2008,19168


In [36]:
# count number of rows with character word flood by agency, dataset and column name
(df
 .groupby(by=['attribution', 'name', 'column'])['count']
 .sum()
 .sort_values(ascending=False)
 .head(60)
 .reset_index()
)

Unnamed: 0,attribution,name,column,count
0,311,311 Service Requests from 2010 to Present,descriptor,174542
1,311,311 Call Center Inquiry,brief_description,167493
2,Department of Transportation (DOT),Pedestrian Mobility Plan Pedestrian Demand,FEMAFldT,121572
3,311,311 Call Center Inquiry,inquiry_name,59277
4,Department of Housing Preservation & Developme...,Housing Maintenance Code Complaints and Problems,problem_code,37342
5,311,311 Web Content - Services,web_keywords,35090
6,Department of Transportation (DOT),Bicycle Parking,FEMAFldT,34473
7,Department of Transportation (DOT),Street Construction Permits - Stipulations (Hi...,stipulationfulltext,26348
8,Department of Transportation (DOT),Street Construction Permits - Stipulations (Hi...,stipulationid,26348
9,311,311 Service Requests for 2007,descriptor,22238


In [37]:
# number of rows with character word flood by agency, dataset, column name, and column value
(df
 .groupby(by=['attribution', 'name', 'column', 'value'])['count']
 .sum()
 .sort_values(ascending=False)
 .head(60)
 .reset_index()
)

Unnamed: 0,attribution,name,column,value,count
0,311,311 Service Requests from 2010 to Present,descriptor,Catch Basin Clogged/Flooding (Use Comments) (SC),119031
1,Department of Transportation (DOT),Pedestrian Mobility Plan Pedestrian Demand,FEMAFldT,AREA OF MINIMAL FLOOD HAZARD,111949
2,311,311 Call Center Inquiry,brief_description,Report a catch basin or storm drain that is cl...,67458
3,311,311 Call Center Inquiry,brief_description,"Report water or sewage coming out a toilet, si...",46216
4,311,311 Service Requests from 2010 to Present,descriptor,Street Flooding (SJ),41272
5,Department of Housing Preservation & Developme...,Housing Maintenance Code Complaints and Problems,problem_code,FIXTURE WET OR FLOODED,37342
6,Department of Transportation (DOT),Bicycle Parking,FEMAFldT,AREA OF MINIMAL FLOOD HAZARD,33586
7,Department of Transportation (DOT),Street Construction Permits - Stipulations (Hi...,stipulationid,FLOODA,26348
8,Department of Transportation (DOT),Street Construction Permits - Stipulations (Hi...,stipulationfulltext,NYC OEM EVACUATION FLOOD ZONE A,26348
9,311,311 Call Center Inquiry,inquiry_name,Flooding Street,25267


In [38]:
# number of rows with charcter word flood by agency, dataset, column name, and column value
# exclude 311 and DOT datasets
(df
 .loc[~df['attribution'].isin(['311', 'Department of Transportation (DOT)'])]
 .groupby(by=['attribution', 'name', 'column', 'value'])['count']
 .sum()
 .sort_values(ascending=False)
 .head(60)
 .reset_index()
)

Unnamed: 0,attribution,name,column,value,count
0,Department of Housing Preservation & Developme...,Housing Maintenance Code Complaints and Problems,problem_code,FIXTURE WET OR FLOODED,37342
1,Department of Parks and Recreation (DPR),Daily Tasks Park Cleaning Records,signname,X061 | Flood Triangle,10035
2,Department of City Planning (DCP),Building Elevation and Subgrade (BES),notes2,Fully Enclosed Ground floor for Apartment Lobb...,4469
3,NYC Department of Transportation,Bus Stop Shelter,FEMAFldT,AREA OF MINIMAL FLOOD HAZARD,3064
4,Department of Environmental Protection (DEP),Harbor Water Quality,Current Direction (Current Direction),Flood,2419
5,Business Integrity Commission,Trade Waste Broker Registrants,EMAIL,kflood@astorcompany.com,1542
6,Department of Records and Information Services...,NYC Historical Vital Records: Index to Digitiz...,last_name,Flood,1017
7,New York City Council (NYCC),NYC Council Constituent Services,DESCRIPTOR,Street Flooding,1011
8,Department of City Planning (DCP),Building Elevation and Subgrade (BES),notes2,"Commercial, Industrial or other Non-Residentia...",986
9,Department of Records and Information Services...,NYC Historical Vital Records: Index to Digitiz...,last_name,Flood,719
