# NYC Open *Big* Data Analysis
Author: Mark Bauer

In [1]:
# import libraries
import pandas as pd

In [2]:
# list files in directory
%ls

analysis.ipynb     export-data.ipynb  log.txt


In [3]:
# log file
file = 'log.txt'

# names of columns
names = [
    'datetime_log',
    'id',
    'error_log',
    'count_rows'
]

# read log file into dataframe
df = pd.read_csv(file, names=names)

# preview data
print(f"shape of data: {df.shape}")
df.head()

shape of data: (2554, 4)


Unnamed: 0,datetime_log,id,error_log,count_rows
0,2024-08-09 13:56:42,fkec-mjr6,,182.0
1,2024-08-09 13:56:47,mzxg-pwib,,27673.0
2,2024-08-09 13:56:53,6r9j-qrwz,,91.0
3,2024-08-09 13:57:00,99xv-he3n,,188.0
4,2024-08-09 13:57:06,ufxk-pq9j,,39.0


In [4]:
# column information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2554 entries, 0 to 2553
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   datetime_log  2554 non-null   object 
 1   id            2554 non-null   object 
 2   error_log     1 non-null      object 
 3   count_rows    2553 non-null   float64
dtypes: float64(1), object(3)
memory usage: 79.9+ KB


In [5]:
# summary statistics
df.describe().round(1)

Unnamed: 0,count_rows
count,2553.0
mean,2337140.8
std,18089982.8
min,0.0
25%,138.0
50%,1396.0
75%,12383.0
max,376404531.0


In [6]:
# is dataset id unique
df['id'].is_unique

True

In [7]:
# count nulls per column
df.isnull().sum()

datetime_log       0
id                 0
error_log       2553
count_rows         1
dtype: int64

Check row with null `count_rows`.

In [8]:
df.loc[df['count_rows'].isnull()]

Unnamed: 0,datetime_log,id,error_log,count_rows
174,2024-08-09 14:32:04,erdf-2akx,Request error for erdf-2akx: 408 Client Error:...,


In [9]:
# preview error log
df.loc[df['error_log'].notnull(), 'error_log'].values[0]

'Request error for erdf-2akx: 408 Client Error: Request Timeout for url: https://data.cityofnewyork.us/resource/erdf-2akx.json?$select=count(*)'

Skip this dataset for now. However, we should confirm the dataset is functional and confirm the number or rows.

In [10]:
# fill null count_rows
df['count_rows'] = df['count_rows'].fillna(0).astype(int)

df.head()

Unnamed: 0,datetime_log,id,error_log,count_rows
0,2024-08-09 13:56:42,fkec-mjr6,,182
1,2024-08-09 13:56:47,mzxg-pwib,,27673
2,2024-08-09 13:56:53,6r9j-qrwz,,91
3,2024-08-09 13:57:00,99xv-he3n,,188
4,2024-08-09 13:57:06,ufxk-pq9j,,39


# Metadata

In [11]:
# read dataset metadata to join information
path = 'https://data.cityofnewyork.us/api/views/metadata/v1'
metadata_df = pd.read_json(path)

print(metadata_df.shape)
metadata_df.head()

(3235, 21)


Unnamed: 0,id,name,attribution,attributionLink,category,createdAt,dataUpdatedAt,dataUri,description,domain,...,hideFromCatalog,hideFromDataJson,license,metadataUpdatedAt,provenance,updatedAt,webUri,approvals,customFields,tags
0,fkec-mjr6,"DOHMH Cryptosporidiosis by Race/Ethnicity, Age...",Department of Health and Mental Hygiene (DOHMH),,Health,2024-08-05T14:12:47+0000,2024-08-05T16:04:46+0000,https://data.cityofnewyork.us/resource/fkec-mjr6,"Cryptosporidiosis, number of cases and annual ...",data.cityofnewyork.us,...,False,False,,2024-08-05T16:33:29+0000,OFFICIAL,2024-08-05T16:34:05+0000,https://data.cityofnewyork.us/d/fkec-mjr6,"[{'reviewedAt': 1722875645, 'reviewedAutomatic...","{'Update': {'Automation': 'No', 'Date Made Pub...","[cryptosporidiosis, diagnosis year, race ethni..."
1,r6e8-2fwe,Location of Disposal Facilities and Sites Used...,NYC Department of Sanitation (DSNY),,City Government,2024-07-31T14:38:56+0000,2024-07-31T14:33:03+0000,https://data.cityofnewyork.us/resource/r6e8-2fwe,The location of the disposal facilities where ...,data.cityofnewyork.us,...,False,False,,2024-07-31T19:40:30+0000,OFFICIAL,2024-07-31T19:53:25+0000,https://data.cityofnewyork.us/d/r6e8-2fwe,"[{'reviewedAt': 1722455605, 'reviewedAutomatic...",{'Data Collection': {'Data Collection': 'Dispo...,
2,9e2b-mctv,New York City Bike Routes\t (Map),Department of Transportation (DOT),https://www.nyc.gov/html/dot/html/bicyclists/b...,,2024-07-24T16:08:52+0000,2024-07-24T16:06:04+0000,https://data.cityofnewyork.us/resource/9e2b-mctv,The New York City Department of Transportation...,data.cityofnewyork.us,...,False,False,,2024-08-06T21:34:51+0000,OFFICIAL,2024-08-06T21:34:51+0000,https://data.cityofnewyork.us/d/9e2b-mctv,"[{'reviewedAt': 1722300713, 'reviewedAutomatic...","{'Update': {'Automation': 'No', 'Update Freque...",
3,mzxg-pwib,New York City Bike Routes,Department of Transportation (DOT),https://www.nyc.gov/html/dot/html/bicyclists/b...,,2024-07-24T15:57:31+0000,2024-07-24T16:06:04+0000,https://data.cityofnewyork.us/resource/mzxg-pwib,The New York City Department of Transportation...,data.cityofnewyork.us,...,False,False,,2024-07-30T00:50:54+0000,OFFICIAL,2024-07-30T00:51:27+0000,https://data.cityofnewyork.us/d/mzxg-pwib,"[{'reviewedAt': 1722300687, 'reviewedAutomatic...","{'Update': {'Automation': 'No', 'Date Made Pub...","[nyc bike routes, bike routes]"
4,6r9j-qrwz,DSNY Disposal Facilities Used by Year,NYC Department of Sanitation (DSNY),,City Government,2024-07-12T18:37:24+0000,2024-07-31T14:21:50+0000,https://data.cityofnewyork.us/resource/6r9j-qrwz,A listing of the facilities used by year to ha...,data.cityofnewyork.us,...,False,False,,2024-07-31T19:45:38+0000,OFFICIAL,2024-07-31T19:51:22+0000,https://data.cityofnewyork.us/d/6r9j-qrwz,"[{'reviewedAt': 1722455482, 'reviewedAutomatic...",{'Data Collection': {'Data Collection': 'Dispo...,"[sanitation, waste, transfer station, waste to..."


In [12]:
# metadata column info
metadata_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3235 entries, 0 to 3234
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 3235 non-null   object 
 1   name               3235 non-null   object 
 2   attribution        3092 non-null   object 
 3   attributionLink    455 non-null    object 
 4   category           3125 non-null   object 
 5   createdAt          3235 non-null   object 
 6   dataUpdatedAt      3062 non-null   object 
 7   dataUri            3235 non-null   object 
 8   description        3161 non-null   object 
 9   domain             3235 non-null   object 
 10  externalId         0 non-null      float64
 11  hideFromCatalog    3235 non-null   bool   
 12  hideFromDataJson   3235 non-null   bool   
 13  license            79 non-null     object 
 14  metadataUpdatedAt  3235 non-null   object 
 15  provenance         3235 non-null   object 
 16  updatedAt          3235 

# Metadata Views
Similar to the metadata api but contains much more information.

In [13]:
# metadata views
path = 'https://data.cityofnewyork.us/api/views/'
views_df = pd.read_json(path)

print(views_df.shape)
views_df.head()

(3235, 50)


Unnamed: 0,id,name,assetType,averageRating,category,createdAt,description,displayType,downloadCount,hideFromCatalog,...,blobFilename,blobFileSize,blobId,blobMimeType,ratings,childViews,indexUpdatedAt,iconUrl,previewImageId,disabledFeatureFlags
0,fkec-mjr6,"DOHMH Cryptosporidiosis by Race/Ethnicity, Age...",dataset,0,Health,1722867167,"Cryptosporidiosis, number of cases and annual ...",table,4,False,...,,,,,,,,,,
1,r6e8-2fwe,Location of Disposal Facilities and Sites Used...,map,0,City Government,1722436736,The location of the disposal facilities where ...,visualization_canvas_map,0,False,...,,,,,,,,,,
2,9e2b-mctv,New York City Bike Routes\t (Map),map,0,,1721837332,The New York City Department of Transportation...,visualization_canvas_map,0,False,...,,,,,,,,,,
3,mzxg-pwib,New York City Bike Routes,dataset,0,,1721836651,The New York City Department of Transportation...,table,89,False,...,,,,,,,,,,
4,6r9j-qrwz,DSNY Disposal Facilities Used by Year,dataset,0,City Government,1720809444,A listing of the facilities used by year to ha...,table,7,False,...,,,,,,,,,,


In [14]:
# preview column info
views_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3235 entries, 0 to 3234
Data columns (total 50 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        3235 non-null   object 
 1   name                      3235 non-null   object 
 2   assetType                 3235 non-null   object 
 3   averageRating             3235 non-null   int64  
 4   category                  3125 non-null   object 
 5   createdAt                 3235 non-null   int64  
 6   description               3161 non-null   object 
 7   displayType               3235 non-null   object 
 8   downloadCount             3235 non-null   int64  
 9   hideFromCatalog           3235 non-null   bool   
 10  hideFromDataJson          3235 non-null   bool   
 11  locked                    3235 non-null   bool   
 12  newBackend                3235 non-null   bool   
 13  numberOfComments          3235 non-null   int64  
 14  oid     

In [15]:
# retrieve only selected columns
cols = [
    'id',
    'viewCount', 'downloadCount',
    'assetType', 'displayType'
]

views_df = views_df.loc[:, cols]

views_df.head()

Unnamed: 0,id,viewCount,downloadCount,assetType,displayType
0,fkec-mjr6,76,4,dataset,table
1,r6e8-2fwe,53,0,map,visualization_canvas_map
2,9e2b-mctv,328,0,map,visualization_canvas_map
3,mzxg-pwib,436,89,dataset,table
4,6r9j-qrwz,93,7,dataset,table


In [16]:
# merge metadata with metadata views
metadata_merged_df = metadata_df.merge(
    views_df,
    on='id',
    how='right'
)

print(metadata_merged_df.shape)
metadata_merged_df.head()

(3235, 25)


Unnamed: 0,id,name,attribution,attributionLink,category,createdAt,dataUpdatedAt,dataUri,description,domain,...,provenance,updatedAt,webUri,approvals,customFields,tags,viewCount,downloadCount,assetType,displayType
0,fkec-mjr6,"DOHMH Cryptosporidiosis by Race/Ethnicity, Age...",Department of Health and Mental Hygiene (DOHMH),,Health,2024-08-05T14:12:47+0000,2024-08-05T16:04:46+0000,https://data.cityofnewyork.us/resource/fkec-mjr6,"Cryptosporidiosis, number of cases and annual ...",data.cityofnewyork.us,...,OFFICIAL,2024-08-05T16:34:05+0000,https://data.cityofnewyork.us/d/fkec-mjr6,"[{'reviewedAt': 1722875645, 'reviewedAutomatic...","{'Update': {'Automation': 'No', 'Date Made Pub...","[cryptosporidiosis, diagnosis year, race ethni...",76,4,dataset,table
1,r6e8-2fwe,Location of Disposal Facilities and Sites Used...,NYC Department of Sanitation (DSNY),,City Government,2024-07-31T14:38:56+0000,2024-07-31T14:33:03+0000,https://data.cityofnewyork.us/resource/r6e8-2fwe,The location of the disposal facilities where ...,data.cityofnewyork.us,...,OFFICIAL,2024-07-31T19:53:25+0000,https://data.cityofnewyork.us/d/r6e8-2fwe,"[{'reviewedAt': 1722455605, 'reviewedAutomatic...",{'Data Collection': {'Data Collection': 'Dispo...,,53,0,map,visualization_canvas_map
2,9e2b-mctv,New York City Bike Routes\t (Map),Department of Transportation (DOT),https://www.nyc.gov/html/dot/html/bicyclists/b...,,2024-07-24T16:08:52+0000,2024-07-24T16:06:04+0000,https://data.cityofnewyork.us/resource/9e2b-mctv,The New York City Department of Transportation...,data.cityofnewyork.us,...,OFFICIAL,2024-08-06T21:34:51+0000,https://data.cityofnewyork.us/d/9e2b-mctv,"[{'reviewedAt': 1722300713, 'reviewedAutomatic...","{'Update': {'Automation': 'No', 'Update Freque...",,328,0,map,visualization_canvas_map
3,mzxg-pwib,New York City Bike Routes,Department of Transportation (DOT),https://www.nyc.gov/html/dot/html/bicyclists/b...,,2024-07-24T15:57:31+0000,2024-07-24T16:06:04+0000,https://data.cityofnewyork.us/resource/mzxg-pwib,The New York City Department of Transportation...,data.cityofnewyork.us,...,OFFICIAL,2024-07-30T00:51:27+0000,https://data.cityofnewyork.us/d/mzxg-pwib,"[{'reviewedAt': 1722300687, 'reviewedAutomatic...","{'Update': {'Automation': 'No', 'Date Made Pub...","[nyc bike routes, bike routes]",436,89,dataset,table
4,6r9j-qrwz,DSNY Disposal Facilities Used by Year,NYC Department of Sanitation (DSNY),,City Government,2024-07-12T18:37:24+0000,2024-07-31T14:21:50+0000,https://data.cityofnewyork.us/resource/6r9j-qrwz,A listing of the facilities used by year to ha...,data.cityofnewyork.us,...,OFFICIAL,2024-07-31T19:51:22+0000,https://data.cityofnewyork.us/d/6r9j-qrwz,"[{'reviewedAt': 1722455482, 'reviewedAutomatic...",{'Data Collection': {'Data Collection': 'Dispo...,"[sanitation, waste, transfer station, waste to...",93,7,dataset,table


In [17]:
# preview column info
metadata_merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3235 entries, 0 to 3234
Data columns (total 25 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 3235 non-null   object 
 1   name               3235 non-null   object 
 2   attribution        3092 non-null   object 
 3   attributionLink    455 non-null    object 
 4   category           3125 non-null   object 
 5   createdAt          3235 non-null   object 
 6   dataUpdatedAt      3062 non-null   object 
 7   dataUri            3235 non-null   object 
 8   description        3161 non-null   object 
 9   domain             3235 non-null   object 
 10  externalId         0 non-null      float64
 11  hideFromCatalog    3235 non-null   bool   
 12  hideFromDataJson   3235 non-null   bool   
 13  license            79 non-null     object 
 14  metadataUpdatedAt  3235 non-null   object 
 15  provenance         3235 non-null   object 
 16  updatedAt          3235 

In [18]:
# select specific columns
cols = [
    'id', 'name', 'attribution', 'description',
    'viewCount', 'downloadCount',
    'category', 'assetType', 'displayType', 'tags',
    'createdAt', 'updatedAt', 'dataUpdatedAt', 'metadataUpdatedAt',
    'domain', 'attributionLink', 'webUri', 'dataUri'
]

metadata_merged_df = metadata_merged_df.loc[:, cols]

metadata_merged_df.head()

Unnamed: 0,id,name,attribution,description,viewCount,downloadCount,category,assetType,displayType,tags,createdAt,updatedAt,dataUpdatedAt,metadataUpdatedAt,domain,attributionLink,webUri,dataUri
0,fkec-mjr6,"DOHMH Cryptosporidiosis by Race/Ethnicity, Age...",Department of Health and Mental Hygiene (DOHMH),"Cryptosporidiosis, number of cases and annual ...",76,4,Health,dataset,table,"[cryptosporidiosis, diagnosis year, race ethni...",2024-08-05T14:12:47+0000,2024-08-05T16:34:05+0000,2024-08-05T16:04:46+0000,2024-08-05T16:33:29+0000,data.cityofnewyork.us,,https://data.cityofnewyork.us/d/fkec-mjr6,https://data.cityofnewyork.us/resource/fkec-mjr6
1,r6e8-2fwe,Location of Disposal Facilities and Sites Used...,NYC Department of Sanitation (DSNY),The location of the disposal facilities where ...,53,0,City Government,map,visualization_canvas_map,,2024-07-31T14:38:56+0000,2024-07-31T19:53:25+0000,2024-07-31T14:33:03+0000,2024-07-31T19:40:30+0000,data.cityofnewyork.us,,https://data.cityofnewyork.us/d/r6e8-2fwe,https://data.cityofnewyork.us/resource/r6e8-2fwe
2,9e2b-mctv,New York City Bike Routes\t (Map),Department of Transportation (DOT),The New York City Department of Transportation...,328,0,,map,visualization_canvas_map,,2024-07-24T16:08:52+0000,2024-08-06T21:34:51+0000,2024-07-24T16:06:04+0000,2024-08-06T21:34:51+0000,data.cityofnewyork.us,https://www.nyc.gov/html/dot/html/bicyclists/b...,https://data.cityofnewyork.us/d/9e2b-mctv,https://data.cityofnewyork.us/resource/9e2b-mctv
3,mzxg-pwib,New York City Bike Routes,Department of Transportation (DOT),The New York City Department of Transportation...,436,89,,dataset,table,"[nyc bike routes, bike routes]",2024-07-24T15:57:31+0000,2024-07-30T00:51:27+0000,2024-07-24T16:06:04+0000,2024-07-30T00:50:54+0000,data.cityofnewyork.us,https://www.nyc.gov/html/dot/html/bicyclists/b...,https://data.cityofnewyork.us/d/mzxg-pwib,https://data.cityofnewyork.us/resource/mzxg-pwib
4,6r9j-qrwz,DSNY Disposal Facilities Used by Year,NYC Department of Sanitation (DSNY),A listing of the facilities used by year to ha...,93,7,City Government,dataset,table,"[sanitation, waste, transfer station, waste to...",2024-07-12T18:37:24+0000,2024-07-31T19:51:22+0000,2024-07-31T14:21:50+0000,2024-07-31T19:45:38+0000,data.cityofnewyork.us,,https://data.cityofnewyork.us/d/6r9j-qrwz,https://data.cityofnewyork.us/resource/6r9j-qrwz


In [19]:
# we only want datasets that are datasets (e.g. not map or dashboards) and are displayed as tables
metadata_merged_df = metadata_merged_df.loc[
    (metadata_merged_df['assetType'] == 'dataset')
    & (metadata_merged_df['displayType'] == 'table')
]

print(metadata_merged_df.shape)
metadata_merged_df.head()

(2552, 18)


Unnamed: 0,id,name,attribution,description,viewCount,downloadCount,category,assetType,displayType,tags,createdAt,updatedAt,dataUpdatedAt,metadataUpdatedAt,domain,attributionLink,webUri,dataUri
0,fkec-mjr6,"DOHMH Cryptosporidiosis by Race/Ethnicity, Age...",Department of Health and Mental Hygiene (DOHMH),"Cryptosporidiosis, number of cases and annual ...",76,4,Health,dataset,table,"[cryptosporidiosis, diagnosis year, race ethni...",2024-08-05T14:12:47+0000,2024-08-05T16:34:05+0000,2024-08-05T16:04:46+0000,2024-08-05T16:33:29+0000,data.cityofnewyork.us,,https://data.cityofnewyork.us/d/fkec-mjr6,https://data.cityofnewyork.us/resource/fkec-mjr6
3,mzxg-pwib,New York City Bike Routes,Department of Transportation (DOT),The New York City Department of Transportation...,436,89,,dataset,table,"[nyc bike routes, bike routes]",2024-07-24T15:57:31+0000,2024-07-30T00:51:27+0000,2024-07-24T16:06:04+0000,2024-07-30T00:50:54+0000,data.cityofnewyork.us,https://www.nyc.gov/html/dot/html/bicyclists/b...,https://data.cityofnewyork.us/d/mzxg-pwib,https://data.cityofnewyork.us/resource/mzxg-pwib
4,6r9j-qrwz,DSNY Disposal Facilities Used by Year,NYC Department of Sanitation (DSNY),A listing of the facilities used by year to ha...,93,7,City Government,dataset,table,"[sanitation, waste, transfer station, waste to...",2024-07-12T18:37:24+0000,2024-07-31T19:51:22+0000,2024-07-31T14:21:50+0000,2024-07-31T19:45:38+0000,data.cityofnewyork.us,,https://data.cityofnewyork.us/d/6r9j-qrwz,https://data.cityofnewyork.us/resource/6r9j-qrwz
5,99xv-he3n,DSNY Disposal Sites Used by Facilities by Year,NYC Department of Sanitation (DSNY),A listing of the disposal sites used by each f...,69,8,City Government,dataset,table,"[sanitation, waste, transfer station, waste to...",2024-07-12T18:18:59+0000,2024-07-31T19:51:26+0000,2024-07-31T14:18:13+0000,2024-07-31T19:44:47+0000,data.cityofnewyork.us,,https://data.cityofnewyork.us/d/99xv-he3n,https://data.cityofnewyork.us/resource/99xv-he3n
6,ufxk-pq9j,Location of Disposal Facilities and Sites Used...,NYC Department of Sanitation (DSNY),The location of the disposal facilities where ...,83,17,City Government,dataset,table,"[sanitation, waste, transfer station, waste to...",2024-07-12T17:54:05+0000,2024-07-31T19:51:24+0000,2024-07-31T14:33:03+0000,2024-07-31T19:45:15+0000,data.cityofnewyork.us,,https://data.cityofnewyork.us/d/ufxk-pq9j,https://data.cityofnewyork.us/resource/ufxk-pq9j


In [20]:
# merge dataset log file with metadata
merged_df = df.merge(
    metadata_merged_df,
    on='id',
    how='left'
)

print(merged_df.shape)
merged_df.head()

(2554, 21)


Unnamed: 0,datetime_log,id,error_log,count_rows,name,attribution,description,viewCount,downloadCount,category,...,displayType,tags,createdAt,updatedAt,dataUpdatedAt,metadataUpdatedAt,domain,attributionLink,webUri,dataUri
0,2024-08-09 13:56:42,fkec-mjr6,,182,"DOHMH Cryptosporidiosis by Race/Ethnicity, Age...",Department of Health and Mental Hygiene (DOHMH),"Cryptosporidiosis, number of cases and annual ...",76.0,4.0,Health,...,table,"[cryptosporidiosis, diagnosis year, race ethni...",2024-08-05T14:12:47+0000,2024-08-05T16:34:05+0000,2024-08-05T16:04:46+0000,2024-08-05T16:33:29+0000,data.cityofnewyork.us,,https://data.cityofnewyork.us/d/fkec-mjr6,https://data.cityofnewyork.us/resource/fkec-mjr6
1,2024-08-09 13:56:47,mzxg-pwib,,27673,New York City Bike Routes,Department of Transportation (DOT),The New York City Department of Transportation...,436.0,89.0,,...,table,"[nyc bike routes, bike routes]",2024-07-24T15:57:31+0000,2024-07-30T00:51:27+0000,2024-07-24T16:06:04+0000,2024-07-30T00:50:54+0000,data.cityofnewyork.us,https://www.nyc.gov/html/dot/html/bicyclists/b...,https://data.cityofnewyork.us/d/mzxg-pwib,https://data.cityofnewyork.us/resource/mzxg-pwib
2,2024-08-09 13:56:53,6r9j-qrwz,,91,DSNY Disposal Facilities Used by Year,NYC Department of Sanitation (DSNY),A listing of the facilities used by year to ha...,93.0,7.0,City Government,...,table,"[sanitation, waste, transfer station, waste to...",2024-07-12T18:37:24+0000,2024-07-31T19:51:22+0000,2024-07-31T14:21:50+0000,2024-07-31T19:45:38+0000,data.cityofnewyork.us,,https://data.cityofnewyork.us/d/6r9j-qrwz,https://data.cityofnewyork.us/resource/6r9j-qrwz
3,2024-08-09 13:57:00,99xv-he3n,,188,DSNY Disposal Sites Used by Facilities by Year,NYC Department of Sanitation (DSNY),A listing of the disposal sites used by each f...,69.0,8.0,City Government,...,table,"[sanitation, waste, transfer station, waste to...",2024-07-12T18:18:59+0000,2024-07-31T19:51:26+0000,2024-07-31T14:18:13+0000,2024-07-31T19:44:47+0000,data.cityofnewyork.us,,https://data.cityofnewyork.us/d/99xv-he3n,https://data.cityofnewyork.us/resource/99xv-he3n
4,2024-08-09 13:57:06,ufxk-pq9j,,39,Location of Disposal Facilities and Sites Used...,NYC Department of Sanitation (DSNY),The location of the disposal facilities where ...,83.0,17.0,City Government,...,table,"[sanitation, waste, transfer station, waste to...",2024-07-12T17:54:05+0000,2024-07-31T19:51:24+0000,2024-07-31T14:33:03+0000,2024-07-31T19:45:15+0000,data.cityofnewyork.us,,https://data.cityofnewyork.us/d/ufxk-pq9j,https://data.cityofnewyork.us/resource/ufxk-pq9j


In [21]:
# rearrange columns
cols = [
    'id', 'name', 'attribution', 'description',
    'count_rows', 'viewCount', 'downloadCount',
     'category', 'assetType', 'displayType', 'tags',
    'createdAt', 'updatedAt', 'dataUpdatedAt', 'metadataUpdatedAt',
    'domain', 'attributionLink', 'webUri', 'dataUri'
]

merged_df = merged_df.loc[:, cols]

merged_df.head()

Unnamed: 0,id,name,attribution,description,count_rows,viewCount,downloadCount,category,assetType,displayType,tags,createdAt,updatedAt,dataUpdatedAt,metadataUpdatedAt,domain,attributionLink,webUri,dataUri
0,fkec-mjr6,"DOHMH Cryptosporidiosis by Race/Ethnicity, Age...",Department of Health and Mental Hygiene (DOHMH),"Cryptosporidiosis, number of cases and annual ...",182,76.0,4.0,Health,dataset,table,"[cryptosporidiosis, diagnosis year, race ethni...",2024-08-05T14:12:47+0000,2024-08-05T16:34:05+0000,2024-08-05T16:04:46+0000,2024-08-05T16:33:29+0000,data.cityofnewyork.us,,https://data.cityofnewyork.us/d/fkec-mjr6,https://data.cityofnewyork.us/resource/fkec-mjr6
1,mzxg-pwib,New York City Bike Routes,Department of Transportation (DOT),The New York City Department of Transportation...,27673,436.0,89.0,,dataset,table,"[nyc bike routes, bike routes]",2024-07-24T15:57:31+0000,2024-07-30T00:51:27+0000,2024-07-24T16:06:04+0000,2024-07-30T00:50:54+0000,data.cityofnewyork.us,https://www.nyc.gov/html/dot/html/bicyclists/b...,https://data.cityofnewyork.us/d/mzxg-pwib,https://data.cityofnewyork.us/resource/mzxg-pwib
2,6r9j-qrwz,DSNY Disposal Facilities Used by Year,NYC Department of Sanitation (DSNY),A listing of the facilities used by year to ha...,91,93.0,7.0,City Government,dataset,table,"[sanitation, waste, transfer station, waste to...",2024-07-12T18:37:24+0000,2024-07-31T19:51:22+0000,2024-07-31T14:21:50+0000,2024-07-31T19:45:38+0000,data.cityofnewyork.us,,https://data.cityofnewyork.us/d/6r9j-qrwz,https://data.cityofnewyork.us/resource/6r9j-qrwz
3,99xv-he3n,DSNY Disposal Sites Used by Facilities by Year,NYC Department of Sanitation (DSNY),A listing of the disposal sites used by each f...,188,69.0,8.0,City Government,dataset,table,"[sanitation, waste, transfer station, waste to...",2024-07-12T18:18:59+0000,2024-07-31T19:51:26+0000,2024-07-31T14:18:13+0000,2024-07-31T19:44:47+0000,data.cityofnewyork.us,,https://data.cityofnewyork.us/d/99xv-he3n,https://data.cityofnewyork.us/resource/99xv-he3n
4,ufxk-pq9j,Location of Disposal Facilities and Sites Used...,NYC Department of Sanitation (DSNY),The location of the disposal facilities where ...,39,83.0,17.0,City Government,dataset,table,"[sanitation, waste, transfer station, waste to...",2024-07-12T17:54:05+0000,2024-07-31T19:51:24+0000,2024-07-31T14:33:03+0000,2024-07-31T19:45:15+0000,data.cityofnewyork.us,,https://data.cityofnewyork.us/d/ufxk-pq9j,https://data.cityofnewyork.us/resource/ufxk-pq9j


In [22]:
# preview column info
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2554 entries, 0 to 2553
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 2554 non-null   object 
 1   name               2551 non-null   object 
 2   attribution        2425 non-null   object 
 3   description        2477 non-null   object 
 4   count_rows         2554 non-null   int64  
 5   viewCount          2551 non-null   float64
 6   downloadCount      2551 non-null   float64
 7   category           2464 non-null   object 
 8   assetType          2551 non-null   object 
 9   displayType        2551 non-null   object 
 10  tags               1903 non-null   object 
 11  createdAt          2551 non-null   object 
 12  updatedAt          2551 non-null   object 
 13  dataUpdatedAt      2531 non-null   object 
 14  metadataUpdatedAt  2551 non-null   object 
 15  domain             2551 non-null   object 
 16  attributionLink    358 n

In [23]:
# summary statistics
merged_df.describe().round(1)

Unnamed: 0,count_rows,viewCount,downloadCount
count,2554.0,2551.0,2551.0
mean,2336225.7,10857.6,4501.6
std,18086498.7,107075.8,39425.3
min,0.0,48.0,4.0
25%,135.8,359.5,388.0
50%,1395.0,822.0,784.0
75%,12382.8,2534.0,2078.5
max,376404531.0,2804997.0,1595518.0


In [24]:
# null counts per column
merged_df.isnull().sum()

id                      0
name                    3
attribution           129
description            77
count_rows              0
viewCount               3
downloadCount           3
category               90
assetType               3
displayType             3
tags                  651
createdAt               3
updatedAt               3
dataUpdatedAt          23
metadataUpdatedAt       3
domain                  3
attributionLink      2196
webUri                  3
dataUri                 3
dtype: int64

Examine why three datasets have `name` as null.

In [25]:
merged_df.loc[merged_df['name'].isnull()]

Unnamed: 0,id,name,attribution,description,count_rows,viewCount,downloadCount,category,assetType,displayType,tags,createdAt,updatedAt,dataUpdatedAt,metadataUpdatedAt,domain,attributionLink,webUri,dataUri
358,in83-58q5,,,,334044,,,,,,,,,,,,,,
359,evu4-6zyr,,,,335616,,,,,,,,,,,,,,
360,njuk-taxk,,,,309528,,,,,,,,,,,,,,


In [26]:
merged_df.loc[merged_df['name'].isnull()].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3 entries, 358 to 360
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 3 non-null      object 
 1   name               0 non-null      object 
 2   attribution        0 non-null      object 
 3   description        0 non-null      object 
 4   count_rows         3 non-null      int64  
 5   viewCount          0 non-null      float64
 6   downloadCount      0 non-null      float64
 7   category           0 non-null      object 
 8   assetType          0 non-null      object 
 9   displayType        0 non-null      object 
 10  tags               0 non-null      object 
 11  createdAt          0 non-null      object 
 12  updatedAt          0 non-null      object 
 13  dataUpdatedAt      0 non-null      object 
 14  metadataUpdatedAt  0 non-null      object 
 15  domain             0 non-null      object 
 16  attributionLink    0 non-n

These might be unauthorized or private datasets hosted on NYC Open Data. Let's drop them.

In [27]:
merged_df = (
    merged_df
    .loc[merged_df['name'].notnull()]
    .reset_index(drop=True)
)

print(merged_df.shape)
merged_df.head()

(2551, 19)


Unnamed: 0,id,name,attribution,description,count_rows,viewCount,downloadCount,category,assetType,displayType,tags,createdAt,updatedAt,dataUpdatedAt,metadataUpdatedAt,domain,attributionLink,webUri,dataUri
0,fkec-mjr6,"DOHMH Cryptosporidiosis by Race/Ethnicity, Age...",Department of Health and Mental Hygiene (DOHMH),"Cryptosporidiosis, number of cases and annual ...",182,76.0,4.0,Health,dataset,table,"[cryptosporidiosis, diagnosis year, race ethni...",2024-08-05T14:12:47+0000,2024-08-05T16:34:05+0000,2024-08-05T16:04:46+0000,2024-08-05T16:33:29+0000,data.cityofnewyork.us,,https://data.cityofnewyork.us/d/fkec-mjr6,https://data.cityofnewyork.us/resource/fkec-mjr6
1,mzxg-pwib,New York City Bike Routes,Department of Transportation (DOT),The New York City Department of Transportation...,27673,436.0,89.0,,dataset,table,"[nyc bike routes, bike routes]",2024-07-24T15:57:31+0000,2024-07-30T00:51:27+0000,2024-07-24T16:06:04+0000,2024-07-30T00:50:54+0000,data.cityofnewyork.us,https://www.nyc.gov/html/dot/html/bicyclists/b...,https://data.cityofnewyork.us/d/mzxg-pwib,https://data.cityofnewyork.us/resource/mzxg-pwib
2,6r9j-qrwz,DSNY Disposal Facilities Used by Year,NYC Department of Sanitation (DSNY),A listing of the facilities used by year to ha...,91,93.0,7.0,City Government,dataset,table,"[sanitation, waste, transfer station, waste to...",2024-07-12T18:37:24+0000,2024-07-31T19:51:22+0000,2024-07-31T14:21:50+0000,2024-07-31T19:45:38+0000,data.cityofnewyork.us,,https://data.cityofnewyork.us/d/6r9j-qrwz,https://data.cityofnewyork.us/resource/6r9j-qrwz
3,99xv-he3n,DSNY Disposal Sites Used by Facilities by Year,NYC Department of Sanitation (DSNY),A listing of the disposal sites used by each f...,188,69.0,8.0,City Government,dataset,table,"[sanitation, waste, transfer station, waste to...",2024-07-12T18:18:59+0000,2024-07-31T19:51:26+0000,2024-07-31T14:18:13+0000,2024-07-31T19:44:47+0000,data.cityofnewyork.us,,https://data.cityofnewyork.us/d/99xv-he3n,https://data.cityofnewyork.us/resource/99xv-he3n
4,ufxk-pq9j,Location of Disposal Facilities and Sites Used...,NYC Department of Sanitation (DSNY),The location of the disposal facilities where ...,39,83.0,17.0,City Government,dataset,table,"[sanitation, waste, transfer station, waste to...",2024-07-12T17:54:05+0000,2024-07-31T19:51:24+0000,2024-07-31T14:33:03+0000,2024-07-31T19:45:15+0000,data.cityofnewyork.us,,https://data.cityofnewyork.us/d/ufxk-pq9j,https://data.cityofnewyork.us/resource/ufxk-pq9j


In [28]:
# sanity check
merged_df.isnull().sum()

id                      0
name                    0
attribution           126
description            74
count_rows              0
viewCount               0
downloadCount           0
category               87
assetType               0
displayType             0
tags                  648
createdAt               0
updatedAt               0
dataUpdatedAt          20
metadataUpdatedAt       0
domain                  0
attributionLink      2193
webUri                  0
dataUri                 0
dtype: int64

In [29]:
# cast desired columns to int
merged_df = merged_df.astype({
    'viewCount':int,
    'downloadCount':int
})

merged_df.head()

Unnamed: 0,id,name,attribution,description,count_rows,viewCount,downloadCount,category,assetType,displayType,tags,createdAt,updatedAt,dataUpdatedAt,metadataUpdatedAt,domain,attributionLink,webUri,dataUri
0,fkec-mjr6,"DOHMH Cryptosporidiosis by Race/Ethnicity, Age...",Department of Health and Mental Hygiene (DOHMH),"Cryptosporidiosis, number of cases and annual ...",182,76,4,Health,dataset,table,"[cryptosporidiosis, diagnosis year, race ethni...",2024-08-05T14:12:47+0000,2024-08-05T16:34:05+0000,2024-08-05T16:04:46+0000,2024-08-05T16:33:29+0000,data.cityofnewyork.us,,https://data.cityofnewyork.us/d/fkec-mjr6,https://data.cityofnewyork.us/resource/fkec-mjr6
1,mzxg-pwib,New York City Bike Routes,Department of Transportation (DOT),The New York City Department of Transportation...,27673,436,89,,dataset,table,"[nyc bike routes, bike routes]",2024-07-24T15:57:31+0000,2024-07-30T00:51:27+0000,2024-07-24T16:06:04+0000,2024-07-30T00:50:54+0000,data.cityofnewyork.us,https://www.nyc.gov/html/dot/html/bicyclists/b...,https://data.cityofnewyork.us/d/mzxg-pwib,https://data.cityofnewyork.us/resource/mzxg-pwib
2,6r9j-qrwz,DSNY Disposal Facilities Used by Year,NYC Department of Sanitation (DSNY),A listing of the facilities used by year to ha...,91,93,7,City Government,dataset,table,"[sanitation, waste, transfer station, waste to...",2024-07-12T18:37:24+0000,2024-07-31T19:51:22+0000,2024-07-31T14:21:50+0000,2024-07-31T19:45:38+0000,data.cityofnewyork.us,,https://data.cityofnewyork.us/d/6r9j-qrwz,https://data.cityofnewyork.us/resource/6r9j-qrwz
3,99xv-he3n,DSNY Disposal Sites Used by Facilities by Year,NYC Department of Sanitation (DSNY),A listing of the disposal sites used by each f...,188,69,8,City Government,dataset,table,"[sanitation, waste, transfer station, waste to...",2024-07-12T18:18:59+0000,2024-07-31T19:51:26+0000,2024-07-31T14:18:13+0000,2024-07-31T19:44:47+0000,data.cityofnewyork.us,,https://data.cityofnewyork.us/d/99xv-he3n,https://data.cityofnewyork.us/resource/99xv-he3n
4,ufxk-pq9j,Location of Disposal Facilities and Sites Used...,NYC Department of Sanitation (DSNY),The location of the disposal facilities where ...,39,83,17,City Government,dataset,table,"[sanitation, waste, transfer station, waste to...",2024-07-12T17:54:05+0000,2024-07-31T19:51:24+0000,2024-07-31T14:33:03+0000,2024-07-31T19:45:15+0000,data.cityofnewyork.us,,https://data.cityofnewyork.us/d/ufxk-pq9j,https://data.cityofnewyork.us/resource/ufxk-pq9j


# Largest Datasets on NYC Open Data

## Largest dataset

In [30]:
(merged_df
 .sort_values(by='count_rows', ascending=False)
 .head(1)
)

Unnamed: 0,id,name,attribution,description,count_rows,viewCount,downloadCount,category,assetType,displayType,tags,createdAt,updatedAt,dataUpdatedAt,metadataUpdatedAt,domain,attributionLink,webUri,dataUri
1928,rmhc-afj9,DSNY - PlowNYC Data,Department of Sanitation (DSNY),A table containing physical ID (a unique stree...,376404531,1852,504,City Government,dataset,table,"[dsny, plownyc, snow removal]",2016-10-20T21:06:59+0000,2024-05-30T14:38:12+0000,2024-02-17T22:22:44+0000,2024-05-30T14:38:12+0000,data.cityofnewyork.us,,https://data.cityofnewyork.us/d/rmhc-afj9,https://data.cityofnewyork.us/resource/rmhc-afj9


## Top 10 largest datasets

In [31]:
(merged_df
 .sort_values(by='count_rows', ascending=False)
 .head(10)
)

Unnamed: 0,id,name,attribution,description,count_rows,viewCount,downloadCount,category,assetType,displayType,tags,createdAt,updatedAt,dataUpdatedAt,metadataUpdatedAt,domain,attributionLink,webUri,dataUri
1928,rmhc-afj9,DSNY - PlowNYC Data,Department of Sanitation (DSNY),A table containing physical ID (a unique stree...,376404531,1852,504,City Government,dataset,table,"[dsny, plownyc, snow removal]",2016-10-20T21:06:59+0000,2024-05-30T14:38:12+0000,2024-02-17T22:22:44+0000,2024-05-30T14:38:12+0000,data.cityofnewyork.us,,https://data.cityofnewyork.us/d/rmhc-afj9,https://data.cityofnewyork.us/resource/rmhc-afj9
1423,am94-epxh,2018 For Hire Vehicles Trip Data,Taxi and Limousine Commission (TLC),These records are generated from the For-Hire ...,260766056,3135,1005,Transportation,dataset,table,"[taxi, trip data, fhv, trip, base, high volume...",2018-09-17T20:40:48+0000,2023-12-15T15:05:40+0000,2019-04-17T18:58:46+0000,2023-12-15T15:05:40+0000,data.cityofnewyork.us,https://www.nyc.gov/site/tlc/about/tlc-trip-re...,https://data.cityofnewyork.us/d/am94-epxh,https://data.cityofnewyork.us/resource/am94-epxh
986,4p5c-cbgn,2019 High Volume FHV Trip Records,Taxi and Limousine Commission (TLC),,234629119,3671,1410,Transportation,dataset,table,"[taxi, trip data, fhv, trip, base, high volume...",2019-09-20T14:15:36+0000,2023-12-15T16:10:36+0000,2020-03-06T19:30:26+0000,2023-12-15T16:10:36+0000,data.cityofnewyork.us,https://www.nyc.gov/site/tlc/about/tlc-trip-re...,https://data.cityofnewyork.us/d/4p5c-cbgn,https://data.cityofnewyork.us/resource/4p5c-cbgn
8,u253-aew4,2023 High Volume FHV Trip Data,Taxi and Limousine Commission (TLC),These records are generated from the trip reco...,232490020,210,20,Transportation,dataset,table,"[taxi, trip data, fhv, trip, base, high volume...",2024-07-02T13:34:52+0000,2024-07-16T18:41:11+0000,2024-07-15T17:29:03+0000,2024-07-16T18:40:12+0000,data.cityofnewyork.us,https://www.nyc.gov/site/tlc/about/tlc-trip-re...,https://data.cityofnewyork.us/d/u253-aew4,https://data.cityofnewyork.us/resource/u253-aew4
88,g6pj-fsah,2022 High Volume FHV Trip Records,Taxi and Limousine Commission (TLC),These records are generated from the trip reco...,212416083,1155,479,Transportation,dataset,table,"[taxi, trip data, fhv, trip, base, high volume...",2023-11-01T15:11:25+0000,2023-12-15T15:59:25+0000,2023-11-08T14:43:26+0000,2023-12-15T15:53:18+0000,data.cityofnewyork.us,https://www.nyc.gov/site/tlc/about/tlc-trip-re...,https://data.cityofnewyork.us/d/g6pj-fsah,https://data.cityofnewyork.us/resource/g6pj-fsah
1612,avz8-mqzz,2017 For Hire Vehicle Trip Data,Taxi and Limousine Commission (TLC),These records are generated from the For-Hire ...,200283593,3863,1013,Transportation,dataset,table,"[taxi, trip data, fhv, trip, base, high volume...",2018-03-08T21:15:31+0000,2023-12-15T15:01:45+0000,2018-10-22T16:11:25+0000,2023-12-15T15:01:45+0000,data.cityofnewyork.us,https://www.nyc.gov/site/tlc/about/tlc-trip-re...,https://data.cityofnewyork.us/d/avz8-mqzz,https://data.cityofnewyork.us/resource/avz8-mqzz
95,5ufr-wvc5,2021 High Volume FHV Trip Records,Department of Transportation (DOT),These records are generated from the trip reco...,174596652,385,197,Transportation,dataset,table,"[taxi, trip data, fhv, trip, base, high volume...",2023-10-25T15:18:34+0000,2023-12-15T15:59:23+0000,2023-10-31T11:46:29+0000,2023-12-15T15:58:59+0000,data.cityofnewyork.us,https://www.nyc.gov/site/tlc/about/tlc-trip-re...,https://data.cityofnewyork.us/d/5ufr-wvc5,https://data.cityofnewyork.us/resource/5ufr-wvc5
2087,t7ny-aygi,2013 Yellow Taxi Trip Data,Taxi and Limousine Commission (TLC),These records are generated from the trip reco...,173179759,1209,349,Transportation,dataset,table,"[yellow taxi, trip data, fhv, trip, base, high...",2015-10-16T21:58:37+0000,2023-12-14T20:44:38+0000,2015-11-12T18:38:36+0000,2023-12-14T20:44:38+0000,data.cityofnewyork.us,https://www.nyc.gov/site/tlc/about/tlc-trip-re...,https://data.cityofnewyork.us/d/t7ny-aygi,https://data.cityofnewyork.us/resource/t7ny-aygi
2081,f9tw-8p66,2009 Yellow Taxi Trip Data,Taxi and Limousine Commission (TLC),These records are generated from the trip reco...,170896055,1771,917,Transportation,dataset,table,"[yellow taxi, trip data, fhv, trip, base, high...",2015-10-19T20:41:58+0000,2023-12-14T20:41:58+0000,2015-12-08T00:31:16+0000,2023-12-14T20:41:58+0000,data.cityofnewyork.us,https://www.nyc.gov/site/tlc/about/tlc-trip-re...,https://data.cityofnewyork.us/d/f9tw-8p66,https://data.cityofnewyork.us/resource/f9tw-8p66
2082,74wj-s5ij,2010 Yellow Taxi Trip Data,Taxi and Limousine Commission (TLC),These records are generated from the trip reco...,168994353,556,427,Transportation,dataset,table,"[yellow taxi, trip data, fhv, trip, base, high...",2015-10-19T20:30:20+0000,2023-12-14T20:42:30+0000,2015-12-07T01:41:47+0000,2023-12-14T20:42:30+0000,data.cityofnewyork.us,https://www.nyc.gov/site/tlc/about/tlc-trip-re...,https://data.cityofnewyork.us/d/74wj-s5ij,https://data.cityofnewyork.us/resource/74wj-s5ij


## Top 20 largest datasets when filtering out taxi car data

In [32]:
# exclude these strings
taxi_car_data = [
    'High Volume FHV Trip',
    'For Hire Vehicle',
    'Taxi Trip',
    'Medallion'
]

(merged_df
 .loc[~merged_df['name'].str.contains('|'.join(taxi_car_data))]
 .sort_values(by='count_rows', ascending=False)
 .head(20)
)

Unnamed: 0,id,name,attribution,description,count_rows,viewCount,downloadCount,category,assetType,displayType,tags,createdAt,updatedAt,dataUpdatedAt,metadataUpdatedAt,domain,attributionLink,webUri,dataUri
1928,rmhc-afj9,DSNY - PlowNYC Data,Department of Sanitation (DSNY),A table containing physical ID (a unique stree...,376404531,1852,504,City Government,dataset,table,"[dsny, plownyc, snow removal]",2016-10-20T21:06:59+0000,2024-05-30T14:38:12+0000,2024-02-17T22:22:44+0000,2024-05-30T14:38:12+0000,data.cityofnewyork.us,,https://data.cityofnewyork.us/d/rmhc-afj9,https://data.cityofnewyork.us/resource/rmhc-afj9
2050,nc67-uf89,Open Parking and Camera Violations,Department of Finance (DOF),The Open Parking and Camera Violations dataset...,117062807,240400,37200,City Government,dataset,table,"[dof, parking, summons, violation]",2016-01-04T16:08:38+0000,2024-03-15T17:35:08+0000,2024-08-17T09:18:23+0000,2024-03-15T17:35:08+0000,data.cityofnewyork.us,,https://data.cityofnewyork.us/d/nc67-uf89,https://data.cityofnewyork.us/resource/nc67-uf89
733,4fwc-j3vn,Weigh in Motion,,A Weigh In Motion (WIM) system is designed to ...,97698569,1876,1298,Transportation,dataset,table,"[wim, weigh in motion, trucks, overweight, axl...",2020-10-21T14:56:39+0000,2024-06-26T19:00:12+0000,2024-06-27T09:35:08+0000,2024-06-26T19:00:12+0000,data.cityofnewyork.us,,https://data.cityofnewyork.us/d/4fwc-j3vn,https://data.cityofnewyork.us/resource/4fwc-j3vn
1997,wewp-mm3p,311 Call Center Inquiry,311,<b>Please note: Due to pandemic call handling ...,97046151,11070,23277,City Government,dataset,table,,2016-05-12T22:58:27+0000,2024-01-24T21:32:54+0000,2024-08-17T01:06:17+0000,2024-01-24T21:32:54+0000,data.cityofnewyork.us,,https://data.cityofnewyork.us/d/wewp-mm3p,https://data.cityofnewyork.us/resource/wewp-mm3p
1874,i4gi-tjb9,DOT Traffic Speeds NBE,,,85740225,18612,6113,Transportation,dataset,table,,2017-04-17T18:31:04+0000,2020-02-08T00:40:14+0000,2024-08-17T20:56:55+0000,2020-02-08T00:40:14+0000,data.cityofnewyork.us,,https://data.cityofnewyork.us/d/i4gi-tjb9,https://data.cityofnewyork.us/resource/i4gi-tjb9
844,2nwg-uqyg,Emergency Department Visits and Admissions for...,Department of Health and Mental Hygiene (DOHMH),"Total emergency department visits, and visits ...",82021561,12410,2293,Health,dataset,table,"[emergency, emergency department, influenza-li...",2020-04-28T18:50:14+0000,2023-01-19T17:00:22+0000,2022-12-05T19:34:43+0000,2023-01-19T17:00:22+0000,data.cityofnewyork.us,,https://data.cityofnewyork.us/d/2nwg-uqyg,https://data.cityofnewyork.us/resource/2nwg-uqyg
1282,pbk5-6r7z,Street Construction Permits - Stipulations (Hi...,Department of Transportation (DOT),DOT issues over 150 different types of sidewal...,79933031,1510,891,Transportation,dataset,table,"[permit, street work, construction, cut, mosai...",2018-12-28T18:26:04+0000,2022-05-09T22:26:50+0000,2020-04-13T19:19:40+0000,2022-05-09T22:26:50+0000,data.cityofnewyork.us,,https://data.cityofnewyork.us/d/pbk5-6r7z,https://data.cityofnewyork.us/resource/pbk5-6r7z
1020,scjx-j6np,DOF: Property Charges Balance,Department of Finance,Property related charge information by period.,75863671,6799,1147,City Government,dataset,table,"[charge balance, open balance, charge summary]",2019-07-16T18:49:01+0000,2024-06-06T18:59:51+0000,2024-06-06T19:22:43+0000,2024-06-06T18:59:51+0000,data.cityofnewyork.us,,https://data.cityofnewyork.us/d/scjx-j6np,https://data.cityofnewyork.us/resource/scjx-j6np
1356,quxm-hmyr,LinkNYC Kiosk Status (Historical),Office of Technology and Innovation (OTI),This dataset provides the most current listing...,65675385,3312,1883,City Government,dataset,table,"[linknyc kiosk status, wifi, wi-fi, broadband,...",2018-11-20T20:36:32+0000,2023-06-08T13:48:56+0000,2023-04-18T01:33:20+0000,2023-06-08T13:48:56+0000,data.cityofnewyork.us,,https://data.cityofnewyork.us/d/quxm-hmyr,https://data.cityofnewyork.us/resource/quxm-hmyr
857,a5nd-6mit,DOF Property Assessment Change,Department of Finance (DOF),Changes to Property Assessment. The data is co...,41906018,2915,626,City Government,dataset,table,"[property assessment change, market value chan...",2020-03-10T17:23:48+0000,2024-07-09T16:58:17+0000,2024-07-09T16:58:20+0000,2024-07-09T16:58:17+0000,data.cityofnewyork.us,,https://data.cityofnewyork.us/d/a5nd-6mit,https://data.cityofnewyork.us/resource/a5nd-6mit


## Top 20 largest datasets with taxi car data

In [33]:
# exclude these strings
taxi_car_data = [
    'High Volume FHV Trip',
    'For Hire Vehicle',
    'Taxi Trip',
    'Medallion'
]

(merged_df
 .loc[merged_df['name'].str.contains('|'.join(taxi_car_data))]
 .sort_values(by='count_rows', ascending=False)
 .head(20)
)

Unnamed: 0,id,name,attribution,description,count_rows,viewCount,downloadCount,category,assetType,displayType,tags,createdAt,updatedAt,dataUpdatedAt,metadataUpdatedAt,domain,attributionLink,webUri,dataUri
1423,am94-epxh,2018 For Hire Vehicles Trip Data,Taxi and Limousine Commission (TLC),These records are generated from the For-Hire ...,260766056,3135,1005,Transportation,dataset,table,"[taxi, trip data, fhv, trip, base, high volume...",2018-09-17T20:40:48+0000,2023-12-15T15:05:40+0000,2019-04-17T18:58:46+0000,2023-12-15T15:05:40+0000,data.cityofnewyork.us,https://www.nyc.gov/site/tlc/about/tlc-trip-re...,https://data.cityofnewyork.us/d/am94-epxh,https://data.cityofnewyork.us/resource/am94-epxh
986,4p5c-cbgn,2019 High Volume FHV Trip Records,Taxi and Limousine Commission (TLC),,234629119,3671,1410,Transportation,dataset,table,"[taxi, trip data, fhv, trip, base, high volume...",2019-09-20T14:15:36+0000,2023-12-15T16:10:36+0000,2020-03-06T19:30:26+0000,2023-12-15T16:10:36+0000,data.cityofnewyork.us,https://www.nyc.gov/site/tlc/about/tlc-trip-re...,https://data.cityofnewyork.us/d/4p5c-cbgn,https://data.cityofnewyork.us/resource/4p5c-cbgn
8,u253-aew4,2023 High Volume FHV Trip Data,Taxi and Limousine Commission (TLC),These records are generated from the trip reco...,232490020,210,20,Transportation,dataset,table,"[taxi, trip data, fhv, trip, base, high volume...",2024-07-02T13:34:52+0000,2024-07-16T18:41:11+0000,2024-07-15T17:29:03+0000,2024-07-16T18:40:12+0000,data.cityofnewyork.us,https://www.nyc.gov/site/tlc/about/tlc-trip-re...,https://data.cityofnewyork.us/d/u253-aew4,https://data.cityofnewyork.us/resource/u253-aew4
88,g6pj-fsah,2022 High Volume FHV Trip Records,Taxi and Limousine Commission (TLC),These records are generated from the trip reco...,212416083,1155,479,Transportation,dataset,table,"[taxi, trip data, fhv, trip, base, high volume...",2023-11-01T15:11:25+0000,2023-12-15T15:59:25+0000,2023-11-08T14:43:26+0000,2023-12-15T15:53:18+0000,data.cityofnewyork.us,https://www.nyc.gov/site/tlc/about/tlc-trip-re...,https://data.cityofnewyork.us/d/g6pj-fsah,https://data.cityofnewyork.us/resource/g6pj-fsah
1612,avz8-mqzz,2017 For Hire Vehicle Trip Data,Taxi and Limousine Commission (TLC),These records are generated from the For-Hire ...,200283593,3863,1013,Transportation,dataset,table,"[taxi, trip data, fhv, trip, base, high volume...",2018-03-08T21:15:31+0000,2023-12-15T15:01:45+0000,2018-10-22T16:11:25+0000,2023-12-15T15:01:45+0000,data.cityofnewyork.us,https://www.nyc.gov/site/tlc/about/tlc-trip-re...,https://data.cityofnewyork.us/d/avz8-mqzz,https://data.cityofnewyork.us/resource/avz8-mqzz
95,5ufr-wvc5,2021 High Volume FHV Trip Records,Department of Transportation (DOT),These records are generated from the trip reco...,174596652,385,197,Transportation,dataset,table,"[taxi, trip data, fhv, trip, base, high volume...",2023-10-25T15:18:34+0000,2023-12-15T15:59:23+0000,2023-10-31T11:46:29+0000,2023-12-15T15:58:59+0000,data.cityofnewyork.us,https://www.nyc.gov/site/tlc/about/tlc-trip-re...,https://data.cityofnewyork.us/d/5ufr-wvc5,https://data.cityofnewyork.us/resource/5ufr-wvc5
2087,t7ny-aygi,2013 Yellow Taxi Trip Data,Taxi and Limousine Commission (TLC),These records are generated from the trip reco...,173179759,1209,349,Transportation,dataset,table,"[yellow taxi, trip data, fhv, trip, base, high...",2015-10-16T21:58:37+0000,2023-12-14T20:44:38+0000,2015-11-12T18:38:36+0000,2023-12-14T20:44:38+0000,data.cityofnewyork.us,https://www.nyc.gov/site/tlc/about/tlc-trip-re...,https://data.cityofnewyork.us/d/t7ny-aygi,https://data.cityofnewyork.us/resource/t7ny-aygi
2081,f9tw-8p66,2009 Yellow Taxi Trip Data,Taxi and Limousine Commission (TLC),These records are generated from the trip reco...,170896055,1771,917,Transportation,dataset,table,"[yellow taxi, trip data, fhv, trip, base, high...",2015-10-19T20:41:58+0000,2023-12-14T20:41:58+0000,2015-12-08T00:31:16+0000,2023-12-14T20:41:58+0000,data.cityofnewyork.us,https://www.nyc.gov/site/tlc/about/tlc-trip-re...,https://data.cityofnewyork.us/d/f9tw-8p66,https://data.cityofnewyork.us/resource/f9tw-8p66
2082,74wj-s5ij,2010 Yellow Taxi Trip Data,Taxi and Limousine Commission (TLC),These records are generated from the trip reco...,168994353,556,427,Transportation,dataset,table,"[yellow taxi, trip data, fhv, trip, base, high...",2015-10-19T20:30:20+0000,2023-12-14T20:42:30+0000,2015-12-07T01:41:47+0000,2023-12-14T20:42:30+0000,data.cityofnewyork.us,https://www.nyc.gov/site/tlc/about/tlc-trip-re...,https://data.cityofnewyork.us/d/74wj-s5ij,https://data.cityofnewyork.us/resource/74wj-s5ij
2086,kerk-3eby,2012 Yellow Taxi Trip Data,Taxi and Limousine Commission (TLC),These records are generated from the trip reco...,167331308,614,159,Transportation,dataset,table,"[yellow taxi, trip data, fhv, trip, base, high...",2015-10-19T18:21:18+0000,2023-12-14T20:44:28+0000,2015-12-08T17:23:23+0000,2023-12-14T20:44:28+0000,data.cityofnewyork.us,https://www.nyc.gov/site/tlc/about/tlc-trip-re...,https://data.cityofnewyork.us/d/kerk-3eby,https://data.cityofnewyork.us/resource/kerk-3eby
