# Export Data
## NYC Open *Big* Data
Author: Mark Bauer

In [1]:
import pandas as pd
import time
from datetime import datetime
import requests

In [2]:
path = 'https://data.cityofnewyork.us/api/views/'
views_df = pd.read_json(path)

print(views_df.shape)
views_df.head()

(3237, 50)


Unnamed: 0,id,name,assetType,averageRating,category,createdAt,description,displayType,downloadCount,hideFromCatalog,...,blobFilename,blobFileSize,blobId,blobMimeType,ratings,childViews,indexUpdatedAt,iconUrl,previewImageId,disabledFeatureFlags
0,fkec-mjr6,"DOHMH Cryptosporidiosis by Race/Ethnicity, Age...",dataset,0,Health,1722867167,"Cryptosporidiosis, number of cases and annual ...",table,1,False,...,,,,,,,,,,
1,r6e8-2fwe,Location of Disposal Facilities and Sites Used...,map,0,City Government,1722436736,The location of the disposal facilities where ...,visualization_canvas_map,0,False,...,,,,,,,,,,
2,9e2b-mctv,New York City Bike Routes\t (Map),map,0,,1721837332,The New York City Department of Transportation...,visualization_canvas_map,0,False,...,,,,,,,,,,
3,mzxg-pwib,New York City Bike Routes,dataset,0,,1721836651,The New York City Department of Transportation...,table,34,False,...,,,,,,,,,,
4,6r9j-qrwz,DSNY Disposal Facilities Used by Year,dataset,0,City Government,1720809444,A listing of the facilities used by year to ha...,table,6,False,...,,,,,,,,,,


In [3]:
views_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3237 entries, 0 to 3236
Data columns (total 50 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        3237 non-null   object 
 1   name                      3237 non-null   object 
 2   assetType                 3237 non-null   object 
 3   averageRating             3237 non-null   int64  
 4   category                  3127 non-null   object 
 5   createdAt                 3237 non-null   int64  
 6   description               3163 non-null   object 
 7   displayType               3237 non-null   object 
 8   downloadCount             3237 non-null   int64  
 9   hideFromCatalog           3237 non-null   bool   
 10  hideFromDataJson          3237 non-null   bool   
 11  locked                    3237 non-null   bool   
 12  newBackend                3237 non-null   bool   
 13  numberOfComments          3237 non-null   int64  
 14  oid     

In [4]:
views_df['displayType'].value_counts()

table                         2578
visualization_canvas_map       213
blob                           170
href                           147
map                            124
story                            3
visualization_canvas_table       1
visualization_canvas_chart       1
Name: displayType, dtype: int64

In [5]:
views_df['assetType'].value_counts()

dataset          2554
map               337
file              170
href              147
filter             24
story               3
visualization       1
chart               1
Name: assetType, dtype: int64

In [6]:
condition = (
    (views_df['displayType'] == 'table')
    & (views_df['assetType'] == 'dataset')
)
    
views_df = views_df.loc[condition].reset_index(drop=True)

print(views_df.shape)
views_df.head()

(2554, 50)


Unnamed: 0,id,name,assetType,averageRating,category,createdAt,description,displayType,downloadCount,hideFromCatalog,...,blobFilename,blobFileSize,blobId,blobMimeType,ratings,childViews,indexUpdatedAt,iconUrl,previewImageId,disabledFeatureFlags
0,fkec-mjr6,"DOHMH Cryptosporidiosis by Race/Ethnicity, Age...",dataset,0,Health,1722867167,"Cryptosporidiosis, number of cases and annual ...",table,1,False,...,,,,,,,,,,
1,mzxg-pwib,New York City Bike Routes,dataset,0,,1721836651,The New York City Department of Transportation...,table,34,False,...,,,,,,,,,,
2,6r9j-qrwz,DSNY Disposal Facilities Used by Year,dataset,0,City Government,1720809444,A listing of the facilities used by year to ha...,table,6,False,...,,,,,,,,,,
3,99xv-he3n,DSNY Disposal Sites Used by Facilities by Year,dataset,0,City Government,1720808339,A listing of the disposal sites used by each f...,table,7,False,...,,,,,,,,,,
4,ufxk-pq9j,Location of Disposal Facilities and Sites Used...,dataset,0,City Government,1720806845,The location of the disposal facilities where ...,table,16,False,...,,,,,,,,,,


In [7]:
views_df.groupby(by=['assetType', 'displayType'])['id'].count()

assetType  displayType
dataset    table          2554
Name: id, dtype: int64

In [8]:
# Initialize variables
API_BASE = 'https://data.cityofnewyork.us/resource/'
DATASETS = views_df['id'].to_list()
EXPORT_LOG = 'log.txt'
REQUEST_INTERVAL = 5  # seconds between requests

# Function to log messages
def log_message(message: str) -> None:
    """Log messages with timestamp."""
    with open(EXPORT_LOG, 'a') as file:
        file.write(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')},{dataset},{message}\n")

for dataset in DATASETS:
    try:
        # Attempt to read the dataset
        response = requests.get(f'{API_BASE}{dataset}.json?$select=count(*)')
        response.raise_for_status()  # Ensure we catch HTTP errors
        df = pd.read_json(response.text)
        count = df.values[0][0]
        log_message(f",{count}")

    except requests.RequestException as req_err:
        # Log request-related errors
        log_message(f"Request error for {dataset}: {req_err},")

    except ValueError as val_err:
        # Log JSON decoding errors
        log_message(f"Value error for {dataset}: {val_err},")

    except Exception as e:
        # Log any other unexpected errors
        log_message(f"Unexpected error for {dataset}: {e},")

    # Sleep to avoid hitting rate limits
    time.sleep(REQUEST_INTERVAL)