## Data Download and File Setup
This notebook builds the file structure for the project, and downloads all the landing data into the relevant folder.
### Load constants and libraries
First of all we load any constant values for use within the notebook, and load any libraries required. 

In [8]:
# import all constants used in the note books
from constants import *

# libraries required
import os, zipfile
import itertools
from urllib.request import Request,urlopen,urlretrieve
from datetime import date, datetime, timedelta
import requests
from sodapy import Socrata
import pandas as pd


### Build File Structure
Only the notebooks and report folder is included in the GitHub repository, the remaining file structure is built here

In [2]:
# ensure all the required folders are present or created
for folder in ALL_FOLDERS:
    if not os.path.exists(folder):
        os.makedirs(folder)
    
# create the data sub-folders
for target_dir in ALL DATA_SUB: 
    if not os.path.exists(f'../data/{target_dir}'):
        os.makedirs(f'../data/{target_dir}')

# create the landing data sub-folders
output_relative_dir = '../data/landing/'
for target_dir in ALL_SOURCES: 
    if not os.path.exists(f'../data/landing/{target_dir}'):
        os.makedirs(f'../data/landing/{target_dir}')

### Retrieve the Data
In this section the following datasets are retrieved: <br>
*  taxi data from the TLC website
*  NYC shape file data
*  NYC weather data via the Iowa State University data repsoitory (https://mesonet.agron.iastate.edu/)
*  events data from Data.NY.gov (https://data.ny.gov/Transportation/511-NY-Events-Beginning-2010/ah74-pg4w)<br>

I use data from May 2020-May 2023. <br>
The weather data set is used to retrieve hourly wind speed, precipitation, and temperature readings from JFK airport. The events dataset is a comprehensive data set of any events that may impact traffic in and around NYC. I filter this dataset at the landing stage, in order to make it more manageable. I filter to only include sports and cultural events, the original dataset includes features such as crashes, construction and road works, and so on. 

#### TLC Data

In [3]:
def retrieve_data(source: str, year: str, month: str) -> None:
    '''
    Downloads TLC data to the landing stage for a given source (yellow, green, or fhvhv), year and month    
    Arguments:
        source = TLC data source (yellow, green, or fhvhv)
        year = year to download
        month = month to download
    Ouput: None
    '''       
    # generate url
    get_url = f'{TLC_URL+source+"_tripdata_"}{year}-{month}.parquet'
    # generate output location and filename
    get_path = f'{LANDING_DATA+source+"/"}/{year}-{month}.parquet'
    # download
    urlretrieve(get_url, get_path)

In [4]:

# generate the cross product of all years and months in scope
months = range(1, 13)
all_months = itertools.product(years, months)

# iterate through this list and retrieve the data within the start year / month and end year / month constraints
for year_month in all_months:
    year, month = year_month
    if not((year == start_year and month < start_month) | (year == end_year and month > end_month)):        
        month = str(month).zfill(2) 
        print(f"Begin year {year} month {month}")
        
        # retrieve data from all sources in scope
        for source in ALL_SOURCES:
            retrieve_data(source, year, month)                

Begin year 2020 month 05
Completed year 2020 month 05
Begin year 2020 month 06
Completed year 2020 month 06
Begin year 2020 month 07
Completed year 2020 month 07
Begin year 2020 month 08
Completed year 2020 month 08
Begin year 2020 month 09
Completed year 2020 month 09
Begin year 2020 month 10
Completed year 2020 month 10
Begin year 2020 month 11
Completed year 2020 month 11
Begin year 2020 month 12
Completed year 2020 month 12
Begin year 2021 month 01
Completed year 2021 month 01
Begin year 2021 month 02
Completed year 2021 month 02
Begin year 2021 month 03
Completed year 2021 month 03
Begin year 2021 month 04
Completed year 2021 month 04
Begin year 2021 month 05
Completed year 2021 month 05
Begin year 2021 month 06
Completed year 2021 month 06
Begin year 2021 month 07
Completed year 2021 month 07
Begin year 2021 month 08
Completed year 2021 month 08
Begin year 2021 month 09
Completed year 2021 month 09
Begin year 2021 month 10
Completed year 2021 month 10
Begin year 2021 month 11
Com

#### Shapefile data

In [None]:
# retrieves the taxi zone and shapefile data from the TLC website, unzipping the latter

urlretrieve(TAXI_ZONE_CSV, f'{TAXI_ZONE_DATA}taxi+_zone_lookup.csv')
local_filename, headers = urlretrieve(url = TAXI_ZONE_SHP)
zip_ref = zipfile.ZipFile(file = local_filename, mode = 'r')
zip_ref.extractall(path = TAXI_ZONE_DATA)    
zip_ref.close()

#### Weather Data

In [6]:
def fetch_weather_data(station_id):
    '''
    Downloads weather data from the Iowa State University repository. Fields are hardcoded to
    temperature (in celsius), precipitation, and wind speed
    Arguments:
        station_id = weather station from which to download data        
    Ouput: None
    '''   
    file_save = f"{LANDING_DATA}{station_id}.csv"
    
    uri = (
        "http://mesonet.agron.iastate.edu/cgi-bin/request/asos.py?"
        f"station={station_id}&data=tmpc&data=sped&data=p01m&year1=2021&month1=5&day1=1&"
        f"year2=2023&month2=5&day2=31&"
        "tz=Etc%2FUTC&format=onlycomma&latlon=no&elev=no&missing=M&trace=T&"
        "direct=yes&report_type=3"
    )
    res = requests.get(uri, timeout=300)
    with open(file_save, "w", encoding="utf-8") as fh:
        fh.write(res.text)


In [7]:
# fetch the weather data for JFK station
fetch_weather_data('JFK')

+ Downloading for JFK


#### Events Data

In [15]:
client = Socrata("data.ny.gov", None)

# these are the types of events we want to retrieve, there are many, many more!
retrieve_events = "'baseball game', 'basketball game', 'boxing', 'boxing match', 'carnival', 'concert', 'concert, festival', \
    'concert, fireworks display', 'concert, performing arts', 'concert, race event', 'concert, show', 'concert, special event', \
    'exhibition', 'exhibition, fair', 'fair', 'festival', 'festival, concert', 'festival, gridlock alert day', 'festival, marathon',\
    'festival, parade', 'fireworks', 'fireworks display', 'fireworks display, concert', 'football game', 'football game, fireworks',\
    'hockey game', 'march', 'outdoor market', 'parade', 'parade, festival', 'performing arts', 'play', 'play/show', 'special event',  \
    'special event, concert','sports event', 'street fair', 'tennis tournament', 'trade expo', 'wrestling'"

# retrieve the data, pre-filtered for the desired date range and event types, and also features of interest
results = client.get("ah74-pg4w",
    select="event_type, create_time, close_time, latitude, longitude",
    where="create_time>='2020-05-01' AND event_type IN (" + retrieve_events + ")",
    limit=10000)

results_df = pd.DataFrame.from_records(results)
results_df.to_csv(f'{LANDING_DATA}events_data.csv')

