# Load and validate the raw data
[TLC Trip Record Data](https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page)

In [None]:
from pathlib import Path
import requests

def download_one_file_of_raw_data(year: int, month: int) -> Path:
    """Download one file of raw data from the TLC website."""
    url = f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year}-{month:02d}.parquet'
    response = requests.get(url)
    
    if response.status_code == 200:
        path = Path(f'../data/raw/rides_{year}-{month:02d}.parquet')
        path.write_bytes(response.content)
        return path
    else:
        raise ValueError(f'Could not download file for {year}-{month:02d}.')

In [None]:
download_one_file_of_raw_data(year=2022, month=1)

In [None]:
import pandas as pd

rides = pd.read_parquet('../data/raw/rides_2022-01.parquet')

rides.head(20)

In [None]:
rides = rides[['tpep_pickup_datetime', 'PULocationID']]

In [None]:
rides.rename(columns={'tpep_pickup_datetime': 'pickup_datetime', 
                      'PULocationID': 'pickup_location_id'}, inplace=True)

rides.head(20)

In [None]:
rides['pickup_datetime'].describe()

In [None]:
# remove rides with pickup time outside of the month of January 2022
rides = rides[rides['pickup_datetime'].between('2022-01-01', '2022-02-01')]

rides['pickup_datetime'].describe()

In [None]:
# store validated data
rides.to_parquet('../data/transformed/rides_2022-01.parquet')