# About the data

Download Data from this Page: https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page<br>
Data Dictionary: https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf

# Imports

In [1]:
import os
import urllib

import multiprocessing as mp

import pandas as pd
import polars as pl

In [2]:
# GIL: global interpter lock --> 16 cores --> 1 core --> Python Interpret
# Multiprocessing.           --> 16 cores --> 16 Interpreters X16 velocidad 

# GLOBAL_VARS

In [3]:
CWD = os.getcwd()
PATH_INPUT_FOLDER = os.path.join(CWD, "input2")

In [4]:
FILES_TO_DOWNLOAD = [
    
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-02.parquet",
    
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-04.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-05.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-06.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-07.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-08.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-09.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-10.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-11.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-12.parquet",
    
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-01.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-02.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-03.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-04.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-05.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-06.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-07.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-08.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-09.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-10.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-11.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-12.parquet",
    
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-02.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-03.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-04.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-05.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-06.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-07.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-08.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-09.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-10.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-11.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-12.parquet",
    
]

# Data Downloading

In [5]:
def make_input_folder(folder_path):
    
    if not os.path.isdir(folder_path):
        os.makedirs(folder_path)

def extract_basename(url):
    
    return os.path.basename(url)

def download_file(t):
    
    folder_path, url = t
    basename = os.path.join(folder_path, extract_basename(url = url))
    
    urllib.request.urlretrieve(url = url, filename = basename)

In [18]:
iterable = list(zip([PATH_INPUT_FOLDER for i in range(len(FILES_TO_DOWNLOAD))], FILES_TO_DOWNLOAD))
cpu_cores = mp.cpu_count() - 2

make_input_folder(folder_path = PATH_INPUT_FOLDER)

In [23]:
pool = mp.get_context(method="fork").Pool(processes = cpu_cores)
pool.map(func = download_file, iterable = iterable)
pool.close()

# Fixing Data Types

In [24]:
def get_file_types(input_folder):

    file_schemas = []

    for file_name in os.listdir(input_folder):

        file_path = os.path.join(input_folder, file_name)

        try:
            pldf_ = pl.read_parquet(file_path)
            schema_ = pldf_.schema

            columns = list(schema_.keys())
            types = list(schema_.values())
            file_names = [file_name for i in range(len(columns))]
            
            for c, t, f in zip(columns, types, file_names):
                file_schemas.append((c, t, f))

        except:
            print(file_path)

    return file_schemas

In [25]:
df = pd.DataFrame(
    data = get_file_types(input_folder=PATH_INPUT_FOLDER),
    columns = ["Col", "Type", "File"]
)

In [26]:
df.head()

Unnamed: 0,Col,Type,File
0,VendorID,Int32,yellow_tripdata_2023-06.parquet
1,tpep_pickup_datetime,"Datetime(time_unit='ns', time_zone=None)",yellow_tripdata_2023-06.parquet
2,tpep_dropoff_datetime,"Datetime(time_unit='ns', time_zone=None)",yellow_tripdata_2023-06.parquet
3,passenger_count,Int64,yellow_tripdata_2023-06.parquet
4,trip_distance,Float64,yellow_tripdata_2023-06.parquet


In [27]:
(
    df["Col"].value_counts()
)

Col
VendorID                 38
tpep_pickup_datetime     38
congestion_surcharge     38
total_amount             38
improvement_surcharge    38
tolls_amount             38
tip_amount               38
mta_tax                  38
extra                    38
fare_amount              38
payment_type             38
DOLocationID             38
PULocationID             38
store_and_fwd_flag       38
RatecodeID               38
trip_distance            38
passenger_count          38
tpep_dropoff_datetime    38
airport_fee              25
Airport_fee              13
Name: count, dtype: int64

In [28]:
df["Col"].nunique()

20

In [29]:
(
    df["File"].value_counts()
)

File
yellow_tripdata_2023-06.parquet    19
yellow_tripdata_2021-08.parquet    19
yellow_tripdata_2023-01.parquet    19
yellow_tripdata_2022-06.parquet    19
yellow_tripdata_2023-10.parquet    19
yellow_tripdata_2024-02.parquet    19
yellow_tripdata_2021-03.parquet    19
yellow_tripdata_2023-09.parquet    19
yellow_tripdata_2022-04.parquet    19
yellow_tripdata_2023-12.parquet    19
yellow_tripdata_2022-10.parquet    19
yellow_tripdata_2023-02.parquet    19
yellow_tripdata_2021-01.parquet    19
yellow_tripdata_2021-11.parquet    19
yellow_tripdata_2021-10.parquet    19
yellow_tripdata_2024-01.parquet    19
yellow_tripdata_2022-05.parquet    19
yellow_tripdata_2021-09.parquet    19
yellow_tripdata_2023-11.parquet    19
yellow_tripdata_2022-07.parquet    19
yellow_tripdata_2023-08.parquet    19
yellow_tripdata_2021-12.parquet    19
yellow_tripdata_2021-05.parquet    19
yellow_tripdata_2022-09.parquet    19
yellow_tripdata_2021-04.parquet    19
yellow_tripdata_2022-08.parquet    19
yellow_

In [30]:
(
    df
    .groupby(["Col"])
    .agg(
        nr_unique_types = ("Type", lambda series: len(set(series))),
        unique_types = ("Type", lambda series: set(series))
    )
    .pipe(
        lambda df: df[df["nr_unique_types"] != 1]
    )
)

Unnamed: 0_level_0,nr_unique_types,unique_types
Col,Unnamed: 1_level_1,Unnamed: 2_level_1
DOLocationID,2,"{Int32, Int64}"
PULocationID,2,"{Int32, Int64}"
RatecodeID,2,"{Int64, Float64}"
VendorID,2,"{Int32, Int64}"
passenger_count,2,"{Int64, Float64}"


In [31]:
def get_default_rename_dict():
    
    rename_dict = {
        
        "VendorID":"VendorID",
        "tpep_pickup_datetime":"TpepPickupDatetime",
        "congestion_surcharge":"CongestionSurcharge",
        "total_amount":"TotalAmount",
        "improvement_surcharge":"ImprovementSurcharge",
        "tolls_amount":"TollsAmount",
        "tip_amount":"TipAmount",
        "mta_tax":"MtaTax",
        "extra":"Extra",
        "fare_amount":"FareAmount",
        "payment_type":"PaymentType",
        "DOLocationID":"DOLocationID",
        "PULocationID":"PULocationID",
        "store_and_fwd_flag":"StoreAndFwdFlag",
        "RatecodeID":"RatecodeID",
        "trip_distance":"TripDistance",
        "passenger_count":"PassengerCount",
        "tpep_dropoff_datetime":"TpepDropoffDatetime",
        "Airport_fee":"AirportFee"
    }
    
    return rename_dict

In [32]:
def standarize_files(input_folder):
    
    for file_name in os.listdir(input_folder):
        
        if file_name != ".ipynb_checkpoints":

            file_path = os.path.join(input_folder, file_name)

            rename_dict = get_default_rename_dict()
            pldf = pl.read_parquet(file_path)
            columns = pldf.columns

            if "airport_fee" in columns:
                _ = rename_dict.pop("Airport_fee")
                rename_dict["airport_fee"] = "AirportFee"

            pldf = (
                pldf
                .rename(rename_dict)
                .with_columns(
                    pl.col("VendorID").cast(pl.Int64),
                    pl.col("DOLocationID").cast(pl.Int64),
                    pl.col("PULocationID").cast(pl.Int64),
                    pl.col("RatecodeID").cast(pl.Float64),
                    pl.col("PassengerCount").cast(pl.Float64)
                )
            )

            pldf.write_parquet(file = os.path.join(input_folder, file_name))

In [33]:
standarize_files(input_folder=PATH_INPUT_FOLDER)

# Final Test

In [34]:
plldf = pl.scan_parquet(source = os.path.join(PATH_INPUT_FOLDER, "yellow_*.parquet"))

In [35]:
(
    plldf.describe()
)

statistic,VendorID,TpepPickupDatetime,TpepDropoffDatetime,PassengerCount,TripDistance,RatecodeID,StoreAndFwdFlag,PULocationID,DOLocationID,PaymentType,FareAmount,Extra,MtaTax,TipAmount,TollsAmount,ImprovementSurcharge,TotalAmount,CongestionSurcharge,AirportFee
str,f64,str,str,f64,f64,f64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",114842782.0,"""114842782""","""114842782""",110360656.0,114842782.0,110360656.0,"""110360656""",114842782.0,114842782.0,114842782.0,114842782.0,114842782.0,114842782.0,114842782.0,114842782.0,114842782.0,114842782.0,110360656.0,106197933.0
"""null_count""",0.0,"""0""","""0""",4482126.0,0.0,4482126.0,"""4482126""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4482126.0,8644849.0
"""mean""",1.72713,"""2022-09-02 05:52:17.837102""","""2022-09-02 06:21:37.871418""",1.394359,5.480142,1.441886,,165.035442,163.087605,1.185801,14.670636,1.224257,0.488394,4.475876,0.513991,0.56659,23.667243,2.276808,0.108185
"""std""",0.4816,,,0.954163,526.928116,6.025593,,65.093578,70.219594,0.538807,13121.156354,1.791555,0.097004,13120.627542,2.043843,0.358734,117.789259,0.764936,0.388731
"""min""",1.0,"""2001-01-01 00:03:14""","""1970-01-20 10:16:32""",0.0,0.0,1.0,"""N""",1.0,1.0,0.0,-133390000.0,-39.17,-0.55,-411.0,-99.99,-1.0,-2567.8,-2.5,-1.75
"""25%""",1.0,"""2021-12-09 08:58:29""","""2021-12-09 09:17:37""",1.0,1.08,1.0,,132.0,113.0,1.0,7.5,0.0,0.5,0.02,0.0,0.3,12.98,2.5,0.0
"""50%""",2.0,"""2022-09-04 18:41:45""","""2022-09-04 18:58:51""",1.0,1.81,1.0,,162.0,162.0,1.0,11.4,0.5,0.5,2.26,0.0,0.3,17.6,2.5,0.0
"""75%""",2.0,"""2023-05-25 17:30:13""","""2023-05-25 17:51:57""",1.0,3.4,1.0,,234.0,234.0,1.0,18.4,2.5,0.5,3.7,0.0,1.0,25.68,2.5,0.0
"""max""",6.0,"""2098-09-11 02:23:31""","""2098-09-11 02:52:04""",112.0,389678.46,99.0,"""Y""",265.0,265.0,5.0,818283.44,10002.5,53.16,133390000.0,956.55,1.0,818286.74,3.0,1.75
