# About the data

Download Data from this Page: https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page<br>
Data Dictionary: https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf

# Imports

In [1]:
import os

import pandas as pd
import polars as pl

# GLOBA_VARS

In [6]:
CWD = os.getcwd()
PATH_INPUT_FOLDER = os.path.join(CWD, "input")

trip2401 = os.path.join(PATH_INPUT_FOLDER, "yellow_tripdata_2024-01.parquet")

In [8]:
sorted(os.listdir(PATH_INPUT_FOLDER))

['yellow_tripdata_2022-01.parquet',
 'yellow_tripdata_2022-02.parquet',
 'yellow_tripdata_2022-03.parquet',
 'yellow_tripdata_2022-04.parquet',
 'yellow_tripdata_2022-05.parquet',
 'yellow_tripdata_2022-06.parquet',
 'yellow_tripdata_2022-07.parquet',
 'yellow_tripdata_2022-08.parquet',
 'yellow_tripdata_2022-09.parquet',
 'yellow_tripdata_2022-10.parquet',
 'yellow_tripdata_2022-11.parquet',
 'yellow_tripdata_2022-12.parquet',
 'yellow_tripdata_2023-01.parquet',
 'yellow_tripdata_2023-02.parquet',
 'yellow_tripdata_2023-03.parquet',
 'yellow_tripdata_2023-04.parquet',
 'yellow_tripdata_2023-05.parquet',
 'yellow_tripdata_2023-06.parquet',
 'yellow_tripdata_2023-07.parquet',
 'yellow_tripdata_2023-08.parquet',
 'yellow_tripdata_2023-09.parquet',
 'yellow_tripdata_2023-10.parquet',
 'yellow_tripdata_2023-11.parquet',
 'yellow_tripdata_2023-12.parquet',
 'yellow_tripdata_2024-01.parquet',
 'yellow_tripdata_2024-02.parquet']

# Comparativa Polars vs Pandas

In [9]:
# %%timeit

# pddf = pd.read_parquet(path = trip2401)

In [10]:
pddf = pd.read_parquet(path = trip2401)

In [11]:
pddf.shape

(2964624, 19)

In [12]:
pddf.head()

Unnamed: 0,VendorID,TpepPickupDatetime,TpepDropoffDatetime,PassengerCount,TripDistance,RatecodeID,StoreAndFwdFlag,PULocationID,DOLocationID,PaymentType,FareAmount,Extra,MtaTax,TipAmount,TollsAmount,ImprovementSurcharge,TotalAmount,CongestionSurcharge,AirportFee
0,2,2024-01-01 00:57:55,2024-01-01 01:17:43,1.0,1.72,1.0,N,186,79,2,17.7,1.0,0.5,0.0,0.0,1.0,22.7,2.5,0.0
1,1,2024-01-01 00:03:00,2024-01-01 00:09:36,1.0,1.8,1.0,N,140,236,1,10.0,3.5,0.5,3.75,0.0,1.0,18.75,2.5,0.0
2,1,2024-01-01 00:17:06,2024-01-01 00:35:01,1.0,4.7,1.0,N,236,79,1,23.3,3.5,0.5,3.0,0.0,1.0,31.3,2.5,0.0
3,1,2024-01-01 00:36:38,2024-01-01 00:44:56,1.0,1.4,1.0,N,79,211,1,10.0,3.5,0.5,2.0,0.0,1.0,17.0,2.5,0.0
4,1,2024-01-01 00:46:51,2024-01-01 00:52:57,1.0,0.8,1.0,N,211,148,1,7.9,3.5,0.5,3.2,0.0,1.0,16.1,2.5,0.0


In [13]:
# %%timeit

# pldf = pl.read_parquet(source = trip2401)

In [14]:
pldf = pl.read_parquet(source = trip2401)

In [15]:
pldf.shape

(2964624, 19)

In [16]:
pldf.head()

VendorID,TpepPickupDatetime,TpepDropoffDatetime,PassengerCount,TripDistance,RatecodeID,StoreAndFwdFlag,PULocationID,DOLocationID,PaymentType,FareAmount,Extra,MtaTax,TipAmount,TollsAmount,ImprovementSurcharge,TotalAmount,CongestionSurcharge,AirportFee
i32,datetime[ns],datetime[ns],f64,f64,f64,str,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64
2,2024-01-01 00:57:55,2024-01-01 01:17:43,1.0,1.72,1.0,"""N""",186,79,2,17.7,1.0,0.5,0.0,0.0,1.0,22.7,2.5,0.0
1,2024-01-01 00:03:00,2024-01-01 00:09:36,1.0,1.8,1.0,"""N""",140,236,1,10.0,3.5,0.5,3.75,0.0,1.0,18.75,2.5,0.0
1,2024-01-01 00:17:06,2024-01-01 00:35:01,1.0,4.7,1.0,"""N""",236,79,1,23.3,3.5,0.5,3.0,0.0,1.0,31.3,2.5,0.0
1,2024-01-01 00:36:38,2024-01-01 00:44:56,1.0,1.4,1.0,"""N""",79,211,1,10.0,3.5,0.5,2.0,0.0,1.0,17.0,2.5,0.0
1,2024-01-01 00:46:51,2024-01-01 00:52:57,1.0,0.8,1.0,"""N""",211,148,1,7.9,3.5,0.5,3.2,0.0,1.0,16.1,2.5,0.0


In [17]:
plldf = pl.scan_parquet(source = trip2401)

In [18]:
plldf

# Fixing Data Types

In [19]:
def get_file_types(input_folder):

    file_schemas = []

    for file_name in os.listdir(input_folder):

        file_path = os.path.join(input_folder, file_name)

        try:
            pldf_ = pl.read_parquet(file_path)
            schema_ = pldf_.schema

            columns = list(schema_.keys())
            types = list(schema_.values())
            file_names = [file_name for i in range(len(columns))]
            
            for c, t, f in zip(columns, types, file_names):
                file_schemas.append((c, t, f))

        except:
            print(file_path)

    return file_schemas

In [20]:
df = pd.DataFrame(
    data = get_file_types(input_folder=PATH_INPUT_FOLDER),
    columns = ["Col", "Type", "File"]
)

In [21]:
df.head()

Unnamed: 0,Col,Type,File
0,VendorID,Int32,yellow_tripdata_2023-06.parquet
1,TpepPickupDatetime,"Datetime(time_unit='ns', time_zone=None)",yellow_tripdata_2023-06.parquet
2,TpepDropoffDatetime,"Datetime(time_unit='ns', time_zone=None)",yellow_tripdata_2023-06.parquet
3,PassengerCount,Float64,yellow_tripdata_2023-06.parquet
4,TripDistance,Float64,yellow_tripdata_2023-06.parquet


In [22]:
(
    df["Col"].value_counts()
)

Col
VendorID                26
FareAmount              26
CongestionSurcharge     26
TotalAmount             26
ImprovementSurcharge    26
TollsAmount             26
TipAmount               26
MtaTax                  26
Extra                   26
PaymentType             26
TpepPickupDatetime      26
DOLocationID            26
PULocationID            26
StoreAndFwdFlag         26
RatecodeID              26
TripDistance            26
PassengerCount          26
TpepDropoffDatetime     26
AirportFee              26
Name: count, dtype: int64

In [23]:
df["Col"].nunique()

19

In [24]:
(
    df["File"].value_counts()
)

File
yellow_tripdata_2023-06.parquet    19
yellow_tripdata_2022-10.parquet    19
yellow_tripdata_2022-05.parquet    19
yellow_tripdata_2024-01.parquet    19
yellow_tripdata_2023-02.parquet    19
yellow_tripdata_2023-12.parquet    19
yellow_tripdata_2022-04.parquet    19
yellow_tripdata_2023-09.parquet    19
yellow_tripdata_2024-02.parquet    19
yellow_tripdata_2023-10.parquet    19
yellow_tripdata_2022-06.parquet    19
yellow_tripdata_2023-01.parquet    19
yellow_tripdata_2023-11.parquet    19
yellow_tripdata_2022-07.parquet    19
yellow_tripdata_2023-08.parquet    19
yellow_tripdata_2022-12.parquet    19
yellow_tripdata_2022-02.parquet    19
yellow_tripdata_2023-04.parquet    19
yellow_tripdata_2022-03.parquet    19
yellow_tripdata_2023-05.parquet    19
yellow_tripdata_2022-11.parquet    19
yellow_tripdata_2022-01.parquet    19
yellow_tripdata_2023-07.parquet    19
yellow_tripdata_2022-08.parquet    19
yellow_tripdata_2022-09.parquet    19
yellow_tripdata_2023-03.parquet    19
Name: c

In [25]:
(
    df
    .groupby(["Col"])
    .agg(
        nr_unique_types = ("Type", lambda series: len(set(series))),
        unique_types = ("Type", lambda series: set(series))
    )
    .pipe(
        lambda df: df[df["nr_unique_types"] != 1]
    )
)

Unnamed: 0_level_0,nr_unique_types,unique_types
Col,Unnamed: 1_level_1,Unnamed: 2_level_1
VendorID,2,"{Int32, Int64}"


In [56]:
# lf = (
#     df[
#         (df["Col"] == "VendorID") & (df["Type"] == df["Type"].iloc[0])
#     ]["File"].values.tolist()
# )

In [58]:
# for f in lf:
#     fp = os.path.join(PATH_INPUT_FOLDER, f)
#     (
#         pl.read_parquet(fp)
#         .with_columns(pl.col("VendorID").cast(pl.Int64))
#         .write_parquet(fp)
#     )

In [59]:
def get_default_rename_dict():
    
    rename_dict = {
        
        "VendorID":"VendorID",
        "tpep_pickup_datetime":"TpepPickupDatetime",
        "congestion_surcharge":"CongestionSurcharge",
        "total_amount":"TotalAmount",
        "improvement_surcharge":"ImprovementSurcharge",
        "tolls_amount":"TollsAmount",
        "tip_amount":"TipAmount",
        "mta_tax":"MtaTax",
        "extra":"Extra",
        "fare_amount":"FareAmount",
        "payment_type":"PaymentType",
        "DOLocationID":"DOLocationID",
        "PULocationID":"PULocationID",
        "store_and_fwd_flag":"StoreAndFwdFlag",
        "RatecodeID":"RatecodeID",
        "trip_distance":"TripDistance",
        "passenger_count":"PassengerCount",
        "tpep_dropoff_datetime":"TpepDropoffDatetime",
        "Airport_fee":"AirportFee"
    }
    
    return rename_dict

In [60]:
def get_default_rename_dict_2022():
    
    rename_dict = {
        "VendorID":"VendorID",
        "tpep_pickup_datetime":"TpepPickupDatetime",
        "congestion_surcharge":"CongestionSurcharge",
        "total_amount":"TotalAmount",
        "improvement_surcharge":"ImprovementSurcharge", # cambia
        "tolls_amount":"TollsAmount", # peaje
        "tip_amount":"TipAmount",
        "mta_tax":"MtaTax",
        "extra":"Extra",
        "fare_amount":"FareAmount",
        "payment_type":"PaymentType",
        "DOLocationID":"DOLocationID",
        "PULocationID":"PULocationID",
        "store_and_fwd_flag":"StoreAndFwdFlag",
        "RatecodeID":"RatecodeID",
        "trip_distance":"TripDistance",
        "passenger_count":"PassengerCount",
        "tpep_dropoff_datetime":"TpepDropoffDatetime",
        "airport_fee":"AirportFee",
    }
    
    return rename_dict

In [61]:
def standarize_files(input_folder, year):
    
    for file_name in os.listdir(input_folder):

        file_path = os.path.join(input_folder, file_name)
        
        rename_dict = get_default_rename_dict() if year != 2022 else get_default_rename_dict_2022()
        pldf = pl.read_parquet(file_path)
        columns = pldf.columns
        
        if "airport_fee" in columns and year != 2022:
            _ = rename_dict.pop("Airport_fee")
            rename_dict["airport_fee"] = "AirportFee"
            
        pldf = (
            pldf
            .rename(rename_dict)
            .with_columns(
                pl.col("DOLocationID").cast(pl.Int64),
                pl.col("PULocationID").cast(pl.Int64),
                pl.col("RatecodeID").cast(pl.Float64),
                pl.col("PassengerCount").cast(pl.Float64)
            )
        )
        
        pldf.write_parquet(file = os.path.join(input_folder, file_name))

In [62]:
# standarize_files(input_folder=PATH_INPUT_FOLDER)

In [63]:
# standarize_files(input_folder=PATH_INPUT_FOLDER, year=2022)

In [64]:
# (
#     pl.read_parquet(os.path.join(PATH_INPUT_FOLDER, "yellow_tripdata_2023-01.parquet"))
#     .rename({"Airport_fee":"AirportFee"})
#     .write_parquet(os.path.join(PATH_INPUT_FOLDER, "yellow_tripdata_2023-01.parquet"))
# )

In [65]:
# (
#     pl.read_parquet(source=os.path.join(PATH_INPUT_FOLDER, "yellow_tripdata_2023-01.parquet"))
#     .with_columns(pl.col("VendorID").cast(pl.Int32))
#     .write_parquet(os.path.join(PATH_INPUT_FOLDER, "yellow_tripdata_2023-01.parquet"))
# )

In [66]:
pldf = pl.read_parquet(source = f"{PATH_INPUT_FOLDER}/yellow_tripdata_*.parquet")

In [67]:
pldf.head()

VendorID,TpepPickupDatetime,TpepDropoffDatetime,PassengerCount,TripDistance,RatecodeID,StoreAndFwdFlag,PULocationID,DOLocationID,PaymentType,FareAmount,Extra,MtaTax,TipAmount,TollsAmount,ImprovementSurcharge,TotalAmount,CongestionSurcharge,AirportFee
i64,datetime[ns],datetime[ns],f64,f64,f64,str,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1,2022-01-01 00:35:40,2022-01-01 00:53:29,2.0,3.8,1.0,"""N""",142,236,1,14.5,3.0,0.5,3.65,0.0,0.3,21.95,2.5,0.0
1,2022-01-01 00:33:43,2022-01-01 00:42:07,1.0,2.1,1.0,"""N""",236,42,1,8.0,0.5,0.5,4.0,0.0,0.3,13.3,0.0,0.0
2,2022-01-01 00:53:21,2022-01-01 01:02:19,1.0,0.97,1.0,"""N""",166,166,1,7.5,0.5,0.5,1.76,0.0,0.3,10.56,0.0,0.0
2,2022-01-01 00:25:21,2022-01-01 00:35:23,1.0,1.09,1.0,"""N""",114,68,2,8.0,0.5,0.5,0.0,0.0,0.3,11.8,2.5,0.0
2,2022-01-01 00:36:48,2022-01-01 01:14:20,1.0,4.3,1.0,"""N""",68,163,1,23.5,0.5,0.5,3.0,0.0,0.3,30.3,2.5,0.0


In [68]:
pldf.shape

(83938474, 19)

In [None]:
# EAGER Mode -> pandas

In [None]:
pldf.head()

In [None]:
os.listdir(PATH_INPUT_FOLDER)

In [None]:
PATH_INPUT_FOLDER

In [None]:
fs = [os.path.join(os.path.join(CWD, "input"), f) for f in os.listdir(os.path.join(CWD, "input"))] +\
[os.path.join(os.path.join(CWD, "input2"), f) for f in os.listdir(os.path.join(CWD, "input2"))]

In [None]:
plldf = pl.scan_parquet(
    source = fs
)

In [None]:
plldf

In [None]:
plldf.schema

In [None]:
(
    plldf
    
)

In [None]:
(
    plldf
    .describe()
)

In [None]:
# %%timeit

# (
#     plldf
#     .describe()
# )

### 12.5 s ± 628 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [None]:
# pddf = pd.concat(
#     [
#         pd.read_parquet(os.path.join(PATH_INPUT_FOLDER, f)) for f in os.listdir(PATH_INPUT_FOLDER)
#     ]
# )

In [None]:
# pddf.shape

In [None]:
# %%timeit

# pddf.describe()

### 25.2 s ± 813 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [None]:
# pddf.describe()

In [None]:
# pddf.head()