# Imports

In [1]:
import warnings
warnings.filterwarnings("ignore")

import os

import datetime

import pandas as pd
import polars as pl

# GLOBAL_VARS

In [None]:
CWD = os.getcwd()
PATH_INPUT_FOLDER = os.path.join(CWD, "input_urllib")

trip2401 = os.path.join(PATH_INPUT_FOLDER, "yellow_tripdata_2024-01.parquet")

In [None]:
pddf = pd.read_parquet(path = trip2401)
pldf = pl.read_parquet(source = trip2401)

In [None]:
pldf.shape

In [None]:
pldf.head()

In [None]:
(
    pddf
    .groupby(by = ["VendorID"])
    .agg(
        max_passenger_count = ("PassengerCount", max),
        min_passenger_count = ("PassengerCount", min),
        mean_tip_distance = ("TripDistance", "mean")
    )
    .reset_index()
    .sort_values("VendorID")
    [["VendorID", "mean_tip_distance"]]
    .plot(kind = "bar", x = "VendorID")
)

In [None]:
(
    pldf
    .group_by(by = ["VendorID"])
    .agg(
        max_passenger_count = pl.col("PassengerCount").max(),
        min_passenger_count = pl.col("PassengerCount").min(),
        mean_tip_distance = pl.col("TripDistance").mean()
    )
    .sort("VendorID")
    .select(pl.col(["VendorID", "mean_tip_distance"]))
    .plot(kind = "bar", x = "VendorID")
)

In [None]:
(
    pldf
    .with_columns(TpepPickupDatetimeStr = pl.col("TpepPickupDatetime").cast(pl.String))
    .with_columns(
        Year = pl.col("TpepPickupDatetime").dt.year(),
        Month = pl.col("TpepPickupDatetime").dt.month(),
        Day = pl.col("TpepPickupDatetime").dt.day()
    )
    .with_columns(Date = pl.date(pl.col("Year"), pl.col("Month"), pl.col("Day")))
    .group_by(pl.col("Date"))
    .agg(
        pl.col("PassengerCount").mean()
    )
    .filter(
        (pl.col("Date") >= datetime.datetime(2024, 1, 1)) & (pl.col("Date") < datetime.datetime(2024, 2, 1))
    )
    .sort(pl.col("Date"))
    .plot(kind = "line", x = "Date")
)

In [None]:
(
    pldf
    .sort("TpepPickupDatetime")
    .group_by_dynamic(index_column = "TpepPickupDatetime", every = "7d", start_by = "monday")
    .agg(
        pl.col("PassengerCount").mean()
    )
    .filter(
        (pl.col("TpepPickupDatetime") >= datetime.datetime(2024, 1, 1)) &
        (pl.col("TpepPickupDatetime") < datetime.datetime(2024, 2, 1))
    )
    .plot(kind = "line", x = "TpepPickupDatetime")
    
)

In [None]:
(
    pldf
    .filter(
        (pl.col("TpepPickupDatetime") >= datetime.datetime(2024, 1, 1)) &
        (pl.col("TpepPickupDatetime") < datetime.datetime(2024, 2, 1))
    )
    .with_columns(Day = pl.col("TpepPickupDatetime").dt.day())
    .with_columns(TotalFareAmount = pl.col("FareAmount").sum().over("Day"))
    .with_columns(PctPerDay = (pl.col("FareAmount")/pl.col("TotalFareAmount")).mul(100).round(5))
    .select(pl.col(["TpepPickupDatetime", "Day", "FareAmount", "TotalFareAmount", "PctPerDay"]))
    .plot(kind = "line", x = "TpepPickupDatetime")
)

In [None]:
(
    pldf
    .filter(
        (pl.col("TpepPickupDatetime") >= datetime.datetime(2024, 1, 1)) &
        (pl.col("TpepPickupDatetime") < datetime.datetime(2024, 2, 1))
    )
    .sort("TpepPickupDatetime")
    .select(pl.col(["TpepPickupDatetime", "PaymentType", "FareAmount"]))
    .group_by_dynamic(index_column = "TpepPickupDatetime", every = "1d", by = "PaymentType")
    .agg(pl.col("FareAmount").mean())
    .plot(kind = "line", y = "FareAmount", x = "TpepPickupDatetime", by = "PaymentType")
)

In [None]:
(
    pldf
    .filter(
        (pl.col("TpepPickupDatetime") >= datetime.datetime(2024, 1, 1)) &
        (pl.col("TpepPickupDatetime") < datetime.datetime(2024, 2, 1))
    )
    .sort("TpepPickupDatetime")
    .select(pl.col(["TpepPickupDatetime", "PaymentType", "FareAmount"]))
    .group_by([pl.col("TpepPickupDatetime").dt.truncate("1d"), "PaymentType"])
    .agg(pl.col("FareAmount").mean())
    .sort("TpepPickupDatetime")
    .plot(kind = "line", y = "FareAmount", x = "TpepPickupDatetime", by = "PaymentType")
)

In [None]:
(
    pldf
    .filter(
        (pl.col("TpepPickupDatetime") >= datetime.datetime(2024, 1, 1)) &
        (pl.col("TpepPickupDatetime") < datetime.datetime(2024, 2, 1))
    )
    .sort("TpepPickupDatetime")
    .select(pl.col(["TpepPickupDatetime", "PaymentType", "FareAmount"]))
    .filter(~pl.all_horizontal(pl.all().is_null()))
)

In [None]:
[m for m in dir(pl.DataFrame) if not m.startswith("_")]

In [None]:
r = (
    pldf
    .filter(
        (pl.col("TpepPickupDatetime") >= datetime.datetime(2024, 1, 1)) &
        (pl.col("TpepPickupDatetime") < datetime.datetime(2024, 2, 1))
    )
    .sort("TpepPickupDatetime")
    .select(pl.col(["TpepPickupDatetime", "PaymentType", "FareAmount"]))
    # .group_by([pl.col("TpepPickupDatetime").dt.truncate("1d"), "PaymentType"])
    .group_by(pl.col("TpepPickupDatetime").dt.truncate("1d"))
    .agg(
        TotalFareAmount = pl.col("FareAmount").sum()
    )
    .with_columns(
        MovingAverageTotalAmount =\
        pl.col("TotalFareAmount").mean().rolling(index_column = "TpepPickupDatetime", period = "7d")
    )

)

In [None]:
pl.Config.set_fmt_float("full")

In [None]:
(1765112.95 + 1583412.42)/2

In [None]:
(
    r
    .head(7)
    .with_columns(pl.col("TotalFareAmount").sum()/7)
)

In [None]:
r = (
    pldf
    .filter(
        (pl.col("TpepPickupDatetime") >= datetime.datetime(2024, 1, 1)) &
        (pl.col("TpepPickupDatetime") < datetime.datetime(2024, 2, 1))
    )
    .sort("TpepPickupDatetime")
    .select(pl.col(["TpepPickupDatetime", "PaymentType", "FareAmount"]))
    .group_by([pl.col("TpepPickupDatetime").dt.truncate("1d"), "PaymentType"])
    .agg(
        TotalFareAmount = pl.col("FareAmount").sum()
    )
)

rp = (
    r
    .sort(["TpepPickupDatetime", "PaymentType"])
    .rolling(
        index_column = "TpepPickupDatetime",
        group_by = "PaymentType",
        period = "7d"
        
    )
    .agg(
        MA7DTotalFareAmount = pl.col("TotalFareAmount").mean()
    )
    .join(r, on = ["PaymentType", "TpepPickupDatetime"])
    .sort(["PaymentType", "TpepPickupDatetime"])
)

rp

In [None]:
rp = rp.pivot(
    index = "TpepPickupDatetime",
    columns = "PaymentType",
    values = ["TotalFareAmount", "MA7DTotalFareAmount"]
)

In [None]:
rp

In [None]:
p1 = rp.plot(
    x = "TpepPickupDatetime",
    y = "TotalFareAmount_PaymentType_0",
    c = "black"
)

p2 = p1 * (rp.plot(
    x = "TpepPickupDatetime",
    y = "TotalFareAmount_PaymentType_1",
    c = "red"
))

p3 = p2 * (rp.plot(
    x = "TpepPickupDatetime",
    y = "TotalFareAmount_PaymentType_2",
    c = "green"
))

p3