In [None]:
import polars as pl
import pandas as pd
from time import time
from pathlib import Path
import warnings

In [None]:
warnings.filterwarnings("ignore")

In [None]:
round((Path("data.csv").stat().st_size / 1048576), 2)

In [None]:
def elapse_time(func):
    def wrapper(*args, **kwargs):
        start_time = time()
        result = func(*args, **kwargs)
        end_time = time()
        execution_time = round((end_time - start_time), 2)
        print(f"Function {func.__name__} took {execution_time} seconds to execute")
        return result
    return wrapper

In [None]:
@elapse_time
def read_csv(path: str, lib: str) -> pd.DataFrame | pl.DataFrame:
    if lib == "polars":
        df = pl.read_csv(path)
    elif lib == "pandas":
        df = pd.read_csv(path)
    else:
        raise TypeError("lib must be one of 'polars' or 'pandas'.")
    return df

In [None]:
pldf = read_csv("data.csv", "polars")

In [None]:
pddf = read_csv("data.csv", "pandas")

In [None]:
# Schema

In [None]:
pldf.schema

In [None]:
pddf.dtypes

In [None]:
# Describe

In [None]:
pldf.describe()

In [None]:
pddf.describe()

In [None]:
# select

In [None]:
pldf[["passenger_count", "fare_amount"]]

In [None]:
# advanced selection

In [None]:
pldf.select(
    pl.col("passenger_count"),
    pl.col("fare_amount")
)

In [None]:
import polars.selectors as cs

In [None]:
pldf.select(cs.starts_with("passenger"))

In [None]:
pldf.select(cs.temporal() | cs.string())

In [None]:
# Add new column

In [None]:
pldf["new_column"] = "important info"

In [None]:
pldf = pldf.with_columns(
    new_column=pl.lit("important info")
)

In [None]:
pldf.head(3)

In [None]:
pldf = pldf.with_columns(
    pl.lit(1.3578490).alias("another_new_column")
)

In [None]:
pldf.head(3)

In [None]:
pldf = pldf.with_columns(
    pl.col("another_new_column").cast(pl.Int64).alias("another_new_column_INT64")
)

In [None]:
pldf.head(3)

In [None]:
# When - Then - Otherwise

In [None]:
pldf.head(3)

In [None]:
pldf = pldf.with_columns(
    pl.when(
        (pl.col("trip_distance")>0) & (pl.col("trip_distance")<1)
    )
    .then(pl.lit("0-1"))
    .otherwise(pl.lit("1+"))
    .alias("trip_distance_category")
)

In [None]:
pldf[["trip_distance", "trip_distance_category", "passenger_count"]].head(3)

In [None]:
# in Pandas

In [None]:
pddf.loc[(pddf['trip_distance']<1) & (pddf['trip_distance']>0), 'trip_distance_category'] = "0-1"
pddf.loc[pddf['trip_distance']>=1, 'trip_distance_category'] = "1+"

In [None]:
pddf[["trip_distance", "trip_distance_category", "passenger_count"]].head(3)

In [None]:
# Sum

In [None]:
pldf.select(pl.sum("trip_distance", "passenger_count"))

In [None]:
# Group By

In [None]:
pldf.group_by("trip_distance_category").agg(pl.col("passenger_count").sum())

In [None]:
pldf.group_by("trip_distance_category").agg(pl.col("trip_distance").mean())

In [None]:
# Unique values

In [None]:
pldf["passenger_count"].unique()

In [None]:
pldf["passenger_count"].n_unique()

In [None]:
# User-defined Functions

In [None]:
def transform_column(value: str) -> str:
    return f"{value}_transformed"

out = pldf.select(pl.col("trip_distance_category").map_elements(transform_column, return_dtype=pl.Utf8))

In [None]:
out.head(3)

In [None]:
# Lazy and Eager execution

In [None]:
@elapse_time
def transform_polars_eager(path):
    df = pl.read_csv(path)
    df = df.with_columns(
        new_column=pl.lit("important info")
    )
    df = df.with_columns(
        pl.lit(1.3578490).alias("another_new_column")
    )
    df = df.with_columns(
        pl.col("another_new_column").cast(pl.Int64).alias("another_new_column_INT64")
    )
    df = df.with_columns(
        pl.when(
            (pl.col("trip_distance")>0) & (pl.col("trip_distance")<1)
        )
        .then(pl.lit("0-1"))
        .otherwise(pl.lit("1+"))
        .alias("trip_distance_category")
    )
    df = df.with_columns(
        pl.mean("trip_distance").alias("mean_trip_distance")
    )
    def transform_column(value: str) -> str:
        return f"{value}_transformed"
    
    df = df.with_columns(
        pl.col("trip_distance_category").map_elements(transform_column, return_dtype=pl.Utf8).alias("trip_distance_category_transformed")
    )
    df = pl.concat([df, df])
    df2 = pl.concat([df, df])
    df3 = df2.group_by("trip_distance_category").agg(pl.col("trip_distance").max())
    display(df3.head())

In [None]:
@elapse_time
def transform_polars_lazy(path):
    df = pl.scan_csv(path)
    df = df.with_columns(
        new_column=pl.lit("important info")
    )
    df = df.with_columns(
        pl.lit(1.3578490).alias("another_new_column")
    )
    df = df.with_columns(
        pl.col("another_new_column").cast(pl.Int64).alias("another_new_column_INT64")
    )
    df = df.with_columns(
        pl.when(
            (pl.col("trip_distance")>0) & (pl.col("trip_distance")<1)
        )
        .then(pl.lit("0-1"))
        .otherwise(pl.lit("1+"))
        .alias("trip_distance_category")
    )
    df = df.with_columns(
        pl.mean("trip_distance").alias("mean_trip_distance")
    )
    def transform_column(value: str) -> str:
        return f"{value}_transformed"
    
    df = df.with_columns(
        pl.col("trip_distance_category").map_elements(transform_column, return_dtype=pl.Utf8).alias("trip_distance_category_transformed")
    )
    df2 = pl.concat([df, df])
    df3 = df2.group_by("trip_distance_category").agg(pl.col("trip_distance").max())
    display(df3.head())
    display(df3.head().collect())

In [None]:
transform_polars_eager("data.csv")

In [None]:
transform_polars_lazy("data.csv")

In [None]:
@elapse_time
def transform_pandas(df: pd.DataFrame) -> pd.DataFrame:
    # TODO

In [None]:
# Visualization

In [None]:
pldf[:500].plot.bar(x="trip_distance", y="passenger_count")

In [None]:
pldf[:500].plot.point(x="trip_distance", y="tip_amount")