In [14]:
%%time

import polars as pl

df_lazy = pl.scan_parquet('../data/*/yellow_taxi/*')

result = df_lazy.select("trip_distance","store_and_fwd_flag","total_amount").collect(streaming=True)

print(result)

shape: (1_125_446_833, 3)
┌───────────────┬────────────────────┬──────────────┐
│ trip_distance ┆ store_and_fwd_flag ┆ total_amount │
│ ---           ┆ ---                ┆ ---          │
│ f64           ┆ str                ┆ f64          │
╞═══════════════╪════════════════════╪══════════════╡
│ 0.9           ┆ N                  ┆ 5.9          │
│ 2.3           ┆ N                  ┆ 9.5          │
│ 2.2           ┆ N                  ┆ 10.3         │
│ 0.9           ┆ N                  ┆ 6.3          │
│ 0.7           ┆ N                  ┆ 5.5          │
│ …             ┆ …                  ┆ …            │
│ 2.43          ┆ null               ┆ 26.03        │
│ 0.0           ┆ null               ┆ 15.33        │
│ 1.8           ┆ null               ┆ 19.67        │
│ 3.39          ┆ null               ┆ 29.2         │
│ 0.0           ┆ null               ┆ 37.97        │
└───────────────┴────────────────────┴──────────────┘
CPU times: user 16.6 s, sys: 4.38 s, total: 21 s
Wall ti

In [1]:
%%time

import polars as pl

df_lazy = pl.scan_parquet('../data/*/yellow_taxi/*')

result = df_lazy.select([
    pl.col('total_amount'),
    pl.when(pl.col('total_amount') <= 10)
        .then(pl.lit('Good Deal'))
        .when(pl.col('total_amount').is_between(11,25))
        .then(pl.lit('Best Deal'))
        .otherwise(pl.lit('Excellent Deal'))
        .alias('amount_group'),
    pl.col('trip_distance'),
    pl.col('store_and_fwd_flag')
]).collect(streaming=True)

print(result)

shape: (1_125_446_833, 4)
┌──────────────┬────────────────┬───────────────┬────────────────────┐
│ total_amount ┆ amount_group   ┆ trip_distance ┆ store_and_fwd_flag │
│ ---          ┆ ---            ┆ ---           ┆ ---                │
│ f64          ┆ str            ┆ f64           ┆ str                │
╞══════════════╪════════════════╪═══════════════╪════════════════════╡
│ 5.9          ┆ Good Deal      ┆ 0.9           ┆ N                  │
│ 9.5          ┆ Good Deal      ┆ 2.3           ┆ N                  │
│ 10.3         ┆ Excellent Deal ┆ 2.2           ┆ N                  │
│ 6.3          ┆ Good Deal      ┆ 0.9           ┆ N                  │
│ 5.5          ┆ Good Deal      ┆ 0.7           ┆ N                  │
│ …            ┆ …              ┆ …             ┆ …                  │
│ 26.03        ┆ Excellent Deal ┆ 2.43          ┆ null               │
│ 15.33        ┆ Best Deal      ┆ 0.0           ┆ null               │
│ 19.67        ┆ Best Deal      ┆ 1.8           ┆ n

In [2]:
%%time

import polars as pl

df_lazy = pl.scan_parquet('../data/*/yellow_taxi/*')

result = df_lazy.filter(pl.col("trip_distance") > 5).group_by("store_and_fwd_flag").agg(
    [
        pl.col("total_amount").mean().alias("avg_total_amount"),
    ]
).with_columns(
    pl.when(pl.col("avg_total_amount") > 40)
    .then(pl.lit("Excellent"))
    .when(pl.col("avg_total_amount") > 35)
    .then(pl.lit("Good"))
    .otherwise(pl.lit("Not Good"))
    .alias("deal_category")
).collect(streaming=True)

print(result)

shape: (3, 3)
┌────────────────────┬──────────────────┬───────────────┐
│ store_and_fwd_flag ┆ avg_total_amount ┆ deal_category │
│ ---                ┆ ---              ┆ ---           │
│ str                ┆ f64              ┆ str           │
╞════════════════════╪══════════════════╪═══════════════╡
│ null               ┆ 36.333789        ┆ Good          │
│ Y                  ┆ 41.058912        ┆ Excellent     │
│ N                  ┆ 41.378407        ┆ Excellent     │
└────────────────────┴──────────────────┴───────────────┘
CPU times: user 20.5 s, sys: 4.65 s, total: 25.2 s
Wall time: 15 s
