In [1]:
%%time

import polars as pl

columns_to_read = ["trip_distance","store_and_fwd_flag","fare_amount"]

df_lazy = pl.scan_parquet("../data/*/yellow_taxi/*").select(
    columns_to_read
)

result = df_lazy.collect(streaming=True)

print(result)

shape: (1_471_335_254, 3)
┌───────────────┬────────────────────┬─────────────┐
│ trip_distance ┆ store_and_fwd_flag ┆ fare_amount │
│ ---           ┆ ---                ┆ ---         │
│ f64           ┆ str                ┆ f64         │
╞═══════════════╪════════════════════╪═════════════╡
│ 0.75          ┆ null               ┆ 4.5         │
│ 5.9           ┆ null               ┆ 15.3        │
│ 4.0           ┆ null               ┆ 11.7        │
│ 4.7           ┆ null               ┆ 13.3        │
│ 0.6           ┆ 0                  ┆ 5.3         │
│ …             ┆ …                  ┆ …           │
│ 2.43          ┆ null               ┆ 17.69       │
│ 0.0           ┆ null               ┆ 11.33       │
│ 1.8           ┆ null               ┆ 12.1        │
│ 3.39          ┆ null               ┆ 20.33       │
│ 0.0           ┆ null               ┆ 36.47       │
└───────────────┴────────────────────┴─────────────┘
CPU times: user 20.2 s, sys: 5.39 s, total: 25.6 s
Wall time: 34.4 s


In [2]:
%%time 

import polars as pl

df_lazy = pl.scan_parquet("../data/*/yellow_taxi/*")

df_lazy = df_lazy.filter(pl.col("store_and_fwd_flag") == "N")

result = df_lazy.select("trip_distance","store_and_fwd_flag").collect(streaming=True)

print(result)

shape: (1_015_899_890, 2)
┌───────────────┬────────────────────┐
│ trip_distance ┆ store_and_fwd_flag │
│ ---           ┆ ---                │
│ f64           ┆ str                │
╞═══════════════╪════════════════════╡
│ 0.9           ┆ N                  │
│ 1.8           ┆ N                  │
│ 1.8           ┆ N                  │
│ 1.6           ┆ N                  │
│ 5.9           ┆ N                  │
│ …             ┆ …                  │
│ 1.5           ┆ N                  │
│ 8.9           ┆ N                  │
│ 2.3           ┆ N                  │
│ 3.44          ┆ N                  │
│ 6.14          ┆ N                  │
└───────────────┴────────────────────┘
CPU times: user 13.2 s, sys: 5.63 s, total: 18.9 s
Wall time: 28.8 s


In [4]:
%%time

import polars as pl

df_lazy = pl.scan_parquet("../data/*/yellow_taxi/*")

df_lazy = df_lazy.filter(pl.col("trip_distance") >=20)

result = df_lazy.select("trip_distance","store_and_fwd_flag").collect(streaming=True)

print(result)

shape: (7_521_674, 2)
┌───────────────┬────────────────────┐
│ trip_distance ┆ store_and_fwd_flag │
│ ---           ┆ ---                │
│ f64           ┆ str                │
╞═══════════════╪════════════════════╡
│ 22.1          ┆ null               │
│ 20.34         ┆ null               │
│ 21.4          ┆ null               │
│ 22.12         ┆ null               │
│ 25.79         ┆ null               │
│ …             ┆ …                  │
│ 20.45         ┆ null               │
│ 28.4          ┆ null               │
│ 20.01         ┆ null               │
│ 20.7          ┆ null               │
│ 31.11         ┆ null               │
└───────────────┴────────────────────┘
CPU times: user 11.8 s, sys: 3.2 s, total: 15 s
Wall time: 13.9 s


In [6]:
%%time

import polars as pl

df_lazy = pl.scan_parquet("../data/*/yellow_taxi/*")

filtered_and = df_lazy.filter((pl.col("trip_distance") >=20) & (pl.col("fare_amount") >= 50))

result = filtered_and.select("trip_distance","store_and_fwd_flag").collect(streaming=True)

print(result)

shape: (6_174_770, 2)
┌───────────────┬────────────────────┐
│ trip_distance ┆ store_and_fwd_flag │
│ ---           ┆ ---                │
│ f64           ┆ str                │
╞═══════════════╪════════════════════╡
│ 25.79         ┆ null               │
│ 26.8          ┆ null               │
│ 26.4          ┆ 0                  │
│ 22.9          ┆ null               │
│ 27.9          ┆ 0                  │
│ …             ┆ …                  │
│ 20.45         ┆ null               │
│ 28.4          ┆ null               │
│ 20.01         ┆ null               │
│ 20.7          ┆ null               │
│ 31.11         ┆ null               │
└───────────────┴────────────────────┘
CPU times: user 20.8 s, sys: 3.71 s, total: 24.5 s
Wall time: 14.3 s


In [8]:
%%time

import polars as pl

df_lazy = pl.scan_parquet("../data/*/yellow_taxi/*")
filtered_and = df_lazy.filter(pl.col("store_and_fwd_flag").is_not_null())

result = filtered_and.select("trip_distance","store_and_fwd_flag").collect(streaming=True)

print(result)

shape: (1_085_438_881, 2)
┌───────────────┬────────────────────┐
│ trip_distance ┆ store_and_fwd_flag │
│ ---           ┆ ---                │
│ f64           ┆ str                │
╞═══════════════╪════════════════════╡
│ 0.6           ┆ 0                  │
│ 2.4           ┆ 0                  │
│ 0.9           ┆ 0                  │
│ 1.6           ┆ 0                  │
│ 2.8           ┆ 0                  │
│ …             ┆ …                  │
│ 1.5           ┆ N                  │
│ 8.9           ┆ N                  │
│ 2.3           ┆ N                  │
│ 3.44          ┆ N                  │
│ 6.14          ┆ N                  │
└───────────────┴────────────────────┘
CPU times: user 12.2 s, sys: 4.03 s, total: 16.2 s
Wall time: 15.1 s


In [9]:
%%time

import polars as pl

df_lazy = pl.scan_parquet("../data/*/yellow_taxi/*")

filtered_and = df_lazy.filter(pl.col("store_and_fwd_flag").is_null())

result = filtered_and.select("trip_distance","store_and_fwd_flag").collect(streaming=True)

print(result)

shape: (385_896_373, 2)
┌───────────────┬────────────────────┐
│ trip_distance ┆ store_and_fwd_flag │
│ ---           ┆ ---                │
│ f64           ┆ str                │
╞═══════════════╪════════════════════╡
│ 0.75          ┆ null               │
│ 5.9           ┆ null               │
│ 4.0           ┆ null               │
│ 4.7           ┆ null               │
│ 3.3           ┆ null               │
│ …             ┆ …                  │
│ 2.43          ┆ null               │
│ 0.0           ┆ null               │
│ 1.8           ┆ null               │
│ 3.39          ┆ null               │
│ 0.0           ┆ null               │
└───────────────┴────────────────────┘
CPU times: user 8.46 s, sys: 3.31 s, total: 11.8 s
Wall time: 11.9 s


In [10]:
%%time

import polars as pl

df_lazy = pl.scan_parquet("../data/*/yellow_taxi/*")

filtered_and = df_lazy.filter(pl.col("trip_distance").is_between(10,20))

result = filtered_and.select("trip_distance","store_and_fwd_flag").collect(streaming=True)

print(result)

shape: (70_244_463, 2)
┌───────────────┬────────────────────┐
│ trip_distance ┆ store_and_fwd_flag │
│ ---           ┆ ---                │
│ f64           ┆ str                │
╞═══════════════╪════════════════════╡
│ 17.3          ┆ null               │
│ 17.2          ┆ null               │
│ 13.97         ┆ null               │
│ 16.7          ┆ 0                  │
│ 14.0          ┆ 0                  │
│ …             ┆ …                  │
│ 11.28         ┆ null               │
│ 13.18         ┆ null               │
│ 10.64         ┆ null               │
│ 10.21         ┆ null               │
│ 17.35         ┆ null               │
└───────────────┴────────────────────┘
CPU times: user 12.7 s, sys: 3.33 s, total: 16 s
Wall time: 14.8 s
