In [None]:
! pip install polars[all]

In [1]:
import polars as pl

In [2]:
from datetime import datetime

df = pl.DataFrame(
    {
        "integer": [1, 2, 3, 4, 5],
        "date": [
            datetime(2022, 1, 1),
            datetime(2022, 1, 2),
            datetime(2022, 1, 3),
            datetime(2022, 1, 4),
            datetime(2022, 1, 5),
        ],
        "float": [4.0, 5.0, 6.0, 7.0, 8.0],
    }
)

print(df)

shape: (5, 3)
┌─────────┬─────────────────────┬───────┐
│ integer ┆ date                ┆ float │
│ ---     ┆ ---                 ┆ ---   │
│ i64     ┆ datetime[μs]        ┆ f64   │
╞═════════╪═════════════════════╪═══════╡
│ 1       ┆ 2022-01-01 00:00:00 ┆ 4.0   │
│ 2       ┆ 2022-01-02 00:00:00 ┆ 5.0   │
│ 3       ┆ 2022-01-03 00:00:00 ┆ 6.0   │
│ 4       ┆ 2022-01-04 00:00:00 ┆ 7.0   │
│ 5       ┆ 2022-01-05 00:00:00 ┆ 8.0   │
└─────────┴─────────────────────┴───────┘


In [3]:
df1 = pl.DataFrame(
    {
        "integer": [1, 1, 2, 3, 3],
        "float": [4.0, 5.0, 6.0, 7.0, 8.0],
    }
)
df2 = pl.DataFrame(
    {
        "integer": [1, 3],
        "float": [14, 16],
    }
)

In [4]:
df2.join(df1, on='integer', how='left')

integer,float,float_right
i64,i64,f64
1,14,4.0
1,14,5.0
3,16,7.0
3,16,8.0


In [5]:
lz_df = (
    pl.scan_parquet('data/nyc_taxi/fhvhv_tripdata_*.parquet')
    .select(pl.col('dispatching_base_num'))
    .group_by("dispatching_base_num", maintain_order=True).agg(pl.count())
    #.group_by("dispatching_base_num", maintain_order=True).agg(pl.count())
    #.select(pl.count())
)

In [6]:
lz_df.explain()

'AGGREGATE\n\t[count()] BY [col("dispatching_base_num")] FROM\n  FAST_PROJECT: [dispatching_base_num]\n    UNION\n      PLAN 0:\n        FAST_PROJECT: [dispatching_base_num]\n\n            Parquet SCAN data/nyc_taxi/fhvhv_tripdata_2021-02.parquet\n            PROJECT 1/24 COLUMNS\n      PLAN 1:\n        FAST_PROJECT: [dispatching_base_num]\n\n            Parquet SCAN data/nyc_taxi/fhvhv_tripdata_2021-03.parquet\n            PROJECT 1/24 COLUMNS\n      PLAN 2:\n        FAST_PROJECT: [dispatching_base_num]\n\n            Parquet SCAN data/nyc_taxi/fhvhv_tripdata_2021-04.parquet\n            PROJECT 1/24 COLUMNS\n      PLAN 3:\n        FAST_PROJECT: [dispatching_base_num]\n\n            Parquet SCAN data/nyc_taxi/fhvhv_tripdata_2021-05.parquet\n            PROJECT 1/24 COLUMNS\n      PLAN 4:\n        FAST_PROJECT: [dispatching_base_num]\n\n            Parquet SCAN data/nyc_taxi/fhvhv_tripdata_2021-06.parquet\n            PROJECT 1/24 COLUMNS\n      PLAN 5:\n        FAST_PROJECT: [dispatch

In [7]:
%%time
lz_df.collect()

CPU times: user 51.8 s, sys: 10.6 s, total: 1min 2s
Wall time: 6.66 s


dispatching_base_num,count
str,u32
"""B02764""",11192632
"""B02510""",31052693
"""B02872""",9337432
"""B02888""",1887161
"""B02800""",767254
"""B02883""",2564640
"""B02867""",1963688
"""B02864""",3011766
"""B02835""",2068662
"""B02869""",4416916


In [4]:
lz_df = (
    pl.scan_parquet('data/nyc_taxi/fhvhv_tripdata_*.parquet')
    .select(pl.col('hvfhs_license_num'), pl.col('driver_pay'))
    .filter(pl.col('hvfhs_license_num') == 'HV0003')
)

In [10]:
lz_df.explain()

'FAST_PROJECT: [hvfhs_license_num, driver_pay]\n  UNION\n    PLAN 0:\n      FAST_PROJECT: [hvfhs_license_num, driver_pay]\n\n          Parquet SCAN data/nyc_taxi/fhvhv_tripdata_2021-02.parquet\n          PROJECT 2/24 COLUMNS\n          SELECTION: [(col("hvfhs_license_num")) == (Utf8(HV0003))]\n    PLAN 1:\n      FAST_PROJECT: [hvfhs_license_num, driver_pay]\n\n          Parquet SCAN data/nyc_taxi/fhvhv_tripdata_2021-03.parquet\n          PROJECT 2/24 COLUMNS\n          SELECTION: [(col("hvfhs_license_num")) == (Utf8(HV0003))]\n    PLAN 2:\n      FAST_PROJECT: [hvfhs_license_num, driver_pay]\n\n          Parquet SCAN data/nyc_taxi/fhvhv_tripdata_2021-04.parquet\n          PROJECT 2/24 COLUMNS\n          SELECTION: [(col("hvfhs_license_num")) == (Utf8(HV0003))]\n    PLAN 3:\n      FAST_PROJECT: [hvfhs_license_num, driver_pay]\n\n          Parquet SCAN data/nyc_taxi/fhvhv_tripdata_2021-05.parquet\n          PROJECT 2/24 COLUMNS\n          SELECTION: [(col("hvfhs_license_num")) == (Utf8(HV

In [5]:
(
    lz_df.head()
    .collect()
)

hvfhs_license_num,driver_pay
str,f64
"""HV0003""",9.79
"""HV0003""",24.01
"""HV0003""",8.53
"""HV0003""",16.05
"""HV0003""",25.42


In [7]:
lz_df.schema

{'hvfhs_license_num': Utf8, 'driver_pay': Float64}

In [3]:
%%time
df.describe()

CPU times: user 11.5 ms, sys: 2.33 ms, total: 13.8 ms
Wall time: 23.5 ms


describe,integer,date,float
str,f64,str,f64
"""count""",5.0,"""5""",5.0
"""null_count""",0.0,"""0""",0.0
"""mean""",3.0,,6.0
"""std""",1.581139,,1.581139
"""min""",1.0,"""2022-01-01 00:…",4.0
"""25%""",2.0,,5.0
"""50%""",3.0,,6.0
"""75%""",4.0,,7.0
"""max""",5.0,"""2022-01-05 00:…",8.0
