This Jupyter Notebook is based on the Pola.rs User Guide, but: 
- with fixes
- downloadable 

Polars is like Pandas, but faster.

Official Documentation: https://docs.pola.rs/

In [83]:
# This installs libraries into the environment.
!pip install polars


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [84]:
import polars as pl
from datetime import datetime
import numpy as np

### Polars Basics

In [85]:
df = pl.DataFrame(
    {
        "a": [1, 2, 3],
        "b": [
            datetime(2022, 1, 1),
            datetime(2022, 1, 2),
            datetime(2022, 1, 3),
        ],
        "c": [4.0, 5.0, 6.0],
    }
)
print(df.columns)
print(df.shape)
print(df)

['a', 'b', 'c']
(3, 3)
shape: (3, 3)
┌─────┬─────────────────────┬─────┐
│ a   ┆ b                   ┆ c   │
│ --- ┆ ---                 ┆ --- │
│ i64 ┆ datetime[μs]        ┆ f64 │
╞═════╪═════════════════════╪═════╡
│ 1   ┆ 2022-01-01 00:00:00 ┆ 4.0 │
│ 2   ┆ 2022-01-02 00:00:00 ┆ 5.0 │
│ 3   ┆ 2022-01-03 00:00:00 ┆ 6.0 │
└─────┴─────────────────────┴─────┘


### File IO

In [86]:
df.write_csv("docs/data/cookbook_output.csv")
df_csv = pl.read_csv("docs/data/cookbook_output.csv")
print(df_csv)

shape: (3, 3)
┌─────┬────────────────────────────┬─────┐
│ a   ┆ b                          ┆ c   │
│ --- ┆ ---                        ┆ --- │
│ i64 ┆ str                        ┆ f64 │
╞═════╪════════════════════════════╪═════╡
│ 1   ┆ 2022-01-01T00:00:00.000000 ┆ 4.0 │
│ 2   ┆ 2022-01-02T00:00:00.000000 ┆ 5.0 │
│ 3   ┆ 2022-01-03T00:00:00.000000 ┆ 6.0 │
└─────┴────────────────────────────┴─────┘


In [87]:
df_csv = pl.read_csv("docs/data/cookbook_output.csv", try_parse_dates=True)
print(df_csv)

shape: (3, 3)
┌─────┬─────────────────────┬─────┐
│ a   ┆ b                   ┆ c   │
│ --- ┆ ---                 ┆ --- │
│ i64 ┆ datetime[μs]        ┆ f64 │
╞═════╪═════════════════════╪═════╡
│ 1   ┆ 2022-01-01 00:00:00 ┆ 4.0 │
│ 2   ┆ 2022-01-02 00:00:00 ┆ 5.0 │
│ 3   ┆ 2022-01-03 00:00:00 ┆ 6.0 │
└─────┴─────────────────────┴─────┘


In [88]:
df.write_json("docs/data/cookbook_output.json")
df_json = pl.read_json("docs/data/cookbook_output.json")
print(df_json)

shape: (3, 3)
┌─────┬─────────────────────┬─────┐
│ a   ┆ b                   ┆ c   │
│ --- ┆ ---                 ┆ --- │
│ i64 ┆ datetime[μs]        ┆ f64 │
╞═════╪═════════════════════╪═════╡
│ 1   ┆ 2022-01-01 00:00:00 ┆ 4.0 │
│ 2   ┆ 2022-01-02 00:00:00 ┆ 5.0 │
│ 3   ┆ 2022-01-03 00:00:00 ┆ 6.0 │
└─────┴─────────────────────┴─────┘


In [89]:
df.write_parquet("docs/data/cookbook_output.parquet")
df_parquet = pl.read_parquet("docs/data/cookbook_output.parquet")
print(df_parquet)

shape: (3, 3)
┌─────┬─────────────────────┬─────┐
│ a   ┆ b                   ┆ c   │
│ --- ┆ ---                 ┆ --- │
│ i64 ┆ datetime[μs]        ┆ f64 │
╞═════╪═════════════════════╪═════╡
│ 1   ┆ 2022-01-01 00:00:00 ┆ 4.0 │
│ 2   ┆ 2022-01-02 00:00:00 ┆ 5.0 │
│ 3   ┆ 2022-01-03 00:00:00 ┆ 6.0 │
└─────┴─────────────────────┴─────┘


### Queries
- select
- limit
- exclude
- filter (is_between)


In [90]:
all = pl.col("*")
df.select(all)

a,b,c
i64,datetime[μs],f64
1,2022-01-01 00:00:00,4.0
2,2022-01-02 00:00:00,5.0
3,2022-01-03 00:00:00,6.0


In [91]:
df.select(pl.col("a", "b"))

a,b
i64,datetime[μs]
1,2022-01-01 00:00:00
2,2022-01-02 00:00:00
3,2022-01-03 00:00:00


In [92]:
df.select(pl.col('a'), pl.col('b')).limit(1)

a,b
i64,datetime[μs]
1,2022-01-01 00:00:00


In [93]:
df.select(pl.exclude('a'))

b,c
datetime[μs],f64
2022-01-01 00:00:00,4.0
2022-01-02 00:00:00,5.0
2022-01-03 00:00:00,6.0


In [94]:
df.filter(
    pl.col('c').is_between(datetime(2022, 12, 2), datetime(2022, 12, 8))
)

a,b,c
i64,datetime[μs],f64


In [95]:
df.filter((pl.col("a") <= 3) & (pl.col("c").is_not_nan()))

a,b,c
i64,datetime[μs],f64
1,2022-01-01 00:00:00,4.0
2,2022-01-02 00:00:00,5.0
3,2022-01-03 00:00:00,6.0


### Feature Engineering

In [96]:
df.with_columns(pl.col("c").sum().alias("d"), (pl.col("c") + 42).alias("c+42"))

a,b,c,d,c+42
i64,datetime[μs],f64,f64,f64
1,2022-01-01 00:00:00,4.0,15.0,46.0
2,2022-01-02 00:00:00,5.0,15.0,47.0
3,2022-01-03 00:00:00,6.0,15.0,48.0


In [97]:
df2 = pl.DataFrame(
    {
        "x": range(8),
        "y": ["A", "A", "A", "B", "B", "C", "X", "X"],
    }
)
df2

x,y
i64,str
0,"""A"""
1,"""A"""
2,"""A"""
3,"""B"""
4,"""B"""
5,"""C"""
6,"""X"""
7,"""X"""


In [98]:
df2.group_by('y', maintain_order=True).count()

y,count
str,u32
"""A""",3
"""B""",2
"""C""",1
"""X""",2


In [99]:
df2 = df2.group_by('y', maintain_order=True).agg(
    pl.col('*').count().alias('count'),
    pl.col('*').sum().alias('sum'),
)
df2

y,count,sum
str,u32,i64
"""A""",3,3
"""B""",2,7
"""C""",1,5
"""X""",2,13


In [100]:
df = pl.DataFrame(
    {
        "a": [1, 2, 3, 4],
        "b": [4.0, 5.0, 6.0, 7.0],
        "c": [
            datetime(2022, 1, 1),
            datetime(2022, 1, 2),
            datetime(2022, 1, 2),
            datetime(2022, 1, 3),],
        "d": [1.1, 1.2, 1.3, 1.4]
    }
)

df_x = df.with_columns((pl.col("a") * pl.col("b"))\
    .alias("a * b"))\
    .select(
        pl.all()\
        .exclude(["c", "d"])
)

print(df_x)

shape: (4, 3)
┌─────┬─────┬───────┐
│ a   ┆ b   ┆ a * b │
│ --- ┆ --- ┆ ---   │
│ i64 ┆ f64 ┆ f64   │
╞═════╪═════╪═══════╡
│ 1   ┆ 4.0 ┆ 4.0   │
│ 2   ┆ 5.0 ┆ 10.0  │
│ 3   ┆ 6.0 ┆ 18.0  │
│ 4   ┆ 7.0 ┆ 28.0  │
└─────┴─────┴───────┘


In [101]:
df_y = df.with_columns((pl.col("a") * pl.col("b"))\
    .alias("a * b"))\
    .select(
        pl.all()\
        .exclude("d")
)

print(df_y)

shape: (4, 4)
┌─────┬─────┬─────────────────────┬───────┐
│ a   ┆ b   ┆ c                   ┆ a * b │
│ --- ┆ --- ┆ ---                 ┆ ---   │
│ i64 ┆ f64 ┆ datetime[μs]        ┆ f64   │
╞═════╪═════╪═════════════════════╪═══════╡
│ 1   ┆ 4.0 ┆ 2022-01-01 00:00:00 ┆ 4.0   │
│ 2   ┆ 5.0 ┆ 2022-01-02 00:00:00 ┆ 10.0  │
│ 3   ┆ 6.0 ┆ 2022-01-02 00:00:00 ┆ 18.0  │
│ 4   ┆ 7.0 ┆ 2022-01-03 00:00:00 ┆ 28.0  │
└─────┴─────┴─────────────────────┴───────┘


### Combining DataFrames

In [102]:
df = pl.DataFrame(
    {
        "a": range(8),
        "b": np.random.rand(8),
        "d": [1, 2.0, float("nan"), float("nan"), 0, -5, -42, None],
    }
)

df2 = pl.DataFrame(
    {
        "x": range(8),
        "y": ["A", "A", "A", "B", "B", "C", "X", "X"],
    }
)
joined = df.join(df2, left_on="a", right_on="x")
print(joined)

shape: (8, 4)
┌─────┬──────────┬───────┬─────┐
│ a   ┆ b        ┆ d     ┆ y   │
│ --- ┆ ---      ┆ ---   ┆ --- │
│ i64 ┆ f64      ┆ f64   ┆ str │
╞═════╪══════════╪═══════╪═════╡
│ 0   ┆ 0.562951 ┆ 1.0   ┆ A   │
│ 1   ┆ 0.964259 ┆ 2.0   ┆ A   │
│ 2   ┆ 0.021492 ┆ NaN   ┆ A   │
│ 3   ┆ 0.455607 ┆ NaN   ┆ B   │
│ 4   ┆ 0.505398 ┆ 0.0   ┆ B   │
│ 5   ┆ 0.626036 ┆ -5.0  ┆ C   │
│ 6   ┆ 0.578255 ┆ -42.0 ┆ X   │
│ 7   ┆ 0.62413  ┆ null  ┆ X   │
└─────┴──────────┴───────┴─────┘


### Concat

Vertical concatenation will make the DataFrame longer. Horizontal concatenation will make the DataFrame wider.

In [103]:
stacked = df.hstack(df2)
print(stacked)

shape: (8, 5)
┌─────┬──────────┬───────┬─────┬─────┐
│ a   ┆ b        ┆ d     ┆ x   ┆ y   │
│ --- ┆ ---      ┆ ---   ┆ --- ┆ --- │
│ i64 ┆ f64      ┆ f64   ┆ i64 ┆ str │
╞═════╪══════════╪═══════╪═════╪═════╡
│ 0   ┆ 0.562951 ┆ 1.0   ┆ 0   ┆ A   │
│ 1   ┆ 0.964259 ┆ 2.0   ┆ 1   ┆ A   │
│ 2   ┆ 0.021492 ┆ NaN   ┆ 2   ┆ A   │
│ 3   ┆ 0.455607 ┆ NaN   ┆ 3   ┆ B   │
│ 4   ┆ 0.505398 ┆ 0.0   ┆ 4   ┆ B   │
│ 5   ┆ 0.626036 ┆ -5.0  ┆ 5   ┆ C   │
│ 6   ┆ 0.578255 ┆ -42.0 ┆ 6   ┆ X   │
│ 7   ┆ 0.62413  ┆ null  ┆ 7   ┆ X   │
└─────┴──────────┴───────┴─────┴─────┘
