In [42]:
import polars as pl
from datetime import datetime

df = pl.DataFrame(
    {
        "integer": [1, 2, 3],
        "date": [
            datetime(2022, 1, 1),
            datetime(2022, 1, 2),
            datetime(2022, 1, 3),
        ],
        "float": [4.0, 5.0, 6.0],
    }
)

print(df)

shape: (3, 3)
┌─────────┬─────────────────────┬───────┐
│ integer ┆ date                ┆ float │
│ ---     ┆ ---                 ┆ ---   │
│ i64     ┆ datetime[μs]        ┆ f64   │
╞═════════╪═════════════════════╪═══════╡
│ 1       ┆ 2022-01-01 00:00:00 ┆ 4.0   │
│ 2       ┆ 2022-01-02 00:00:00 ┆ 5.0   │
│ 3       ┆ 2022-01-03 00:00:00 ┆ 6.0   │
└─────────┴─────────────────────┴───────┘


In [3]:
import numpy as np

In [4]:
df = pl.DataFrame(
    {
        "a": range(8),
        "b": np.random.rand(8),
        "d": [1, 2.0, float("nan"), float("nan"), 0, -5, -42, None],
    }
)

In [5]:
df

a,b,d
i64,f64,f64
0,0.242939,1.0
1,0.772906,2.0
2,0.191597,
3,0.486959,
4,0.796929,0.0
5,0.234152,-5.0
6,0.637966,-42.0
7,0.997262,


In [6]:
df.select(pl.col("*"))

a,b,d
i64,f64,f64
0,0.242939,1.0
1,0.772906,2.0
2,0.191597,
3,0.486959,
4,0.796929,0.0
5,0.234152,-5.0
6,0.637966,-42.0
7,0.997262,


In [7]:
df.select(pl.col("a", "b"))

a,b
i64,f64
0,0.242939
1,0.772906
2,0.191597
3,0.486959
4,0.796929
5,0.234152
6,0.637966
7,0.997262


In [8]:
df.select(pl.col("a"), pl.col("b")).limit(3)

a,b
i64,f64
0,0.242939
1,0.772906
2,0.191597


In [9]:
df.select(pl.exclude("a"))

b,d
f64,f64
0.242939,1.0
0.772906,2.0
0.191597,
0.486959,
0.796929,0.0
0.234152,-5.0
0.637966,-42.0
0.997262,


In [11]:
df.filter(
    pl.col("b").is_between(0.1,0.6),
)

a,b,d
i64,f64,f64
0,0.242939,1.0
2,0.191597,
3,0.486959,
5,0.234152,-5.0


In [12]:
df.filter((pl.col("a") <= 5) & (pl.col("d").is_not_nan()))

a,b,d
i64,f64,f64
0,0.242939,1.0
1,0.772906,2.0
4,0.796929,0.0
5,0.234152,-5.0


In [13]:
df.with_columns(pl.col("b").sum().alias("e"), (pl.col("b") + 42).alias("b+42"))

a,b,d,e,b+42
i64,f64,f64,f64,f64
0,0.242939,1.0,4.360708,42.242939
1,0.772906,2.0,4.360708,42.772906
2,0.191597,,4.360708,42.191597
3,0.486959,,4.360708,42.486959
4,0.796929,0.0,4.360708,42.796929
5,0.234152,-5.0,4.360708,42.234152
6,0.637966,-42.0,4.360708,42.637966
7,0.997262,,4.360708,42.997262


In [14]:
df2 = pl.DataFrame(
    {
        "x": range(8),
        "y": ["A", "A", "A", "B", "B", "C", "X", "X"],
    }
)

In [15]:
df2.group_by("y", maintain_order=True).count()

y,count
str,u32
"""A""",3
"""B""",2
"""C""",1
"""X""",2


In [16]:
df2.group_by("y", maintain_order=True).agg(
    pl.col("*").count().alias("count"),
    pl.col("*").sum().alias("sum"),
)

y,count,sum
str,u32,i64
"""A""",3,3
"""B""",2,7
"""C""",1,5
"""X""",2,13


In [17]:
df_x = df.with_columns((pl.col("a") * pl.col("b")).alias("a * b")).select(
    pl.all().exclude(["c", "d"])
)

print(df_x)

shape: (8, 3)
┌─────┬──────────┬──────────┐
│ a   ┆ b        ┆ a * b    │
│ --- ┆ ---      ┆ ---      │
│ i64 ┆ f64      ┆ f64      │
╞═════╪══════════╪══════════╡
│ 0   ┆ 0.242939 ┆ 0.0      │
│ 1   ┆ 0.772906 ┆ 0.772906 │
│ 2   ┆ 0.191597 ┆ 0.383193 │
│ 3   ┆ 0.486959 ┆ 1.460878 │
│ 4   ┆ 0.796929 ┆ 3.187714 │
│ 5   ┆ 0.234152 ┆ 1.170759 │
│ 6   ┆ 0.637966 ┆ 3.827793 │
│ 7   ┆ 0.997262 ┆ 6.980831 │
└─────┴──────────┴──────────┘


In [18]:
df_y = df.with_columns((pl.col("a") * pl.col("b")).alias("a * b")).select(
    pl.all().exclude("d")
)

print(df_y)

shape: (8, 3)
┌─────┬──────────┬──────────┐
│ a   ┆ b        ┆ a * b    │
│ --- ┆ ---      ┆ ---      │
│ i64 ┆ f64      ┆ f64      │
╞═════╪══════════╪══════════╡
│ 0   ┆ 0.242939 ┆ 0.0      │
│ 1   ┆ 0.772906 ┆ 0.772906 │
│ 2   ┆ 0.191597 ┆ 0.383193 │
│ 3   ┆ 0.486959 ┆ 1.460878 │
│ 4   ┆ 0.796929 ┆ 3.187714 │
│ 5   ┆ 0.234152 ┆ 1.170759 │
│ 6   ┆ 0.637966 ┆ 3.827793 │
│ 7   ┆ 0.997262 ┆ 6.980831 │
└─────┴──────────┴──────────┘


In [19]:
df = pl.DataFrame(
    {
        "a": range(8),
        "b": np.random.rand(8),
        "d": [1, 2.0, float("nan"), float("nan"), 0, -5, -42, None],
    }
)

df2 = pl.DataFrame(
    {
        "x": range(8),
        "y": ["A", "A", "A", "B", "B", "C", "X", "X"],
    }
)
joined = df.join(df2, left_on="a", right_on="x")
print(joined)

shape: (8, 4)
┌─────┬──────────┬───────┬─────┐
│ a   ┆ b        ┆ d     ┆ y   │
│ --- ┆ ---      ┆ ---   ┆ --- │
│ i64 ┆ f64      ┆ f64   ┆ str │
╞═════╪══════════╪═══════╪═════╡
│ 0   ┆ 0.669424 ┆ 1.0   ┆ A   │
│ 1   ┆ 0.756802 ┆ 2.0   ┆ A   │
│ 2   ┆ 0.409069 ┆ NaN   ┆ A   │
│ 3   ┆ 0.90993  ┆ NaN   ┆ B   │
│ 4   ┆ 0.185282 ┆ 0.0   ┆ B   │
│ 5   ┆ 0.617747 ┆ -5.0  ┆ C   │
│ 6   ┆ 0.912349 ┆ -42.0 ┆ X   │
│ 7   ┆ 0.618671 ┆ null  ┆ X   │
└─────┴──────────┴───────┴─────┘


In [20]:
stacked = df.hstack(df2)
print(stacked)

shape: (8, 5)
┌─────┬──────────┬───────┬─────┬─────┐
│ a   ┆ b        ┆ d     ┆ x   ┆ y   │
│ --- ┆ ---      ┆ ---   ┆ --- ┆ --- │
│ i64 ┆ f64      ┆ f64   ┆ i64 ┆ str │
╞═════╪══════════╪═══════╪═════╪═════╡
│ 0   ┆ 0.669424 ┆ 1.0   ┆ 0   ┆ A   │
│ 1   ┆ 0.756802 ┆ 2.0   ┆ 1   ┆ A   │
│ 2   ┆ 0.409069 ┆ NaN   ┆ 2   ┆ A   │
│ 3   ┆ 0.90993  ┆ NaN   ┆ 3   ┆ B   │
│ 4   ┆ 0.185282 ┆ 0.0   ┆ 4   ┆ B   │
│ 5   ┆ 0.617747 ┆ -5.0  ┆ 5   ┆ C   │
│ 6   ┆ 0.912349 ┆ -42.0 ┆ 6   ┆ X   │
│ 7   ┆ 0.618671 ┆ null  ┆ 7   ┆ X   │
└─────┴──────────┴───────┴─────┴─────┘


In [21]:
df = pl.DataFrame(
    {
        "nrs": [1, 2, 3, None, 5],
        "names": ["foo", "ham", "spam", "egg", None],
        "random": np.random.rand(5),
        "groups": ["A", "A", "B", "C", "B"],
    }
)
print(df)

shape: (5, 4)
┌──────┬───────┬──────────┬────────┐
│ nrs  ┆ names ┆ random   ┆ groups │
│ ---  ┆ ---   ┆ ---      ┆ ---    │
│ i64  ┆ str   ┆ f64      ┆ str    │
╞══════╪═══════╪══════════╪════════╡
│ 1    ┆ foo   ┆ 0.5043   ┆ A      │
│ 2    ┆ ham   ┆ 0.437099 ┆ A      │
│ 3    ┆ spam  ┆ 0.217617 ┆ B      │
│ null ┆ egg   ┆ 0.366501 ┆ C      │
│ 5    ┆ null  ┆ 0.118551 ┆ B      │
└──────┴───────┴──────────┴────────┘


In [22]:
df_numerical = df.select(
    (pl.col("nrs") + 5).alias("nrs + 5"),
    (pl.col("nrs") - 5).alias("nrs - 5"),
    (pl.col("nrs") * pl.col("random")).alias("nrs * random"),
    (pl.col("nrs") / pl.col("random")).alias("nrs / random"),
)
print(df_numerical)

shape: (5, 4)
┌─────────┬─────────┬──────────────┬──────────────┐
│ nrs + 5 ┆ nrs - 5 ┆ nrs * random ┆ nrs / random │
│ ---     ┆ ---     ┆ ---          ┆ ---          │
│ i64     ┆ i64     ┆ f64          ┆ f64          │
╞═════════╪═════════╪══════════════╪══════════════╡
│ 6       ┆ -4      ┆ 0.5043       ┆ 1.982945     │
│ 7       ┆ -3      ┆ 0.874199     ┆ 4.575619     │
│ 8       ┆ -2      ┆ 0.65285      ┆ 13.7857      │
│ null    ┆ null    ┆ null         ┆ null         │
│ 10      ┆ 0       ┆ 0.592753     ┆ 42.176088    │
└─────────┴─────────┴──────────────┴──────────────┘


In [23]:
df_logical = df.select(
    (pl.col("nrs") > 1).alias("nrs > 1"),
    (pl.col("random") <= 0.5).alias("random <= .5"),
    (pl.col("nrs") != 1).alias("nrs != 1"),
    (pl.col("nrs") == 1).alias("nrs == 1"),
    ((pl.col("random") <= 0.5) & (pl.col("nrs") > 1)).alias("and_expr"),  # and
    ((pl.col("random") <= 0.5) | (pl.col("nrs") > 1)).alias("or_expr"),  # or
)
print(df_logical)

shape: (5, 6)
┌─────────┬──────────────┬──────────┬──────────┬──────────┬─────────┐
│ nrs > 1 ┆ random <= .5 ┆ nrs != 1 ┆ nrs == 1 ┆ and_expr ┆ or_expr │
│ ---     ┆ ---          ┆ ---      ┆ ---      ┆ ---      ┆ ---     │
│ bool    ┆ bool         ┆ bool     ┆ bool     ┆ bool     ┆ bool    │
╞═════════╪══════════════╪══════════╪══════════╪══════════╪═════════╡
│ false   ┆ false        ┆ false    ┆ true     ┆ false    ┆ false   │
│ true    ┆ true         ┆ true     ┆ false    ┆ true     ┆ true    │
│ true    ┆ true         ┆ true     ┆ false    ┆ true     ┆ true    │
│ null    ┆ true         ┆ null     ┆ null     ┆ null     ┆ true    │
│ true    ┆ true         ┆ true     ┆ false    ┆ true     ┆ true    │
└─────────┴──────────────┴──────────┴──────────┴──────────┴─────────┘


In [24]:
from datetime import date, datetime

df = pl.DataFrame(
    {
        "id": [9, 4, 2],
        "place": ["Mars", "Earth", "Saturn"],
        "date": pl.date_range(date(2022, 1, 1), date(2022, 1, 3), "1d", eager=True),
        "sales": [33.4, 2142134.1, 44.7],
        "has_people": [False, True, False],
        "logged_at": pl.datetime_range(
            datetime(2022, 12, 1), datetime(2022, 12, 1, 0, 0, 2), "1s", eager=True
        ),
    }
).with_row_count("rn")
print(df)

shape: (3, 7)
┌─────┬─────┬────────┬────────────┬───────────┬────────────┬─────────────────────┐
│ rn  ┆ id  ┆ place  ┆ date       ┆ sales     ┆ has_people ┆ logged_at           │
│ --- ┆ --- ┆ ---    ┆ ---        ┆ ---       ┆ ---        ┆ ---                 │
│ u32 ┆ i64 ┆ str    ┆ date       ┆ f64       ┆ bool       ┆ datetime[μs]        │
╞═════╪═════╪════════╪════════════╪═══════════╪════════════╪═════════════════════╡
│ 0   ┆ 9   ┆ Mars   ┆ 2022-01-01 ┆ 33.4      ┆ false      ┆ 2022-12-01 00:00:00 │
│ 1   ┆ 4   ┆ Earth  ┆ 2022-01-02 ┆ 2142134.1 ┆ true       ┆ 2022-12-01 00:00:01 │
│ 2   ┆ 2   ┆ Saturn ┆ 2022-01-03 ┆ 44.7      ┆ false      ┆ 2022-12-01 00:00:02 │
└─────┴─────┴────────┴────────────┴───────────┴────────────┴─────────────────────┘


In [25]:
out = df.select(pl.col("*"))

# Is equivalent to
out = df.select(pl.all())
print(out)

shape: (3, 7)
┌─────┬─────┬────────┬────────────┬───────────┬────────────┬─────────────────────┐
│ rn  ┆ id  ┆ place  ┆ date       ┆ sales     ┆ has_people ┆ logged_at           │
│ --- ┆ --- ┆ ---    ┆ ---        ┆ ---       ┆ ---        ┆ ---                 │
│ u32 ┆ i64 ┆ str    ┆ date       ┆ f64       ┆ bool       ┆ datetime[μs]        │
╞═════╪═════╪════════╪════════════╪═══════════╪════════════╪═════════════════════╡
│ 0   ┆ 9   ┆ Mars   ┆ 2022-01-01 ┆ 33.4      ┆ false      ┆ 2022-12-01 00:00:00 │
│ 1   ┆ 4   ┆ Earth  ┆ 2022-01-02 ┆ 2142134.1 ┆ true       ┆ 2022-12-01 00:00:01 │
│ 2   ┆ 2   ┆ Saturn ┆ 2022-01-03 ┆ 44.7      ┆ false      ┆ 2022-12-01 00:00:02 │
└─────┴─────┴────────┴────────────┴───────────┴────────────┴─────────────────────┘


In [26]:
out = df.select(pl.col("*").exclude("logged_at", "rn"))
print(out)

shape: (3, 5)
┌─────┬────────┬────────────┬───────────┬────────────┐
│ id  ┆ place  ┆ date       ┆ sales     ┆ has_people │
│ --- ┆ ---    ┆ ---        ┆ ---       ┆ ---        │
│ i64 ┆ str    ┆ date       ┆ f64       ┆ bool       │
╞═════╪════════╪════════════╪═══════════╪════════════╡
│ 9   ┆ Mars   ┆ 2022-01-01 ┆ 33.4      ┆ false      │
│ 4   ┆ Earth  ┆ 2022-01-02 ┆ 2142134.1 ┆ true       │
│ 2   ┆ Saturn ┆ 2022-01-03 ┆ 44.7      ┆ false      │
└─────┴────────┴────────────┴───────────┴────────────┘


In [27]:
out = df.select(pl.col("date", "logged_at").dt.to_string("%Y-%h-%d"))
print(out)

shape: (3, 2)
┌─────────────┬─────────────┐
│ date        ┆ logged_at   │
│ ---         ┆ ---         │
│ str         ┆ str         │
╞═════════════╪═════════════╡
│ 2022-Jan-01 ┆ 2022-Dec-01 │
│ 2022-Jan-02 ┆ 2022-Dec-01 │
│ 2022-Jan-03 ┆ 2022-Dec-01 │
└─────────────┴─────────────┘


In [28]:
out = df.select(pl.col("^.*(as|sa).*$"))
print(out)

shape: (3, 2)
┌───────────┬────────────┐
│ sales     ┆ has_people │
│ ---       ┆ ---        │
│ f64       ┆ bool       │
╞═══════════╪════════════╡
│ 33.4      ┆ false      │
│ 2142134.1 ┆ true       │
│ 44.7      ┆ false      │
└───────────┴────────────┘


In [29]:
import polars.selectors as cs

out = df.select(cs.integer(), cs.string())
print(out)

shape: (3, 3)
┌─────┬─────┬────────┐
│ rn  ┆ id  ┆ place  │
│ --- ┆ --- ┆ ---    │
│ u32 ┆ i64 ┆ str    │
╞═════╪═════╪════════╡
│ 0   ┆ 9   ┆ Mars   │
│ 1   ┆ 4   ┆ Earth  │
│ 2   ┆ 2   ┆ Saturn │
└─────┴─────┴────────┘


In [30]:
out = df.select(cs.numeric() - cs.first())
print(out)

shape: (3, 2)
┌─────┬───────────┐
│ id  ┆ sales     │
│ --- ┆ ---       │
│ i64 ┆ f64       │
╞═════╪═══════════╡
│ 9   ┆ 33.4      │
│ 4   ┆ 2142134.1 │
│ 2   ┆ 44.7      │
└─────┴───────────┘


In [31]:
out = df.select(cs.by_name("rn") | ~cs.numeric())
print(out)

shape: (3, 5)
┌─────┬────────┬────────────┬────────────┬─────────────────────┐
│ rn  ┆ place  ┆ date       ┆ has_people ┆ logged_at           │
│ --- ┆ ---    ┆ ---        ┆ ---        ┆ ---                 │
│ u32 ┆ str    ┆ date       ┆ bool       ┆ datetime[μs]        │
╞═════╪════════╪════════════╪════════════╪═════════════════════╡
│ 0   ┆ Mars   ┆ 2022-01-01 ┆ false      ┆ 2022-12-01 00:00:00 │
│ 1   ┆ Earth  ┆ 2022-01-02 ┆ true       ┆ 2022-12-01 00:00:01 │
│ 2   ┆ Saturn ┆ 2022-01-03 ┆ false      ┆ 2022-12-01 00:00:02 │
└─────┴────────┴────────────┴────────────┴─────────────────────┘


In [32]:
out = df.select(cs.contains("rn"), cs.matches(".*_.*"))
print(out)

shape: (3, 3)
┌─────┬────────────┬─────────────────────┐
│ rn  ┆ has_people ┆ logged_at           │
│ --- ┆ ---        ┆ ---                 │
│ u32 ┆ bool       ┆ datetime[μs]        │
╞═════╪════════════╪═════════════════════╡
│ 0   ┆ false      ┆ 2022-12-01 00:00:00 │
│ 1   ┆ true       ┆ 2022-12-01 00:00:01 │
│ 2   ┆ false      ┆ 2022-12-01 00:00:02 │
└─────┴────────────┴─────────────────────┘


In [33]:
out = df.select(cs.temporal().as_expr().dt.to_string("%Y-%h-%d"))
print(out)

shape: (3, 2)
┌─────────────┬─────────────┐
│ date        ┆ logged_at   │
│ ---         ┆ ---         │
│ str         ┆ str         │
╞═════════════╪═════════════╡
│ 2022-Jan-01 ┆ 2022-Dec-01 │
│ 2022-Jan-02 ┆ 2022-Dec-01 │
│ 2022-Jan-03 ┆ 2022-Dec-01 │
└─────────────┴─────────────┘


In [34]:
from polars.selectors import is_selector

out = cs.temporal()
print(is_selector(out))

True


In [35]:
from polars.selectors import expand_selector

out = cs.temporal().as_expr().dt.to_string("%Y-%h-%d")
print(expand_selector(df, out))

('date', 'logged_at')


In [36]:
df = pl.DataFrame(
    {
        "nrs": [1, 2, 3, None, 5],
        "names": ["foo", "ham", "spam", "egg", "spam"],
        "random": np.random.rand(5),
        "groups": ["A", "A", "B", "C", "B"],
    }
)
print(df)

shape: (5, 4)
┌──────┬───────┬──────────┬────────┐
│ nrs  ┆ names ┆ random   ┆ groups │
│ ---  ┆ ---   ┆ ---      ┆ ---    │
│ i64  ┆ str   ┆ f64      ┆ str    │
╞══════╪═══════╪══════════╪════════╡
│ 1    ┆ foo   ┆ 0.870271 ┆ A      │
│ 2    ┆ ham   ┆ 0.506531 ┆ A      │
│ 3    ┆ spam  ┆ 0.496187 ┆ B      │
│ null ┆ egg   ┆ 0.235141 ┆ C      │
│ 5    ┆ spam  ┆ 0.890038 ┆ B      │
└──────┴───────┴──────────┴────────┘


In [37]:
df_samename = df.select(pl.col("nrs") + 5)
print(df_samename)

shape: (5, 1)
┌──────┐
│ nrs  │
│ ---  │
│ i64  │
╞══════╡
│ 6    │
│ 7    │
│ 8    │
│ null │
│ 10   │
└──────┘


In [38]:
try:
    df_samename2 = df.select(pl.col("nrs") + 5, pl.col("nrs") - 5)
    print(df_samename2)
except Exception as e:
    print(e)

column with name 'nrs' has more than one occurrences


In [39]:
df_alias = df.select(
    (pl.col("nrs") + 5).alias("nrs + 5"),
    (pl.col("nrs") - 5).alias("nrs - 5"),
)
print(df_alias)

shape: (5, 2)
┌─────────┬─────────┐
│ nrs + 5 ┆ nrs - 5 │
│ ---     ┆ ---     │
│ i64     ┆ i64     │
╞═════════╪═════════╡
│ 6       ┆ -4      │
│ 7       ┆ -3      │
│ 8       ┆ -2      │
│ null    ┆ null    │
│ 10      ┆ 0       │
└─────────┴─────────┘


In [40]:
df_alias = df.select(
    pl.col("names").n_unique().alias("unique"),
    pl.approx_n_unique("names").alias("unique_approx"),
)
print(df_alias)

shape: (1, 2)
┌────────┬───────────────┐
│ unique ┆ unique_approx │
│ ---    ┆ ---           │
│ u32    ┆ u32           │
╞════════╪═══════════════╡
│ 4      ┆ 4             │
└────────┴───────────────┘


In [41]:
df_conditional = df.select(
    pl.col("nrs"),
    pl.when(pl.col("nrs") > 2)
    .then(pl.lit(True))
    .otherwise(pl.lit(False))
    .alias("conditional"),
)
print(df_conditional)

shape: (5, 2)
┌──────┬─────────────┐
│ nrs  ┆ conditional │
│ ---  ┆ ---         │
│ i64  ┆ bool        │
╞══════╪═════════════╡
│ 1    ┆ false       │
│ 2    ┆ false       │
│ 3    ┆ true        │
│ null ┆ false       │
│ 5    ┆ true        │
└──────┴─────────────┘
