In [None]:
#%pip install polars
"""
Author: Matt Martin
Date: 2023-11-12
Desc: Polars Demo
"""

In [13]:
import polars as pl
from datetime import datetime

In [23]:
#create some data
dataset = {
    'user_name': ['Bob','Ted','Amy','Nancy','Greg','Brittany'],
    'hire_date': ['2023-01-01','2022-02-02','2018-05-12','2019-07-08','2022-04-05','2020-08-01'],
    'title': ['Senior Manager','Manager','Director','Analyst','Analyst','Manager']
}

df = pl.DataFrame(dataset)

#update date column from string to date
df = df.with_columns(pl.col('hire_date').cast(pl.Date).alias('hire_date'))

print(df)

shape: (6, 3)
┌───────────┬────────────┬────────────────┐
│ user_name ┆ hire_date  ┆ title          │
│ ---       ┆ ---        ┆ ---            │
│ str       ┆ date       ┆ str            │
╞═══════════╪════════════╪════════════════╡
│ Bob       ┆ 2023-01-01 ┆ Senior Manager │
│ Ted       ┆ 2022-02-02 ┆ Manager        │
│ Amy       ┆ 2018-05-12 ┆ Director       │
│ Nancy     ┆ 2019-07-08 ┆ Analyst        │
│ Greg      ┆ 2022-04-05 ┆ Analyst        │
│ Brittany  ┆ 2020-08-01 ┆ Manager        │
└───────────┴────────────┴────────────────┘


In [24]:
#filtering
df2 = df.filter(pl.col('hire_date') > datetime(2022,1,1))
print(df2)

shape: (3, 3)
┌───────────┬────────────┬────────────────┐
│ user_name ┆ hire_date  ┆ title          │
│ ---       ┆ ---        ┆ ---            │
│ str       ┆ date       ┆ str            │
╞═══════════╪════════════╪════════════════╡
│ Bob       ┆ 2023-01-01 ┆ Senior Manager │
│ Ted       ┆ 2022-02-02 ┆ Manager        │
│ Greg      ┆ 2022-04-05 ┆ Analyst        │
└───────────┴────────────┴────────────────┘


In [25]:
#grouping
df3 = df.group_by('title').agg(pl.col('user_name').count().alias('row_cnt'))
print(df3.sort('row_cnt',descending=True))

shape: (4, 2)
┌────────────────┬─────────┐
│ title          ┆ row_cnt │
│ ---            ┆ ---     │
│ str            ┆ u32     │
╞════════════════╪═════════╡
│ Manager        ┆ 2       │
│ Analyst        ┆ 2       │
│ Director       ┆ 1       │
│ Senior Manager ┆ 1       │
└────────────────┴─────────┘


In [26]:
#writing to file
df3.write_parquet('employee_counts.parquet')

In [27]:
#joins
df1 = pl.DataFrame({'user_id': ['a','b','c','d'], 'curr_salary': [20, 22, 24,18]})
df2 = pl.DataFrame({'user_id': ['c','d','e','f'], 'new_salary': [21,19,26,32]})

df3 = df1.join(df2, on='user_id',how='left')
print(df3)

shape: (4, 3)
┌─────────┬─────────────┬────────────┐
│ user_id ┆ curr_salary ┆ new_salary │
│ ---     ┆ ---         ┆ ---        │
│ str     ┆ i64         ┆ i64        │
╞═════════╪═════════════╪════════════╡
│ a       ┆ 20          ┆ null       │
│ b       ┆ 22          ┆ null       │
│ c       ┆ 24          ┆ 21         │
│ d       ┆ 18          ┆ 19         │
└─────────┴─────────────┴────────────┘
