# Polars
---
Practical Intorduction

https://towardsdatascience.com/practical-introduction-to-polars-8d9cdca350f1

Further reading:
How to use LazyFrames and Pipe method.

In [0]:
from dataclasses import dataclass
from datetime import datetime, timedelta
from random import choice, gauss, randrange, seed
from typing import Any, Dict
import sys

import polars as pl
import pandas as pd
import numpy as np

#### Generating dummy data

In [0]:
seed(42)
base_time = datetime(2024, 8, 31, 0, 0, 0, 0)

user_actions_data = [
    {
        "OnlineStore": choice(["Shop1", "Shop2", "Shop3"]),
        "product": choice(["0001", "0002", "0003"]),
        "quantity": choice([1.0, 2.0, 3.0]),
        "Action type": ("purchase" if gauss(0,1) > 0.6 else "view"),
        "Action_time": base_time - timedelta(minutes=randrange(1_000_000)),
    }
    for x in range(1_000_000)
]

corrupted_data = [
    {
        "OnlineStore": choice(["Shop1", "Shop2", "Shop3"]),
        "product": choice(["0001", None]),
        "quantity": choice([1.0, None]),
        "Action type": ("purchase" if gauss(0,1) > 0.6 else "view"),
        "Action_time": base_time - timedelta(minutes=randrange(1_000)),
    }
    for x in range(1_000)
]

product_catalog_data = {"product_id": ["0001", "0002", "0003"], "price": [100, 25, 80]}


#### Loading data into a dataframe

In [0]:
# Pandas
user_actions_pd_df = pd.DataFrame(user_actions_data)
corrupted_pd_df = pd.DataFrame(corrupted_data)
product_catalog_pd_df = pd.DataFrame(product_catalog_data)

# Polars
user_actions_pl_df = pl.DataFrame(user_actions_data)
corrupted_pl_df = pl.DataFrame(corrupted_data)
product_catalog_pl_df = pl.DataFrame(product_catalog_data)

In [0]:
product_catalog_pd_df

Unnamed: 0,product_id,price
0,1,100
1,2,25
2,3,80


In [0]:
product_catalog_pl_df

product_id,price
str,i64
"""0001""",100
"""0002""",25
"""0003""",80


#### Concatenate Dataframes

In [0]:
# Pandas
user_actions_pd_df = pd.concat([user_actions_pd_df, corrupted_pd_df])

# Polars
user_actions_pl_df = pl.concat([user_actions_pl_df, corrupted_pl_df])

#### Summary Statistics
---
Results may differ. Polars describe() output is not stable and is not recommended.

In [0]:
# Pandas
user_actions_pd_df.describe(include='all')

Unnamed: 0,OnlineStore,product,quantity,Action type,Action_time
count,1001000,1000492.0,1000510.0,1001000,1001000
unique,3,3.0,,2,632335
top,Shop3,1.0,,view,2024-08-30 22:02:00
freq,333931,333963.0,,726623,9
first,,,,,2022-10-06 13:23:00
last,,,,,2024-08-30 23:58:00
mean,,,1.998925,,
std,,,0.8164457,,
min,,,1.0,,
25%,,,1.0,,


In [0]:
# Polars
user_actions_pl_df.describe()

statistic,OnlineStore,product,quantity,Action type,Action_time
str,str,str,f64,str,str
"""count""","""1001000""","""1000492""",1000510.0,"""1001000""","""1001000"""
"""null_count""","""0""","""508""",490.0,"""0""","""0"""
"""mean""",,,1.998925,,"""2023-09-19 03:24:30.981698"""
"""std""",,,0.816446,,
"""min""","""Shop1""","""1""",1.0,"""purchase""","""2022-10-06 13:23:00"""
"""25%""",,,1.0,,"""2023-03-29 03:09:00"""
"""50%""",,,2.0,,"""2023-09-19 06:49:00"""
"""75%""",,,3.0,,"""2024-03-11 03:01:00"""
"""max""","""Shop3""","""3""",3.0,"""view""","""2024-08-30 23:58:00"""


#### value_counts Method

In [0]:
# Pandas
user_actions_pd_df.quantity.value_counts(dropna=False)

In [0]:
# Polars
user_actions_pl_df.select(pl.col('quantity').value_counts())

quantity
struct[2]
"{1.0,334000}"
"{2.0,333586}"
"{null,490}"
"{3.0,332924}"


#### Retrieve first and last 5 values

In [0]:
# Pandas
user_actions_pd_df.head()
user_actions_pd_df.tail()

Unnamed: 0,OnlineStore,product,quantity,Action type,Action_time
995,Shop3,1.0,,purchase,2024-08-30 20:38:00
996,Shop2,,1.0,view,2024-08-30 18:21:00
997,Shop1,,,view,2024-08-30 18:18:00
998,Shop1,1.0,1.0,view,2024-08-30 12:50:00
999,Shop2,1.0,1.0,view,2024-08-30 10:50:00


In [0]:
# Polars
user_actions_pl_df.head()
user_actions_pl_df.tail()

OnlineStore,product,quantity,Action type,Action_time
str,str,f64,str,datetime[μs]
"""Shop3""","""0001""",,"""purchase""",2024-08-30 20:38:00
"""Shop2""",,1.0,"""view""",2024-08-30 18:21:00
"""Shop1""",,,"""view""",2024-08-30 18:18:00
"""Shop1""","""0001""",1.0,"""view""",2024-08-30 12:50:00
"""Shop2""","""0001""",1.0,"""view""",2024-08-30 10:50:00


In [0]:
# Polars
## Glimpse provides a dense preview of the DataFrame. Available in Polars only.
user_actions_pl_df.glimpse()

#### Rename Columns

In [0]:
# Pandas
user_actions_pd_df = user_actions_pd_df.rename(
    columns={
        "OnlineStore": "online_store",
        "product": "product_id",
        "Action type": "action_type",
        "Action_time": "action_dt",
    }
)

In [0]:
# Polars
user_actions_pl_df = user_actions_pl_df.rename(
    {
        "OnlineStore": "online_store",
        "product": "product_id",
        "Action type": "action_type",
        "Action_time": "action_dt",
    }
)

#### Changing Column Data Types

In [0]:
# Pandas
user_actions_pd_df = user_actions_pd_df.astype({"quantity": "Int64"})

In [0]:
# Polars
user_actions_pl_df = user_actions_pl_df.cast({"quantity": pl.Int32})

#### Checking Dataframe Size

In [0]:
# getsizeof returns the size in bytes. Divide by 1_048_576 to convert it into MB.
# getsizeof wont work for polars because it returns the size of the wrapping object, not the actual dataframe. For polars you should use the estimated_size('mb') method.

print('Pandas:', sys.getsizeof(user_actions_pd_df) / 1_048_576)
print('Polars:', sys.getsizeof(user_actions_pl_df) / 1_048_576)

In [0]:
# Pandas
user_actions_pd_df.info(memory_usage='deep')

In [0]:
# Polars
user_actions_pl_df.estimated_size("mb")

#### Filling Missing Values

In [0]:
# Pandas
user_actions_pd_df["quantity"] = user_actions_pd_df["quantity"].fillna(0) # You can also use inplace=True
user_actions_pd_df['quantity'].value_counts()

In [0]:
# Polars
user_actions_pl_df = user_actions_pl_df.with_columns(pl.col("quantity").fill_null(0)) # Similar syntax to PySpark
user_actions_pl_df.select(pl.col('quantity'))

quantity
i32
1
3
3
3
3
…
0
1
0
1


#### Removing Missing Values

In [0]:
# Pandas
user_actions_pd_df = user_actions_pd_df.dropna(subset=["product_id"])
user_actions_pd_df['quantity'].sum()

In [0]:
# Polars
user_actions_pl_df = user_actions_pl_df.drop_nulls(subset=["product_id"]) # Notice that Polars returns a dataframe, unlike pandas which returns a series or dataframe depending on the method.
user_actions_pl_df.select(pl.col('quantity')).sum()

quantity
i32
1999679


#### Remove Duplicates

In [0]:
# Pandas
user_actions_pd_df = user_actions_pd_df.drop_duplicates(subset=["online_store", "action_type", "action_dt"], keep="last")
user_actions_pd_df.shape

In [0]:
# Polars
user_actions_pl_df = user_actions_pl_df.unique(subset=["online_store", "action_type", "action_dt"], keep="last")
user_actions_pl_df.shape

#### Selecting Columns

In [0]:
# Pandas
user_actions_pd_df.quantity # Returns a Series
user_actions_pd_df['quantity'] # Returns a Series
user_actions_pd_df[['quantity']] # Returns a DataFrame
user_actions_pd_df[['quantity', 'product_id']]

Unnamed: 0,quantity,product_id
0,1,0001
1,3,0001
2,3,0001
3,3,0003
4,3,0001
...,...,...
991,1,0001
994,1,0001
995,0,0001
998,1,0001


In [0]:
# Polars
# user_actions_pl_df.quantity >> Throws an error, dataframes dont store columns as attributes, as in pandas.
user_actions_pl_df.select('quantity') # Returns DataFrame
user_actions_pl_df.select(pl.col('quantity')) # Returns a dtaframs, same as above
user_actions_pl_df.select('quantity', 'product_id') 

quantity,product_id
i32,str
3,"""0002"""
2,"""0003"""
2,"""0002"""
3,"""0003"""
3,"""0002"""
…,…
1,"""0002"""
3,"""0001"""
1,"""0003"""
2,"""0001"""


#### Filter Rows

In [0]:
user_actions_pd_df.loc[(user_actions_pd_df.online_store == 'Shop1') & (user_actions_pd_df.action_type == 'view')]

Unnamed: 0,online_store,product_id,quantity,action_type,action_dt
3,Shop1,0003,3,view,2024-08-11 16:16:00
6,Shop1,0001,3,view,2024-03-27 05:08:00
8,Shop1,0002,2,view,2023-03-18 15:01:00
12,Shop1,0001,3,view,2024-07-03 21:33:00
25,Shop1,0003,2,view,2024-06-25 19:15:00
...,...,...,...,...,...
956,Shop1,0001,0,view,2024-08-30 09:21:00
961,Shop1,0001,0,view,2024-08-30 12:29:00
969,Shop1,0001,0,view,2024-08-30 23:25:00
994,Shop1,0001,1,view,2024-08-30 12:36:00


In [0]:
user_actions_pl_df.filter((pl.col('online_store')=='Shop1') & (pl.col('action_type')=='view'))

online_store,product_id,quantity,action_type,action_dt
str,str,i32,str,datetime[μs]
"""Shop1""","""0002""",3,"""view""",2023-12-11 03:48:00
"""Shop1""","""0001""",2,"""view""",2024-04-29 20:15:00
"""Shop1""","""0002""",1,"""view""",2024-03-10 08:14:00
"""Shop1""","""0001""",2,"""view""",2022-12-26 06:16:00
"""Shop1""","""0002""",2,"""view""",2023-08-18 07:29:00
…,…,…,…,…
"""Shop1""","""0002""",3,"""view""",2024-08-27 19:09:00
"""Shop1""","""0003""",2,"""view""",2022-12-04 13:36:00
"""Shop1""","""0001""",3,"""view""",2023-03-20 20:34:00
"""Shop1""","""0003""",1,"""view""",2024-07-22 06:57:00


#### Group By

In [0]:
# Pandas
user_actions_pd_df = user_actions_pd_df.groupby(["online_store", "product_id", "action_type"]).agg({"quantity": "sum"}).reset_index()
user_actions_pd_df

Unnamed: 0,online_store,product_id,action_type,quantity
0,Shop1,1,purchase,57772
1,Shop1,1,view,143477
2,Shop1,2,purchase,58015
3,Shop1,2,view,142174
4,Shop1,3,purchase,58242
5,Shop1,3,view,143211
6,Shop2,1,purchase,58256
7,Shop2,1,view,143560
8,Shop2,2,purchase,58648
9,Shop2,2,view,143740


In [0]:
# Polars
user_actions_pl_df = user_actions_pl_df.group_by(["online_store", "product_id", "action_type"]).agg(pl.col('quantity').sum())
user_actions_pl_df

online_store,product_id,action_type,quantity
str,str,str,i32
"""Shop2""","""0002""","""view""",143740
"""Shop2""","""0001""","""view""",143560
"""Shop3""","""0001""","""purchase""",57891
"""Shop3""","""0003""","""purchase""",59107
"""Shop2""","""0002""","""purchase""",58648
…,…,…,…
"""Shop3""","""0002""","""view""",142657
"""Shop2""","""0003""","""view""",142230
"""Shop1""","""0002""","""purchase""",58015
"""Shop1""","""0001""","""purchase""",57772


#### Join/Merge

In [0]:
# Pandas
user_actions_pd_df = user_actions_pd_df.merge(product_catalog_pd_df, on='product_id')
user_actions_pd_df

Unnamed: 0,online_store,product_id,action_type,quantity,price
0,Shop1,1,purchase,57772,100
1,Shop1,1,view,143477,100
2,Shop2,1,purchase,58256,100
3,Shop2,1,view,143560,100
4,Shop3,1,purchase,57891,100
5,Shop3,1,view,143685,100
6,Shop1,2,purchase,58015,25
7,Shop1,2,view,142174,25
8,Shop2,2,purchase,58648,25
9,Shop2,2,view,143740,25


In [0]:
# Polars
user_actions_pl_df = user_actions_pl_df.join(product_catalog_pl_df, on='product_id')
user_actions_pl_df

online_store,product_id,action_type,quantity,price
str,str,str,i32,i64
"""Shop2""","""0002""","""view""",143740,25
"""Shop2""","""0001""","""view""",143560,100
"""Shop3""","""0001""","""purchase""",57891,100
"""Shop3""","""0003""","""purchase""",59107,80
"""Shop2""","""0002""","""purchase""",58648,25
…,…,…,…,…
"""Shop3""","""0002""","""view""",142657,25
"""Shop2""","""0003""","""view""",142230,80
"""Shop1""","""0002""","""purchase""",58015,25
"""Shop1""","""0001""","""purchase""",57772,100


#### Calculate a New Column

In [0]:
# Pandas
user_actions_pd_df["total"] = (user_actions_pd_df["price"] * user_actions_pd_df["quantity"])
user_actions_pd_df = user_actions_pd_df[["online_store", "action_type", "total"]]
user_actions_pd_df

Unnamed: 0,online_store,action_type,total
0,Shop1,purchase,5777200
1,Shop1,view,14347700
2,Shop2,purchase,5825600
3,Shop2,view,14356000
4,Shop3,purchase,5789100
5,Shop3,view,14368500
6,Shop1,purchase,1450375
7,Shop1,view,3554350
8,Shop2,purchase,1466200
9,Shop2,view,3593500


In [0]:
# Polars
user_actions_pl_df = user_actions_pl_df.with_columns((pl.col("price") * pl.col("quantity")).alias("total"))
user_actions_pl_df = user_actions_pl_df.select("online_store", "action_type", "total")
user_actions_pl_df

# You can also use the select method to create new columns
user_actions_pl_df = user_actions_pl_df.select(
    "online_store",
    "action_type",
    (pl.col("price") * pl.col("quantity")).alias("total"))

#### Pivot

In [0]:
# Pandas
result_pd = user_actions_pd_df.pivot_table(
    columns="online_store",
    index="action_type",
    values="total",
    aggfunc="sum",
)

In [0]:
# Polars
result_pl = user_actions_pl_df.pivot(
    columns="online_store",
    index="action_type",
    values="total",
    aggregate_function="sum",
)