# Group-By in Polars

In [1]:
import polars as pl
import pandas as pd
import numpy as np
import pyarrow

import matplotlib.pyplot as plt
import seaborn as sns


# Download Dataset or run in a kaggle notebook
# https://www.kaggle.com/datasets/mkechinov/ecommerce-behavior-data-from-multi-category-store
df = pl.read_csv("2019-Nov.csv")

In [2]:
df.head()

event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
str,str,i64,i64,str,str,f64,i64,str
"""2019-11-01 00:...","""view""",1003461,2053013555631882655,"""electronics.sm...","""xiaomi""",489.07,520088904,"""4d3b30da-a5e4-..."
"""2019-11-01 00:...","""view""",5000088,2053013566100866035,"""appliances.sew...","""janome""",293.65,530496790,"""8e5f4f83-366c-..."
"""2019-11-01 00:...","""view""",17302664,2053013553853497655,,"""creed""",28.31,561587266,"""755422e7-9040-..."
"""2019-11-01 00:...","""view""",3601530,2053013563810775923,"""appliances.kit...","""lg""",712.87,518085591,"""3bfb58cd-7892-..."
"""2019-11-01 00:...","""view""",1004775,2053013555631882655,"""electronics.sm...","""xiaomi""",183.27,558856683,"""313628f1-68b8-..."


## 1. Groupby - pl.count (one group)

In [3]:
df.groupby("brand").agg([
    pl.count()
]).head()

brand,count
str,u32
"""shivaki""",58516
"""oris""",6299
"""derspur""",1401
"""rode""",447
"""worth""",66


In [4]:
# sorting in polars
df.groupby("brand").agg([
    pl.count()
]).sort("count", reverse=True).head()

brand,count
str,u32
,9218235
"""samsung""",7889245
"""apple""",6259379
"""xiaomi""",4638062
"""huawei""",1410126


In [5]:
# aggregate in polars and convert to pandas for sorting the aggregation
df_count = df.groupby("brand").agg([
    pl.count()
]).to_pandas().sort_values("count", ascending=False)

print(df_count.shape)

df_count.head()

(4202, 2)


Unnamed: 0,brand,count
4109,,9218235
3390,samsung,7889245
368,apple,6259379
3518,xiaomi,4638062
1837,huawei,1410126


In [6]:
# Compute count and percent of instances
(
    df.groupby("brand").agg([
        pl.count()
    ])
    .sort('count', reverse=True)
    .with_columns([
        (pl.col("count") / pl.col("count").sum()).alias("n_pct")
    ])
    .head(10)
)

brand,count,n_pct
str,u32,f64
,9218235,0.136562
"""samsung""",7889245,0.116874
"""apple""",6259379,0.092729
"""xiaomi""",4638062,0.06871
"""huawei""",1410126,0.02089
"""lucente""",1185075,0.017556
"""lg""",1096990,0.016251
"""bosch""",975059,0.014445
"""oppo""",811698,0.012025
"""sony""",798457,0.011829


In [7]:
df.sample(5)

event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
str,str,i64,i64,str,str,f64,i64,str
"""2019-11-28 07:...","""view""",1005158,2053013555631882655,"""electronics.sm...","""xiaomi""",282.12,544074209,"""1ca11c10-92ac-..."
"""2019-11-18 05:...","""view""",7201910,2053013553165631753,,"""lego""",37.3,541866488,"""2ac22c2c-8d11-..."
"""2019-11-01 03:...","""view""",5700384,2053013553970938175,"""auto.accessori...","""pioneer""",140.31,526141854,"""147b516f-66d1-..."
"""2019-11-15 01:...","""view""",1005115,2053013555631882655,"""electronics.sm...","""apple""",914.0,518773118,"""564875fa-e866-..."
"""2019-11-15 15:...","""view""",1004209,2053013555631882655,"""electronics.sm...","""samsung""",88.67,529519097,"""a19edc9b-9917-..."


### Value Counts works as expected

In [9]:
# this works!
df['event_type'].value_counts()

# More verbose version with groupby
df.groupby("event_type").agg(pl.count())

event_type,count
str,u32
"""purchase""",916939
"""view""",63556110
"""cart""",3028930


## 2. Groupby - pl.count (two groups)

In [10]:
df_brand_event = df.groupby(["brand", "event_type"]).agg([
    pl.count()
])

df_brand_event.head()

brand,event_type,count
str,str,u32
"""besty""","""view""",3283
"""xgimi""","""view""",1890
"""neptun""","""view""",306
"""evrodetal""","""purchase""",21
"""forward""","""cart""",142


In [11]:
# Convert from long to wide format
df_brand_event_wide = df_brand_event.pivot(values="count", 
                                           index=["brand"], 
                                           columns=['event_type'])
df_brand_event_wide.head()

brand,view,purchase,cart
str,u32,u32,u32
"""besty""",3283,24.0,105.0
"""xgimi""",1890,15.0,66.0
"""neptun""",306,,
"""evrodetal""",7804,21.0,138.0
"""forward""",3931,26.0,142.0


In [12]:
# Pivot and sorting
df_brand_event_wide = ( 
    df_brand_event
    .pivot(values="count", index=["brand"], columns=['event_type'])
    .sort("purchase", reverse=True)
)

df_brand_event_wide.head()

brand,view,purchase,cart
str,u32,u32,u32
"""samsung""",7091998,200027,597220
"""apple""",5603650,166064,489665
,8886720,73273,258242
"""xiaomi""",4309751,68292,260019
"""huawei""",1312663,23703,73760


In [13]:
# Keep original columns and compute % values
agg_performance = df_brand_event_wide.with_columns([
    (pl.col("cart") / pl.col("view")).alias("cart_by_views"),
    (pl.col("purchase") / pl.col("cart")).alias("buy_by_cart"),
    (pl.col("purchase") / pl.col("view")).alias("buy_by_views"),
])
agg_performance.head()

brand,view,purchase,cart,cart_by_views,buy_by_cart,buy_by_views
str,u32,u32,u32,f64,f64,f64
"""samsung""",7091998,200027,597220,0.08421,0.33493,0.028205
"""apple""",5603650,166064,489665,0.087383,0.339138,0.029635
,8886720,73273,258242,0.029059,0.283738,0.008245
"""xiaomi""",4309751,68292,260019,0.060333,0.262642,0.015846
"""huawei""",1312663,23703,73760,0.056191,0.321353,0.018057


In [14]:
agg_performance.filter(pl.col("buy_by_cart") < 1)\
.select(['cart_by_views', 'buy_by_cart', 'buy_by_views'])\
.to_pandas().quantile([0, 0.25, 0.5, 0.75, 0.99, 1])

# .boxplot()

Unnamed: 0,cart_by_views,buy_by_cart,buy_by_views
0.0,0.001115,0.02681,0.000353
0.25,0.01464,0.210685,0.003704
0.5,0.024419,0.279579,0.006616
0.75,0.03722,0.343597,0.010874
0.99,0.121112,0.75,0.043861
1.0,0.4,0.9,0.2


In [None]:
df.head()

## 3. Group By - UserId + 2 Groups

In [15]:
# groupby 
# %%timeit
# 8.72 s ± 246 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

df_user_brand_event = df.groupby(["user_id", "brand", "event_type"]).agg([
    pl.count()
])

df_user_brand_event.head()

user_id,brand,event_type,count
i64,str,str,u32
552111808,"""apple""","""view""",2
538399224,"""nokia""","""view""",1
554511060,"""bosch""","""view""",6
515375111,"""payot""","""view""",1
563928304,"""apple""","""cart""",2


In [16]:
df_user_brand_event.shape

(17798395, 4)

In [17]:
# pivot and sort
df_user_brand_event = (df_user_brand_event
    .pivot(values="count", index=["user_id", "brand"], columns=['event_type'])
    .sort("purchase", reverse=True)
)
df_user_brand_event.shape

(15905970, 5)

In [18]:
df_user_brand_event.head()

user_id,brand,view,cart,purchase
i64,str,u32,u32,u32
564068124,"""samsung""",634,431,453
518514099,"""apple""",314,206,190
549030056,"""samsung""",313,300,180
543128872,"""apple""",239,161,154
549109608,"""samsung""",307,184,154


In [19]:
df_user_brand_event.sample(10)

user_id,brand,view,cart,purchase
i64,str,u32,u32,u32
515799072,"""vitek""",2,1.0,
516363897,"""xiaomi""",55,,
578708569,"""giorgioarmani""",1,,
514489669,"""oneplus""",2,,
555238866,"""willmark""",1,,
517923794,"""jbl""",3,,
549669920,"""haier""",1,,
518042667,"""casio""",2,,
567725178,"""rondell""",5,,
559073920,"""element""",8,,


In [20]:
df_user_brand_event = df_user_brand_event.with_columns([
    pl.col('purchase').fill_null(strategy="zero"),
    pl.col('view').fill_null(strategy="zero"),
    pl.col('cart').fill_null(strategy="zero"),
])

In [21]:
df_user_brand_event.sample(10)

user_id,brand,view,cart,purchase
i64,str,u32,u32,u32
541978247,"""lanvin""",1,0,0
516056866,"""samsung""",3,0,0
551017434,"""rowenta""",1,0,0
579916513,"""apple""",2,0,0
561227529,"""bosch""",6,0,0
549667503,"""arg""",1,0,0
521552552,"""philips""",1,0,0
513035989,"""samsung""",2,0,0
512869754,"""maribel""",14,1,1
570056543,"""greyder""",1,0,0


In [22]:
# Looks like these user-ids are distributors (nobody need 400 samsung devices)
df_user_brand_event.head(10)

user_id,brand,view,cart,purchase
i64,str,u32,u32,u32
564068124,"""samsung""",634,431,453
518514099,"""apple""",314,206,190
549030056,"""samsung""",313,300,180
543128872,"""apple""",239,161,154
549109608,"""samsung""",307,184,154
567928887,"""samsung""",174,217,118
521230795,"""apple""",315,195,118
513230794,"""lg""",334,102,116
543312954,"""samsung""",302,129,115
538473314,"""samsung""",639,178,114


In [23]:
df_user_brand_event = df_user_brand_event.with_columns(
    (pl.col("purchase") / pl.col("view")).alias("pct_buy_views")
)

In [24]:
df_user_brand_event.head()

user_id,brand,view,cart,purchase,pct_buy_views
i64,str,u32,u32,u32,f64
564068124,"""samsung""",634,431,453,0.714511
518514099,"""apple""",314,206,190,0.605096
549030056,"""samsung""",313,300,180,0.57508
543128872,"""apple""",239,161,154,0.644351
549109608,"""samsung""",307,184,154,0.501629


In [25]:
df_user_brand_event = df_user_brand_event.with_columns(
    pl.when(pl.col("pct_buy_views").is_infinite())
      .then(None)
      .otherwise(pl.col("pct_buy_views"))
)

In [26]:
df_user_brand_event.sample(5)

user_id,brand,view,cart,purchase,pct_buy_views,literal
i64,str,u32,u32,u32,f64,f64
523994890,"""chicco""",22,0,0,0.0,0.0
544424277,"""samsung""",1,0,0,0.0,0.0
566059853,"""huawei""",1,0,0,0.0,0.0
524404734,,8,0,0,0.0,0.0
556587564,,2,0,0,0.0,0.0
