In [1]:
import pandas as pd
import duckdb

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
# insert data
table_creation_query = """
drop table if exists orders;

create table orders (order_id int, customer_id int, product_category varchar, amount float);

insert into orders values
  (122154, 1, 'tea', 4.5),
  (122453, 2, 'chocolate', 5.0),
  (122476, 1, 'coffee', 4.0),
  (122783, 3, 'tea', 6.0),
  (122378, 1, 'chocolate', 5.0),
  (122157, 2, 'coffee', 5.5)
  ;
  
SELECT * FROM orders"""

In [4]:
# transform orders table to a pandas dataframe
df = duckdb.sql(table_creation_query).df()

# 1. SQL

In [5]:
sql_query = """ 
    SELECT 
        product_category,
        COUNT(order_id) AS order_count,
        SUM(amount) AS total_amount,
        AVG(amount) AS avg_amount
    FROM df
    GROUP BY product_category
    """

duckdb.sql(sql_query)

┌──────────────────┬─────────────┬──────────────┬────────────┐
│ product_category │ order_count │ total_amount │ avg_amount │
│     varchar      │    int64    │    double    │   double   │
├──────────────────┼─────────────┼──────────────┼────────────┤
│ tea              │           2 │         10.5 │       5.25 │
│ chocolate        │           2 │         10.0 │        5.0 │
│ coffee           │           2 │          9.5 │       4.75 │
└──────────────────┴─────────────┴──────────────┴────────────┘

# 2. Pandas

In [6]:
df.groupby(["product_category"]).agg({
                                        "order_id": "count", 
                                        "amount": ["sum", "mean"]
                                    })

Unnamed: 0_level_0,order_id,amount,amount
Unnamed: 0_level_1,count,sum,mean
product_category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
chocolate,2,10.0,5.0
coffee,2,9.5,4.75
tea,2,10.5,5.25


In [7]:
# rename columns
df_groupby = (df.groupby(["product_category"])
   .agg({
        "order_id": "count", 
        "amount": ["sum", "mean"]
        })
   .rename(columns={"count": "order_count", "sum": "total_amount", "mean": "avg_amount"}))
df_groupby.columns = df_groupby.columns.droplevel(0)
df_groupby

Unnamed: 0_level_0,order_count,total_amount,avg_amount
product_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
chocolate,2,10.0,5.0
coffee,2,9.5,4.75
tea,2,10.5,5.25


# Summary
<img src="assets/groupby.png" width=600 />