In [1]:
import pandas as pd
import duckdb

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
# insert data
sales_table_creation_query = """
drop table if exists sales;

create table sales (date date, sales int);

insert into sales values
  ('2024-01-12', 500),
  ('2024-01-13', 605),
  ('2024-01-14', 340),
  ('2024-01-15', 509);
SELECT * FROM sales
"""

In [3]:
# transform sales table to a pandas dataframe
df = duckdb.sql(sales_table_creation_query).df()
df_copy = df.copy()

# 1. Moving average

## 1a. SQL

In [4]:
sql_query = """ 
    SELECT 
        date,
        sales,
        AVG(sales) OVER (ORDER BY date ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS moving_avg
    FROM sales
    """

duckdb.sql(sql_query)

┌────────────┬───────┬───────────────────┐
│    date    │ sales │    moving_avg     │
│    date    │ int32 │      double       │
├────────────┼───────┼───────────────────┤
│ 2024-01-12 │   500 │             500.0 │
│ 2024-01-13 │   605 │             552.5 │
│ 2024-01-14 │   340 │ 481.6666666666667 │
│ 2024-01-15 │   509 │ 484.6666666666667 │
└────────────┴───────┴───────────────────┘

## 1b. Pandas

In [5]:
df["moving_avg"] = df.sales.rolling(window=3, min_periods=1).mean()
df

Unnamed: 0,date,sales,moving_avg
0,2024-01-12,500,500.0
1,2024-01-13,605,552.5
2,2024-01-14,340,481.666667
3,2024-01-15,509,484.666667


# 2. Running total

## 2a. SQL

In [6]:
# Running total
sql_query = """ 
    SELECT 
        date,
        sales,
        SUM(sales) OVER (ORDER BY date ROWS UNBOUNDED PRECEDING) AS running_total_sales
    FROM sales
    """

duckdb.sql(sql_query)

┌────────────┬───────┬─────────────────────┐
│    date    │ sales │ running_total_sales │
│    date    │ int32 │       int128        │
├────────────┼───────┼─────────────────────┤
│ 2024-01-12 │   500 │                 500 │
│ 2024-01-13 │   605 │                1105 │
│ 2024-01-14 │   340 │                1445 │
│ 2024-01-15 │   509 │                1954 │
└────────────┴───────┴─────────────────────┘

## 2b. Pandas

In [7]:
# Running total 
df_copy["running_total_sales"] = df_copy.sales.cumsum()
df_copy

Unnamed: 0,date,sales,running_total_sales
0,2024-01-12,500,500
1,2024-01-13,605,1105
2,2024-01-14,340,1445
3,2024-01-15,509,1954


# Summary

<img src="assets/window_functions_1.png" width=1000 />
