In [1]:
# link used for below task
# GIT
# https://github.com/anbento0490/code_tutorials

# https://towardsdatascience.com/8-popular-sql-window-functions-replicated-in-python-e17e6b34d5d7

In [2]:
import pandas as pd
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", -1)

  pd.set_option("display.max_colwidth", -1)


In [3]:
orders = pd.read_csv(r"customer_orders.csv", parse_dates=['order_date'], index_col=['order_date'])

In [4]:
orders.head()

Unnamed: 0_level_0,order_id,customer_id,item_id,item_price,quantity,amount_paid_gbp
order_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-02-13,ID001,customer_1,item_3,10.0,2,20.0
2019-02-15,ID002,customer_2,item_1,23.5,2,47.0
2019-02-16,ID003,customer_2,item_2,7.5,3,22.5
2019-02-16,ID004,customer_1,item_1,23.5,4,94.0
2019-02-19,ID005,customer_4,item_1,23.5,6,141.0


In [5]:
pd.DatetimeIndex(orders.index).to_period("M")

PeriodIndex(['2019-02', '2019-02', '2019-02', '2019-02', '2019-02', '2019-02',
             '2019-02', '2019-02', '2019-02', '2019-02', '2019-02', '2019-02',
             '2019-02', '2019-02', '2019-02', '2019-02', '2019-03', '2019-03',
             '2019-03', '2019-03', '2019-03', '2019-03', '2019-03', '2019-03',
             '2019-03', '2019-03', '2019-03', '2019-03', '2019-03', '2019-03',
             '2020-01', '2020-01', '2020-01', '2020-02', '2020-02', '2020-02',
             '2020-02', '2020-03', '2020-03', '2020-03', '2020-03', '2020-03',
             '2020-03', '2020-03', '2020-03', '2020-03', '2020-03', '2020-03',
             '2020-04', '2020-04'],
            dtype='period[M]', name='order_date')

In [6]:
orders.insert(0, 'order_month', pd.DatetimeIndex(orders.index).to_period("M"))

orders.reset_index(inplace = True)

orders.head()

Unnamed: 0,order_date,order_month,order_id,customer_id,item_id,item_price,quantity,amount_paid_gbp
0,2019-02-13,2019-02,ID001,customer_1,item_3,10.0,2,20.0
1,2019-02-15,2019-02,ID002,customer_2,item_1,23.5,2,47.0
2,2019-02-16,2019-02,ID003,customer_2,item_2,7.5,3,22.5
3,2019-02-16,2019-02,ID004,customer_1,item_1,23.5,4,94.0
4,2019-02-19,2019-02,ID005,customer_4,item_1,23.5,6,141.0


In [7]:
orders.groupby('customer_id')['order_id'].count()

customer_id
customer_1    18
customer_2    13
customer_3    7 
customer_4    12
Name: order_id, dtype: int64

In [8]:
orders.groupby('customer_id')['amount_paid_gbp'].sum()

customer_id
customer_1    1291.75
customer_2    761.00 
customer_3    576.75 
customer_4    1205.50
Name: amount_paid_gbp, dtype: float64

### window functions in python

In [9]:
# we wanted to rank orders by customer based on the order date, starting from the less recent one. To achieve this result in SQL , we can write:

# row number() over(partition by customer_id order by order_date)

In [10]:
(
    orders.assign(
        rnk_cu_cnt = orders.groupby('customer_id')['order_date']
        .cumcount()+1
    ).sort_values(['customer_id', 'rnk_cu_cnt'])
).head(5)

Unnamed: 0,order_date,order_month,order_id,customer_id,item_id,item_price,quantity,amount_paid_gbp,rnk_cu_cnt
0,2019-02-13,2019-02,ID001,customer_1,item_3,10.0,2,20.0,1
3,2019-02-16,2019-02,ID004,customer_1,item_1,23.5,4,94.0,2
7,2019-02-21,2019-02,ID008,customer_1,item_2,7.5,3,22.5,3
8,2019-02-21,2019-02,ID009,customer_1,item_3,10.0,2,20.0,4
10,2019-02-23,2019-02,ID011,customer_1,item_5,35.0,2,70.0,5


In [11]:
orders.head()

Unnamed: 0,order_date,order_month,order_id,customer_id,item_id,item_price,quantity,amount_paid_gbp
0,2019-02-13,2019-02,ID001,customer_1,item_3,10.0,2,20.0
1,2019-02-15,2019-02,ID002,customer_2,item_1,23.5,2,47.0
2,2019-02-16,2019-02,ID003,customer_2,item_2,7.5,3,22.5
3,2019-02-16,2019-02,ID004,customer_1,item_1,23.5,4,94.0
4,2019-02-19,2019-02,ID005,customer_4,item_1,23.5,6,141.0


In [12]:
# the above same logic can be performed using .rank() function
# 1. ROW NUMBER() --> .RANK(method='first')
# orders['by_rank_func'] = 
orders = (
    orders.assign(
        row_num = orders.groupby('customer_id')['order_date'].rank(method='first')
    )
    .sort_values(['customer_id', 'order_date'])
)
orders.reset_index(drop=True, inplace=True)

In [13]:
orders

Unnamed: 0,order_date,order_month,order_id,customer_id,item_id,item_price,quantity,amount_paid_gbp,row_num
0,2019-02-13,2019-02,ID001,customer_1,item_3,10.0,2,20.0,1.0
1,2019-02-16,2019-02,ID004,customer_1,item_1,23.5,4,94.0,2.0
2,2019-02-21,2019-02,ID008,customer_1,item_2,7.5,3,22.5,3.0
3,2019-02-21,2019-02,ID009,customer_1,item_3,10.0,2,20.0,4.0
4,2019-02-23,2019-02,ID011,customer_1,item_5,35.0,2,70.0,5.0
5,2019-02-26,2019-02,ID014,customer_1,item_1,23.5,4,94.0,6.0
6,2019-03-13,2019-03,ID019,customer_1,item_1,23.5,7,164.5,7.0
7,2019-03-14,2019-03,ID020,customer_1,item_2,7.5,6,45.0,8.0
8,2019-03-15,2019-03,ID021,customer_1,item_2,7.5,12,90.0,9.0
9,2019-03-22,2019-03,ID025,customer_1,item_1,23.5,10,235.0,10.0


In [14]:
# What if we wanted to assign the row number in descending order (from most recent to oldest order)? 
# --> in the rank method provide one more argument

(
    orders.assign(
        by_rank_func = orders.groupby('customer_id')['order_date'].rank(method='first', ascending=False)
    )
    .sort_values(['customer_id', 'order_date'])
)

Unnamed: 0,order_date,order_month,order_id,customer_id,item_id,item_price,quantity,amount_paid_gbp,row_num,by_rank_func
0,2019-02-13,2019-02,ID001,customer_1,item_3,10.0,2,20.0,1.0,18.0
1,2019-02-16,2019-02,ID004,customer_1,item_1,23.5,4,94.0,2.0,17.0
2,2019-02-21,2019-02,ID008,customer_1,item_2,7.5,3,22.5,3.0,15.0
3,2019-02-21,2019-02,ID009,customer_1,item_3,10.0,2,20.0,4.0,16.0
4,2019-02-23,2019-02,ID011,customer_1,item_5,35.0,2,70.0,5.0,14.0
5,2019-02-26,2019-02,ID014,customer_1,item_1,23.5,4,94.0,6.0,13.0
6,2019-03-13,2019-03,ID019,customer_1,item_1,23.5,7,164.5,7.0,12.0
7,2019-03-14,2019-03,ID020,customer_1,item_2,7.5,6,45.0,8.0,11.0
8,2019-03-15,2019-03,ID021,customer_1,item_2,7.5,12,90.0,9.0,10.0
9,2019-03-22,2019-03,ID025,customer_1,item_1,23.5,10,235.0,10.0,9.0


In [15]:
orders.head()

Unnamed: 0,order_date,order_month,order_id,customer_id,item_id,item_price,quantity,amount_paid_gbp,row_num
0,2019-02-13,2019-02,ID001,customer_1,item_3,10.0,2,20.0,1.0
1,2019-02-16,2019-02,ID004,customer_1,item_1,23.5,4,94.0,2.0
2,2019-02-21,2019-02,ID008,customer_1,item_2,7.5,3,22.5,3.0
3,2019-02-21,2019-02,ID009,customer_1,item_3,10.0,2,20.0,4.0
4,2019-02-23,2019-02,ID011,customer_1,item_5,35.0,2,70.0,5.0


In [16]:
# What if we wanted to partition by multiple columns (like Customer ID and Order Month)?
#SQL Syntax
# row number() over(partition by customer_id, order_month order by order_date)

#Python Syntax
(
    orders.assign(
        doule_part = orders.groupby(['customer_id', 'order_month'])['order_date'].rank(method='first')
    )
)

Unnamed: 0,order_date,order_month,order_id,customer_id,item_id,item_price,quantity,amount_paid_gbp,row_num,doule_part
0,2019-02-13,2019-02,ID001,customer_1,item_3,10.0,2,20.0,1.0,1.0
1,2019-02-16,2019-02,ID004,customer_1,item_1,23.5,4,94.0,2.0,2.0
2,2019-02-21,2019-02,ID008,customer_1,item_2,7.5,3,22.5,3.0,3.0
3,2019-02-21,2019-02,ID009,customer_1,item_3,10.0,2,20.0,4.0,4.0
4,2019-02-23,2019-02,ID011,customer_1,item_5,35.0,2,70.0,5.0,5.0
5,2019-02-26,2019-02,ID014,customer_1,item_1,23.5,4,94.0,6.0,6.0
6,2019-03-13,2019-03,ID019,customer_1,item_1,23.5,7,164.5,7.0,1.0
7,2019-03-14,2019-03,ID020,customer_1,item_2,7.5,6,45.0,8.0,2.0
8,2019-03-15,2019-03,ID021,customer_1,item_2,7.5,12,90.0,9.0,3.0
9,2019-03-22,2019-03,ID025,customer_1,item_1,23.5,10,235.0,10.0,4.0


In [26]:
# #2. Rank() → Rank(method='min')

# The SQL RANK() function, assigns a rank to each row within a partition of a result set. Unlike ROW NUMBER(), the rank is not sequential, meaning that rows within a partition that share the same values, will receive the same rank

# rank() over(partition by customer_id order by order_date)

#Python Syntax
orders = (
    orders.assign(
        rnk = orders.groupby(['customer_id'])['order_date'].rank(method='min')
    )
)

In [27]:
orders.head()

Unnamed: 0,order_date,order_month,order_id,customer_id,item_id,item_price,quantity,amount_paid_gbp,row_num,rnk
0,2019-02-13,2019-02,ID001,customer_1,item_3,10.0,2,20.0,1.0,1.0
1,2019-02-16,2019-02,ID004,customer_1,item_1,23.5,4,94.0,2.0,2.0
2,2019-02-21,2019-02,ID008,customer_1,item_2,7.5,3,22.5,3.0,3.0
3,2019-02-21,2019-02,ID009,customer_1,item_3,10.0,2,20.0,4.0,3.0
4,2019-02-23,2019-02,ID011,customer_1,item_5,35.0,2,70.0,5.0,5.0


In [28]:
# (
#     orders.assign(
#         temp = orders.sort_values(['order_date'])
#         .groupby('customer_id')
#         .cumcount()+1
#     ).sort_values(['customer_id', 'order_month', 'temp'])
# )

In [29]:
orders.head(10)

Unnamed: 0,order_date,order_month,order_id,customer_id,item_id,item_price,quantity,amount_paid_gbp,row_num,rnk
0,2019-02-13,2019-02,ID001,customer_1,item_3,10.0,2,20.0,1.0,1.0
1,2019-02-16,2019-02,ID004,customer_1,item_1,23.5,4,94.0,2.0,2.0
2,2019-02-21,2019-02,ID008,customer_1,item_2,7.5,3,22.5,3.0,3.0
3,2019-02-21,2019-02,ID009,customer_1,item_3,10.0,2,20.0,4.0,3.0
4,2019-02-23,2019-02,ID011,customer_1,item_5,35.0,2,70.0,5.0,5.0
5,2019-02-26,2019-02,ID014,customer_1,item_1,23.5,4,94.0,6.0,6.0
6,2019-03-13,2019-03,ID019,customer_1,item_1,23.5,7,164.5,7.0,7.0
7,2019-03-14,2019-03,ID020,customer_1,item_2,7.5,6,45.0,8.0,8.0
8,2019-03-15,2019-03,ID021,customer_1,item_2,7.5,12,90.0,9.0,9.0
9,2019-03-22,2019-03,ID025,customer_1,item_1,23.5,10,235.0,10.0,10.0


In [30]:
#3. Dense_rank() → Rank(method=’dense’)
# If we wanted to avoid gaps in ranking values we should use the SQL dense_rank() function instead. 
# In effect, unlike the rank() function, using dense_rank() returns consecutive rank values. 
# In our case, the SQL syntax would be:
# dense_rank() over(partition by customer_id order by order_date)

(
    orders.assign(
        dense = orders.groupby(['customer_id'])['order_date'].rank(method='dense')
    )
)

Unnamed: 0,order_date,order_month,order_id,customer_id,item_id,item_price,quantity,amount_paid_gbp,row_num,rnk,dense
0,2019-02-13,2019-02,ID001,customer_1,item_3,10.0,2,20.0,1.0,1.0,1.0
1,2019-02-16,2019-02,ID004,customer_1,item_1,23.5,4,94.0,2.0,2.0,2.0
2,2019-02-21,2019-02,ID008,customer_1,item_2,7.5,3,22.5,3.0,3.0,3.0
3,2019-02-21,2019-02,ID009,customer_1,item_3,10.0,2,20.0,4.0,3.0,3.0
4,2019-02-23,2019-02,ID011,customer_1,item_5,35.0,2,70.0,5.0,5.0,4.0
5,2019-02-26,2019-02,ID014,customer_1,item_1,23.5,4,94.0,6.0,6.0,5.0
6,2019-03-13,2019-03,ID019,customer_1,item_1,23.5,7,164.5,7.0,7.0,6.0
7,2019-03-14,2019-03,ID020,customer_1,item_2,7.5,6,45.0,8.0,8.0,7.0
8,2019-03-15,2019-03,ID021,customer_1,item_2,7.5,12,90.0,9.0,9.0,8.0
9,2019-03-22,2019-03,ID025,customer_1,item_1,23.5,10,235.0,10.0,10.0,9.0


In [35]:
#4.Sum(…) over(partition by … order by .. rows unbounded preceding) → cumsum()
# We now wish to compute the cumulative sum of the amount paid by each customer, in each month, sorted by order date. 
# This calculation is also known as a running total and it’s probably one of the most used metrics in business analytics. 
# One way to achieve this in SQL is:
# sum(amount_paid) over(partition by customer_id, order_month order by order_date rows unbounded preceding)

(
    orders.assign(
        cumu_sum = orders.groupby(['customer_id', 'order_month'])['amount_paid_gbp'].cumsum()
    )
)

Unnamed: 0,order_date,order_month,order_id,customer_id,item_id,item_price,quantity,amount_paid_gbp,row_num,rnk,cumu_sum
0,2019-02-13,2019-02,ID001,customer_1,item_3,10.0,2,20.0,1.0,1.0,20.0
1,2019-02-16,2019-02,ID004,customer_1,item_1,23.5,4,94.0,2.0,2.0,114.0
2,2019-02-21,2019-02,ID008,customer_1,item_2,7.5,3,22.5,3.0,3.0,136.5
3,2019-02-21,2019-02,ID009,customer_1,item_3,10.0,2,20.0,4.0,3.0,156.5
4,2019-02-23,2019-02,ID011,customer_1,item_5,35.0,2,70.0,5.0,5.0,226.5
5,2019-02-26,2019-02,ID014,customer_1,item_1,23.5,4,94.0,6.0,6.0,320.5
6,2019-03-13,2019-03,ID019,customer_1,item_1,23.5,7,164.5,7.0,7.0,164.5
7,2019-03-14,2019-03,ID020,customer_1,item_2,7.5,6,45.0,8.0,8.0,209.5
8,2019-03-15,2019-03,ID021,customer_1,item_2,7.5,12,90.0,9.0,9.0,299.5
9,2019-03-22,2019-03,ID025,customer_1,item_1,23.5,10,235.0,10.0,10.0,534.5


In [41]:
#5. Avg(…) over(partition by… ) → transform(np.mean)
# In a similar fashion, we may also wish to compute the average amount spent by each customer per month. 
# This time the SQL syntax is quite intuitive:
# avg(amount_paid) over(partition by customer_id, order_month)

(
    orders.assign(
        avg_amt = orders.groupby(['customer_id', 'order_month'])['amount_paid_gbp'].transform('mean')
    )
)

Unnamed: 0,order_date,order_month,order_id,customer_id,item_id,item_price,quantity,amount_paid_gbp,row_num,rnk,avg_amt
0,2019-02-13,2019-02,ID001,customer_1,item_3,10.0,2,20.0,1.0,1.0,53.416667
1,2019-02-16,2019-02,ID004,customer_1,item_1,23.5,4,94.0,2.0,2.0,53.416667
2,2019-02-21,2019-02,ID008,customer_1,item_2,7.5,3,22.5,3.0,3.0,53.416667
3,2019-02-21,2019-02,ID009,customer_1,item_3,10.0,2,20.0,4.0,3.0,53.416667
4,2019-02-23,2019-02,ID011,customer_1,item_5,35.0,2,70.0,5.0,5.0,53.416667
5,2019-02-26,2019-02,ID014,customer_1,item_1,23.5,4,94.0,6.0,6.0,53.416667
6,2019-03-13,2019-03,ID019,customer_1,item_1,23.5,7,164.5,7.0,7.0,92.15625
7,2019-03-14,2019-03,ID020,customer_1,item_2,7.5,6,45.0,8.0,8.0,92.15625
8,2019-03-15,2019-03,ID021,customer_1,item_2,7.5,12,90.0,9.0,9.0,92.15625
9,2019-03-22,2019-03,ID025,customer_1,item_1,23.5,10,235.0,10.0,10.0,92.15625


In [39]:
orders.groupby(['customer_id', 'order_month'])['amount_paid_gbp'].mean()

customer_id  order_month
customer_1   2019-02        53.416667 
             2019-03        92.156250 
             2020-01        23.500000 
             2020-02        94.000000 
             2020-04        22.500000 
customer_2   2019-02        33.833333 
             2019-03        60.125000 
             2020-01        70.500000 
             2020-03        123.500000
customer_3   2019-02        132.500000
             2019-03        11.750000 
             2020-03        55.833333 
customer_4   2019-02        141.000000
             2019-03        10.000000 
             2020-01        40.000000 
             2020-02        107.250000
             2020-03        130.833333
             2020-04        15.000000 
Name: amount_paid_gbp, dtype: float64