# Env setup

In [46]:
import datetime as dt
from datetime import date
import random

from mlxtend import frequent_patterns
import pandas as pd


# Requirements

1. Implicit or explicit ratings.
    - Rating created date required for applying rating time decay.

In [47]:
today = date.today()
ratings = [{
    "user_id": random.randint(1, 100), 
    "item_id": random.randint(2, 200), 
    "rating": random.random() * 10, 
    "created_date": today - dt.timedelta(days=random.randint(0, 200))
} for _ in range(1000)]

ratings_df = pd.DataFrame(ratings)
ratings_df = ratings_df.loc[ratings_df[["user_id", "item_id"]].drop_duplicates().index]

# Static non-personalised recommenders

## Top 10

### All-time

In [48]:
ratings_df.groupby("item_id")["rating"].mean().sort_values(ascending=False).reset_index().head(10)

Unnamed: 0,item_id,rating
0,29,9.369051
1,184,8.828954
2,81,8.262685
3,143,8.174707
4,148,8.15369
5,158,8.068435
6,50,7.783134
7,119,7.570137
8,43,7.530229
9,151,7.480565


### Time-weighted

Applying decay functions can add a recency effect to a top 10 creation. Note, decay functions can take multiple forms (linear, exponential, etc.).

In [49]:
ratings_df["days_since"] = (today - ratings_df["created_date"]).apply(lambda delta: delta.days)
ratings_df["weighted_rating"] = ratings_df["rating"] / (1 + ratings_df["days_since"])
ratings_df.groupby("item_id")["weighted_rating"].mean().sort_values(ascending=False).reset_index().head(10)

Unnamed: 0,item_id,weighted_rating
0,198,0.74985
1,100,0.638717
2,56,0.624317
3,21,0.6159
4,81,0.54209
5,9,0.514877
6,58,0.511728
7,154,0.45401
8,195,0.4523
9,115,0.434293


## Frequently bought together (FBT) recommendation 

Created using the [online retail dataset](https://archive.ics.uci.edu/dataset/352/online+retail) from the UCI ML repository. Should be accessible as `data/online_retail.xlsx`.


In [50]:
retail_df = pd.read_excel("../data/online_retail.xlsx")
retail_df = retail_df.rename(columns={
    "InvoiceNo": "invoice_id",
    "StockCode": "stock_id",
    "Description": "description",
    "Quantity": "quantity",
    "InvoiceDate": "invoiced_at",
    "UnitPrice": "unit_price",
    "CustomerID": "customer_id",
    "Country": "country"
})
retail_df = retail_df.drop(columns=["country", "quantity", "unit_price"])
retail_df["stock_id"] = retail_df["stock_id"].astype(str)
retail_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   invoice_id   541909 non-null  object        
 1   stock_id     541909 non-null  object        
 2   description  540455 non-null  object        
 3   invoiced_at  541909 non-null  datetime64[ns]
 4   customer_id  406829 non-null  float64       
dtypes: datetime64[ns](1), float64(1), object(3)
memory usage: 20.7+ MB


### Methods

- [Apriori algorithm](https://analyticslog.com/blog/2020/8/13/apriori-algorithm-items-frequently-bought-together-a-basic-explanation-of-how-it-works) (last accessed 2023-10-01)
- [FP-growth algorithm](https://hands-on.cloud/implementation-of-fp-growth-algorithm-using-python) (last accessed 2023-10-01)

FP-growth improves upon Apriori by being faster and computational more efficient.

FP-growth took 2.06 s ± 18.8 ms (mean ± std. dev.) to find all n-item itemsets for 7 runs across 25900 sales and 4070 items.

In [51]:
pvt_df = pd.crosstab(index=retail_df["invoice_id"], columns=retail_df["stock_id"]).map(bool)
pvt_df.shape

(25900, 4070)

In [52]:
#%%timeit

fg_result = frequent_patterns.fpgrowth(pvt_df, min_support=0.01, use_colnames=True)
ar_result = frequent_patterns.association_rules(fg_result, metric="lift", min_threshold=1)
ar_result.sort_values("confidence", ascending=False).head()

2.09 s ± 30.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
