# Env setup

In [11]:
import datetime as dt
from datetime import date
import itertools
import math
import random
import typing

import pandas as pd
import tqdm


# Requirements

1. Implicit or explicit ratings.
    - Rating created date required for applying rating time decay.

In [27]:
today = date.today()
ratings = [{
    "user_id": random.randint(1, 100), 
    "item_id": random.randint(2, 200), 
    "rating": random.random() * 20, 
    "created_date": today - dt.timedelta(days=random.randint(0, 200))
} for _ in range(1000)]

ratings_df = pd.DataFrame(ratings)
ratings_df["days_since"] = (today - ratings_df["created_date"]).apply(lambda delta: delta.days)
ratings_df = ratings_df.loc[ratings_df[["user_id", "item_id"]].drop_duplicates().index]

# Static non-personalised recommenders

## Top 10

### All-time

In [28]:
ratings_df.groupby("item_id")["rating"].mean().sort_values(ascending=False).reset_index().head(10)

Unnamed: 0,item_id,rating
0,65,19.173553
1,192,18.601053
2,79,17.444371
3,20,17.058592
4,7,16.785143
5,38,16.161971
6,115,15.957855
7,25,15.108986
8,2,15.029119
9,15,14.775617


### Time-weighted

Applying decay functions can add a recency effect to a top 10 creation. Note, decay functions can take multiple forms (linear, exponential, etc.).

In [29]:
ratings_df["weighted_rating"] = ratings_df["rating"] / (1 + ratings_df["days_since"])
ratings_df.groupby("item_id")["weighted_rating"].mean().sort_values(ascending=False).reset_index().head(10)

Unnamed: 0,item_id,weighted_rating
0,176,3.879943
1,7,2.098143
2,10,1.755871
3,188,1.55853
4,50,1.356819
5,146,1.24126
6,155,1.23473
7,86,1.231082
8,179,1.192951
9,92,0.949165


## Frequently bought together (FBT) recommendation 

Create using the [online retail dataset](https://archive.ics.uci.edu/dataset/352/online+retail) from the UCI ML repository. Should be accessible as `data/online_retail.xlsx`.


In [2]:
retail_df = pd.read_excel("../data/online_retail.xlsx")
retail_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    541909 non-null  object        
 1   StockCode    541909 non-null  object        
 2   Description  540455 non-null  object        
 3   Quantity     541909 non-null  int64         
 4   InvoiceDate  541909 non-null  datetime64[ns]
 5   UnitPrice    541909 non-null  float64       
 6   CustomerID   406829 non-null  float64       
 7   Country      541909 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 33.1+ MB


In [3]:
# items bought together (ibt)
retail_df["StockCode"] = retail_df["StockCode"].astype(str)
ibt_df = retail_df.groupby(["CustomerID", "InvoiceDate", "InvoiceNo"])["StockCode"].apply(list).reset_index()
ibt_df.columns = ["customer_id", "invoiced_at", "invoice_id", "stock_ids"]

In [4]:
# Association rule functions

def get_confidence(itemset_1: typing.Union[str, list], item_2: str, df: pd.DataFrame) -> float:
    '''Returns number of transactions where itemset appears / number 
    of transactions where item/itemset 1 appears.

    Can be seen as the level of confidence that item 2 is purchased 
    with item/itemset 1 when item/itemset 1 appears.
    
    Arguments:
        item_1: ID of item 1 or IDs of itemset 1.
        item_2: ID of item 2.
        df: Dataframe with columns `invoice_id` and `stock_ids` from 
            ibt_df.

    Returns:
        Confidence value.
    '''

    #item_1_set = set(df.loc[df["stock_ids"].apply(lambda el: item_1 in el), "invoice_id"])

    if isinstance(itemset_1, str):
        itemset_1_appearances = set(df.loc[df["stock_ids"].apply(lambda el: itemset_1 in el), "invoice_id"])
    else:
        itemset_1_appearances = set(df.loc[df["stock_ids"].apply(lambda el: itemset_1[0] in el), "invoice_id"])
        for item in itemset_1[1:]:
            itemset_1_appearances = itemset_1_appearances & set(df.loc[df["stock_ids"].apply(lambda el: item in el), "invoice_id"])

    item_2_appearances = set(df.loc[df["stock_ids"].apply(lambda el: item_2 in el), "invoice_id"])
    confidence = len(itemset_1_appearances & item_2_appearances) / len(itemset_1_appearances)

    return confidence

def get_support(itemset_1: typing.Union[str, list], item_2: typing.Union[str, None], df: pd.DataFrame) -> float:
    '''Returns number of transactions where itemset appears / number 
    of total transactions.
    
    Can be seen as the level of support that item 1 and 2 are bought
    together frequently enough versus all transactions.
    
    Arguments:
        itemset_1: ID of item 1 or IDs of itemset 1.
        item_2: ID of item 2.
        df: Dataframe with columns `invoice_id` and `stock_ids` from 
            ibt_df.
            
    Returns:
        Support value.
    '''

    if isinstance(itemset_1, str):
        itemset_1_appearances = set(df.loc[df["stock_ids"].apply(lambda el: itemset_1 in el), "invoice_id"])
    else:
        itemset_1_appearances = set(df.loc[df["stock_ids"].apply(lambda el: itemset_1[0] in el), "invoice_id"])
        for item in itemset_1[1:]:
            itemset_1_appearances = itemset_1_appearances & set(df.loc[df["stock_ids"].apply(lambda el: item in el), "invoice_id"])

    if isinstance(item_2, str):
        item_2_appearances = set(df.loc[df["stock_ids"].apply(lambda el: item_2 in el), "invoice_id"])
        support = len(itemset_1_appearances & item_2_appearances) / ibt_df.shape[0]
    else:
        support = len(itemset_1_appearances) / ibt_df.shape[0]

    return support

In [5]:
itemset_1 = ("85123A", "22423")
item_2 = "22423"

print(get_confidence(itemset_1, item_2, ibt_df))

1.0


In [12]:
min_support = 0.01
items = list(retail_df["StockCode"].unique())
itemsets_1 = []

for stock_id in tqdm.tqdm(items, desc="get itemset 1", total=len(items)):
    support = get_support(stock_id, None, ibt_df)
    if support > min_support:
        itemsets_1.append(stock_id)

stock_combinations = itertools.combinations(itemsets_1, 2)
n_iterations = math.comb(len(itemsets_1), 2)
itemsets_2 = []

for id_1, id_2 in tqdm.tqdm(stock_combinations, desc="get itemset 2", total=n_iterations):
    if id_1 == id_2:
        continue
    
    support = get_support(id_1, id_2, ibt_df)
    if support > min_support:
        itemsets_2.append([id_1, id_2])

100%|██████████| 4070/4070 [00:32<00:00, 125.66it/s]
100%|██████████| 135981/135981 [36:35<00:00, 61.93it/s]


In [14]:
len(itemsets_2), itemsets_2[:5]

(216,
 [['85123A', '84879'],
  ['85123A', '82482'],
  ['85123A', '82494L'],
  ['85123A', '21733'],
  ['85123A', '20725']])

This algorithm is too slow for only 4070 items, predicted to finish in 37 mins for only getting 2-item itemsets.

For association rule mining from a database, the [FP-growth algorithm](https://hands-on.cloud/implementation-of-fp-growth-algorithm-using-python) is an improved version of the [Apriori algorithm](https://analyticslog.com/blog/2020/8/13/apriori-algorithm-items-frequently-bought-together-a-basic-explanation-of-how-it-works) that can be used. This improves speed and computational cost vs the Apriori algorithm.