# Association Rule Based Recommender System

This project will focus on making product recommendations to users at the basket stage.

## Dataset

*InvoiceNo*: Invoice number. Nominal. A 6-digit integral number uniquely assigned to each transaction. If this code starts with the letter 'c', it indicates a cancellation.<br>
*StockCode*: Product (item) code. Nominal. A 5-digit integral number uniquely assigned to each distinct product.<br>
*Description*: Product (item) name. Nominal.<br>
*Quantity*: The quantities of each product (item) per transaction. Numeric.<br>
*InvoiceDate*: Invice date and time. Numeric. The day and time when a transaction was generated.<br>
*UnitPrice*: Unit price. Numeric. Product price per unit in sterling (Â£).<br>
*CustomerID*: Customer number. Nominal. A 5-digit integral number uniquely assigned to each customer.<br>
*Country*: Country name. Nominal. The name of the country where a customer resides.<br>

In [1]:
# libraries
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from mlxtend.frequent_patterns import apriori,association_rules

pd.set_option("display.max_columns",None)
pd.set_option("display.width",500)
sns.set(rc={"figure.figsize":(12,12)})

  if LooseVersion(mpl.__version__) >= "3.0":
  other = LooseVersion(other)


## Data Preprocessing

In [2]:
df = pd.read_excel("datas/online_retail_II.xlsx",sheet_name="Year 2010-2011")
data = df.copy()
data.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [3]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Quantity,541910.0,9.552234,218.080957,-80995.0,1.0,3.0,10.0,80995.0
Price,541910.0,4.611138,96.759765,-11062.06,1.25,2.08,4.13,38970.0
Customer ID,406830.0,15287.68416,1713.603074,12346.0,13953.0,15152.0,16791.0,18287.0


In [4]:
data.isnull().sum()

Invoice             0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
Price               0
Customer ID    135080
Country             0
dtype: int64

In [5]:
data.shape

(541910, 8)

In [6]:
def outlier_threshold(dataframe,column_name:str):
    
    '''
    This function extracts upper and lower threshold limits in the specific column of the dataframe
    
    :param dataframe: the dataframe
    :param column_name: the name of the column
    :type dataframe: pandas DataFrame
    :type column_name: string
    
    :returns: the lower limit and upper limit
    '''
    
    quartile1 = dataframe[column_name].quantile(0.01)
    quartile3 = dataframe[column_name].quantile(0.99)
    
    interquartile_range = quartile3-quartile1 # a wide range
    
    up_limit = quartile3 * 1.5 + interquartile_range
    low_limit = quartile1 * 1.5 - interquartile_range
    
    return low_limit,up_limit

In [7]:
def replace_with_threshold(dataframe, column_name:str):
    
    '''
    This function selects and alters the data according to calculated threshold and assigns new value 
    in the specific column of the dataframe
    
    :param dataframe: the dataframe
    :param column_name: the name of the column
    :type dataframe: pandas DataFrame
    :type column_name: string
    
    :returns: 
    '''
    
    low_limit, up_limit = outlier_threshold(dataframe, column_name)
    
    dataframe.loc[(dataframe[column_name] < low_limit), column_name] = low_limit
    dataframe.loc[(dataframe[column_name] > up_limit), column_name] = up_limit
    
    

In [8]:
def data_preperation(dataframe):
    
    '''
    This function drops missing values and filters specific columns of the dataframe
    
    :param dataframe: the dataframe
    :type dataframe: pandas DataFrame
    
    :returns: filtered dataframe
    '''
    
    dataframe.dropna(inplace=True)
    dataframe = dataframe[~dataframe["Invoice"].astype(str).str.contains("C",na=False)] # data which does not contain C in the Invoice
    dataframe = dataframe[dataframe["Quantity"] > 0]
    dataframe = dataframe[dataframe["Price"] > 0]
    
    replace_with_threshold(dataframe,"Quantity")
    replace_with_threshold(dataframe, "Price")
    
    return dataframe  

In [9]:
data = data_preperation(data)
data.isnull().sum()

Invoice        0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
Price          0
Customer ID    0
Country        0
dtype: int64


## Preparing Association Rules Learning Data Structures

Association rules of customers in *France* will be derived.

In [10]:
df_fr = data[data["Country"] == 'France']
df_fr.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
26,536370,22728,ALARM CLOCK BAKELIKE PINK,24.0,2010-12-01 08:45:00,3.75,12583.0,France
27,536370,22727,ALARM CLOCK BAKELIKE RED,24.0,2010-12-01 08:45:00,3.75,12583.0,France
28,536370,22726,ALARM CLOCK BAKELIKE GREEN,12.0,2010-12-01 08:45:00,3.75,12583.0,France
29,536370,21724,PANDA AND BUNNIES STICKER SHEET,12.0,2010-12-01 08:45:00,0.85,12583.0,France
30,536370,21883,STARS GIFT TAPE,24.0,2010-12-01 08:45:00,0.65,12583.0,France


In [11]:
df_fr.groupby(["Invoice","Description"]).agg({"Quantity":"sum"}).head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Quantity
Invoice,Description,Unnamed: 2_level_1
536370,SET 2 TEA TOWELS I LOVE LONDON,24.0
536370,ALARM CLOCK BAKELIKE GREEN,12.0
536370,ALARM CLOCK BAKELIKE PINK,24.0
536370,ALARM CLOCK BAKELIKE RED,24.0
536370,CHARLOTTE BAG DOLLY GIRL DESIGN,20.0
536370,CIRCUS PARADE LUNCH BOX,24.0
536370,INFLATABLE POLITICAL GLOBE,48.0
536370,LUNCH BOX I LOVE LONDON,24.0
536370,MINI JIGSAW CIRCUS PARADE,24.0
536370,MINI JIGSAW SPACEBOY,24.0


In [12]:
def create_invoice_product(dataframe, id=False):
    if id:
        return df_fr.groupby(["Invoice","StockCode"])["Quantity"].sum().unstack().fillna(0).\
                    applymap(lambda x: 1 if x > 0 else 0)
        
    else:
        return df_fr.groupby(["Invoice","Description"])["Quantity"].sum().unstack().fillna(0).\
                    applymap(lambda x: 1 if x > 0 else 0)

In [15]:
def check_id(dataframe, stock_code:int):
    
    product_name = dataframe[dataframe["StockCode"] == stock_code][["Description"]].values[0].tolist()
    print(product_name)

In [16]:
check_id(df_fr,10002)
check_id(df_fr,10125)

['INFLATABLE POLITICAL GLOBE ']
['MINI FUNKY DESIGN TAPES']


## Association Rules Analysis

In [17]:
# Gets frequent itemsets from a one-hot DataFrame

frequent_itemsets = apriori(fr_invoice_product,
                        min_support=0.01,
                        use_colnames=True)

frequent_itemsets.sort_values("support",ascending=False)



Unnamed: 0,support,itemsets
538,0.773779,(POST)
387,0.187661,(23084)
107,0.179949,(21731)
243,0.172237,(22554)
245,0.169666,(22556)
...,...,...
18793,0.010283,"(22729, 21086, 22326, 22551)"
18787,0.010283,"(23256, 21086, 22492, 22326)"
18786,0.010283,"(22728, 21086, 22492, 22326)"
18785,0.010283,"(21086, 22492, 22326, 22727)"


In [18]:
# Generates a DataFrame of association rules including the metrics 'score', 'confidence', and 'lift'

rules = association_rules(frequent_itemsets, metric = "support", min_threshold=0.01)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(10002),(21791),0.020566,0.028278,0.010283,0.500000,17.681818,0.009701,1.943445
1,(21791),(10002),0.028278,0.020566,0.010283,0.363636,17.681818,0.009701,1.539111
2,(10002),(21915),0.020566,0.069409,0.010283,0.500000,7.203704,0.008855,1.861183
3,(21915),(10002),0.069409,0.020566,0.010283,0.148148,7.203704,0.008855,1.149771
4,(10002),(22551),0.020566,0.136247,0.010283,0.500000,3.669811,0.007481,1.727506
...,...,...,...,...,...,...,...,...,...
1372699,(23254),"(22659, 23206, 22726, 22727, 22728, 20750, 223...",0.071979,0.010283,0.010283,0.142857,13.892857,0.009543,1.154670
1372700,(22326),"(22659, 23206, 22726, 22727, 22728, 20750, 223...",0.159383,0.010283,0.010283,0.064516,6.274194,0.008644,1.057974
1372701,(21558),"(22659, 23206, 22726, 22727, 22728, 20750, 223...",0.051414,0.010283,0.010283,0.200000,19.450000,0.009754,1.237147
1372702,(23291),"(22659, 23206, 22726, 22727, 22728, 20750, 223...",0.041131,0.010283,0.010283,0.250000,24.312500,0.009860,1.319623


If first index is examined:<br>

The probability of seeing 10002 sales is seen as 2%, 21791 intake is seen as 3%. It can be said that the support of both of them is measured as 1%. 50% of those who buys 10002, buys 21791 as well (**confidence**). Their *correlation* with each other is seen as *1.94*.

In [19]:
rules[(rules["support"] > 0.05) & (rules["confidence"] > 0.1) & (rules["lift"] > 5)]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
1606,(21080),(21086),0.133676,0.138817,0.102828,0.769231,5.541311,0.084271,3.731791
1607,(21086),(21080),0.138817,0.133676,0.102828,0.740741,5.541311,0.084271,3.341535
1608,(21080),(21094),0.133676,0.128535,0.102828,0.769231,5.984615,0.085646,3.776350
1609,(21094),(21080),0.128535,0.133676,0.102828,0.800000,5.984615,0.085646,4.331620
1776,(21086),(21094),0.138817,0.128535,0.123393,0.888889,6.915556,0.105550,7.843188
...,...,...,...,...,...,...,...,...,...
213940,"(POST, 22727)","(22728, 22726)",0.089974,0.074550,0.059126,0.657143,8.814778,0.052418,2.699229
213941,"(22726, 22727)","(22728, POST)",0.079692,0.092545,0.059126,0.741935,8.017025,0.051751,3.516388
213942,(22728),"(POST, 22726, 22727)",0.102828,0.074550,0.059126,0.575000,7.712931,0.051460,2.177529
213944,(22726),"(22728, POST, 22727)",0.097686,0.069409,0.059126,0.605263,8.720273,0.052346,2.357498


In [22]:
def create_rules(dataframe, id=True,country="France"):
    
    dataframe = dataframe[dataframe["Country"] == country]
    dataframe = create_invoice_product(dataframe,id)
    frequent_itemsets = apriori(dataframe,min_support=0.01,use_colnames=True)
    rules = association_rules(frequent_itemsets, metric="support",min_threshold=0.01)
    
    return rules

In [20]:
check_id(data,21080)
check_id(data,21086)

['SET/20 RED RETROSPOT PAPER NAPKINS ']
['SET/6 RED SPOTTY PAPER CUPS']


## Product Recommendation Practice

In [23]:
def arl_recommender(rules_df, product_id,rec_count=1):
    
    sorted_rules = rules_df.sort_values("lift",ascending=False)
    recommendation_list = []
    
    for i,product in enumerate(sorted_rules["antecedents"]):
        for j in list(product):
            if j == product_id:
                recommendation_list.append(list(sorted_rules.iloc[i]["consequents"])[0])
                
    return recommendation_list[0:rec_count]

In [24]:
arl_recommender(rules, 22492, 1)

[22326]

In [25]:
arl_recommender(rules, 22492, 3)

[22326, 22556, 22551]

In [27]:
check_id(data,22326)
check_id(data,22556)
check_id(data,22551)

['ROUND SNACK BOXES SET OF4 WOODLAND ']
['PLASTERS IN TIN CIRCUS PARADE ']
['PLASTERS IN TIN SPACEBOY']
