<a href="https://www.kaggle.com/omerparlak/association-rule-learning-recommender?scriptVersionId=88245274" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Business Problem

### Recommending products to users at the cart stage.

####  This study was conducted for Germany customers data from 2010-2011.


In [1]:
import numpy as np 
import pandas as pd 
import random
from mlxtend.frequent_patterns import apriori, association_rules

In [2]:
pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.0.9-py2.py3-none-any.whl (242 kB)
     |████████████████████████████████| 242 kB 587 kB/s            
[?25hCollecting et-xmlfile
  Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.0.9
Note: you may need to restart the kernel to use updated packages.


In [3]:
df_ = pd.read_excel("../input/online-retail-ll/online_retail_II.xlsx",sheet_name="Year 2010-2011")

In [4]:
df = df_.copy()

In [5]:
def check_df(dataframe, head=5):
    print("##################### Shape #####################", end="\n\n")
    print(dataframe.shape, end="\n\n")
    print("##################### Types #####################", end="\n\n")
    print(dataframe.dtypes, end="\n\n")
    print("##################### Head #####################", end="\n\n")
    print(dataframe.head(head), end="\n\n")
    print("##################### Tail #####################", end="\n\n")
    print(dataframe.tail(head), end="\n\n")
    print("##################### NA #####################", end="\n\n")
    print(dataframe.isnull().sum(), end="\n\n")
    print("##################### Quantiles #####################", end="\n\n")
    print(dataframe.quantile([0, 0.05, 0.50, 0.95, 0.99, 1]).T, end="\n\n")


def outlier_thresholds(dataframe, variable):
    quartile1 = dataframe[variable].quantile(0.01)
    quartile3 = dataframe[variable].quantile(0.99)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit


def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit
    

def data_prep(data):
    data = data.rename(columns={"Customer ID": "CustomerID"})
    data = data.dropna(axis=0)
    data = data[~data["Invoice"].str.contains("C", na=False)]
    data = data[~data["Description"].str.contains("POSTAGE", na=False)]
    data = data[(data['Quantity'] > 0)]
    data = data[(data['Price'] > 0)]
    replace_with_thresholds(data, "Quantity")
    replace_with_thresholds(data, "Price")
    return data


def create_invoice_product_df(dataframe, id=False):
    if id:
        return dataframe.groupby(['Invoice', "StockCode"])['Quantity'].sum().unstack().fillna(0). \
            applymap(lambda x: 1 if x > 0 else 0)
    else:
        return dataframe.groupby(['Invoice', 'Description'])['Quantity'].sum().unstack().fillna(0). \
            applymap(lambda x: 1 if x > 0 else 0)
    
    
def check_id(dataframe, stockcode):
    product_name = dataframe[dataframe["StockCode"] == stockcode][["Description"]].values[0].tolist()
    print(product_name)
    
    
def choose_stock(data):
    stock_list = data["StockCode"].values.tolist()
    random.seed(20)
    stock_1 = random.choice(stock_list)
    stock_2 = random.choice(stock_list)
    stock_3 = random.choice(stock_list)
    print(f"Stock_1 : {stock_1}")
    check_id(data, stock_1)
    print(f"Stock_2 : {stock_2}")
    check_id(data, stock_2)
    print(f"Stock_3 : {stock_3}")
    check_id(data, stock_3)
    
    return stock_1,stock_2, stock_3

def create_rules(data, id=True, country="Germany"):
    data = data[data['Country'] == country]
    data = create_invoice_product_df(data, id)
    # Possibilities of all possible product combinations
    frequent_itemsets = apriori(data, min_support=0.01, use_colnames=True)
    # Extracting association rules
    rules = association_rules(frequent_itemsets, metric="support", min_threshold=0.01)
    return rules


def arl_recommender(rules_df, product_id, rec_count=1):
    sorted_rules = rules_df.sort_values("lift", ascending=False)
    recommendation_list = []
    for i, product in sorted_rules["antecedents"].items():
        for j in list(product):
            if j == product_id:
                recommendation_list.append(list(sorted_rules.iloc[i]["consequents"]))
    recommendation_list = list({item for item_list in recommendation_list for item in item_list})
    return recommendation_list[:rec_count]

# EDA and Data Preprocessing

In [6]:
check_df(df)

##################### Shape #####################

(541910, 8)

##################### Types #####################

Invoice                object
StockCode              object
Description            object
Quantity                int64
InvoiceDate    datetime64[ns]
Price                 float64
Customer ID           float64
Country                object
dtype: object

##################### Head #####################

  Invoice StockCode                          Description  Quantity  \
0  536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6   
1  536365     71053                  WHITE METAL LANTERN         6   
2  536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   
3  536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE         6   
4  536365    84029E       RED WOOLLY HOTTIE WHITE HEART.         6   

          InvoiceDate  Price  Customer ID         Country  
0 2010-12-01 08:26:00   2.55      17850.0  United Kingdom  
1 2010-12-01 08:26:00   3.39      17850

In [7]:
df = data_prep(df)

# Invoice-Product Matrix for Germany 

In [8]:
df_grm = df[df["Country"] == "Germany"]
grm_inv_pro_df = create_invoice_product_df(df_grm, id=True)
grm_inv_pro_df.head()

StockCode,10002,10125,10135,11001,15034,15036,15039,16008,16011,16014,...,90160D,90161B,90161C,90161D,90201A,90201B,90201C,90201D,90202D,M
Invoice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
536527,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536840,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536861,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536967,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536983,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Reaching to Description of the Given Stock Code

In [9]:
stock_1,stock_2, stock_3 = choose_stock(df_grm)

Stock_1 : 20719
['WOODLAND CHARLOTTE BAG']
Stock_2 : 22979
['PANTRY WASHING UP BRUSH']
Stock_3 : 21210
['SET OF 72 RETROSPOT PAPER  DOILIES']


# Establishment of Association Rules

In [10]:
grm_inv_pro_df = create_rules(df, id=True, country="Germany")
grm_inv_pro_df.sort_values("support", ascending=False).head(10)
# antecedent support: X probability alone
# consequent support: Y probability alone
# support: Probability to buy both X and Y at the same time
# confidence: Probability of purchasing Y when X is purchased.
# lift: When X is purchased, the probability of purchasing Y increases .. times.
# conviction: Expected frequency of X without Y

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
2105,(22326),(22328),0.249443,0.160356,0.13363,0.535714,3.340774,0.09363,1.808463
2104,(22328),(22326),0.160356,0.249443,0.13363,0.833333,3.340774,0.09363,4.503341
2156,(22554),(22326),0.140312,0.249443,0.075724,0.539683,2.163549,0.040724,1.63052
2157,(22326),(22554),0.249443,0.140312,0.075724,0.303571,2.163549,0.040724,1.234424
2174,(22629),(22326),0.104677,0.249443,0.071269,0.680851,2.729483,0.045159,2.351745
2175,(22326),(22629),0.249443,0.104677,0.071269,0.285714,2.729483,0.045159,1.253452
2891,(22556),(22554),0.11804,0.140312,0.069042,0.584906,4.168613,0.05248,2.071067
2890,(22554),(22556),0.140312,0.11804,0.069042,0.492063,4.168613,0.05248,1.736359
300,(22326),(20719),0.249443,0.129176,0.064588,0.258929,2.004464,0.032366,1.175088
301,(20719),(22326),0.129176,0.249443,0.064588,0.5,2.004464,0.032366,1.501114


# Making a Product Recommendation for the Users in the Cart

In [11]:
recommend_1 = arl_recommender(grm_inv_pro_df, stock_1, 2)
recommend_1

[22029, '85049E']

In [12]:
recommend_2 = arl_recommender(grm_inv_pro_df, stock_2, 2)
recommend_2

[22629, 21239]

In [13]:
recommend_3 = arl_recommender(grm_inv_pro_df, stock_3, 2)
recommend_3

[21088, 21668]

# Names of the Recommended Products

In [14]:
check_id(df_grm, recommend_1[0])
check_id(df_grm, recommend_1[1])

['SPACEBOY BIRTHDAY CARD']
['SCANDINAVIAN REDS RIBBONS']


In [15]:
check_id(df_grm, recommend_2[0])
check_id(df_grm, recommend_2[1])

['SPACEBOY LUNCH BOX ']
['PINK  POLKADOT CUP']


In [16]:
check_id(df_grm, recommend_3[0])
check_id(df_grm, recommend_3[1])

['SET/6 FRUIT SALAD PAPER CUPS']
['RED STRIPE CERAMIC DRAWER KNOB']
