# Frequently Bought Together Product Recommendation

### 1. Importing Libraries

In [1]:
import pandas as pd

### 2. Loading dataset

In [2]:
df = pd.read_excel('Online Retail.xlsx')

In [3]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


### 3. Data Cleaning

In [4]:
# Modify StockCode to always be a string by prepending '_'
df['StockCode'] = df['StockCode'].apply(lambda x: '_'+str(x))

In [5]:
# DataFrame for building the recommendation system
orders = df[['InvoiceNo', 'StockCode']]
orders.head()

Unnamed: 0,InvoiceNo,StockCode
0,536365,_85123A
1,536365,_71053
2,536365,_84406B
3,536365,_84029G
4,536365,_84029E


In [6]:
# DataFrame for retrieving product descriptions
products = df[['StockCode', 'Description']].copy()
products.head()

Unnamed: 0,StockCode,Description
0,_85123A,WHITE HANGING HEART T-LIGHT HOLDER
1,_71053,WHITE METAL LANTERN
2,_84406B,CREAM CUPID HEARTS COAT HANGER
3,_84029G,KNITTED UNION FLAG HOT WATER BOTTLE
4,_84029E,RED WOOLLY HOTTIE WHITE HEART.


In [7]:
# Drop duplicated products
products = products[~products.duplicated()]
products.head()

Unnamed: 0,StockCode,Description
0,_85123A,WHITE HANGING HEART T-LIGHT HOLDER
1,_71053,WHITE METAL LANTERN
2,_84406B,CREAM CUPID HEARTS COAT HANGER
3,_84029G,KNITTED UNION FLAG HOT WATER BOTTLE
4,_84029E,RED WOOLLY HOTTIE WHITE HEART.


In [8]:
# Drop descriptions that are not uppercase
products = products[
    products['Description'].str.upper() == products['Description']
]

In [9]:
# Keep only the first Description of each product
products = products[~products.duplicated(subset=['StockCode'])]
products

Unnamed: 0,StockCode,Description
0,_85123A,WHITE HANGING HEART T-LIGHT HOLDER
1,_71053,WHITE METAL LANTERN
2,_84406B,CREAM CUPID HEARTS COAT HANGER
3,_84029G,KNITTED UNION FLAG HOT WATER BOTTLE
4,_84029E,RED WOOLLY HOTTIE WHITE HEART.
...,...,...
509369,_85179a,GREEN BITTY LIGHT CHAIN
512588,_23617,SET 10 CARDS SWIRLY XMAS TREE 17104
527065,_90214U,"LETTER ""U"" BLING KEY RING"
537224,_47591b,SCOTTIES CHILDRENS APRON


In [10]:
# Set the index to StockCode
products = products.set_index('StockCode')

# Convert to Series for eve easier lookups
products = products['Description']

In [11]:
# Test it out
products['_21755']

'LOVE BUILDING BLOCK WORD'

In [12]:
# Number of unique products
len(products)

3905

### 4.  Restructuring data

In [13]:
def string_list(x):
    return [str(i) for i in x]

orders = orders.groupby('InvoiceNo')['StockCode'].apply(list).reset_index()
orders.head()

Unnamed: 0,InvoiceNo,StockCode
0,536365,"[_85123A, _71053, _84406B, _84029G, _84029E, _..."
1,536366,"[_22633, _22632]"
2,536367,"[_84879, _22745, _22748, _22749, _22310, _8496..."
3,536368,"[_22960, _22913, _22912, _22914]"
4,536369,[_21756]


In [14]:
from mlxtend.preprocessing import TransactionEncoder

In [15]:
te = TransactionEncoder()

te.fit(orders['StockCode'])
orders_1hot = te.transform(orders['StockCode'])


In [16]:
orders_1hot = pd.DataFrame(orders_1hot, columns =te.columns_)
orders_1hot.head()

Unnamed: 0,_10002,_10080,_10120,_10123C,_10123G,_10124A,_10124G,_10125,_10133,_10134,...,_M,_PADS,_POST,_S,_gift_0001_10,_gift_0001_20,_gift_0001_30,_gift_0001_40,_gift_0001_50,_m
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


### 5. Model Build

#### 5.1.  Apriori

In [17]:
from mlxtend.frequent_patterns import apriori

In [18]:
%%timeit -n1 -r1

apriori(orders_1hot, min_support=0.01, max_len=2, use_colnames=True)

2min 28s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [19]:
is_ap = apriori(orders_1hot, min_support=0.01, max_len=2, use_colnames=True)

In [20]:
is_ap.head()

Unnamed: 0,support,itemsets
0,0.020193,(_15036)
1,0.012587,(_15056BL)
2,0.017876,(_15056N)
3,0.011236,(_16237)
4,0.01251,(_20675)


#### 5.2.  FP Growth

In [21]:
from mlxtend.frequent_patterns import fpgrowth

In [22]:
%%timeit -n1 -r1

fpgrowth(orders_1hot, min_support=0.01, max_len=2, use_colnames=True)

7.69 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [23]:
is_fp = fpgrowth(orders_1hot, min_support=0.01, max_len=2, use_colnames=True)
is_fp

Unnamed: 0,support,itemsets
0,0.086718,(_85123A)
1,0.017915,(_84029G)
2,0.016911,(_84029E)
3,0.014865,(_22752)
4,0.013205,(_71053)
...,...,...
997,0.010077,"(_23344, _23203)"
998,0.010039,"(_23344, _22086)"
999,0.011853,"(_23293, _23295)"
1000,0.010077,"(_23293, _23296)"


### 6.  Both alorithms generate the same itemsets

In [24]:
def itemset_to_ordered_string(itemset):
    return ','.join(sorted(list(itemset)))

In [25]:
ap_itemset_strings = is_ap['itemsets'].apply(itemset_to_ordered_string)
ap_itemset_strings = ap_itemset_strings.sort_values().reset_index(drop=True)
ap_itemset_strings.head()

0      _15036
1    _15056BL
2     _15056N
3      _16237
4      _20675
Name: itemsets, dtype: object

In [26]:
fp_itemset_strings = is_fp['itemsets'].apply(itemset_to_ordered_string)
fp_itemset_strings = fp_itemset_strings.sort_values().reset_index(drop=True)
fp_itemset_strings.head()

0      _15036
1    _15056BL
2     _15056N
3      _16237
4      _20675
Name: itemsets, dtype: object

In [27]:
# test to see if the itemset lists are equal
fp_itemset_strings.equals(ap_itemset_strings)

True

### 7. Calculating Association Rules

In [28]:
from mlxtend.frequent_patterns import association_rules

In [29]:
is_fp

Unnamed: 0,support,itemsets
0,0.086718,(_85123A)
1,0.017915,(_84029G)
2,0.016911,(_84029E)
3,0.014865,(_22752)
4,0.013205,(_71053)
...,...,...
997,0.010077,"(_23344, _23203)"
998,0.010039,"(_23344, _22086)"
999,0.011853,"(_23293, _23295)"
1000,0.010077,"(_23293, _23296)"


In [30]:
rules = association_rules(is_fp, metric="lift", min_threshold=10)
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(_21755),(_21754),0.024363,0.030386,0.011467,0.470681,15.490025,0.010727,1.831815
1,(_21754),(_21755),0.030386,0.024363,0.011467,0.377382,15.490025,0.010727,1.566993
2,(_22745),(_22748),0.016448,0.016988,0.012124,0.737089,43.387751,0.011844,3.738955
3,(_22748),(_22745),0.016988,0.016448,0.012124,0.713636,43.387751,0.011844,3.434626
4,(_22727),(_22726),0.041737,0.038726,0.024942,0.597595,15.431412,0.023326,2.388821


In [31]:
rules.shape

(280, 9)

### 8.  Making Predictions

In [32]:
def predict(antecedent, rules, max_results= 6):
    
    # get the rules for this antecedent
    preds = rules[rules['antecedents'] == antecedent]
    
    # a messy way to convert a frozen set with one element to string
    preds = preds['consequents'].apply(iter).apply(next)
    
    return preds[:max_results].reset_index(drop=True)

In [33]:
preds = predict({'_20712'}, rules)
preds

0    _22379
1    _20713
2    _21930
3    _21931
4    _22386
5    _21928
Name: consequents, dtype: object

In [34]:
print(products['_20712'])

JUMBO BAG WOODLAND ANIMALS


In [35]:
for stockid in preds:  
    print(products[stockid])

RECYCLING BAG RETROSPOT 
JUMBO BAG OWLS
JUMBO STORAGE BAG SKULLS
JUMBO STORAGE BAG SUKI
JUMBO BAG PINK POLKADOT
JUMBO BAG SCANDINAVIAN PAISLEY


Try another product.

In [36]:
print(products['_22112'])

CHOCOLATE HOT WATER BOTTLE


In [37]:
# get the predictions
preds = predict({'_22112'}, rules)

# Display the descriptions of the predictions
for stockid in preds:  
    print(products[stockid])

HOT WATER BOTTLE TEA AND SYMPATHY
SCOTTIE DOG HOT WATER BOTTLE
HOT WATER BOTTLE I AM SO POORLY
HOT WATER BOTTLE KEEP CALM
