In [9]:
#importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [5]:
#Loading the data set
marketbasketdf = pd.read_csv('data/clean_basketdf.csv', index_col=0, decimal='.')

In [6]:
marketbasketdf.head()

Unnamed: 0,BasketID,BasketDate,Sale,CustomerID,CustomerCountry,ProdID,ProdDescr,Qta,Amount
0,536365,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,15.3
1,536365,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,71053,WHITE METAL LANTERN,6,20.34
2,536365,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,84406B,CREAM CUPID HEARTS COAT HANGER,8,22.0
3,536365,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,20.34
4,536365,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,20.34


In [7]:
transactions = marketbasketdf.groupby(['BasketID'])['ProdDescr'].apply(list)
baskets = transactions.values

print("NUM OF BASKETS: {}".format(len(baskets)))

NUM OF BASKETS: 18532


In [13]:
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [17]:
# convert data in format which is required 
# converting using pivot table and Quantity sum as values. fill 0 if any nan values

basket = pd.pivot_table(data=marketbasketdf,index='BasketID',columns='ProdDescr',values='Qta', aggfunc='sum',fill_value=0)

In [18]:
basket.head()

ProdDescr,4 PURPLE FLOCK DINNER CANDLES,50'S CHRISTMAS GIFT BAG LARGE,DOLLY GIRL BEAKER,I LOVE LONDON MINI BACKPACK,I LOVE LONDON MINI RUCKSACK,NINE DRAWER OFFICE TIDY,OVAL WALL MIRROR DIAMANTE,RED SPOT GIFT BAG LARGE,SET 2 TEA TOWELS I LOVE LONDON,SPACEBOY BABY GIFT SET,...,ZINC STAR T-LIGHT HOLDER,ZINC SWEETHEART SOAP DISH,ZINC SWEETHEART WIRE LETTER RACK,ZINC T-LIGHT HOLDER STAR LARGE,ZINC T-LIGHT HOLDER STARS LARGE,ZINC T-LIGHT HOLDER STARS SMALL,ZINC TOP 2 DOOR WOODEN SHELF,ZINC WILLIE WINKIE CANDLE STICK,ZINC WIRE KITCHEN ORGANISER,ZINC WIRE SWEETHEART LETTER TRAY
BasketID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
536365,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536366,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536367,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536368,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536369,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
#this to check correctness after binning it to 1 ..
basket['10 COLOUR SPACEBOY PEN'].head(10)

BasketID
536365    0
536366    0
536367    0
536368    0
536369    0
536370    0
536371    0
536372    0
536373    0
536374    0
Name: 10 COLOUR SPACEBOY PEN, dtype: int64

In [20]:
# we dont need quantity sum 
# we need either has taken or not 
# so if user has taken that item mark as 1 else mark as 0.

def convert_into_binary(x):
    if x > 0:
        return 1
    else:
        return 0

In [21]:
basket_sets = basket.applymap(convert_into_binary)

In [22]:
# check : has quantity now converted to 1 or 0.
basket_sets['10 COLOUR SPACEBOY PEN'].head(10)

BasketID
536365    0
536366    0
536367    0
536368    0
536369    0
536370    0
536371    0
536372    0
536373    0
536374    0
Name: 10 COLOUR SPACEBOY PEN, dtype: int64

In [23]:
# remove postage item as it is just a seal which almost all transaction contains. 
print(basket_sets['POSTAGE'].head())

basket_sets.drop(columns=['POSTAGE'],inplace=True)

BasketID
536365    0
536366    0
536367    0
536368    0
536369    0
Name: POSTAGE, dtype: int64


In [28]:
# call apriori function and pass minimum support here we are passing 7%. 
# means 7 times in total number of transaction the item should be present.
frequent_itemsets = apriori(basket_sets, min_support=0.02, use_colnames=True)



In [25]:
#it will generate frequent itemsets 
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.074196,(ASSORTED COLOUR BIRD ORNAMENT)
1,0.086337,(JUMBO BAG RED RETROSPOT)
2,0.074412,(PARTY BUNTING)
3,0.091895,(REGENCY CAKESTAND 3 TIER)
4,0.106357,(WHITE HANGING HEART T-LIGHT HOLDER)


In [26]:
# We would apply association rules on frequent itemset. 
# here we are setting based on lift and keeping minimum lift as 1

rules_mlxtend = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules_mlxtend.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction


In [27]:
# rules_mlxtend.rename(columns={'antecedents':'lhs','consequents':'rhs'})

# as based business use case we can sort based on confidance and lift.
rules_mlxtend[ (rules_mlxtend['lift'] >= 4) & (rules_mlxtend['confidence'] >= 0.8) ].head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
