In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from mlxtend.frequent_patterns import apriori, association_rules


In [2]:
df = pd.read_csv('ecom-data.csv', encoding= 'unicode_escape', parse_dates =['InvoiceDate'])

In [3]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [4]:
df.shape

(541909, 8)

In [5]:
df = df[df['Quantity']>0]

In [6]:
df = df.dropna(subset=['Description'])

In [7]:
df[df['InvoiceNo'].isna()]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country


In [8]:
df.shape

(530693, 8)

In [9]:
df.describe(include = 'O')

Unnamed: 0,InvoiceNo,StockCode,Description,Country
count,530693,530693,530693,530693
unique,20136,3925,4077,38
top,573585,85123A,WHITE HANGING HEART T-LIGHT HOLDER,United Kingdom
freq,1114,2270,2327,485694


In [10]:
# Transaction by Country (Top 10)
df['Country'].value_counts().head(10)

United Kingdom    485694
Germany             9042
France              8408
EIRE                7894
Spain               2485
Netherlands         2363
Belgium             2031
Switzerland         1967
Portugal            1501
Australia           1185
Name: Country, dtype: int64

In [11]:
# Stripping extra spaces in the description
df['Description'] = df['Description'].str.strip()

In [12]:
# Drop rows with NaNs in the Invoice number and convert them into strings for categorical treatment
df = df[~df['InvoiceNo'].isna()]
df['InvoiceNo'] = df['InvoiceNo'].astype('str')

In [13]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [14]:
# Transactions From germany
basket_de = df[df['Country'] =="Germany"]
basket_de = basket_de.groupby(['InvoiceNo', 'Description'])['Quantity'].sum()
basket_de = basket_de.unstack().reset_index().fillna(0)
basket_de = basket_de.set_index('InvoiceNo')
basket_de.head()
     



Description,10 COLOUR SPACEBOY PEN,12 COLOURED PARTY BALLOONS,12 IVORY ROSE PEG PLACE SETTINGS,12 MESSAGE CARDS WITH ENVELOPES,12 PENCIL SMALL TUBE WOODLAND,12 PENCILS SMALL TUBE RED RETROSPOT,12 PENCILS SMALL TUBE SKULL,12 PENCILS TALL TUBE POSY,12 PENCILS TALL TUBE RED RETROSPOT,12 PENCILS TALL TUBE SKULLS,...,YULETIDE IMAGES GIFT WRAP SET,ZINC HEART T-LIGHT HOLDER,ZINC STAR T-LIGHT HOLDER,ZINC BOX SIGN HOME,ZINC FOLKART SLEIGH BELLS,ZINC HEART LATTICE T-LIGHT HOLDER,ZINC METAL HEART DECORATION,ZINC T-LIGHT HOLDER STAR LARGE,ZINC T-LIGHT HOLDER STARS SMALL,ZINC WILLIE WINKIE CANDLE STICK
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
536527,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536840,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536861,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536967,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536983,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# Transactions done in France
basket_fr = df[df['Country'] =="France"]
basket_fr = basket_fr.groupby(['InvoiceNo', 'Description'])['Quantity'].sum()
basket_fr = basket_fr.unstack().reset_index().fillna(0)
basket_fr = basket_fr.set_index('InvoiceNo')
basket_fr.head()   

Description,10 COLOUR SPACEBOY PEN,12 COLOURED PARTY BALLOONS,12 EGG HOUSE PAINTED WOOD,12 MESSAGE CARDS WITH ENVELOPES,12 PENCIL SMALL TUBE WOODLAND,12 PENCILS SMALL TUBE RED RETROSPOT,12 PENCILS SMALL TUBE SKULL,12 PENCILS TALL TUBE POSY,12 PENCILS TALL TUBE RED RETROSPOT,12 PENCILS TALL TUBE WOODLAND,...,WRAP VINTAGE PETALS DESIGN,YELLOW COAT RACK PARIS FASHION,YELLOW GIANT GARDEN THERMOMETER,YELLOW SHARK HELICOPTER,ZINC STAR T-LIGHT HOLDER,ZINC FOLKART SLEIGH BELLS,ZINC HERB GARDEN CONTAINER,ZINC METAL HEART DECORATION,ZINC T-LIGHT HOLDER STAR LARGE,ZINC T-LIGHT HOLDER STARS SMALL
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
536370,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536852,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536974,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
537065,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
537463,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## France

In [16]:
# One hot encoding function to binarize the data for Libraries

basket_fr = (basket_fr>0).astype(int)
basket_de = (basket_de>0).astype(int)
     


In [17]:
# Calculate Association Rules
frq_items_fr = apriori(basket_fr, min_support = 0.05, use_colnames = True)

# Collecting the inferred rules in a dataframe
rules_fr = association_rules(frq_items_fr, metric ="lift", min_threshold = 1)
rules_fr = rules_fr.sort_values(['confidence', 'lift'], ascending =[False, False])
rules_fr.sample(12)



Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
1,(4 TRADITIONAL SPINNING TOPS),(POSTAGE),0.071429,0.765306,0.056122,0.785714,1.026667,0.001458,1.095238
269,(POSTAGE),"(PLASTERS IN TIN WOODLAND ANIMALS, PLASTERS IN...",0.765306,0.104592,0.084184,0.11,1.051707,0.004139,1.006077
123,(RABBIT NIGHT LIGHT),(POSTAGE),0.188776,0.765306,0.165816,0.878378,1.147748,0.021345,1.929705
42,(JUMBO BAG SPACEBOY DESIGN),(POSTAGE),0.063776,0.765306,0.056122,0.88,1.149867,0.007315,1.955782
316,(POSTAGE),"(ALARM CLOCK BAKELIKE RED, ALARM CLOCK BAKELIK...",0.765306,0.063776,0.056122,0.073333,1.149867,0.007315,1.010314
115,(POSTAGE),(PLASTERS IN TIN SPACEBOY),0.765306,0.137755,0.114796,0.15,1.088889,0.009371,1.014406
26,(CIRCUS PARADE CHILDRENS EGG CUP),(POSTAGE),0.056122,0.765306,0.05102,0.909091,1.187879,0.00807,2.581633
164,(RED TOADSTOOL LED NIGHT LIGHT),(RABBIT NIGHT LIGHT),0.181122,0.188776,0.053571,0.295775,1.566806,0.01938,1.151939
225,(LUNCH BAG RED RETROSPOT),"(LUNCH BAG WOODLAND, POSTAGE)",0.153061,0.102041,0.05102,0.333333,3.266667,0.035402,1.346939
127,(POSTAGE),(RED RETROSPOT MINI CASES),0.765306,0.137755,0.114796,0.15,1.088889,0.009371,1.014406


In [18]:
# Calculate Association Rules
frq_items_de = apriori(basket_de, min_support = 0.05, use_colnames = True)

# Collecting the inferred rules in a dataframe
rules_de = association_rules(frq_items_de, metric ="lift", min_threshold = 1)
rules_de = rules_de.sort_values(['confidence', 'lift'], ascending =[False, False])
rules_de.sample(12)



Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
33,(POSTAGE),(PLASTERS IN TIN SPACEBOY),0.818381,0.107221,0.100656,0.122995,1.147113,0.012909,1.017986
112,(ROUND SNACK BOXES SET OF4 WOODLAND),"(SPACEBOY LUNCH BOX, POSTAGE)",0.245077,0.091904,0.061269,0.25,2.720238,0.038746,1.210795
92,"(ROUND SNACK BOXES SET OF4 WOODLAND, POSTAGE)",(PLASTERS IN TIN WOODLAND ANIMALS),0.225383,0.137856,0.065646,0.291262,2.112806,0.034575,1.21645
66,(RED RETROSPOT CHARLOTTE BAG),(WOODLAND CHARLOTTE BAG),0.070022,0.126915,0.059081,0.84375,6.648168,0.050194,5.587746
83,(POSTAGE),"(PLASTERS IN TIN CIRCUS PARADE, ROUND SNACK BO...",0.818381,0.056893,0.050328,0.061497,1.080934,0.003768,1.004906
94,(ROUND SNACK BOXES SET OF4 WOODLAND),"(PLASTERS IN TIN WOODLAND ANIMALS, POSTAGE)",0.245077,0.118162,0.065646,0.267857,2.266865,0.036687,1.204462
89,(POSTAGE),"(PLASTERS IN TIN WOODLAND ANIMALS, PLASTERS IN...",0.818381,0.061269,0.054705,0.066845,1.091005,0.004563,1.005975
100,(POSTAGE),"(WOODLAND CHARLOTTE BAG, RED RETROSPOT CHARLOT...",0.818381,0.059081,0.054705,0.066845,1.131412,0.006354,1.00832
115,"(ROUND SNACK BOXES SET OF4 WOODLAND, WOODLAND ...",(POSTAGE),0.063457,0.818381,0.059081,0.931034,1.137654,0.007149,2.633479
116,"(POSTAGE, WOODLAND CHARLOTTE BAG)",(ROUND SNACK BOXES SET OF4 WOODLAND),0.115974,0.245077,0.059081,0.509434,2.078673,0.030659,1.538882
