In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# Mlxtend library
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from mlxtend.preprocessing import TransactionEncoder

# pyECLAT library
from pyECLAT import ECLAT

import ast

In [3]:
clusters = pd.read_csv("Clusters.csv")
baskets = pd.read_csv("customer_basket.csv")

In [4]:
baskets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89952 entries, 0 to 89951
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   invoice_id     89952 non-null  int64 
 1   list_of_goods  89952 non-null  object
 2   customer_id    89952 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 2.1+ MB


In [5]:
baskets.describe(include="all")

Unnamed: 0,invoice_id,list_of_goods,customer_id
count,89952.0,89952,89952.0
unique,,88767,
top,,"['babies food', 'cooking oil']",
freq,,25,
mean,6126259.0,,21855.193915
std,3527265.0,,12610.661213
min,20066.0,,1.0
25%,3085110.0,,10814.0
50%,6133909.0,,21904.0
75%,9185876.0,,32771.0


In [6]:
baskets["customer_id"].nunique()

28516

In [7]:
basket = baskets.merge(clusters, on= "customer_id")

In [8]:
basket_cluster_0 = basket[basket["kmeans_cluster"] == 0]
basket_cluster_1 = basket[basket["kmeans_cluster"] == 1]
basket_cluster_2 = basket[basket["kmeans_cluster"] == 2]
basket_cluster_3 = basket[basket["kmeans_cluster"] == 3]
basket_cluster_4 = basket[basket["kmeans_cluster"] == 4]
basket_cluster_5 = basket[basket["kmeans_cluster"] == 5]
basket_cluster_6 = basket[basket["kmeans_cluster"] == 6]
basket_cluster_7 = basket[basket["kmeans_cluster"] == 7]
basket_cluster_8 = basket[basket["kmeans_cluster"] == 8]
basket_cluster_9 = basket[basket["kmeans_cluster"] == 9]

# Cluster 0 (basket analysis)

In [9]:
baskets_0 = []
for i in range(len(basket_cluster_0)):
    baskets_0.append(ast.literal_eval(basket_cluster_0.iloc[i,1]))

In [10]:
te = TransactionEncoder()
te_fit = te.fit(baskets_0).transform(baskets_0)
basket_items_0 = pd.DataFrame(te_fit, columns=te.columns_)

In [526]:
frequent_itemsets_0 = apriori(
    basket_items_0, min_support=0.02, use_colnames=True
    )

rules_0 = association_rules(frequent_itemsets_0, metric="confidence", min_threshold=0.175)

In [527]:
rules_0.sort_values(by='lift', ascending=False).head(150)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
369,"(dessert wine, beer)",(white wine),0.033245,0.170373,0.028981,0.87175,5.116706,0.023317,6.468846,0.832229
366,(white wine),"(cider, beer)",0.170373,0.043213,0.037163,0.218126,5.047738,0.029801,1.223711,0.966569
363,"(cider, beer)",(white wine),0.043213,0.170373,0.037163,0.86,5.047738,0.029801,5.925905,0.838108
748,"(white wine, champagne)",(dessert wine),0.048802,0.098755,0.024314,0.498229,5.045078,0.019495,1.796127,0.842923
749,(dessert wine),"(white wine, champagne)",0.098755,0.048802,0.024314,0.246208,5.045078,0.019495,1.261884,0.889644
747,"(dessert wine, champagne)",(white wine),0.02829,0.170373,0.024314,0.85947,5.04463,0.019495,5.903575,0.825112
370,"(white wine, beer)",(dessert wine),0.058251,0.098755,0.028981,0.497527,5.037971,0.023229,1.793619,0.851084
371,(dessert wine),"(white wine, beer)",0.098755,0.058251,0.028981,0.293466,5.037971,0.023229,1.332913,0.889334
373,"(bramble, cider)",(white wine),0.031113,0.170373,0.026561,0.853704,5.010782,0.021261,5.670866,0.826134
759,"(french wine, white wine)",(cider),0.035377,0.128774,0.022643,0.640065,4.970457,0.018088,2.42051,0.828107


# Cluster 1 (basket_analysis)

In [33]:
baskets_1 = []
for i in range(len(basket_cluster_1)):
    baskets_1.append(ast.literal_eval(basket_cluster_1.iloc[i,1]))

In [34]:
te = TransactionEncoder()
te_fit = te.fit(baskets_1).transform(baskets_1)
basket_items_1 = pd.DataFrame(te_fit, columns=te.columns_)

In [151]:
frequent_itemsets_1 = apriori(
    basket_items_1, min_support=0.08, use_colnames=True
    )

rules_1 = association_rules(frequent_itemsets_1, metric="confidence", min_threshold=0.20)

In [515]:
rules_1.sort_values(by='lift', ascending=False).head(150)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
141,"(tomatoes, melons)","(carrots, asparagus)",0.179079,0.347484,0.083684,0.467302,1.344817,0.021457,1.224928,0.312338
137,"(carrots, asparagus)","(tomatoes, melons)",0.347484,0.179079,0.083684,0.240829,1.344817,0.021457,1.081338,0.392947
139,"(carrots, melons)","(asparagus, tomatoes)",0.130711,0.480756,0.083684,0.640224,1.331702,0.020844,1.443241,0.286534
127,"(mashed potato, tomatoes)","(carrots, asparagus)",0.25276,0.347484,0.116133,0.459459,1.322246,0.028303,1.207155,0.326149
128,"(carrots, asparagus)","(mashed potato, tomatoes)",0.347484,0.25276,0.116133,0.334211,1.322246,0.028303,1.122338,0.373495
129,"(carrots, tomatoes)","(mashed potato, asparagus)",0.404697,0.217322,0.116133,0.286963,1.320449,0.028183,1.097668,0.407661
126,"(mashed potato, asparagus)","(carrots, tomatoes)",0.217322,0.404697,0.116133,0.534381,1.320449,0.028183,1.278521,0.310066
130,"(asparagus, tomatoes)","(mashed potato, carrots)",0.480756,0.183715,0.116133,0.241563,1.314883,0.027811,1.076273,0.461201
125,"(mashed potato, carrots)","(asparagus, tomatoes)",0.183715,0.480756,0.116133,0.632138,1.314883,0.027811,1.411518,0.293373
140,"(asparagus, melons)","(carrots, tomatoes)",0.157548,0.404697,0.083684,0.531165,1.312503,0.019925,1.269751,0.282624


# Cluster 2 (basket analysis)

In [153]:
baskets_2 = []
for i in range(len(basket_cluster_2)):
    baskets_2.append(ast.literal_eval(basket_cluster_2.iloc[i,1]))

In [154]:
te = TransactionEncoder()
te_fit = te.fit(baskets_2).transform(baskets_2)
basket_items_2 = pd.DataFrame(te_fit, columns=te.columns_)

In [579]:
frequent_itemsets_2 = apriori(
    basket_items_2, min_support=0.035, use_colnames=True
    )

rules_2 = association_rules(frequent_itemsets_2, metric="confidence", min_threshold=0.15)

In [580]:
rules_2.sort_values(by='lift', ascending=False).head(150)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
671,(turkey),"(champagne, cottage cheese)",0.121418,0.190869,0.036911,0.304,1.592712,0.013736,1.162544,0.423569
670,"(champagne, cottage cheese)",(turkey),0.190869,0.121418,0.036911,0.193384,1.592712,0.013736,1.08922,0.459926
669,"(turkey, champagne)",(cottage cheese),0.101991,0.248179,0.036911,0.361905,1.458242,0.011599,1.178227,0.349933
435,"(spaghetti, airpods)",(iphone 8),0.12239,0.233123,0.041282,0.337302,1.446883,0.01275,1.157204,0.351932
436,(iphone 8),"(spaghetti, airpods)",0.233123,0.12239,0.041282,0.177083,1.446883,0.01275,1.066463,0.402749
1110,"(samsung galaxy 10, iphone 8)","(spaghetti, champagne)",0.124818,0.242351,0.043711,0.350195,1.444991,0.013461,1.165963,0.351874
1113,"(spaghetti, champagne)","(samsung galaxy 10, iphone 8)",0.242351,0.124818,0.043711,0.180361,1.444991,0.013461,1.067765,0.40646
1100,"(fromage blanc, samsung galaxy 10)","(spaghetti, champagne)",0.108305,0.242351,0.037397,0.345291,1.42476,0.011149,1.157232,0.334338
1103,"(spaghetti, champagne)","(fromage blanc, samsung galaxy 10)",0.242351,0.108305,0.037397,0.154309,1.42476,0.011149,1.054398,0.39349
260,(protein bar),(laptop),0.103934,0.256435,0.037882,0.364486,1.421357,0.01123,1.170021,0.330832


# Cluster 3 (basket analysis)

There is no transactions stored in this dataset about people in cluster 3

# Cluster 4 (basket analysis)

In [75]:
baskets_4 = []
for i in range(len(basket_cluster_4)):
    baskets_4.append(ast.literal_eval(basket_cluster_4.iloc[i,1]))

In [76]:
te = TransactionEncoder()
te_fit = te.fit(baskets_4).transform(baskets_4)
basket_items_4 = pd.DataFrame(te_fit, columns=te.columns_)

In [494]:
frequent_itemsets_4 = apriori(
    basket_items_4, min_support=0.015, use_colnames=True
    )

rules_4 = association_rules(frequent_itemsets_4, metric="confidence", min_threshold=0.55)

In [495]:
rules_4.sort_values(by='lift', ascending=False).head(150)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
157,"(mashed potato, carrots)",(tomatoes),0.022795,0.110733,0.019122,0.838863,7.575543,0.016598,5.518686,0.888244
158,"(mashed potato, tomatoes)",(carrots),0.030303,0.084427,0.019122,0.631016,7.474088,0.016563,2.481335,0.893273
37,"(carrots, asparagus)",(tomatoes),0.0417,0.110733,0.034192,0.819948,7.404732,0.029575,4.938951,0.90259
42,"(mashed potato, asparagus)",(tomatoes),0.02771,0.110733,0.022147,0.79922,7.217544,0.019078,4.429068,0.886
44,"(asparagus, melons)",(tomatoes),0.019932,0.110733,0.015881,0.796748,7.195217,0.013674,4.375194,0.87853
36,"(mashed potato, asparagus)",(carrots),0.02771,0.084427,0.016367,0.590643,6.995892,0.014027,2.236614,0.881485
39,"(asparagus, tomatoes)",(carrots),0.059202,0.084427,0.034192,0.577555,6.840864,0.029194,2.167317,0.907548
43,"(mashed potato, tomatoes)",(asparagus),0.030303,0.122617,0.022147,0.730838,5.960352,0.018431,3.259683,0.858232
35,"(mashed potato, carrots)",(asparagus),0.022795,0.122617,0.016367,0.718009,5.855731,0.013572,3.111393,0.84857
38,"(carrots, tomatoes)",(asparagus),0.048074,0.122617,0.034192,0.711236,5.80049,0.028297,3.03841,0.869396


# Cluster 5 (basket analysis)

In [91]:
baskets_5 = []
for i in range(len(basket_cluster_5)):
    baskets_5.append(ast.literal_eval(basket_cluster_5.iloc[i,1]))

In [92]:
te = TransactionEncoder()
te_fit = te.fit(baskets_5).transform(baskets_5)
basket_items_5 = pd.DataFrame(te_fit, columns=te.columns_)

In [562]:
frequent_itemsets_5 = apriori(
    basket_items_5, min_support=0.03, use_colnames=True
    )

rules_5 = association_rules(frequent_itemsets_5, metric="confidence", min_threshold=0.2)

In [563]:
rules_5.sort_values(by='lift', ascending=False).head(150)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
905,"(samsung galaxy 10, cottage cheese)","(airpods, champagne)",0.13584,0.27204,0.049478,0.364238,1.338913,0.012524,1.14502,0.292915
901,"(samsung galaxy 10, champagne, cottage cheese)",(airpods),0.110112,0.349046,0.049478,0.449346,1.287354,0.011044,1.182147,0.250833
916,"(grated cheese, samsung galaxy 10, champagne)",(airpods),0.077906,0.349046,0.034545,0.443418,1.27037,0.007352,1.169556,0.230809
934,"(samsung galaxy 10, iphone 8)","(airpods, champagne)",0.117308,0.27204,0.040482,0.345092,1.268533,0.00857,1.111545,0.239821
913,"(fromage blanc, samsung galaxy 10)","(airpods, champagne)",0.107773,0.27204,0.037064,0.343907,1.264175,0.007745,1.109537,0.234212
370,"(samsung galaxy 10, cottage cheese)",(airpods),0.13584,0.349046,0.059914,0.44106,1.263613,0.012499,1.164621,0.241412
958,"(bluetooth headphones, asparagus)","(samsung galaxy 10, champagne)",0.073048,0.389888,0.035984,0.492611,1.263466,0.007504,1.202453,0.224959
1027,"(samsung galaxy 10, iphone 8)","(champagne, cottage cheese)",0.117308,0.202951,0.030047,0.256135,1.262055,0.006239,1.071497,0.235237
908,"(samsung galaxy 10, airpods)","(champagne, cottage cheese)",0.193595,0.202951,0.049478,0.255576,1.259302,0.010188,1.070693,0.255342
906,"(champagne, cottage cheese)","(samsung galaxy 10, airpods)",0.202951,0.193595,0.049478,0.243794,1.259302,0.010188,1.066383,0.258339


# Cluster 6 (basket analysis)

In [97]:
baskets_6 = []
for i in range(len(basket_cluster_6)):
    baskets_6.append(ast.literal_eval(basket_cluster_6.iloc[i,1]))

In [98]:
te = TransactionEncoder()
te_fit = te.fit(baskets_6).transform(baskets_6)
basket_items_6 = pd.DataFrame(te_fit, columns=te.columns_)

In [572]:
frequent_itemsets_6 = apriori(
    basket_items_6, min_support=0.01, use_colnames=True
    )

rules_6 = association_rules(frequent_itemsets_6, metric="confidence", min_threshold=0.15)

In [573]:
rules_6.sort_values(by='lift', ascending=False).head(150)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
5053,"(bluetooth headphones, turkey)","(spaghetti, airpods)",0.0549,0.107176,0.010868,0.197952,1.846977,0.004984,1.11318,0.485213
5055,"(turkey, airpods)","(bluetooth headphones, spaghetti)",0.051152,0.123103,0.010868,0.212454,1.725827,0.004571,1.113455,0.44324
7501,"(flax seed, samsung galaxy 10)","(phone car charger, champagne)",0.046468,0.13397,0.01068,0.229839,1.715593,0.004455,1.124478,0.437438
7079,"(samsung galaxy 10, protein bar)","(bluetooth headphones, spaghetti)",0.059771,0.123103,0.012179,0.203762,1.655215,0.004821,1.1013,0.421014
5054,"(turkey, spaghetti)","(bluetooth headphones, airpods)",0.044969,0.147648,0.010868,0.241667,1.63677,0.004228,1.12398,0.407359
7502,"(flax seed, champagne)","(phone car charger, samsung galaxy 10)",0.067266,0.097808,0.01068,0.158774,1.623331,0.004101,1.072474,0.411675
5050,"(bluetooth headphones, spaghetti, airpods)",(turkey),0.051152,0.13116,0.010868,0.212454,1.619812,0.004158,1.103225,0.403273
2712,"(shallot, champagne)",(chutney),0.068016,0.102117,0.011242,0.165289,1.618622,0.004297,1.075681,0.410082
3194,(vacuum cleaner),"(grated cheese, champagne)",0.048529,0.149147,0.011617,0.239382,1.605004,0.004379,1.118634,0.396175
5517,"(shallot, airpods)","(laptop, champagne)",0.03204,0.204047,0.010493,0.327485,1.604949,0.003955,1.183547,0.389404


# Cluster 7 (basket analysis)

There is no transactions stored in this dataset about people in cluster 7

# Cluster 8 (basket analysis)

In [103]:
baskets_8 = []
for i in range(len(basket_cluster_8)):
    baskets_8.append(ast.literal_eval(basket_cluster_8.iloc[i,1]))

In [104]:
te = TransactionEncoder()
te_fit = te.fit(baskets_8).transform(baskets_8)
basket_items_8 = pd.DataFrame(te_fit, columns=te.columns_)

In [536]:
frequent_itemsets_8 = apriori(
    basket_items_8, min_support=0.015, use_colnames=True
    )

rules_8 = association_rules(frequent_itemsets_8, metric="confidence", min_threshold=0.55)

In [537]:
rules_8.sort_values(by='lift', ascending=False).head(150)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
1018,"(muffins, oil, french fries, cooking oil)",(cake),0.02655,0.44186,0.015795,0.594891,1.346331,0.004063,1.37775,0.264257
1028,"(cake, gums, napkins)","(oil, cooking oil)",0.036725,0.461822,0.022481,0.612137,1.325484,0.00552,1.387548,0.25492
1019,"(cake, muffins, french fries)","(oil, cooking oil)",0.025969,0.461822,0.015795,0.608209,1.316978,0.003802,1.373636,0.247103
712,"(muffins, french fries, cooking oil)",(cake),0.030814,0.44186,0.017829,0.578616,1.3095,0.004214,1.32454,0.243864
959,"(olive oil, muffins)","(oil, cooking oil)",0.037403,0.461822,0.022578,0.603627,1.307056,0.005304,1.357757,0.24405
880,"(gums, cookies)","(oil, cooking oil)",0.026647,0.461822,0.015988,0.6,1.299203,0.003682,1.345446,0.236602
754,"(cake, pasta)","(oil, cooking oil)",0.037209,0.461822,0.022287,0.598958,1.296947,0.005103,1.341951,0.237807
1038,"(soup, muffins, oil, cooking oil)",(cake),0.030329,0.44186,0.017345,0.571885,1.294266,0.003944,1.303714,0.234473
804,"(soup, muffins, oil)",(cake),0.045833,0.44186,0.026163,0.570825,1.291866,0.005911,1.300493,0.236778
743,"(soup, muffins, cooking oil)",(cake),0.033624,0.44186,0.019186,0.570605,1.29137,0.004329,1.299828,0.233479


# Cluster 9 (basket analysis)

In [255]:
baskets_9 = []
for i in range(len(basket_cluster_9)):
    baskets_9.append(ast.literal_eval(basket_cluster_9.iloc[i,1]))

In [256]:
te = TransactionEncoder()
te_fit = te.fit(baskets_9).transform(baskets_9)
basket_items_9 = pd.DataFrame(te_fit, columns=te.columns_)

In [601]:
frequent_itemsets_9 = apriori(
    basket_items_9, min_support=0.075, use_colnames=True
    )

rules_9 = association_rules(frequent_itemsets_9, metric="confidence", min_threshold=0.60)

In [602]:
rules_9.sort_values(by='lift', ascending=False).head(150)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
50,"(cake, muffins)",(cooking oil),0.115235,0.544332,0.075274,0.653221,1.200042,0.012548,1.314001,0.188407
45,"(babies food, napkins)",(cooking oil),0.178507,0.544332,0.115582,0.647493,1.189519,0.018415,1.292651,0.193945
49,"(cake, gums)",(cooking oil),0.132788,0.544332,0.085958,0.647335,1.189229,0.013678,1.292072,0.183484
38,"(babies food, gums)",(cooking oil),0.233037,0.544332,0.150132,0.644239,1.183541,0.023282,1.280828,0.202198
36,"(babies food, fresh bread)",(cooking oil),0.140905,0.544332,0.090676,0.643525,1.18223,0.013977,1.278262,0.179422
51,"(candy bars, cake, babies food)",(cooking oil),0.13473,0.544332,0.086166,0.639547,1.174921,0.012828,1.264154,0.172061
42,"(babies food, muffins)",(cooking oil),0.202581,0.544332,0.129527,0.639384,1.174621,0.019256,1.263581,0.186428
40,"(babies food, ketchup)",(cooking oil),0.12134,0.544332,0.0768,0.632933,1.162771,0.010751,1.241376,0.159317
52,"(candy bars, cake, cooking oil)",(babies food),0.100735,0.736714,0.086166,0.855372,1.161063,0.011953,1.820433,0.15426
35,"(french fries, cooking oil)",(babies food),0.094283,0.736714,0.080547,0.854305,1.159615,0.011087,1.807097,0.151973
