In [2]:
# import libraries
import mlxtend
import pandas as pd
import plotly.express as px
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [3]:
toy_dataset = [['Skirt', 'Sneakers', 'Scarf', 'Pants', 'Hat'],

        ['Sunglasses', 'Skirt', 'Sneakers', 'Pants', 'Hat'],

        ['Dress', 'Sandals', 'Scarf', 'Pants', 'Heels'],

        ['Dress', 'Necklace', 'Earrings', 'Scarf', 'Hat', 'Heels', 'Hat'],

      ['Earrings', 'Skirt', 'Skirt', 'Scarf', 'Shirt', 'Pants']]

In [4]:
# transform dataset into a one-hot-encoded dataframe
te = TransactionEncoder()
te_ary = te.fit(toy_dataset).transform(toy_dataset) #apply one-hot-encoding
toy_df = pd.DataFrame(te_ary, columns=te.columns_) # create a new dataframe using numpy
toy_df

Unnamed: 0,Dress,Earrings,Hat,Heels,Necklace,Pants,Sandals,Scarf,Shirt,Skirt,Sneakers,Sunglasses
0,False,False,True,False,False,True,False,True,False,True,True,False
1,False,False,True,False,False,True,False,False,False,True,True,True
2,True,False,False,True,False,True,True,True,False,False,False,False
3,True,True,True,True,True,False,False,True,False,False,False,False
4,False,True,False,False,False,True,False,True,True,True,False,False


`support`

In [38]:
apriori(toy_df, min_support=0.6)

Unnamed: 0,support,itemsets
0,0.6,(2)
1,0.8,(5)
2,0.8,(7)
3,0.6,(9)
4,0.6,"(5, 7)"
5,0.6,"(9, 5)"


In [None]:
# select items with a minimum of 60% support
# apply use_colnames to display column names on the output
frequent_items = apriori(toy_df, min_support=0.6, use_colnames=True)

`confidence`

In [6]:
# items that are associated to go together
association_rules(frequent_items, metric="confidence", min_threshold=0.7)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(Scarf),(Pants),0.8,0.8,0.6,0.75,0.9375,1.0,-0.04,0.8,-0.25,0.6,-0.25,0.75
1,(Pants),(Scarf),0.8,0.8,0.6,0.75,0.9375,1.0,-0.04,0.8,-0.25,0.6,-0.25,0.75
2,(Skirt),(Pants),0.6,0.8,0.6,1.0,1.25,1.0,0.12,inf,0.5,0.75,1.0,0.875
3,(Pants),(Skirt),0.8,0.6,0.6,0.75,1.25,1.0,0.12,1.6,1.0,0.75,0.375,0.875


`lift`

In [7]:
# probability of buying two associated items, X and Y, together instead of just one, X
association_rules(frequent_items, metric="lift", min_threshold=1.25)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(Skirt),(Pants),0.6,0.8,0.6,1.0,1.25,1.0,0.12,inf,0.5,0.75,1.0,0.875


### Apriori on Checkpoint Dataset

In [28]:
df = pd.read_csv('Market_Basket_Optimisation.csv', header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,turkey,avocado,,,,,,,,,,,,,,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,


In [9]:
# general info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   shrimp             7500 non-null   object 
 1   almonds            5746 non-null   object 
 2   avocado            4388 non-null   object 
 3   vegetables mix     3344 non-null   object 
 4   green grapes       2528 non-null   object 
 5   whole weat flour   1863 non-null   object 
 6   yams               1368 non-null   object 
 7   cottage cheese     980 non-null    object 
 8   energy drink       653 non-null    object 
 9   tomato juice       394 non-null    object 
 10  low fat yogurt     255 non-null    object 
 11  green tea          153 non-null    object 
 12  honey              86 non-null     object 
 13  salad              46 non-null     object 
 14  mineral water      24 non-null     object 
 15  salmon             7 non-null      object 
 16  antioxydant juice  3 non

In [10]:
# count the frequency of each item
items = df.values.flatten() # flatten the data into creating a single list of all items

# count the frequency of each item
item_counts = pd.Series(items).value_counts()

In [11]:
# bar plot of item frequency
fig = px.bar(item_counts, x=item_counts.index, y=item_counts.values, labels={'x': 'Items', 'y': 'Frequency'},
             title="Item Frequency in Dataset", text=item_counts.values)

fig.update_layout(xaxis_tickangle=-45)
fig.show()


*scattermapbox* is deprecated! Use *scattermap* instead. Learn more at: https://plotly.com/python/mapbox-to-maplibre/



In [30]:
# Assuming each row is a string of items separated by tab or comma:
# For tab-separated values, use '\t'
transactions = df[0].apply(lambda x: [item.strip() for item in x.split('\t') if item.strip() != '']).tolist()

In [31]:
# transform dataset into one-hot- encoded dataframe
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions) # apply one-hot-encoding to dataset
market_df = pd.DataFrame(te_ary, columns=te.columns_) #create a new dataframe
market_df.head()

Unnamed: 0,almonds,antioxydant juice,asparagus,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,body spray,...,tomatoes,toothpaste,turkey,vegetables mix,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [40]:
market_df.shape

(7501, 115)

In [39]:
# checking for existing frequent appearing items in the dataset
print(market_df.sum().sort_values(ascending=False).head(10))


mineral water        577
burgers              576
turkey               458
chocolate            391
frozen vegetables    373
spaghetti            354
shrimp               325
grated cheese        293
eggs                 279
cookies              270
dtype: int64


`support`

In [45]:
apriori(market_df, min_support=0.03)

Unnamed: 0,support,itemsets
0,0.07679,(14)
1,0.052126,(24)
2,0.035995,(29)
3,0.037195,(36)
4,0.032529,(42)
5,0.049727,(48)
6,0.039061,(50)
7,0.030929,(58)
8,0.076923,(71)
9,0.043328,(95)


In [46]:
#select itemsets with a minimum of 60% support and apply use_colnames to add column names to the output
frequent_itemsets = apriori(market_df, min_support=0.03, use_colnames=True)
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.07679,(burgers)
1,0.052126,(chocolate)
2,0.035995,(cookies)
3,0.037195,(eggs)
4,0.032529,(french fries)
5,0.049727,(frozen vegetables)
6,0.039061,(grated cheese)
7,0.030929,(herb & pepper)
8,0.076923,(mineral water)
9,0.043328,(shrimp)


`confidence`

In [51]:
association_rules(frequent_itemsets, metric="confidence", min_threshold=0.00) # associate items

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski


In [54]:
# checking the number of each frequent item set to see the number of pairs
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
print(frequent_itemsets.head(10))

    support             itemsets  length
0  0.076790            (burgers)       1
1  0.052126          (chocolate)       1
2  0.035995            (cookies)       1
3  0.037195               (eggs)       1
4  0.032529       (french fries)       1
5  0.049727  (frozen vegetables)       1
6  0.039061      (grated cheese)       1
7  0.030929      (herb & pepper)       1
8  0.076923      (mineral water)       1
9  0.043328             (shrimp)       1


`lift`

In [55]:
association_rules(frequent_itemsets, metric="lift", min_threshold=0.00)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski


The length of the frequent items indicate that the dataset only produces single item frequent itemsets, this prevents association rules mining from yielding as it needs atleast two items to generate antecendents and consequents