In [1]:
import pandas as pd
import re
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

# Load the dataset
data = pd.read_csv("french_bakery_dataset.csv")

# Data Cleaning: Remove rows with missing values
data = data.dropna()

def preprocess_items(item):
    # Convert item names to lowercase
    item = item.lower()
    
    # Remove special characters (e.g., punctuations)
    special_characters = r"[!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~]"
    item = re.sub(special_characters, "", item)
    
    # Remove leading/trailing whitespaces
    item = item.strip()
    
    return item

data['article'] = data['article'].apply(preprocess_items)

print(data)

        Unnamed: 0       date   time  ticket_number               article   
0                0   1/2/2021   8:38         150040              baguette  \
1                1   1/2/2021   8:38         150040      pain au chocolat   
2                4   1/2/2021   9:14         150041      pain au chocolat   
3                5   1/2/2021   9:14         150041                  pain   
4                8   1/2/2021   9:25         150042  traditional baguette   
...            ...        ...    ...            ...                   ...   
234000      511387  9/30/2022  18:52         288911                 coupe   
234001      511388  9/30/2022  18:52         288911            boule 200g   
234002      511389  9/30/2022  18:52         288911                 coupe   
234003      511392  9/30/2022  18:55         288912  traditional baguette   
234004      511395  9/30/2022  18:56         288913  traditional baguette   

        Quantity unit_price  
0              1     0,90 €  
1              

In [2]:
# Convert the data to a transaction format where each row represents a set of items (articles) for a single transaction
transactions = data.groupby(['ticket_number'])['article'].apply(list)

print(transactions)

ticket_number
150040                         [baguette, pain au chocolat]
150041                             [pain au chocolat, pain]
150042                               [traditional baguette]
150043                                [baguette, croissant]
150044                                            [banette]
                                ...                        
288908                                    [cereal baguette]
288910                               [traditional baguette]
288911    [campagne, traditional baguette, coupe, boule ...
288912                               [traditional baguette]
288913                               [traditional baguette]
Name: article, Length: 136451, dtype: object


In [3]:
singleItemTransactions = 0
multiItemTransactions = 0

# counter of single-item transactions
for transaction in transactions:
    if (len(transaction) == 1):
        singleItemTransactions += 1
    else:
        multiItemTransactions += 1

print('Single Item Transactions: ' + str(singleItemTransactions))
print('Multiple Item Transactions: ' + str(multiItemTransactions))

Single Item Transactions: 77797
Multiple Item Transactions: 58654


In [4]:
# Create a dictionary to map each unique item to a unique integer identifier
item_to_int = {}
for transaction in transactions:
    for item in transaction:
        if item not in item_to_int:
            item_to_int[item] = len(item_to_int)

print(item_to_int)

{'baguette': 0, 'pain au chocolat': 1, 'pain': 2, 'traditional baguette': 3, 'croissant': 4, 'banette': 5, 'banettine': 6, 'special bread': 7, 'coupe': 8, 'sand jb emmental': 9, 'kouign amann': 10, 'boule 200g': 11, 'boule 400g': 12, 'gal frangipane 6p': 13, 'campagne': 14, 'moisson': 15, 'cafe ou eau': 16, 'brioche': 17, 'cereal baguette': 18, 'seigle': 19, 'complet': 20, 'divers patisserie': 21, 'gal frangipane 4p': 22, 'cookie': 23, 'ficelle': 24, 'pain aux raisins': 25, 'gal pomme 6p': 26, 'gal pomme 4p': 27, 'financier x5': 28, 'vik bread': 29, 'divers viennoiserie': 30, 'gache': 31, 'sandwich complet': 32, 'pain banette': 33, 'grand far breton': 34, 'quim bread': 35, 'special bread kg': 36, 'gd kouign amann': 37, 'boule polka': 38, 'demi baguette': 39, 'chausson aux pommes': 40, 'baguette graine': 41, 'divers confiserie': 42, 'sucette': 43, 'divers boulangerie': 44, 'boisson 33cl': 45, 'pates': 46, 'formule sandwich': 47, 'divers sandwichs': 48, 'croissant amandes': 49, 'pain cho

In [5]:
# Convert the transactions to a one-hot encoded format
one_hot_encoded = pd.DataFrame(0, index=transactions.index, columns=item_to_int.values())
for i, transaction in enumerate(transactions):
    for item in transaction:
        one_hot_encoded.at[transactions.index[i], item_to_int[item]] = 1

print(one_hot_encoded)

               0    1    2    3    4    5    6    7    8    9    ...  139   
ticket_number                                                    ...        
150040           1    1    0    0    0    0    0    0    0    0  ...    0  \
150041           0    1    1    0    0    0    0    0    0    0  ...    0   
150042           0    0    0    1    0    0    0    0    0    0  ...    0   
150043           1    0    0    0    1    0    0    0    0    0  ...    0   
150044           0    0    0    0    0    1    0    0    0    0  ...    0   
...            ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
288908           0    0    0    0    0    0    0    0    0    0  ...    0   
288910           0    0    0    1    0    0    0    0    0    0  ...    0   
288911           0    0    0    1    0    0    0    0    1    0  ...    0   
288912           0    0    0    1    0    0    0    0    0    0  ...    0   
288913           0    0    0    1    0    0    0    0    0    0  ...    0   

In [6]:
# Apply the Apriori algorithm to find frequent itemsets
frequent_itemsets = apriori(one_hot_encoded, min_support=0.03, use_colnames=True)

print(frequent_itemsets)



     support itemsets
0   0.111930      (0)
1   0.077163      (1)
2   0.494940      (3)
3   0.083884      (4)
4   0.110714      (5)
5   0.037977      (7)
6   0.142351      (8)
7   0.036277     (18)
8   0.030634     (47)
9   0.030817   (1, 3)
10  0.039531   (1, 4)
11  0.036108   (3, 4)
12  0.044822   (8, 3)


In [7]:
# Generate association rules from the frequent itemsets
association_rules_df = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.3)

print(association_rules_df)

  antecedents consequents  antecedent support  consequent support   support   
0         (1)         (3)            0.077163            0.494940  0.030817  \
1         (1)         (4)            0.077163            0.083884  0.039531   
2         (4)         (1)            0.083884            0.077163  0.039531   
3         (4)         (3)            0.083884            0.494940  0.036108   
4         (8)         (3)            0.142351            0.494940  0.044822   

   confidence      lift  leverage  conviction  zhangs_metric  
0    0.399373  0.806913 -0.007374    0.840889      -0.205908  
1    0.512299  6.107265  0.033058    1.878440       0.906185  
2    0.471256  6.107265  0.033058    1.745339       0.912832  
3    0.430456  0.869714 -0.005409    0.886780      -0.140539  
4    0.314868  0.636175 -0.025633    0.737173      -0.400054  


In [8]:
# Invert the item_to_int dictionary to create a new dictionary for mapping integer identifiers to item names
int_to_item = {v: k for k, v in item_to_int.items()}

print(int_to_item)

{0: 'baguette', 1: 'pain au chocolat', 2: 'pain', 3: 'traditional baguette', 4: 'croissant', 5: 'banette', 6: 'banettine', 7: 'special bread', 8: 'coupe', 9: 'sand jb emmental', 10: 'kouign amann', 11: 'boule 200g', 12: 'boule 400g', 13: 'gal frangipane 6p', 14: 'campagne', 15: 'moisson', 16: 'cafe ou eau', 17: 'brioche', 18: 'cereal baguette', 19: 'seigle', 20: 'complet', 21: 'divers patisserie', 22: 'gal frangipane 4p', 23: 'cookie', 24: 'ficelle', 25: 'pain aux raisins', 26: 'gal pomme 6p', 27: 'gal pomme 4p', 28: 'financier x5', 29: 'vik bread', 30: 'divers viennoiserie', 31: 'gache', 32: 'sandwich complet', 33: 'pain banette', 34: 'grand far breton', 35: 'quim bread', 36: 'special bread kg', 37: 'gd kouign amann', 38: 'boule polka', 39: 'demi baguette', 40: 'chausson aux pommes', 41: 'baguette graine', 42: 'divers confiserie', 43: 'sucette', 44: 'divers boulangerie', 45: 'boisson 33cl', 46: 'pates', 47: 'formule sandwich', 48: 'divers sandwichs', 49: 'croissant amandes', 50: 'pain

In [9]:
# Convert the integer identifiers in association_rules_df back to item names
association_rules_df['antecedents'] = association_rules_df['antecedents'].apply(lambda x: frozenset([int_to_item[i] for i in x]))
association_rules_df['consequents'] = association_rules_df['consequents'].apply(lambda x: frozenset([int_to_item[i] for i in x]))
association_rules_df = association_rules_df.sort_values(by='support', ascending=False)

In [10]:
# Set pandas options to display complete contents of columns
pd.set_option('display.max_colwidth', None)
pd.set_option('display.expand_frame_repr', False)

# Print the updated association rules with item names
print(association_rules_df)

          antecedents             consequents  antecedent support  consequent support   support  confidence      lift  leverage  conviction  zhangs_metric
4             (coupe)  (traditional baguette)            0.142351            0.494940  0.044822    0.314868  0.636175 -0.025633    0.737173      -0.400054
1  (pain au chocolat)             (croissant)            0.077163            0.083884  0.039531    0.512299  6.107265  0.033058    1.878440       0.906185
2         (croissant)      (pain au chocolat)            0.083884            0.077163  0.039531    0.471256  6.107265  0.033058    1.745339       0.912832
3         (croissant)  (traditional baguette)            0.083884            0.494940  0.036108    0.430456  0.869714 -0.005409    0.886780      -0.140539
0  (pain au chocolat)  (traditional baguette)            0.077163            0.494940  0.030817    0.399373  0.806913 -0.007374    0.840889      -0.205908
