In [1]:
import pandas as pd
import numpy as np

from mlxtend.frequent_patterns import apriori as apriori
from mlxtend.frequent_patterns import association_rules
from mlxtend.frequent_patterns import fpgrowth

## Load the Dataset

In [2]:
basket = pd.read_csv("basket.csv")
basket.columns

Index(['9 de oro', 'agua', 'agua tonica', 'alfajor block',
       'alfajor de maicena', 'alfajor santafesino', 'alka', 'aquarius',
       'baggio', 'bandeja de miga', 'barrita de cereales', 'belden',
       'bizcochos', 'block', 'brahma', 'café', 'campeon', 'caramelos',
       'cepita', 'cereales', 'chester', 'chupetin', 'coca', 'cofler',
       'conitos', 'corona', 'doble cola', 'doctor lemon', 'don satur',
       'doritos', 'dos corazones', 'encendedor', 'facturas', 'fanta',
       'fantoche', 'fernet', 'fresh', 'gancia', 'gatorade', 'gin', 'gomitas',
       'guaymayen', 'heineken', 'iguana', 'jorgito', 'kit kat', 'lata coca',
       'lays', 'levite', 'lincon', 'liverpool', 'lucky', 'malboro',
       'matecocido', 'mentoplus', 'mini oreo', 'mogul', 'monster', 'opera',
       'oreo', 'pan', 'paseo', 'pañuelos', 'pebete', 'petaca', 'phillip',
       'pier', 'pipas', 'pitusas', 'polvorita', 'postre', 'powerade', 'pritty',
       'pronto', 'quilmes', 'rhodesia', 'saladix', 'sanguche miga

## Brief Dataset Analysis

In [3]:
basket.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1032 entries, 0 to 1031
Data columns (total 93 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   9 de oro             1032 non-null   bool 
 1   agua                 1032 non-null   bool 
 2   agua tonica          1032 non-null   bool 
 3   alfajor block        1032 non-null   bool 
 4   alfajor de maicena   1032 non-null   bool 
 5   alfajor santafesino  1032 non-null   bool 
 6   alka                 1032 non-null   bool 
 7   aquarius             1032 non-null   bool 
 8   baggio               1032 non-null   bool 
 9   bandeja de miga      1032 non-null   bool 
 10  barrita de cereales  1032 non-null   bool 
 11  belden               1032 non-null   bool 
 12  bizcochos            1032 non-null   bool 
 13  block                1032 non-null   bool 
 14  brahma               1032 non-null   bool 
 15  café                 1032 non-null   bool 
 16  campeon              103

## Association Rule Mining


"I will be using the association rule mining algorithms `apriori` and `fpgrowth` from the `mlxtend` library. These are the best algorithms for association rule mining.

Important metrics to remember:
- **Support**: It is the ratio of times the set/rule appears in the dataset. It is calculated as:

  $SP = \frac{A \cup B}{U}$

  Where $A$ is the antecedent of $B$, and they are items within the universe of the dataset $U$.

- **Confidence**: It is the ratio of times the consequent appears when the antecedent appears. It is calculated as:

  $CON = \frac{A \cup B}{A}$

- **Lift**: It compares the observed frequency of a rule with the expected frequency by chance (if the rule doesn't actually exist). It is calculated as:

  $LF = \frac{SP(A \cup B)}{SP(A) \times SP(B)}$"


### Calculating the Most Frequent Itemsets


In [4]:
mins = 0.001
freq_items_apri = apriori(basket,min_support=mins,use_colnames=True)
freq_items_fp = fpgrowth(basket,min_support=mins, use_colnames=True)

In [5]:
freq_items_apri.shape

(142, 2)

In [6]:
freq_items_fp.shape

(142, 2)

Both `apriori` and `fpgrowth` found 142 frequent itemsets with a support of 0.1%.


### Association Rule

#### Apriori

In [7]:
rules_apri = association_rules(freq_items_apri,metric="confidence",min_threshold=0.5)
rules_apri["antecedent_len"] = rules_apri["antecedents"].apply(lambda x: len(x))
rules_apri.sort_values("lift",ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric,antecedent_len
3,(matecocido),(facturas),0.005814,0.017442,0.002907,0.5,28.666667,0.002806,1.965116,0.97076,1
0,(pañuelos),(belden),0.002907,0.055233,0.001938,0.666667,12.070175,0.001777,2.834302,0.919825,1
5,"(phillip, sanguche milanesa)",(coca),0.001938,0.09593,0.001938,1.0,10.424242,0.001752,inf,0.905825,2
6,"(sanguche milanesa, coca)",(phillip),0.002907,0.074612,0.001938,0.666667,8.935065,0.001721,2.776163,0.890671,2
1,(conitos),(caramelos),0.002907,0.07655,0.001938,0.666667,8.708861,0.001715,2.770349,0.887755,1
2,(lays),(coca),0.007752,0.09593,0.003876,0.5,5.212121,0.003132,1.80814,0.814453,1
4,"(phillip, guaymayen)",(coca),0.003876,0.09593,0.001938,0.5,5.212121,0.001566,1.80814,0.811284,2


#### Fpgrowth

In [8]:
rules_fp = association_rules(freq_items_fp,metric="confidence",min_threshold=0.7)
rules_fp["antecedent_len"] = rules_fp["antecedents"].apply(lambda x: len(x))
rules_fp.sort_values("lift",ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric,antecedent_len
0,"(phillip, sanguche milanesa)",(coca),0.001938,0.09593,0.001938,1.0,10.424242,0.001752,inf,0.905825,2


### Conclusion

Both algorithms found rules that are not very useful due to the significant noise in the data caused by those items that were sold infrequently. Additionally, the items "caramelos" and "alka" are not usually bought, they are given as change in a purchase. Therefore, I have decided to perform association rule mining again after removing the noise from my data.


## Association Rule Mining with cleaned dataset

In [9]:
basket_clean = pd.read_csv("basket_clean.csv")
basket_clean.shape

(794, 22)

As you can see, after removing the noise from my transactions, the data quantity reduced from 1034 to 794 transactions.


### Calculating the Most Frequent Itemsets

In [10]:
mins = 0.001
freq_items_apri = apriori(basket_clean,min_support=mins,use_colnames=True,max_len=3)
freq_items_fp = fpgrowth(basket_clean,min_support=mins, use_colnames=True,max_len=3)

In [11]:
freq_items_apri.shape

(121, 2)

In [12]:
freq_items_fp.shape

(121, 2)

Both `apriori` and `fpgrowth` found 121 frequent itemsets with a support of 0.1%.

### Association Rule

#### Apriori

In [13]:
rules_apri = association_rules(freq_items_apri,metric="confidence",min_threshold=0.5)
rules_apri["antecedent_len"] = rules_apri["antecedents"].apply(lambda x: len(x))
rules_apri.sort_values("lift",ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric,antecedent_len
17,"(pier, levite)",(guaymayen),0.001259,0.027708,0.001259,1.0,36.090909,0.001225,inf,0.973518,2
22,"(quilmes, levite)",(sanguche milanesa),0.001259,0.032746,0.001259,1.0,30.538462,0.001218,inf,0.968474,2
20,"(sanguche milanesa, quilmes)",(levite),0.002519,0.027708,0.001259,0.5,18.045455,0.00119,1.944584,0.94697,2
18,"(phillip, sanguche milanesa)",(guaymayen),0.002519,0.027708,0.001259,0.5,18.045455,0.00119,1.944584,0.94697,2
15,"(guaymayen, pier)",(levite),0.002519,0.027708,0.001259,0.5,18.045455,0.00119,1.944584,0.94697,2
21,"(sanguche milanesa, levite)",(quilmes),0.001259,0.060453,0.001259,1.0,16.541667,0.001183,inf,0.940731,2
3,"(encendedor, lucky)",(belden),0.001259,0.071788,0.001259,1.0,13.929825,0.001169,inf,0.929382,2
5,"(encendedor, malboro)",(belden),0.001259,0.071788,0.001259,1.0,13.929825,0.001169,inf,0.929382,2
6,"(lucky, malboro)",(belden),0.001259,0.071788,0.001259,1.0,13.929825,0.001169,inf,0.929382,2
4,"(encendedor, belden)",(malboro),0.002519,0.039043,0.001259,0.5,12.806452,0.001161,1.921914,0.924242,2


#### Fpgrowth

In [14]:
rules_fp = association_rules(freq_items_fp,metric="confidence",min_threshold=0.5)
rules_fp["antecedent_len"] = rules_fp["antecedents"].apply(lambda x: len(x))
rules_fp.sort_values("lift",ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric,antecedent_len
19,"(pier, levite)",(guaymayen),0.001259,0.027708,0.001259,1.0,36.090909,0.001225,inf,0.973518,2
16,"(quilmes, levite)",(sanguche milanesa),0.001259,0.032746,0.001259,1.0,30.538462,0.001218,inf,0.968474,2
14,"(sanguche milanesa, quilmes)",(levite),0.002519,0.027708,0.001259,0.5,18.045455,0.00119,1.944584,0.94697,2
3,"(phillip, sanguche milanesa)",(guaymayen),0.002519,0.027708,0.001259,0.5,18.045455,0.00119,1.944584,0.94697,2
17,"(guaymayen, pier)",(levite),0.002519,0.027708,0.001259,0.5,18.045455,0.00119,1.944584,0.94697,2
15,"(sanguche milanesa, levite)",(quilmes),0.001259,0.060453,0.001259,1.0,16.541667,0.001183,inf,0.940731,2
10,"(encendedor, malboro)",(belden),0.001259,0.071788,0.001259,1.0,13.929825,0.001169,inf,0.929382,2
12,"(encendedor, lucky)",(belden),0.001259,0.071788,0.001259,1.0,13.929825,0.001169,inf,0.929382,2
13,"(lucky, malboro)",(belden),0.001259,0.071788,0.001259,1.0,13.929825,0.001169,inf,0.929382,2
9,"(encendedor, belden)",(malboro),0.002519,0.039043,0.001259,0.5,12.806452,0.001161,1.921914,0.924242,2


In this case, fewer rules were obtained, but in my opinion, these have a lower level of noise compared to the ones obtained previously.


## Interesting Rules
