# IIC-2433 Minería de Datos UC

- Versiones de librerías, python 3.8.10

- sklearn 1.6.1
- mlxtend 0.23.4


### Vamos a usar la librería mlxted

!pip3 install mlxtend

In [1]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori, association_rules

### Vamos a descargar un dataset desde Kaggle

!pip3 install kagglehub

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("shazadudwadia/supermarket")

print("Path to dataset files:", path)

Path to dataset files: /home/marcelo/.cache/kagglehub/datasets/shazadudwadia/supermarket/versions/1


In [3]:
foo = path + "/GroceryStoreDataSet.csv"

print(foo)

/home/marcelo/.cache/kagglehub/datasets/shazadudwadia/supermarket/versions/1/GroceryStoreDataSet.csv


In [4]:
df = pd.read_csv(foo, names = ['products'], sep = ',')
data = list(df["products"].apply(lambda x:x.split(",") ))
data

[['MILK', 'BREAD', 'BISCUIT'],
 ['BREAD', 'MILK', 'BISCUIT', 'CORNFLAKES'],
 ['BREAD', 'TEA', 'BOURNVITA'],
 ['JAM', 'MAGGI', 'BREAD', 'MILK'],
 ['MAGGI', 'TEA', 'BISCUIT'],
 ['BREAD', 'TEA', 'BOURNVITA'],
 ['MAGGI', 'TEA', 'CORNFLAKES'],
 ['MAGGI', 'BREAD', 'TEA', 'BISCUIT'],
 ['JAM', 'MAGGI', 'BREAD', 'TEA'],
 ['BREAD', 'MILK'],
 ['COFFEE', 'COCK', 'BISCUIT', 'CORNFLAKES'],
 ['COFFEE', 'COCK', 'BISCUIT', 'CORNFLAKES'],
 ['COFFEE', 'SUGER', 'BOURNVITA'],
 ['BREAD', 'COFFEE', 'COCK'],
 ['BREAD', 'SUGER', 'BISCUIT'],
 ['COFFEE', 'SUGER', 'CORNFLAKES'],
 ['BREAD', 'SUGER', 'BOURNVITA'],
 ['BREAD', 'COFFEE', 'SUGER'],
 ['BREAD', 'COFFEE', 'SUGER'],
 ['TEA', 'MILK', 'COFFEE', 'CORNFLAKES']]

### Vamos a usar one hot encoding para codificar las transacciones en vectores

In [5]:
from mlxtend.preprocessing import TransactionEncoder

a = TransactionEncoder()
a_data = a.fit(data).transform(data)
df = pd.DataFrame(a_data,columns=a.columns_)
df = df.replace(False,0)
df = df.replace(True,1)
df

  df = df.replace(True,1)


Unnamed: 0,BISCUIT,BOURNVITA,BREAD,COCK,COFFEE,CORNFLAKES,JAM,MAGGI,MILK,SUGER,TEA
0,1,0,1,0,0,0,0,0,1,0,0
1,1,0,1,0,0,1,0,0,1,0,0
2,0,1,1,0,0,0,0,0,0,0,1
3,0,0,1,0,0,0,1,1,1,0,0
4,1,0,0,0,0,0,0,1,0,0,1
5,0,1,1,0,0,0,0,0,0,0,1
6,0,0,0,0,0,1,0,1,0,0,1
7,1,0,1,0,0,0,0,1,0,0,1
8,0,0,1,0,0,0,1,1,0,0,1
9,0,0,1,0,0,0,0,0,1,0,0


### Ahora vamos a aplicar el algoritmo apriori con MinSupp = 0.2 para encontrar itemsets frecuentes

In [6]:
df = apriori(df, min_support = 0.2, use_colnames = True, verbose = 1)
df

Processing 72 combinations | Sampling itemset size 2Processing 42 combinations | Sampling itemset size 3




Unnamed: 0,support,itemsets
0,0.35,(BISCUIT)
1,0.2,(BOURNVITA)
2,0.65,(BREAD)
3,0.4,(COFFEE)
4,0.3,(CORNFLAKES)
5,0.25,(MAGGI)
6,0.25,(MILK)
7,0.3,(SUGER)
8,0.35,(TEA)
9,0.2,"(BREAD, BISCUIT)"


### Y fijamos MinConf para buscar las reglas a partir de estos itemsets frecuentes

In [7]:
df_ar = association_rules(df, metric = "confidence", min_threshold = 0.6)
df_ar[['antecedents','consequents','support','confidence']]

Unnamed: 0,antecedents,consequents,support,confidence
0,(MILK),(BREAD),0.2,0.8
1,(SUGER),(BREAD),0.2,0.666667
2,(CORNFLAKES),(COFFEE),0.2,0.666667
3,(SUGER),(COFFEE),0.2,0.666667
4,(MAGGI),(TEA),0.2,0.8
