In [None]:
import pandas as pd

In [None]:
df = pd.read_excel('http://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx')

In [None]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


El conjunto de datos no se puede utilizar tal como esta. Por ejemplo, las descripciones contienen espacios que ha de ser eliminados. Las filas que no tienen número de facturas no se pueden utilizar ya que no se identifica la transacción.

In [None]:
df['Description'] = df['Description'].str.strip()
df.dropna(axis=0, subset=['InvoiceNo'], inplace=True)
df['InvoiceNo'] = df['InvoiceNo'].astype('str')

Para acelerar los cálculos se van a utilizar únicamente los datos correspondientes a España. La implementación del método apriori requiere que los datos se encuentren en un dataframe. Cada una de las filas representa una factura y cada una de las columnas un producto. Esto se puede conseguir utilizando el siguiente código:

In [None]:
basket = (df[df['Country']=="Spain"]
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

In [None]:
def encode_units (x): 
    if x <= 0: 
        return 0 
    if x >= 1: 
        return 1 
basket_sets = basket.applymap (encode_units) 
basket_sets.drop ('POSTAGE', inplace = True, axis = 1) 
basket_sets

Description,10 COLOUR SPACEBOY PEN,12 EGG HOUSE PAINTED WOOD,12 MESSAGE CARDS WITH ENVELOPES,12 PENCIL SMALL TUBE WOODLAND,20 DOLLY PEGS RETROSPOT,200 RED + WHITE BENDY STRAWS,3 DRAWER ANTIQUE WHITE WOOD CABINET,3 HOOK HANGER MAGIC GARDEN,3 HOOK PHOTO SHELF ANTIQUE WHITE,3 PIECE SPACEBOY COOKIE CUTTER SET,...,WRAP RED APPLES,WRAP RED VINTAGE DOILY,WRAP SUKI AND FRIENDS,"WRAP, BILLBOARD FONTS DESIGN",YELLOW COAT RACK PARIS FASHION,YELLOW FLOWERS FELT HANDBAG KIT,ZINC FOLKART SLEIGH BELLS,ZINC METAL HEART DECORATION,ZINC T-LIGHT HOLDER STAR LARGE,ZINC T-LIGHT HOLDER STARS SMALL
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
536944,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
538095,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
538525,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
539351,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
540469,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C569422,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C571499,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C574894,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C575531,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Puede observase que se ha eliminado la columna ` POSTAGE` ya que prácticamente es un ítem que aparece en todas las facturas.

## Ahora se pude utilizar la función `apriori` para obtener los productos con un mínimo de soporte.

In [None]:
pip install mlxtend

Collecting mlxtend
  Downloading mlxtend-0.17.3-py2.py3-none-any.whl (1.3 MB)
Installing collected packages: mlxtend
Successfully installed mlxtend-0.17.3



In [None]:
from mlxtend.frequent_patterns import apriori

In [None]:
from mlxtend.frequent_patterns import association_rules

In [None]:
common_itemsets = apriori (basket_sets, min_support = 0.02, use_colnames = True) 
rules = association_rules (common_itemsets, metric = "lift", min_threshold = 1) 
rules.head ()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(SET OF 3 HEART COOKIE CUTTERS),(3 TRADITIONAl BISCUIT CUTTERS SET),0.066667,0.028571,0.028571,0.428571,15.0,0.026667,1.7
1,(3 TRADITIONAl BISCUIT CUTTERS SET),(SET OF 3 HEART COOKIE CUTTERS),0.028571,0.066667,0.028571,1.0,15.0,0.026667,inf
2,(36 FOIL HEART CAKE CASES),(JAM MAKING SET WITH JARS),0.028571,0.133333,0.028571,1.0,7.5,0.024762,inf
3,(JAM MAKING SET WITH JARS),(36 FOIL HEART CAKE CASES),0.133333,0.028571,0.028571,0.214286,7.5,0.024762,1.236364
4,(36 FOIL HEART CAKE CASES),(RETROSPOT TEA SET CERAMIC 11 PC),0.028571,0.047619,0.028571,1.0,21.0,0.027211,inf


In [None]:
rules[ (rules['support'] >= 0.05) &
      (rules['confidence'] >= 0.5) ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
28,(6 RIBBONS RUSTIC CHARM),(ASSORTED COLOUR BIRD ORNAMENT),0.142857,0.114286,0.085714,0.6,5.25,0.069388,2.214286
29,(ASSORTED COLOUR BIRD ORNAMENT),(6 RIBBONS RUSTIC CHARM),0.114286,0.142857,0.085714,0.75,5.25,0.069388,3.428571
78,(PACK OF 72 RETROSPOT CAKE CASES),(6 RIBBONS RUSTIC CHARM),0.104762,0.142857,0.057143,0.545455,3.818182,0.042177,1.885714
89,(PLASTERS IN TIN SKULLS),(6 RIBBONS RUSTIC CHARM),0.095238,0.142857,0.057143,0.6,4.2,0.043537,2.142857
99,(RED RETROSPOT TAPE),(6 RIBBONS RUSTIC CHARM),0.066667,0.142857,0.057143,0.857143,6.0,0.047619,6.0
182,(PACK OF 72 RETROSPOT CAKE CASES),(ASSORTED COLOUR BIRD ORNAMENT),0.104762,0.114286,0.057143,0.545455,4.772727,0.04517,1.948571
183,(ASSORTED COLOUR BIRD ORNAMENT),(PACK OF 72 RETROSPOT CAKE CASES),0.114286,0.104762,0.057143,0.5,4.772727,0.04517,1.790476
190,(ASSORTED COLOUR BIRD ORNAMENT),(PLASTERS IN TIN SKULLS),0.114286,0.095238,0.057143,0.5,5.25,0.046259,1.809524
191,(PLASTERS IN TIN SKULLS),(ASSORTED COLOUR BIRD ORNAMENT),0.095238,0.114286,0.057143,0.6,5.25,0.046259,2.214286
524,(DOLLY GIRL LUNCH BOX),(SPACEBOY LUNCH BOX),0.085714,0.114286,0.057143,0.666667,5.833333,0.047347,2.657143


## Buen Trabajo