In [3]:
%%capture
! pip install mlxtend

In [5]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

# Load the data from the UCI Machine Learning Repository
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx"
online_retail_data = pd.read_excel(url)

In [6]:
# Data preprocessing
online_retail_data['Description'] = online_retail_data['Description'].str.strip()
online_retail_data = online_retail_data.dropna(axis=0, subset=['InvoiceNo'])
online_retail_data['InvoiceNo'] = online_retail_data['InvoiceNo'].astype('str')
online_retail_data = online_retail_data[~online_retail_data['InvoiceNo'].str.contains('C')] # 취소

In [7]:
online_retail_data.sample(20)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
5392,536862,21876,POTTERING MUG,1,2010-12-03 11:13:00,3.36,,United Kingdom
145996,548952,22999,TRAVEL CARD WALLET RETRO PETALS,8,2011-04-05 10:46:00,0.42,15572.0,United Kingdom
457572,575739,23482,PEARLISED IVORY HEART LARGE,1,2011-11-11 09:05:00,3.29,,United Kingdom
131314,547552,20726,LUNCH BAG WOODLAND,2,2011-03-23 15:55:00,4.96,,United Kingdom
254333,559307,23301,GARDENERS KNEELING PAD KEEP CALM,1,2011-07-07 13:02:00,1.65,15356.0,United Kingdom
72321,542239,22430,ENAMEL WATERING CAN CREAM,1,2011-01-26 14:35:00,4.95,17786.0,United Kingdom
294755,562717,23316,RED REFECTORY CLOCK,1,2011-08-08 15:42:00,19.96,,United Kingdom
345533,567168,23300,GARDENERS KNEELING PAD CUP OF TEA,2,2011-09-18 12:24:00,1.65,14878.0,United Kingdom
363716,568574,23328,SET 6 SCHOOL MILK BOTTLES IN CRATE,4,2011-09-28 09:15:00,3.75,13862.0,United Kingdom
303313,563538,22662,LUNCH BAG DOLLY GIRL DESIGN,1,2011-08-17 11:48:00,1.65,17841.0,United Kingdom


In [32]:
# Convert transaction data into a basket format
basket = (online_retail_data[online_retail_data['Country'] == "United Kingdom"]
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

# Convert the quantities into 0/1 (0: not in the basket, 1: in the basket)
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

basket_sets = basket.applymap(encode_units)

# Use the Apriori algorithm to find frequent itemsets
frequent_itemsets = apriori(basket_sets, min_support=0.03, use_colnames=True)

# Generate association rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)

In [34]:
basket.shape

(18667, 4175)

In [33]:
basket.head()

Description,*Boombox Ipod Classic,*USB Office Mirror Ball,10 COLOUR SPACEBOY PEN,12 COLOURED PARTY BALLOONS,12 DAISY PEGS IN WOOD BOX,12 EGG HOUSE PAINTED WOOD,12 HANGING EGGS HAND PAINTED,12 IVORY ROSE PEG PLACE SETTINGS,12 MESSAGE CARDS WITH ENVELOPES,12 PENCIL SMALL TUBE WOODLAND,...,wrongly coded 20713,wrongly coded 23343,wrongly coded-23343,wrongly marked,wrongly marked 23343,wrongly marked carton 22804,wrongly marked. 23343 in box,wrongly sold (22719) barcode,wrongly sold as sets,wrongly sold sets
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
536365,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536366,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536367,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536368,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536369,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(ALARM CLOCK BAKELIKE GREEN),(ALARM CLOCK BAKELIKE RED),0.046928,0.049821,0.03016,0.642694,12.900183,0.027822,2.659288
1,(ALARM CLOCK BAKELIKE RED),(ALARM CLOCK BAKELIKE GREEN),0.049821,0.046928,0.03016,0.605376,12.900183,0.027822,2.415142
2,(GREEN REGENCY TEACUP AND SAUCER),(PINK REGENCY TEACUP AND SAUCER),0.050035,0.03766,0.03091,0.617773,16.403939,0.029026,2.517719
3,(PINK REGENCY TEACUP AND SAUCER),(GREEN REGENCY TEACUP AND SAUCER),0.03766,0.050035,0.03091,0.820768,16.403939,0.029026,5.300203
4,(GREEN REGENCY TEACUP AND SAUCER),(ROSES REGENCY TEACUP AND SAUCER),0.050035,0.051267,0.037553,0.750535,14.639752,0.034988,3.803076
5,(ROSES REGENCY TEACUP AND SAUCER),(GREEN REGENCY TEACUP AND SAUCER),0.051267,0.050035,0.037553,0.732497,14.639752,0.034988,3.551237
6,(JUMBO BAG RED RETROSPOT),(JUMBO BAG BAROQUE BLACK WHITE),0.10382,0.048749,0.030535,0.294118,6.03329,0.025474,1.347605
7,(JUMBO BAG BAROQUE BLACK WHITE),(JUMBO BAG RED RETROSPOT),0.048749,0.10382,0.030535,0.626374,6.03329,0.025474,2.398601
8,(JUMBO BAG PINK POLKADOT),(JUMBO BAG RED RETROSPOT),0.062088,0.10382,0.042053,0.677308,6.523895,0.035607,2.777201
9,(JUMBO BAG RED RETROSPOT),(JUMBO BAG PINK POLKADOT),0.10382,0.062088,0.042053,0.405057,6.523895,0.035607,1.576473


In [12]:
# Filter rules by a minimum lift and confidence
filtered_rules = rules[(rules['lift'] >= 3.0) & (rules['confidence'] >= 0.5)]

In [14]:
# Print the association rules
print(filtered_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

                            antecedents                        consequents  \
0          (ALARM CLOCK BAKELIKE GREEN)         (ALARM CLOCK BAKELIKE RED)   
1            (ALARM CLOCK BAKELIKE RED)       (ALARM CLOCK BAKELIKE GREEN)   
2     (GREEN REGENCY TEACUP AND SAUCER)   (PINK REGENCY TEACUP AND SAUCER)   
3      (PINK REGENCY TEACUP AND SAUCER)  (GREEN REGENCY TEACUP AND SAUCER)   
4     (GREEN REGENCY TEACUP AND SAUCER)  (ROSES REGENCY TEACUP AND SAUCER)   
5     (ROSES REGENCY TEACUP AND SAUCER)  (GREEN REGENCY TEACUP AND SAUCER)   
7      (JUMBO  BAG BAROQUE BLACK WHITE)          (JUMBO BAG RED RETROSPOT)   
8             (JUMBO BAG PINK POLKADOT)          (JUMBO BAG RED RETROSPOT)   
10  (JUMBO SHOPPER VINTAGE RED PAISLEY)          (JUMBO BAG RED RETROSPOT)   
12             (JUMBO STORAGE BAG SUKI)          (JUMBO BAG RED RETROSPOT)   

     support  confidence       lift  
0   0.030160    0.642694  12.900183  
1   0.030160    0.605376  12.900183  
2   0.030910    0.617773  1