In [42]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [43]:
data = pd.read_csv(r'e:\documents\online_retail.csv' ,delimiter=',')

In [44]:
data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/10 08:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/10 08:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/10 08:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/10 08:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/10 08:26,3.39,17850.0,United Kingdom


In [45]:
data.shape

(541909, 8)

In [46]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
InvoiceNo      541909 non-null object
StockCode      541909 non-null object
Description    540455 non-null object
Quantity       541909 non-null int64
InvoiceDate    541909 non-null object
UnitPrice      541909 non-null float64
CustomerID     406829 non-null float64
Country        541909 non-null object
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB


# Pre Processing

In [47]:
np.sum(data.isnull())

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

In [48]:
#'Description' merupakan variabel penting dan terdapat missing value di dalamnya 
#missing value pada 'Description' diatasi dengan metode listwise yaitu dengan menghapus transaksi (baris)
data['Description']=data['Description'].str.strip()
data.dropna(axis=0, subset=['Description'], inplace=True)
data['InvoiceNo']=data['InvoiceNo'].astype('str')
data=data[~data['InvoiceNo'].str.contains('C')]

In [49]:
np.sum(data.isnull())

InvoiceNo           0
StockCode           0
Description         0
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     133243
Country             0
dtype: int64

In [50]:
data.isnull().sum().sum()

133243

# association rule

In [51]:
#menata data untuk melakukan association rule
data_basket=(data.groupby(['InvoiceNo', 'Description'])['Quantity'].sum().unstack().reset_index().fillna(0).set_index('InvoiceNo'))
data_basket.head()

Description,*Boombox Ipod Classic,*USB Office Mirror Ball,10 COLOUR SPACEBOY PEN,12 COLOURED PARTY BALLOONS,12 DAISY PEGS IN WOOD BOX,12 EGG HOUSE PAINTED WOOD,12 HANGING EGGS HAND PAINTED,12 IVORY ROSE PEG PLACE SETTINGS,12 MESSAGE CARDS WITH ENVELOPES,12 PENCIL SMALL TUBE WOODLAND,...,wrongly coded 20713,wrongly coded 23343,wrongly coded-23343,wrongly marked,wrongly marked 23343,wrongly marked carton 22804,wrongly marked. 23343 in box,wrongly sold (22719) barcode,wrongly sold as sets,wrongly sold sets
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
536365,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536366,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536367,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536368,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536369,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [52]:
#jika item tidak dibeli pada suatu transaksi atau sama dengan 0 maka bernilai 0
#jika pembelian item sama dengan atau lebih dari 1 maka bernilai 1
def encode_units(x) :
    if x <=0:
        return 0
    if x >= 1:
        return 1
basketset = data_basket.applymap(encode_units)
basketset.head()

Description,*Boombox Ipod Classic,*USB Office Mirror Ball,10 COLOUR SPACEBOY PEN,12 COLOURED PARTY BALLOONS,12 DAISY PEGS IN WOOD BOX,12 EGG HOUSE PAINTED WOOD,12 HANGING EGGS HAND PAINTED,12 IVORY ROSE PEG PLACE SETTINGS,12 MESSAGE CARDS WITH ENVELOPES,12 PENCIL SMALL TUBE WOODLAND,...,wrongly coded 20713,wrongly coded 23343,wrongly coded-23343,wrongly marked,wrongly marked 23343,wrongly marked carton 22804,wrongly marked. 23343 in box,wrongly sold (22719) barcode,wrongly sold as sets,wrongly sold sets
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
536365,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536366,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536367,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536368,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536369,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [53]:
#perhitungan support atau peluang dibelinya 1 item atau lebih dari keseluruhan transaksi dengan nilai minimum 0,02
frequent_itemset=apriori(basketset, min_support=0.02, use_colnames=True)
frequent_itemset.tail()

Unnamed: 0,support,itemsets
348,0.024551,"(RED RETROSPOT CHARLOTTE BAG, WOODLAND CHARLOT..."
349,0.025473,"(REGENCY CAKESTAND 3 TIER, ROSES REGENCY TEACU..."
350,0.026152,"(WOODEN PICTURE FRAME WHITE FINISH, WOODEN FRA..."
351,0.026298,"(GREEN REGENCY TEACUP AND SAUCER, PINK REGENCY..."
352,0.020039,"(JUMBO STORAGE BAG SUKI, JUMBO BAG PINK POLKAD..."


In [54]:
#menghitung nilai confidence dan lift dari item yang memiliki nilai support lebih dari 0,02
#nilai confidence adalah peluang terbelinya item 'antecedents' lalu beli item 'consequents'
#nilai lift adalah rasio nilai confidence dan nilai support dari item 'antecedents' 
rules=association_rules(frequent_itemset, metric="lift", min_threshold=1)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(60 TEATIME FAIRY CAKE CASES),(PACK OF 72 RETROSPOT CAKE CASES),0.040175,0.064047,0.021980,0.547101,8.542243,0.019407,2.066585
1,(PACK OF 72 RETROSPOT CAKE CASES),(60 TEATIME FAIRY CAKE CASES),0.064047,0.040175,0.021980,0.343182,8.542243,0.019407,1.461326
2,(ALARM CLOCK BAKELIKE PINK),(ALARM CLOCK BAKELIKE GREEN),0.037991,0.047550,0.020281,0.533844,11.227070,0.018475,2.043202
3,(ALARM CLOCK BAKELIKE GREEN),(ALARM CLOCK BAKELIKE PINK),0.047550,0.037991,0.020281,0.426531,11.227070,0.018475,1.677524
4,(ALARM CLOCK BAKELIKE GREEN),(ALARM CLOCK BAKELIKE RED),0.047550,0.050995,0.031053,0.653061,12.806462,0.028628,2.735368
5,(ALARM CLOCK BAKELIKE RED),(ALARM CLOCK BAKELIKE GREEN),0.050995,0.047550,0.031053,0.608944,12.806462,0.028628,2.435585
6,(ALARM CLOCK BAKELIKE PINK),(ALARM CLOCK BAKELIKE RED),0.037991,0.050995,0.022804,0.600255,11.770946,0.020867,2.374029
7,(ALARM CLOCK BAKELIKE RED),(ALARM CLOCK BAKELIKE PINK),0.050995,0.037991,0.022804,0.447193,11.770946,0.020867,1.740226
8,(RED RETROSPOT CHARLOTTE BAG),(CHARLOTTE BAG PINK POLKADOT),0.050170,0.036050,0.025328,0.504836,14.003582,0.023519,1.946726
9,(CHARLOTTE BAG PINK POLKADOT),(RED RETROSPOT CHARLOTTE BAG),0.036050,0.050170,0.025328,0.702557,14.003582,0.023519,3.193320


In [55]:
#memilah data transaksi yang memiliki nilai lift minimum bernilai 2 dan nilai confidence minimum bernilai 0,5 
#dari data ini dapat diketahui seberapa besar kecenderungan pembeli membeli suatu item dengan item lain 
dataa = rules[(rules['lift']>=2) &
     (rules['confidence'] >= 0.5)]
dataa

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(60 TEATIME FAIRY CAKE CASES),(PACK OF 72 RETROSPOT CAKE CASES),0.040175,0.064047,0.02198,0.547101,8.542243,0.019407,2.066585
2,(ALARM CLOCK BAKELIKE PINK),(ALARM CLOCK BAKELIKE GREEN),0.037991,0.04755,0.020281,0.533844,11.22707,0.018475,2.043202
4,(ALARM CLOCK BAKELIKE GREEN),(ALARM CLOCK BAKELIKE RED),0.04755,0.050995,0.031053,0.653061,12.806462,0.028628,2.735368
5,(ALARM CLOCK BAKELIKE RED),(ALARM CLOCK BAKELIKE GREEN),0.050995,0.04755,0.031053,0.608944,12.806462,0.028628,2.435585
6,(ALARM CLOCK BAKELIKE PINK),(ALARM CLOCK BAKELIKE RED),0.037991,0.050995,0.022804,0.600255,11.770946,0.020867,2.374029
8,(RED RETROSPOT CHARLOTTE BAG),(CHARLOTTE BAG PINK POLKADOT),0.05017,0.03605,0.025328,0.504836,14.003582,0.023519,1.946726
9,(CHARLOTTE BAG PINK POLKADOT),(RED RETROSPOT CHARLOTTE BAG),0.03605,0.05017,0.025328,0.702557,14.003582,0.023519,3.19332
11,(CHARLOTTE BAG SUKI DESIGN),(RED RETROSPOT CHARLOTTE BAG),0.042795,0.05017,0.0246,0.57483,11.457684,0.022453,2.234001
12,(STRAWBERRY CHARLOTTE BAG),(CHARLOTTE BAG SUKI DESIGN),0.035032,0.042795,0.020136,0.574792,13.43137,0.018637,2.251147
14,(WOODLAND CHARLOTTE BAG),(CHARLOTTE BAG SUKI DESIGN),0.040514,0.042795,0.022125,0.546108,12.76109,0.020391,2.108882
