In [2]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

# 載入資料

In [3]:
df = pd.read_excel('Online_Retail001.xlsx')
df = df[df["Country"] == "France"] # 縮小筆數比較容易找到相關

df

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536370,22726,ALARM CLOCK BAKELIKE GREEN,12,2010-12-01 08:45:00,3.75,12583.0,France
131,536974,22748,POPPY'S PLAYHOUSE KITCHEN,6,2010-12-03 13:59:00,2.10,12682.0,France
150,537065,21982,PACK OF 12 SUKI TISSUES,24,2010-12-05 11:57:00,0.29,12567.0,France
151,537065,22892,SET OF SALT AND PEPPER TOADSTOOLS,12,2010-12-05 11:57:00,1.25,12567.0,France
269,537463,22312,OFFICE MUG WARMER POLKADOT,24,2010-12-07 10:08:00,2.95,12681.0,France
...,...,...,...,...,...,...,...,...
9781,580736,21706,FOLDING UMBRELLA RED/WHITE POLKADOT,4,2011-12-06 08:55:00,4.95,12716.0,France
9782,580736,85114C,RED ENCHANTED FOREST PLACEMAT,6,2011-12-06 08:55:00,1.65,12716.0,France
9787,580753,23163,REGENCY SUGAR TONGS,8,2011-12-06 10:00:00,2.49,12682.0,France
9877,581171,21889,WOODEN BOX OF DOMINOES,12,2011-12-07 15:02:00,1.25,12615.0,France


# 以訂單為單位，進行各筆產品資料彙總

挑出某筆訂單的產品編號。

In [4]:
df[df['InvoiceNo'] == 536370]['StockCode'].values.astype(str).tolist()

['22726']

以下為把每一筆訂單的產品名稱轉至 `records`。

In [5]:
records = df.set_index("InvoiceNo").groupby(["InvoiceNo"]).apply(lambda x: x['StockCode'].values.astype(str).tolist())

records.to_frame()

Unnamed: 0_level_0,0
InvoiceNo,Unnamed: 1_level_1
536370,[22726]
536974,[22748]
537065,"[21982, 22892]"
537463,"[22312, 22894, 22181]"
538008,[22894]
...,...
580753,[23163]
581171,[21889]
581587,[23254]
C570515,[90201C]


# [問題] 使用 TransactionEncoder 將 records 轉換成布林值的 one_hot_encoder。

In [6]:
from numpy import ndarray


te = TransactionEncoder()
te_ary = te.fit(records).transform(records)

assert isinstance(te_ary, ndarray)
df_trans = pd.DataFrame(te_ary, columns=te.columns_)
df_trans

Unnamed: 0,15058C,20674,20724,20726,20727,20750,20914,20984,21084,21086,...,84380,84535B,84596B,84828,84997D,85014B,85114C,90030B,90201C,POST
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
113,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
114,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
115,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False


# [問題] 將 df_trans 資料利用 apriori 找出 min_support=0.01 的項目集

In [7]:
frequent_itemsets = apriori(df_trans, min_support=0.01, use_colnames=True)

frequent_itemsets

Unnamed: 0,support,itemsets
0,0.025641,(20750)
1,0.017094,(22352)
2,0.017094,(22383)
3,0.034188,(22437)
4,0.025641,(22555)
5,0.017094,(22605)
6,0.017094,(22699)
7,0.017094,(22726)
8,0.017094,(22892)
9,0.017094,(22894)


Assign 新欄位 length，將 frequent_itemsets 內的 itemsets 行，產品數列出。

In [8]:
frequent_itemsets["length"] = frequent_itemsets["itemsets"].map(len)
frequent_itemsets

Unnamed: 0,support,itemsets,length
0,0.025641,(20750),1
1,0.017094,(22352),1
2,0.017094,(22383),1
3,0.034188,(22437),1
4,0.025641,(22555),1
5,0.017094,(22605),1
6,0.017094,(22699),1
7,0.017094,(22726),1
8,0.017094,(22892),1
9,0.017094,(22894),1


找出 frequent itemsets 中 `frequent_itemsets["length"]>=2` 且 `frequent_itemsets["support"]>0.01` 的筆數。

In [9]:
frequent_itemsets[(frequent_itemsets["length"] >= 2) & (frequent_itemsets["support"] >= 0.01)]

Unnamed: 0,support,itemsets,length
18,0.017094,"(22383, 23209)",2


# 關聯規則

- 使用了 `association_rules` 函數來從頻繁項目集（frequent_itemsets）中產生關聯規則（association rules）。
- 根據指定的門檻值，`confidence` 大於 `0.3`，使用 `association_rules` 函數來生成關聯規則。

In [10]:
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.3)

rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(22383),(23209),0.017094,0.017094,0.017094,1.0,58.5,0.016802,inf,1.0
1,(23209),(22383),0.017094,0.017094,0.017094,1.0,58.5,0.016802,inf,1.0


根據指定的門檻值 `lift > 1.000000001`，使用 `association_rules` 函數來從頻繁項目集（frequent_itemsets）中產生關聯規則（association rules）

In [11]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.000000001)

rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(22383),(23209),0.017094,0.017094,0.017094,1.0,58.5,0.016802,inf,1.0
1,(23209),(22383),0.017094,0.017094,0.017094,1.0,58.5,0.016802,inf,1.0


## 共同篩選

共同篩選符合 `confidence` 大於 `0.3` 與 `lift > 1.000000001` 的頻繁項目集。

先生成所有規則，僅用一個基本篩選標準（如 confidence）

In [12]:
rule = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.3)

再篩選符合特定 lift 和 confidence 的規則

In [13]:
rules = rule[(rule['confidence'] >= 0.3) & (rule['lift'] >= 1.000000001)]

In [14]:
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(22383),(23209),0.017094,0.017094,0.017094,1.0,58.5,0.016802,inf,1.0
1,(23209),(22383),0.017094,0.017094,0.017094,1.0,58.5,0.016802,inf,1.0


# 進階：找出符合標準的關聯規則（association rules）

算出前置產品長度。

In [15]:
rules["antecedents_len"] = rules["antecedents"].map(len)

rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric,antecedents_len
0,(22383),(23209),0.017094,0.017094,0.017094,1.0,58.5,0.016802,inf,1.0,1
1,(23209),(22383),0.017094,0.017094,0.017094,1.0,58.5,0.016802,inf,1.0,1


根據條件篩選，找出更精準的關聯規則（association rules）。

條件：

- 最少 1 個 antecedents
- confidence > 0.3，lift score > 1.000000001

In [16]:
filtered_criterial = rules[
    (rules["antecedents_len"] >= 1) & (rules["confidence"] >= 0.3) & (rules["lift"] > 1.000000001)
]

filtered_criterial

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric,antecedents_len
0,(22383),(23209),0.017094,0.017094,0.017094,1.0,58.5,0.016802,inf,1.0,1
1,(23209),(22383),0.017094,0.017094,0.017094,1.0,58.5,0.016802,inf,1.0,1


# 問題：rules 前置項為 23209 的產品，其應與什麼產品搭配？

In [24]:
filtered_criterial[filtered_criterial["antecedents"] == {'23209'}]['consequents']

1    (22383)
Name: consequents, dtype: object