In [8]:
import pandas as pd
import numpy as np
from scipy.stats.contingency import association

In [9]:
transactions = [
    ['牛奶', '麵包'],
    ['麵包', '尿布', '葡萄酒', '甜菜'],
    ['牛奶', '尿布', '葡萄酒', '橙汁'],
    ['麵包', '牛奶', '尿布', '葡萄酒'],
    ['麵包', '牛奶', '尿布', '橙汁']
]

## Example 4-1 ~ 4-4：資料集編碼

使用 `TransactionEncoder`，我們可以將資料集轉換為適合典型機器學習 API 的陣列（array）格式。透過該函數的 `fit` 方法，`TransactionEncoder` 學習者將取得資料集中的唯一標籤。並透過該 `transform` 方法，將輸入資料集（列表的 Python 列表）轉換為單編碼（a one-hot encoded）的 NumPy 布林陣列（boolean array）。

In [10]:
from mlxtend.preprocessing import TransactionEncoder
te = TransactionEncoder()

te_ary = te.fit(transactions).transform(transactions)
te_ary

array([[False, False,  True, False, False,  True],
       [ True, False, False,  True,  True,  True],
       [ True,  True,  True, False,  True, False],
       [ True, False,  True, False,  True,  True],
       [ True,  True,  True, False, False,  True]])

In [11]:
te.columns_

['尿布', '橙汁', '牛奶', '甜菜', '葡萄酒', '麵包']

In [12]:
assert isinstance(te_ary, np.ndarray)  # type check
df_trans = pd.DataFrame(te_ary, columns=te.columns_)
df_trans

Unnamed: 0,尿布,橙汁,牛奶,甜菜,葡萄酒,麵包
0,False,False,True,False,False,True
1,True,False,False,True,True,True
2,True,True,True,False,True,False
3,True,False,True,False,True,True
4,True,True,True,False,False,True


對資料進行還原和重排。

In [13]:
org_trans = te_ary[:5]
te.inverse_transform(org_trans)

[['牛奶', '麵包'],
 ['尿布', '甜菜', '葡萄酒', '麵包'],
 ['尿布', '橙汁', '牛奶', '葡萄酒'],
 ['尿布', '牛奶', '葡萄酒', '麵包'],
 ['尿布', '橙汁', '牛奶', '麵包']]

## Example 4-5 ~ 4-7：Apriori 演算法

反覆產生候選項目集，找出所有高頻項目集，進而推導規則。

In [18]:
from mlxtend.frequent_patterns import apriori, association_rules

In [15]:
frequent_itemsets = apriori(df_trans, min_support=0.6, use_colnames=True)
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.8,(尿布)
1,0.8,(牛奶)
2,0.6,(葡萄酒)
3,0.8,(麵包)
4,0.6,"(牛奶, 尿布)"
5,0.6,"(葡萄酒, 尿布)"
6,0.6,"(麵包, 尿布)"
7,0.6,"(牛奶, 麵包)"


顯示每個 itemset 的長度。

In [17]:
frequent_itemsets['item_length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
frequent_itemsets

Unnamed: 0,support,itemsets,item_length
0,0.8,(尿布),1
1,0.8,(牛奶),1
2,0.6,(葡萄酒),1
3,0.8,(麵包),1
4,0.6,"(牛奶, 尿布)",2
5,0.6,"(葡萄酒, 尿布)",2
6,0.6,"(麵包, 尿布)",2
7,0.6,"(牛奶, 麵包)",2


執行關聯規則。

$$
F(\text{antecedents}) \rightarrow \text{consequents}
$$

欄位說明詳見 [Craft 筆記](https://docs.craft.do/editor/d/380f3fc7-8f72-4fab-5806-087d291a9850/4AB86DEC-43D3-4A31-9C08-24E4CB0C350C/b/622BF40B-4E94-4784-978B-5A04D62B7264?s=YkG1fgtbGMULTBG7imw1sThKv3ip2aEP7VwFN4yQqTqZ#EA5E20C6-ECBF-4880-B6DF-5E1351EC2CD4)。

In [23]:
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(牛奶),(尿布),0.8,0.8,0.6,0.75,0.9375,-0.04,0.8,-0.25
1,(尿布),(牛奶),0.8,0.8,0.6,0.75,0.9375,-0.04,0.8,-0.25
2,(葡萄酒),(尿布),0.6,0.8,0.6,1.0,1.25,0.12,inf,0.5
3,(尿布),(葡萄酒),0.8,0.6,0.6,0.75,1.25,0.12,1.6,1.0
4,(麵包),(尿布),0.8,0.8,0.6,0.75,0.9375,-0.04,0.8,-0.25
5,(尿布),(麵包),0.8,0.8,0.6,0.75,0.9375,-0.04,0.8,-0.25
6,(牛奶),(麵包),0.8,0.8,0.6,0.75,0.9375,-0.04,0.8,-0.25
7,(麵包),(牛奶),0.8,0.8,0.6,0.75,0.9375,-0.04,0.8,-0.25


## Example 4-8：找出增益 (lift) 大於 1 的規則

In [24]:
rules[rules["lift"] > 1]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
2,(葡萄酒),(尿布),0.6,0.8,0.6,1.0,1.25,0.12,inf,0.5
3,(尿布),(葡萄酒),0.8,0.6,0.6,0.75,1.25,0.12,1.6,1.0
