In [1]:
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from mlxtend.preprocessing import TransactionEncoder

In [2]:
market_data = {
    'Transaction ID': [1, 2, 3, 4, 5, 6, 7, 8],
    'Items': [
        ['T-Shirt', 'Pants', 'Jeans', 'Jersy', 'Socks', 'Basketball', 'Bottle', 'Shorts'],
        ['T-Shirt', 'Jeans'],
        ['Jersy', 'Basketball', 'Socks', 'Bottle'],
        ['Jeans', 'Pants', 'Bottle'],
        ['Shorts', 'Basketball'],
        ['Shorts', 'Jersy'],
        ['T-Shirt'],
        ['Basketball', 'Jersy'],
]}

df = pd.DataFrame(market_data)
df

Unnamed: 0,Transaction ID,Items
0,1,"[T-Shirt, Pants, Jeans, Jersy, Socks, Basketba..."
1,2,"[T-Shirt, Jeans]"
2,3,"[Jersy, Basketball, Socks, Bottle]"
3,4,"[Jeans, Pants, Bottle]"
4,5,"[Shorts, Basketball]"
5,6,"[Shorts, Jersy]"
6,7,[T-Shirt]
7,8,"[Basketball, Jersy]"


# 使用 TransactionEncoder 將商品轉換為 One-Hot Encoding

- 透過該函數的 `fit` 方法，`TransactionEncoder` 將學習列表（list）或序列（series）中的唯一項目（unique items）。
- 並透過該 `transform` 方法，將輸入的列表轉換為 NumPy 布林陣列（boolean array）。

In [3]:
te = TransactionEncoder()
te_ary=te.fit_transform(X=df['Items'])

te_ary

array([[ True,  True,  True,  True,  True,  True,  True,  True],
       [False, False,  True, False, False, False, False,  True],
       [ True,  True, False,  True, False, False,  True, False],
       [False,  True,  True, False,  True, False, False, False],
       [ True, False, False, False, False,  True, False, False],
       [False, False, False,  True, False,  True, False, False],
       [False, False, False, False, False, False, False,  True],
       [ True, False, False,  True, False, False, False, False]])

## 轉換成 Apriori 函數要求資料格式

- **Apriori 函數**要求資料使用 Pandas DataFrame 格式，因此在這裡需要進行轉換：將 array 轉換為 DataFrame。

In [4]:
data_items_dumm = pd.DataFrame(te_ary, columns=te.columns_)
data_items_dumm

Unnamed: 0,Basketball,Bottle,Jeans,Jersy,Pants,Shorts,Socks,T-Shirt
0,True,True,True,True,True,True,True,True
1,False,False,True,False,False,False,False,True
2,True,True,False,True,False,False,True,False
3,False,True,True,False,True,False,False,False
4,True,False,False,False,False,True,False,False
5,False,False,False,True,False,True,False,False
6,False,False,False,False,False,False,False,True
7,True,False,False,True,False,False,False,False


# 計算支持度達 0.2 的項目集

In [5]:
frequent_itemsets = apriori(data_items_dumm, use_colnames=True)  # 不訂最低支持度的話，預設為 min_support=0.5
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.5,(Basketball)
1,0.5,(Jersy)


In [6]:
frequent_itemsets = apriori(data_items_dumm, min_support=0.2, use_colnames=True)
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.5,(Basketball)
1,0.375,(Bottle)
2,0.375,(Jeans)
3,0.5,(Jersy)
4,0.25,(Pants)
5,0.375,(Shorts)
6,0.25,(Socks)
7,0.375,(T-Shirt)
8,0.25,"(Basketball, Bottle)"
9,0.375,"(Jersy, Basketball)"


# 計算關聯規則

沒有指定的門檻值，使用了 `association_rules` 函數來從頻繁項目集（`frequent_itemsets`）中產生關聯規則（association rules）
預設情況下，`association_rules` 會只根據 `confidence=0.8` 的條件來生成規則，而 `lift` 並沒有被設定特定的篩選標準。

In [7]:
association_rules_df = association_rules(frequent_itemsets)
association_rules_df


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Socks),(Basketball),0.25,0.5,0.25,1.0,2.0,0.125,inf,0.666667
1,(Pants),(Bottle),0.25,0.375,0.25,1.0,2.666667,0.15625,inf,0.833333
2,(Socks),(Bottle),0.25,0.375,0.25,1.0,2.666667,0.15625,inf,0.833333
3,(Pants),(Jeans),0.25,0.375,0.25,1.0,2.666667,0.15625,inf,0.833333
4,(Socks),(Jersy),0.25,0.5,0.25,1.0,2.0,0.125,inf,0.666667
5,"(Jersy, Bottle)",(Basketball),0.25,0.5,0.25,1.0,2.0,0.125,inf,0.666667
6,"(Basketball, Bottle)",(Jersy),0.25,0.5,0.25,1.0,2.0,0.125,inf,0.666667
7,"(Basketball, Socks)",(Bottle),0.25,0.375,0.25,1.0,2.666667,0.15625,inf,0.833333
8,"(Basketball, Bottle)",(Socks),0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0
9,"(Socks, Bottle)",(Basketball),0.25,0.5,0.25,1.0,2.0,0.125,inf,0.666667


指定 `confidence` 最小為 `0.6`

In [8]:
association_rules_df = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.6)
association_rules_df

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Bottle),(Basketball),0.375,0.5,0.25,0.666667,1.333333,0.0625,1.5,0.4
1,(Jersy),(Basketball),0.5,0.5,0.375,0.75,1.5,0.125,2.0,0.666667
2,(Basketball),(Jersy),0.5,0.5,0.375,0.75,1.5,0.125,2.0,0.666667
3,(Shorts),(Basketball),0.375,0.5,0.25,0.666667,1.333333,0.0625,1.5,0.4
4,(Socks),(Basketball),0.25,0.5,0.25,1.0,2.0,0.125,inf,0.666667
5,(Jeans),(Bottle),0.375,0.375,0.25,0.666667,1.777778,0.109375,1.875,0.7
6,(Bottle),(Jeans),0.375,0.375,0.25,0.666667,1.777778,0.109375,1.875,0.7
7,(Bottle),(Jersy),0.375,0.5,0.25,0.666667,1.333333,0.0625,1.5,0.4
8,(Pants),(Bottle),0.25,0.375,0.25,1.0,2.666667,0.15625,inf,0.833333
9,(Bottle),(Pants),0.375,0.25,0.25,0.666667,2.666667,0.15625,2.25,1.0


指定 `lift` 最小為 `1`

In [9]:
association_rules_df = association_rules(frequent_itemsets, metric='lift', min_threshold=2)
association_rules_df

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Basketball),(Socks),0.5,0.25,0.25,0.5,2.0,0.125,1.5,1.0
1,(Socks),(Basketball),0.25,0.5,0.25,1.0,2.0,0.125,inf,0.666667
2,(Pants),(Bottle),0.25,0.375,0.25,1.0,2.666667,0.15625,inf,0.833333
3,(Bottle),(Pants),0.375,0.25,0.25,0.666667,2.666667,0.15625,2.25,1.0
4,(Socks),(Bottle),0.25,0.375,0.25,1.0,2.666667,0.15625,inf,0.833333
5,(Bottle),(Socks),0.375,0.25,0.25,0.666667,2.666667,0.15625,2.25,1.0
6,(Pants),(Jeans),0.25,0.375,0.25,1.0,2.666667,0.15625,inf,0.833333
7,(Jeans),(Pants),0.375,0.25,0.25,0.666667,2.666667,0.15625,2.25,1.0
8,(Jersy),(Socks),0.5,0.25,0.25,0.5,2.0,0.125,1.5,1.0
9,(Socks),(Jersy),0.25,0.5,0.25,1.0,2.0,0.125,inf,0.666667


# 