In [33]:
import polars as pl
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from mlxtend.preprocessing import TransactionEncoder

In [34]:
market_data = {
    'Transaction ID': [1, 2, 3, 4, 5, 6, 7, 8],
    'Items': [
        ['T-Shirt', 'Pants', 'Jeans', 'Jersy', 'Socks', 'Basketball', 'Bottle', 'Shorts'],
        ['T-Shirt', 'Jeans'],
        ['Jersy', 'Basketball', 'Socks', 'Bottle'],
        ['Jeans', 'Pants', 'Bottle'],
        ['Shorts', 'Basketball'],
        ['Shorts', 'Jersy'],
        ['T-Shirt'],
        ['Basketball', 'Jersy'],
]}

df = pl.DataFrame(market_data)
df

Transaction ID,Items
i64,list[str]
1,"[""T-Shirt"", ""Pants"", … ""Shorts""]"
2,"[""T-Shirt"", ""Jeans""]"
3,"[""Jersy"", ""Basketball"", … ""Bottle""]"
4,"[""Jeans"", ""Pants"", ""Bottle""]"
5,"[""Shorts"", ""Basketball""]"
6,"[""Shorts"", ""Jersy""]"
7,"[""T-Shirt""]"
8,"[""Basketball"", ""Jersy""]"


# 使用 TransactionEncoder 將商品轉換為 One-Hot Encoding

- 透過該函數的 `fit` 方法，`TransactionEncoder` 將學習列表（list）或序列（series）中的唯一項目（unique items）。
- 並透過該 `transform` 方法，將輸入的列表轉換為 NumPy 布林陣列（boolean array）。

In [35]:
te = TransactionEncoder()
te_ary=te.fit_transform(X=df['Items'])

te_ary

array([[ True,  True,  True,  True,  True,  True,  True,  True],
       [False, False,  True, False, False, False, False,  True],
       [ True,  True, False,  True, False, False,  True, False],
       [False,  True,  True, False,  True, False, False, False],
       [ True, False, False, False, False,  True, False, False],
       [False, False, False,  True, False,  True, False, False],
       [False, False, False, False, False, False, False,  True],
       [ True, False, False,  True, False, False, False, False]])

## 轉換成 Apriori 函數要求資料格式

- **Apriori 函數**要求資料使用 Pandas DataFrame 格式，因此在這裡需要進行轉換：將 array 轉換為 DataFrame。

In [36]:
data_items_dumm = pl.DataFrame(te_ary, schema=te.columns_)
data_items_dumm

Basketball,Bottle,Jeans,Jersy,Pants,Shorts,Socks,T-Shirt
bool,bool,bool,bool,bool,bool,bool,bool
True,True,True,True,True,True,True,True
False,False,True,False,False,False,False,True
True,True,False,True,False,False,True,False
False,True,True,False,True,False,False,False
True,False,False,False,False,True,False,False
False,False,False,True,False,True,False,False
False,False,False,False,False,False,False,True
True,False,False,True,False,False,False,False


# 計算支持度達 0.2 的項目集

In [37]:
def apriori_pl(data: pl.DataFrame, min_support=0.5, use_colnames=False, *args, **kwargs):
    frequent_itemsets = apriori(data.to_pandas(), min_support=min_support, use_colnames=use_colnames, *args, **kwargs)

    frequent_itemsets['itemsets'] = frequent_itemsets['itemsets'].apply(lambda x: list(x))
    return pl.DataFrame(frequent_itemsets)

In [38]:
frequent_itemsets = apriori_pl(data_items_dumm, use_colnames=True)  # 不訂最低支持度的話，預設為 min_support=0.5
frequent_itemsets

support,itemsets
f64,list[str]
0.5,"[""Basketball""]"
0.5,"[""Jersy""]"


In [39]:
frequent_itemsets = apriori_pl(data_items_dumm, min_support=0.2, use_colnames=True)
frequent_itemsets

support,itemsets
f64,list[str]
0.5,"[""Basketball""]"
0.375,"[""Bottle""]"
0.375,"[""Jeans""]"
0.5,"[""Jersy""]"
0.25,"[""Pants""]"
…,…
0.25,"[""Basketball"", ""Socks"", ""Bottle""]"
0.25,"[""Jersy"", ""Basketball"", ""Socks""]"
0.25,"[""Pants"", ""Jeans"", ""Bottle""]"
0.25,"[""Jersy"", ""Socks"", ""Bottle""]"


# 計算關聯規則

沒有指定的門檻值，使用了 `association_rules` 函數來從頻繁項目集（`frequent_itemsets`）中產生關聯規則（association rules）
預設情況下，`association_rules` 會只根據 `confidence=0.8` 的條件來生成規則，而 `lift` 並沒有被設定特定的篩選標準。

In [40]:
def association_rules_pl(data: pl.DataFrame, metric="confidence", min_threshold=0.8, *args, **kwargs):
    rules = association_rules(data.to_pandas(), metric=metric, min_threshold=min_threshold, *args, **kwargs)
    rules["antecedents"] = rules["antecedents"].apply(lambda x: list(x))
    rules["consequents"] = rules["consequents"].apply(lambda x: list(x))

    return pl.DataFrame(rules)

In [41]:
association_rules_df = association_rules_pl(frequent_itemsets)
association_rules_df


antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
list[str],list[str],f64,f64,f64,f64,f64,f64,f64,f64
"[""Socks""]","[""Basketball""]",0.25,0.5,0.25,1.0,2.0,0.125,inf,0.666667
"[""Pants""]","[""Bottle""]",0.25,0.375,0.25,1.0,2.666667,0.15625,inf,0.833333
"[""Socks""]","[""Bottle""]",0.25,0.375,0.25,1.0,2.666667,0.15625,inf,0.833333
"[""Pants""]","[""Jeans""]",0.25,0.375,0.25,1.0,2.666667,0.15625,inf,0.833333
"[""Socks""]","[""Jersy""]",0.25,0.5,0.25,1.0,2.0,0.125,inf,0.666667
…,…,…,…,…,…,…,…,…,…
"[""Jersy"", ""Bottle""]","[""Basketball"", ""Socks""]",0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0
"[""Basketball"", ""Socks""]","[""Jersy"", ""Bottle""]",0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0
"[""Basketball"", ""Bottle""]","[""Jersy"", ""Socks""]",0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0
"[""Socks"", ""Bottle""]","[""Jersy"", ""Basketball""]",0.25,0.375,0.25,1.0,2.666667,0.15625,inf,0.833333


指定 `confidence` 最小為 `0.6`

In [42]:
association_rules_df = association_rules_pl(frequent_itemsets, metric='confidence', min_threshold=0.6)
association_rules_df

antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
list[str],list[str],f64,f64,f64,f64,f64,f64,f64,f64
"[""Bottle""]","[""Basketball""]",0.375,0.5,0.25,0.666667,1.333333,0.0625,1.5,0.4
"[""Jersy""]","[""Basketball""]",0.5,0.5,0.375,0.75,1.5,0.125,2.0,0.666667
"[""Basketball""]","[""Jersy""]",0.5,0.5,0.375,0.75,1.5,0.125,2.0,0.666667
"[""Shorts""]","[""Basketball""]",0.375,0.5,0.25,0.666667,1.333333,0.0625,1.5,0.4
"[""Socks""]","[""Basketball""]",0.25,0.5,0.25,1.0,2.0,0.125,inf,0.666667
…,…,…,…,…,…,…,…,…,…
"[""Basketball"", ""Socks""]","[""Jersy"", ""Bottle""]",0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0
"[""Basketball"", ""Bottle""]","[""Jersy"", ""Socks""]",0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0
"[""Socks"", ""Bottle""]","[""Jersy"", ""Basketball""]",0.25,0.375,0.25,1.0,2.666667,0.15625,inf,0.833333
"[""Socks""]","[""Jersy"", ""Basketball"", ""Bottle""]",0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0


指定 `lift` 最小為 `1`

In [43]:
association_rules_df = association_rules_pl(frequent_itemsets, metric='lift', min_threshold=2)
association_rules_df

antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
list[str],list[str],f64,f64,f64,f64,f64,f64,f64,f64
"[""Basketball""]","[""Socks""]",0.5,0.25,0.25,0.5,2.0,0.125,1.5,1.0
"[""Socks""]","[""Basketball""]",0.25,0.5,0.25,1.0,2.0,0.125,inf,0.666667
"[""Pants""]","[""Bottle""]",0.25,0.375,0.25,1.0,2.666667,0.15625,inf,0.833333
"[""Bottle""]","[""Pants""]",0.375,0.25,0.25,0.666667,2.666667,0.15625,2.25,1.0
"[""Socks""]","[""Bottle""]",0.25,0.375,0.25,1.0,2.666667,0.15625,inf,0.833333
…,…,…,…,…,…,…,…,…,…
"[""Socks"", ""Bottle""]","[""Jersy"", ""Basketball""]",0.25,0.375,0.25,1.0,2.666667,0.15625,inf,0.833333
"[""Jersy""]","[""Basketball"", ""Socks"", ""Bottle""]",0.5,0.25,0.25,0.5,2.0,0.125,1.5,1.0
"[""Basketball""]","[""Jersy"", ""Socks"", ""Bottle""]",0.5,0.25,0.25,0.5,2.0,0.125,1.5,1.0
"[""Socks""]","[""Jersy"", ""Basketball"", ""Bottle""]",0.25,0.25,0.25,1.0,4.0,0.1875,inf,1.0


# 