# CSV 實例計算關聯規則

讀取資料。

In [1]:
import pandas as pd

store_data = pd.read_csv('retail_dataset.csv')
store_data

Unnamed: 0,0,1,2,3,4,5,6
0,Bread,Wine,Eggs,Meat,Cheese,Pencil,Diaper
1,Bread,Cheese,Meat,Diaper,Wine,Milk,Pencil
2,Cheese,Meat,Eggs,Milk,Wine,,
3,Cheese,Meat,Eggs,Milk,Wine,,
4,Meat,Pencil,Wine,,,,
...,...,...,...,...,...,...,...
310,Bread,Eggs,Cheese,,,,
311,Meat,Milk,Pencil,,,,
312,Bread,Cheese,Eggs,Meat,Pencil,Diaper,Wine
313,Meat,Cheese,,,,,


建立清理後的資料集。做法具體是把每一列變成 Python 的列表，不含 NaN 值。

In [2]:
transactions = [
    [item for item in line if pd.notna(item)]
    for line in store_data.values
]

transactions[:3]

[['Bread', 'Wine', 'Eggs', 'Meat', 'Cheese', 'Pencil', 'Diaper'],
 ['Bread', 'Cheese', 'Meat', 'Diaper', 'Wine', 'Milk', 'Pencil'],
 ['Cheese', 'Meat', 'Eggs', 'Milk', 'Wine']]

將資料集進行編碼。

In [3]:
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df_trans = pd.DataFrame(te_ary, columns=te.columns_)

df_trans

Unnamed: 0,Bagel,Bread,Cheese,Diaper,Eggs,Meat,Milk,Pencil,Wine
0,False,True,True,True,True,True,False,True,True
1,False,True,True,True,False,True,True,True,True
2,False,False,True,False,True,True,True,False,True
3,False,False,True,False,True,True,True,False,True
4,False,False,False,False,False,True,False,True,True
...,...,...,...,...,...,...,...,...,...
310,False,True,True,False,True,False,False,False,False
311,False,False,False,False,False,True,True,True,False
312,False,True,True,True,True,True,False,True,True
313,False,False,True,False,False,True,False,False,False


這次我們使用 FP-Max 演算法找出高頻資料集。

> FP-Max is a variant of FP-Growth, which focuses on obtaining maximal itemsets. An itemset X is said to maximal if X is frequent and there exists no frequent super-pattern containing X. In other words, a frequent pattern X cannot be sub-pattern of larger frequent pattern to qualify for the definition maximal itemset.

In [4]:
from mlxtend.frequent_patterns import fpmax

frequent_itemsets = fpmax(df_trans, min_support=0.2, use_colnames=True)
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.2,"(Pencil, Bread)"
1,0.2,"(Pencil, Cheese)"
2,0.2,"(Pencil, Wine)"
3,0.2,"(Cheese, Diaper)"
4,0.231746,"(Diaper, Bread)"
5,0.234921,"(Diaper, Wine)"
6,0.225397,"(Bagel, Milk)"
7,0.279365,"(Bagel, Bread)"
8,0.24127,"(Eggs, Wine)"
9,0.244444,"(Milk, Eggs)"


執行關聯規則。這裡以 `lift` 為指標，計算最小增益為 1.1 的規則。

In [5]:
from mlxtend.frequent_patterns import association_rules

rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.25, support_only=True)

rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Bagel),(Bread),,,0.279365,,,,,
1,(Bread),(Bagel),,,0.279365,,,,,
2,(Meat),(Wine),,,0.250794,,,,,
3,(Wine),(Meat),,,0.250794,,,,,
4,(Cheese),(Wine),,,0.269841,,,,,
5,(Wine),(Cheese),,,0.269841,,,,,
6,(Milk),(Bread),,,0.279365,,,,,
7,(Bread),(Milk),,,0.279365,,,,,
