### 1. Mengimpor Library yang Diperlukan

In [30]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

### 2. Mengimpor Data

In [31]:
# Mengimpor dataset dari file CSV
data = pd.read_csv(r'C:\Users\User\Documents\Modul Data Mining\Asosiasi_Apriori\Groceries_dataset.csv')

In [32]:
data.head()

Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk


In [33]:
df = data

### 3. Membersihkan dan Menyiapkan Data

In [34]:
# Menghapus spasi ekstra pada kolom 'itemDescription'
data['itemDescription'] = data['itemDescription'].str.strip()

# Menghapus baris yang tidak memiliki 'Member_number' atau 'itemDescription'
data.dropna(axis=0, subset=['Member_number', 'itemDescription'], inplace=True)

# Mengubah 'Member_number' menjadi tipe data string
data['Member_number'] = data['Member_number'].astype('str')

# Menyaring transaksi hanya untuk tahun tertentu jika diperlukan (misalnya 2015)
data['Date'] = pd.to_datetime(data['Date'], format='%d-%m-%Y')
data_2015 = data[data['Date'].dt.year == 2015]

# Menampilkan data yang telah disaring
print(data_2015.head())

  Member_number       Date   itemDescription
0          1808 2015-07-21    tropical fruit
1          2552 2015-01-05        whole milk
2          2300 2015-09-19         pip fruit
3          1187 2015-12-12  other vegetables
4          3037 2015-02-01        whole milk


### 4. Menerapkan Algoritma Apriori

In [35]:
import numpy as np

# Membuat pivot tabel untuk transaksi dalam bentuk one-hot encoding
basket = (data_2015.groupby(['Member_number', 'itemDescription'])['itemDescription']
        .count().unstack().reset_index().fillna(0).set_index('Member_number'))

# Mengonversi data Quantity menjadi one-hot encoding (1 jika dibeli, 0 jika tidak)
basket = np.where(basket > 0, 1, 0)
basket = pd.DataFrame(basket, index=data_2015['Member_number'].unique(), columns=data_2015['itemDescription'].unique())

# Menampilkan beberapa kolom pertama untuk memeriksa hasil
print(basket.iloc[:, :10].head())  # Hanya menampilkan 10 kolom pertama

      tropical fruit  whole milk  pip fruit  other vegetables  rolls/buns  \
1808               0           0          0                 0           0   
2552               0           0          0                 0           0   
2300               0           0          0                 0           0   
1187               0           0          0                 0           0   
3037               0           0          0                 0           0   

      pot plants  citrus fruit  beef  frankfurter  chicken  
1808           0             0     0            0        0  
2552           0             0     0            1        0  
2300           0             0     0            0        0  
1187           0             0     0            0        0  
3037           0             0     0            0        0  


### 5. Menjalankan Algoritma Apriori

In [36]:
# Mengonversi data menjadi tipe boolean
basket_bool = basket.astype(bool)

# Menjalankan algoritma Apriori untuk menemukan itemset yang sering muncul
frequent_itemsets = apriori(basket_bool, min_support=0.01, use_colnames=True)
print(frequent_itemsets.head())

    support        itemsets
0  0.047978    (whole milk)
1  0.016596  (citrus fruit)
2  0.097465   (frankfurter)
3  0.058238       (chicken)
4  0.041340        (butter)


### 6. Menyusun Aturan Asosiasi

In [37]:
from mlxtend.frequent_patterns import association_rules

# Menyusun aturan asosiasi dengan menambahkan num_itemsets
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1, num_itemsets=1)

# Menampilkan hasilnya
print(rules.head())


       antecedents      consequents  antecedent support  consequent support  \
0     (whole milk)       (cookware)            0.047978            0.262824   
1       (cookware)     (whole milk)            0.262824            0.047978   
2     (whole milk)  (sweet spreads)            0.047978            0.226916   
3  (sweet spreads)     (whole milk)            0.226916            0.047978   
4     (whole milk)     (mayonnaise)            0.047978            0.197043   

    support  confidence      lift  representativity  leverage  conviction  \
0  0.014484    0.301887  1.148626               1.0  0.001874    1.055954   
1  0.014484    0.055109  1.148626               1.0  0.001874    1.007547   
2  0.012372    0.257862  1.136374               1.0  0.001485    1.041698   
3  0.012372    0.054521  1.136374               1.0  0.001485    1.006920   
4  0.011467    0.238994  1.212902               1.0  0.002013    1.055126   

   zhangs_metric   jaccard  certainty  kulczynski  
0       0.

### 7. Menyaring Aturan Berdasarkan Confidence dan Lift

In [38]:
# Menyaring aturan dengan lift > 1 dan confidence > 0.7
rules_lift = rules[rules['lift'] > 1]
rules_confidence = rules[rules['confidence'] > 0.7]

In [39]:
# Menampilkan tabel pertama untuk aturan dengan lift > 1
display(rules_lift)

# Menampilkan tabel kedua untuk aturan dengan confidence > 0.7
display(rules_confidence)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(whole milk),(cookware),0.047978,0.262824,0.014484,0.301887,1.148626,1.0,0.001874,1.055954,0.135915,0.048880,0.052989,0.178498
1,(cookware),(whole milk),0.262824,0.047978,0.014484,0.055109,1.148626,1.0,0.001874,1.007547,0.175527,0.048880,0.007490,0.178498
2,(whole milk),(sweet spreads),0.047978,0.226916,0.012372,0.257862,1.136374,1.0,0.001485,1.041698,0.126056,0.047126,0.040029,0.156191
3,(sweet spreads),(whole milk),0.226916,0.047978,0.012372,0.054521,1.136374,1.0,0.001485,1.006920,0.155233,0.047126,0.006873,0.156191
4,(whole milk),(mayonnaise),0.047978,0.197043,0.011467,0.238994,1.212902,1.0,0.002013,1.055126,0.184377,0.049096,0.052245,0.148593
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1661,"(make up remover, mayonnaise)","(ready soups, sweet spreads)",0.047978,0.094750,0.010260,0.213836,2.256860,1.0,0.005714,1.151479,0.584972,0.077449,0.131551,0.161058
1662,(ready soups),"(sweet spreads, make up remover, mayonnaise)",0.352746,0.014484,0.010260,0.029085,2.008055,1.0,0.005150,1.015038,0.775593,0.028740,0.014815,0.368709
1663,(sweet spreads),"(ready soups, make up remover, mayonnaise)",0.226916,0.025347,0.010260,0.045213,1.783751,1.0,0.004508,1.020806,0.568352,0.042394,0.020382,0.224987
1664,(make up remover),"(ready soups, sweet spreads, mayonnaise)",0.185878,0.026252,0.010260,0.055195,2.102478,1.0,0.005380,1.030633,0.644094,0.050822,0.029723,0.223000


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
1655,"(sweet spreads, make up remover, mayonnaise)",(ready soups),0.014484,0.352746,0.01026,0.708333,2.008055,1.0,0.00515,2.219157,0.509384,0.02874,0.549378,0.368709


### 8. Visualisasi Hasil 

In [40]:
import plotly.express as px


# Mengonversi frozenset menjadi string untuk menghindari masalah serialisasi
rules_lift['antecedents'] = rules_lift['antecedents'].apply(lambda x: ', '.join(list(x)) if isinstance(x, frozenset) else x)
rules_lift['consequents'] = rules_lift['consequents'].apply(lambda x: ', '.join(list(x)) if isinstance(x, frozenset) else x)

# Sampling data hanya jika jumlah aturan > 1000, jika tidak gunakan data asli
sampled_data = rules_lift.sample(1000, random_state=42) if len(rules_lift) > 1000 else rules_lift
# Scatter plot untuk visualisasi tambahan
fig_scatter = px.scatter(
    sampled_data,
    x='lift',
    y='confidence',
    size='support',
    color='lift',
    hover_name='antecedents',
    hover_data={'support': True, 'confidence': True, 'lift': True, 'consequents': True},
    title='Scatter Plot dari Aturan Asosiasi',
    labels={'lift': 'Lift', 'confidence': 'Confidence', 'support': 'Support'},
    template='plotly_dark'
)

# Menampilkan grafik
fig_scatter.show()

Heatmap untuk melihat korelasi antar metrik

In [43]:
# Heatmap untuk melihat korelasi antar metrik
fig_heatmap = px.imshow(
    sampled_data[['lift', 'confidence', 'support']].corr(),
    title='Heatmap dari Metrik Aturan Asosiasi',
    labels={'x': 'Metrik', 'y': 'Metrik'},
    template='plotly_dark'
)

# Menampilkan grafik
fig_heatmap.show()