In [3]:
# Import libraries
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules


In [6]:
# Load the cleaned dataset
df = pd.read_csv("cleaned_retail_dataset.csv")

In [7]:
df.head()

Unnamed: 0,invoice_no,stock_code,description,quantity,invoice_date,unit_price,customer_id,country,total_price,Month
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom,15.3,2010-12
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34,2010-12
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom,22.0,2010-12
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34,2010-12
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34,2010-12


In [22]:
# Select Top 50 Most Frequent Items (to avoid memory error)
top_items = df['stock_code'].value_counts().head(50).index
df_top = df[df['stock_code'].isin(top_items)]

In [29]:
# Prepare basket (Invoice Ã— StockCode)
basket = (df_top.groupby(['invoice_no', 'stock_code'])['quantity']
                    .sum().unstack().fillna(0))

# Convert quantities to 1/0 for Apriori (one-hot encoding)
basket = basket.map(lambda x: 1 if x > 0 else 0)
basket = basket.astype('bool')

In [30]:
# Apply Apriori (frequent itemsets)
frequent_items = apriori(basket, min_support=0.01, use_colnames=True)
print("Frequent Itemsets Found:", frequent_items.shape)

Frequent Itemsets Found: (465, 2)


In [26]:
# Generate Association Rules
rules = association_rules(frequent_items, metric="confidence", min_threshold=0.1)
rules = rules.sort_values(by="lift", ascending=False)
rules.head()


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
1618,"(23203, 20725)","(23209, 85099B)",0.019396,0.01917,0.010224,0.527132,27.497674,1.0,0.009852,2.074214,0.982693,0.360743,0.51789,0.530233
1619,"(23209, 85099B)","(23203, 20725)",0.01917,0.019396,0.010224,0.533333,27.497674,1.0,0.009852,2.101295,0.982467,0.360743,0.524103,0.530233
1823,"(22384, 22383)","(20728, 20727, 20725)",0.030071,0.019696,0.010374,0.345,17.515992,1.0,0.009782,1.496647,0.972142,0.263359,0.33184,0.435859
1820,"(20728, 20727, 20725)","(22384, 22383)",0.019696,0.030071,0.010374,0.526718,17.515992,1.0,0.009782,2.049367,0.961854,0.263359,0.512044,0.435859
1828,"(20728, 20725)","(22384, 22383, 20727)",0.034506,0.018268,0.010374,0.300654,16.458,1.0,0.009744,1.403785,0.972807,0.244681,0.28764,0.434277


In [28]:
# Show Top 10 Strong Rules
rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(10)


Unnamed: 0,antecedents,consequents,support,confidence,lift
1618,2320320725,"23209,85099B",0.010224,0.527132,27.497674
1619,"23209,85099B",2320320725,0.010224,0.533333,27.497674
1823,2238422383,207282072720725,0.010374,0.345,17.515992
1820,207282072720725,2238422383,0.010374,0.526718,17.515992
1828,2072820725,223842238320727,0.010374,0.300654,16.458
1815,223842238320727,2072820725,0.010374,0.567901,16.458
1816,223842238320725,2072820727,0.010374,0.492857,15.572413
1827,2072820727,223842238320725,0.010374,0.327791,15.572413
1826,2072822384,223832072720725,0.010374,0.323185,15.353597
1817,223832072720725,2072822384,0.010374,0.492857,15.353597
