In [None]:

# Step 1: Import Required Libraries

import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori, association_rules

# Expert Note:
# Pandas – data handling
# Mlxtend – provides Apriori & association rule functions


In [None]:

# Step 2: Load the Dataset

# Replace path with your dataset CSV (Online Retail)
df = pd.read_excel("Online Retail.xlsx")

# Quick check
df.head()



Unnamed: 0,"shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil"
0,"burgers,meatballs,eggs"
1,chutney
2,"turkey,avocado"
3,"mineral water,milk,energy bar,whole wheat rice..."
4,low fat yogurt


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 1 columns):
 #   Column                                                                                                                                                                                                                           Non-Null Count  Dtype 
---  ------                                                                                                                                                                                                                           --------------  ----- 
 0   shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil  7500 non-null   object
dtypes: object(1)
memory usage: 58.7+ KB


In [4]:
print(df.columns)


Index(['shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil'], dtype='object')


In [5]:
df.columns = ["items"]

In [None]:

# Step 3: Split Items into List

# Each row → list of items
df['items'] = df['items'].apply(lambda x: x.split(','))




In [7]:
df.head(10)

Unnamed: 0,items
0,"[burgers, meatballs, eggs]"
1,[chutney]
2,"[turkey, avocado]"
3,"[mineral water, milk, energy bar, whole wheat ..."
4,[low fat yogurt]
5,"[whole wheat pasta, french fries]"
6,"[soup, light cream, shallot]"
7,"[frozen vegetables, spaghetti, green tea]"
8,[french fries]
9,"[eggs, pet food]"


In [None]:

# Step 4: Convert to Basket Format

from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
te_data = te.fit(df['items']).transform(df['items'])

basket = pd.DataFrame(te_data, columns=te.columns_)
print(basket.head())

# Now 'basket' is a 0/1 matrix (Transaction vs Items).


    asparagus  almonds  antioxydant juice  asparagus  avocado  babies food  \
0       False    False              False      False    False        False   
1       False    False              False      False    False        False   
2       False    False              False      False     True        False   
3       False    False              False      False    False        False   
4       False    False              False      False    False        False   

   bacon  barbecue sauce  black tea  blueberries  ...  turkey  vegetables mix  \
0  False           False      False        False  ...   False           False   
1  False           False      False        False  ...   False           False   
2  False           False      False        False  ...    True           False   
3  False           False      False        False  ...   False           False   
4  False           False      False        False  ...   False           False   

   water spray  white wine  whole weat flour

In [None]:

# Step 5: Apply Apriori

frequent_itemsets = apriori(basket, min_support=0.02, use_colnames=True)

print(frequent_itemsets.head())


    support    itemsets
0  0.020267   (almonds)
1  0.033200   (avocado)
2  0.033733  (brownies)
3  0.087200   (burgers)
4  0.030133    (butter)


In [None]:

# Step 6: Generate Rules

rules = association_rules(frequent_itemsets,
                          metric="lift",
                          min_threshold=1.0)

# Apply thresholds
rules = rules[(rules['support'] > 0.02) &
              (rules['confidence'] > 0.3) &
              (rules['lift'] > 1.2)]

print(rules[['antecedents','consequents','support','confidence','lift']].head(10))


            antecedents      consequents   support  confidence      lift
1             (burgers)           (eggs)  0.028800    0.330275  1.837585
8                (cake)  (mineral water)  0.027467    0.338816  1.422002
10            (chicken)  (mineral water)  0.022800    0.380000  1.594852
25          (chocolate)  (mineral water)  0.052667    0.321400  1.348907
28        (cooking oil)  (mineral water)  0.020133    0.394256  1.654683
53    (frozen smoothie)  (mineral water)  0.020133    0.318565  1.337012
57  (frozen vegetables)  (mineral water)  0.035733    0.374825  1.573133
65        (ground beef)  (mineral water)  0.040933    0.416554  1.748266
67        (ground beef)      (spaghetti)  0.039200    0.398915  2.290857
69     (low fat yogurt)  (mineral water)  0.023867    0.312391  1.311098


In [None]:

# Step 7: Interpret Top Rules

rules_sorted = rules.sort_values(by="lift", ascending=False)

print("\nTop 5 Rules:")
for i, row in rules_sorted.head(5).iterrows():
    print(f"Rule: {list(row['antecedents'])} → {list(row['consequents'])}")
    print(f"Support: {row['support']:.3f}, Confidence: {row['confidence']:.3f}, Lift: {row['lift']:.3f}\n")



Top 5 Rules:
Rule: ['ground beef'] → ['spaghetti']
Support: 0.039, Confidence: 0.399, Lift: 2.291

Rule: ['olive oil'] → ['spaghetti']
Support: 0.023, Confidence: 0.349, Lift: 2.004

Rule: ['soup'] → ['mineral water']
Support: 0.023, Confidence: 0.456, Lift: 1.916

Rule: ['burgers'] → ['eggs']
Support: 0.029, Confidence: 0.330, Lift: 1.838

Rule: ['tomatoes'] → ['spaghetti']
Support: 0.021, Confidence: 0.306, Lift: 1.758



### 1. What is lift and why is it important in Association rules?

**Lift**  
- Lift measures how much more likely items A and B occur together compared to if they were independent.  
- Formula:  
  \[
  Lift(A \rightarrow B) = \frac{Support(A \cup B)}{Support(A) \times Support(B)}
  \]
- **Interpretation:**  
  - Lift > 1 → A and B are positively correlated (strong rule).  
  - Lift = 1 → A and B are independent.  
  - Lift < 1 → A and B are negatively correlated.  
- **Importance:** It helps identify rules that are not just frequent but also **statistically significant**.  

---

### 2. What is support and confidence? How do you calculate them?

**Support**  
- Support tells us how frequently an itemset appears in the dataset.  
- Formula:  
  \[
  Support(A) = \frac{\text{Transactions containing A}}{\text{Total Transactions}}
  \]
- Example: If 200 out of 1000 transactions contain "milk", support(milk) = 200/1000 = 0.2 (20%).  

**Confidence**  
- Confidence measures how often items in B appear in transactions that contain A.  
- Formula:  
  \[
  Confidence(A \rightarrow B) = \frac{Support(A \cup B)}{Support(A)}
  \]
- Example: If 100 out of 200 milk transactions also contain bread, confidence(milk → bread) = 100/200 = 0.5 (50%).  

---

### 3. What are some limitations or challenges of Association rules mining?

1. **Too many rules** – Generates a very large number of rules, many of which are not useful.  
2. **Computationally expensive** – Mining can be slow for very large datasets.  
3. **Support–Confidence limitations** – High support & confidence do not always mean interesting or useful rules.  
4. **Redundancy** – Many rules may convey the same information.  
5. **Domain dependency** – Interestingness of rules often depends on domain knowledge.  
6. **Scalability** – Performance drops with increasing number of items and transactions.  
