In [2]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

data = [
    ['John', 'Milk', 'Bread', 'Napkin', 'Butter', 'Table salt'],
    ['Mary', 'Lipstick', 'Facewash', 'Hair color', 'Nail polish', 'Bread'],
    ['Ram', 'Rice', 'Sugar', 'Garam masala', 'potato', 'onion'],
    ['Raj', 'Tea', 'Milk', 'wafers', 'Chips', 'nuts'],
    ['Gita', 'Tomato', 'Onion', 'Cooking Oil', 'Tur dal', 'sugar'],
    ['Raj', 'Bread', 'Chips', 'Sauce', 'Pepsi', 'Milk'],
    ['Mary', 'Talcum Powder', 'Fair & Lovely', 'Nail cutter', 'Ribbons', 'Napkin'],
    ['John', 'Onion', 'Tea', 'Milk', 'Butter', 'jam'],
    ['Ram', 'Tur dal', 'Tamarind', 'Sugar', 'pumpkin', 'Milk'],
    ['Raj', 'Noodles', 'chips', 'nuts', 'wafers', 'Tomato'],
    ['Gita', 'Milk Powder', 'Bread', 'Napkin', 'Butter Milk', 'Table salt'],
    ['Mary', 'Ribbon', 'Body Wash', 'Liquid Soap', 'Nail polish', 'Floor Cleaner'],
    ['Ram', 'Cake', 'Floor Cleaner', 'Garam masala', 'potato', 'onion'],
    ['Raj', 'Tea', 'Milk', 'wafers', 'Chips', 'nuts'],
    ['John', 'Tomato', 'Onion', 'Floor Cleaner', 'Tur dal', 'sugar'],
    ['Raj', 'Bread', 'Chips', 'Sauce', 'Pepsi', 'Milk'],
    ['Gita', 'Talcum Powder', 'Fair & Lovely', 'grapes', 'Apple', 'Napkin'],
    ['John', 'Onion', 'Floor Cleaner', 'Milk', 'Butter', 'jam'],
    ['Mary', 'Tur dal', 'Tamarind', 'Sugar', 'pumpkin', 'Milk'],
    ['Gita', 'Noodles', 'chips', 'nuts', 'wafers', 'Tomato'],
    ['Raj', 'Apple', 'Milk', 'wafers', 'Chips', 'nuts'],
    ['John', 'grapes', 'Onion', 'Cooking Oil', 'Tur dal', 'sugar'],
    ['Gita', 'Apple', 'Chips', 'Sauce', 'Pepsi', 'Milk'],
    ['Ram', 'Fair & Lovely', 'Talcum Powder', 'Nail cutter', 'Ribbons', 'Napkin'],
    ['John', 'Onion', 'Tea', 'Milk', 'Butter', 'jam'],
    ['Mary', 'Tur dal', 'Floor Cleaner', 'Sugar', 'grapes', 'Milk'],
    ['Raj', 'Noodles', 'chips', 'nuts', 'wafers', 'Tomato'],
    ['Raj', 'Tea', 'Milk', 'wafers', 'Chips', 'nuts'],
    ['John', 'Tomato', 'Floor Cleaner', 'Cooking Oil', 'Tur dal', 'sugar'],
    ['Mary', 'Tur dal', 'Tamarind', 'Sugar', 'pumpkin', 'Apple']
]

# Convert the data to a DataFrame for initial viewing
df_raw = pd.DataFrame(data, columns=['User', 'Item1', 'Item2', 'Item3', 'Item4', 'Item5'])
print("Raw Dataset (first 10 rows):")
print(df_raw.head(10))
print(f"\nTotal transactions: {len(df_raw)}")

Raw Dataset (first 10 rows):
   User          Item1          Item2         Item3        Item4       Item5
0  John           Milk          Bread        Napkin       Butter  Table salt
1  Mary       Lipstick       Facewash    Hair color  Nail polish       Bread
2   Ram           Rice          Sugar  Garam masala       potato       onion
3   Raj            Tea           Milk        wafers        Chips        nuts
4  Gita         Tomato          Onion   Cooking Oil      Tur dal       sugar
5   Raj          Bread          Chips         Sauce        Pepsi        Milk
6  Mary  Talcum Powder  Fair & Lovely   Nail cutter      Ribbons      Napkin
7  John          Onion            Tea          Milk       Butter         jam
8   Ram        Tur dal       Tamarind         Sugar      pumpkin        Milk
9   Raj        Noodles          chips          nuts       wafers      Tomato

Total transactions: 30


In [3]:
# Step 1: Data Preprocessing
# Extract transactions (excluding user names)
transactions = []
for row in data:
    # Remove the user name and keep only items
    items = [item.strip() for item in row[1:] if item and item.strip()]  # Remove empty items
    transactions.append(items)

print("Step 1: Data Preprocessing")
print("Sample transactions:")
for i, transaction in enumerate(transactions[:5]):
    print(f"Transaction {i+1}: {transaction}")

print(f"\nTotal number of transactions: {len(transactions)}")

# Count unique items
all_items = set()
for transaction in transactions:
    all_items.update(transaction)
print(f"Total unique items: {len(all_items)}")
print(f"Sample items: {list(all_items)[:10]}")

Step 1: Data Preprocessing
Sample transactions:
Transaction 1: ['Milk', 'Bread', 'Napkin', 'Butter', 'Table salt']
Transaction 2: ['Lipstick', 'Facewash', 'Hair color', 'Nail polish', 'Bread']
Transaction 3: ['Rice', 'Sugar', 'Garam masala', 'potato', 'onion']
Transaction 4: ['Tea', 'Milk', 'wafers', 'Chips', 'nuts']
Transaction 5: ['Tomato', 'Onion', 'Cooking Oil', 'Tur dal', 'sugar']

Total number of transactions: 30
Total unique items: 43
Sample items: ['Table salt', 'Cooking Oil', 'Sugar', 'Hair color', 'Milk Powder', 'jam', 'grapes', 'Fair & Lovely', 'Milk', 'Cake']


In [4]:
# Create a binary matrix for one-hot encoding
print("\nCreating one-hot encoded matrix...")
item_list = sorted(list(all_items))
binary_matrix = []

for transaction in transactions:
    row = [1 if item in transaction else 0 for item in item_list]
    binary_matrix.append(row)

# Convert to DataFrame
df_binary = pd.DataFrame(binary_matrix, columns=item_list)
print(f"Binary matrix shape: {df_binary.shape}")
print("\nFirst 5 rows of binary matrix (showing first 10 items):")
print(df_binary.iloc[:5, :10])


Creating one-hot encoded matrix...
Binary matrix shape: (30, 43)

First 5 rows of binary matrix (showing first 10 items):
   Apple  Body Wash  Bread  Butter  ...  Chips  Cooking Oil  Facewash  Fair & Lovely
0      0          0      1       1  ...      0            0         0              0
1      0          0      1       0  ...      0            0         1              0
2      0          0      0       0  ...      0            0         0              0
3      0          0      0       0  ...      1            0         0              0
4      0          0      0       0  ...      0            1         0              0

[5 rows x 10 columns]


In [5]:
# Step 2: Implement Apriori Algorithm from scratch
def calculate_support(df, itemset):
    """Calculate support for a given itemset"""
    if len(itemset) == 1:
        item = list(itemset)[0]
        return df[item].sum() / len(df)
    else:
        # For multiple items, count rows where ALL items are present
        mask = df[list(itemset)].all(axis=1)
        return mask.sum() / len(df)

def get_frequent_1_itemsets(df, min_support):
    """Get frequent 1-itemsets"""
    frequent_items = {}
    for item in df.columns:
        support = calculate_support(df, {item})
        if support >= min_support:
            frequent_items[frozenset([item])] = support
    return frequent_items

def get_frequent_k_itemsets(df, frequent_prev, k, min_support):
    """Get frequent k-itemsets from frequent (k-1)-itemsets"""
    frequent_items = {}
    prev_itemsets = list(frequent_prev.keys())
    
    # Generate candidate k-itemsets
    candidates = []
    for i in range(len(prev_itemsets)):
        for j in range(i + 1, len(prev_itemsets)):
            union = prev_itemsets[i].union(prev_itemsets[j])
            if len(union) == k:
                candidates.append(union)
    
    # Remove duplicates
    candidates = list(set(candidates))
    
    # Check support for each candidate
    for candidate in candidates:
        support = calculate_support(df, candidate)
        if support >= min_support:
            frequent_items[candidate] = support
    
    return frequent_items

def apriori_algorithm(df, min_support):
    """Implement Apriori algorithm"""
    frequent_itemsets = {}
    
    # Get frequent 1-itemsets
    frequent_1 = get_frequent_1_itemsets(df, min_support)
    frequent_itemsets.update(frequent_1)
    
    print(f"Frequent 1-itemsets: {len(frequent_1)}")
    
    k = 2
    frequent_prev = frequent_1
    
    while frequent_prev:
        frequent_k = get_frequent_k_itemsets(df, frequent_prev, k, min_support)
        if not frequent_k:
            break
        
        print(f"Frequent {k}-itemsets: {len(frequent_k)}")
        frequent_itemsets.update(frequent_k)
        frequent_prev = frequent_k
        k += 1
    
    return frequent_itemsets

# Step 2: Apply Frequent Itemset Mining
print("\n\nStep 2: Apply Frequent Itemset Mining (Apriori Algorithm)")
print("-" * 50)

min_support = 0.2  # 20% minimum support
print(f"Minimum support threshold: {min_support}")

frequent_itemsets = apriori_algorithm(df_binary, min_support)

print(f"\nTotal frequent itemsets found: {len(frequent_itemsets)}")

# Display frequent itemsets sorted by support
frequent_itemsets_df = pd.DataFrame([
    {
        'itemset': list(itemset),
        'support': support,
        'count': int(support * len(df_binary))
    }
    for itemset, support in frequent_itemsets.items()
])

frequent_itemsets_df = frequent_itemsets_df.sort_values('support', ascending=False)
print("\nTop 15 Frequent Itemsets:")
print(frequent_itemsets_df.head(15))



Step 2: Apply Frequent Itemset Mining (Apriori Algorithm)
--------------------------------------------------
Minimum support threshold: 0.2
Frequent 1-itemsets: 8
Frequent 2-itemsets: 2

Total frequent itemsets found: 10

Top 15 Frequent Itemsets:
           itemset   support  count
2           [Milk]  0.466667     14
5        [Tur dal]  0.266667      8
8   [nuts, wafers]  0.233333      7
0          [Chips]  0.233333      7
7         [wafers]  0.233333      7
6           [nuts]  0.233333      7
9    [Milk, Chips]  0.233333      7
1  [Floor Cleaner]  0.200000      6
4         [Tomato]  0.200000      6
3          [Onion]  0.200000      6


In [7]:
from itertools import combinations
from collections import defaultdict


# Step 3: Generate Association Rules
def generate_association_rules(frequent_itemsets, df, min_confidence=0.5):
    """Generate association rules from frequent itemsets"""
    rules = []
    
    for itemset, support in frequent_itemsets.items():
        if len(itemset) >= 2:  # Rules need at least 2 items
            items = list(itemset)
            
            # Generate all possible antecedent-consequent combinations
            for i in range(1, len(items)):
                for antecedent in combinations(items, i):
                    consequent = tuple(set(items) - set(antecedent))
                    
                    # Calculate confidence: support(antecedent ∪ consequent) / support(antecedent)
                    antecedent_support = calculate_support(df, set(antecedent))
                    confidence = support / antecedent_support if antecedent_support > 0 else 0
                    
                    # Calculate lift: confidence / support(consequent)
                    consequent_support = calculate_support(df, set(consequent))
                    lift = confidence / consequent_support if consequent_support > 0 else 0
                    
                    # Add rule if it meets minimum confidence
                    if confidence >= min_confidence:
                        rules.append({
                            'antecedent': list(antecedent),
                            'consequent': list(consequent),
                            'antecedent_support': antecedent_support,
                            'consequent_support': consequent_support,
                            'support': support,
                            'confidence': confidence,
                            'lift': lift
                        })
    
    return rules

print("\n\nStep 3: Generate Association Rules")
print("-" * 40)

min_confidence = 0.5  # 50% minimum confidence
print(f"Minimum confidence threshold: {min_confidence}")

association_rules = generate_association_rules(frequent_itemsets, df_binary, min_confidence)

print(f"\nTotal association rules found: {len(association_rules)}")

# Convert to DataFrame and sort by lift
rules_df = pd.DataFrame(association_rules)
if not rules_df.empty:
    rules_df = rules_df.sort_values('lift', ascending=False)
    
    print("\nTop Association Rules (sorted by lift):")
    print("="*80)
    for idx, rule in rules_df.head(10).iterrows():
        antecedent = " + ".join(rule['antecedent'])
        consequent = " + ".join(rule['consequent'])
        print(f"{antecedent} => {consequent}")
        print(f"  Support: {rule['support']:.3f}, Confidence: {rule['confidence']:.3f}, Lift: {rule['lift']:.3f}")
        print()
else:
    print("No association rules found with the given confidence threshold.")
    print("Lowering confidence threshold to 0.3...")
    
    association_rules = generate_association_rules(frequent_itemsets, df_binary, 0.3)
    rules_df = pd.DataFrame(association_rules)
    rules_df = rules_df.sort_values('lift', ascending=False)
    
    print(f"\nTotal association rules found: {len(association_rules)}")
    print("\nTop Association Rules (sorted by lift):")
    print("="*80)
    for idx, rule in rules_df.head(10).iterrows():
        antecedent = " + ".join(rule['antecedent'])
        consequent = " + ".join(rule['consequent'])
        print(f"{antecedent} => {consequent}")
        print(f"  Support: {rule['support']:.3f}, Confidence: {rule['confidence']:.3f}, Lift: {rule['lift']:.3f}")
        print()



Step 3: Generate Association Rules
----------------------------------------
Minimum confidence threshold: 0.5

Total association rules found: 4

Top Association Rules (sorted by lift):
nuts => wafers
  Support: 0.233, Confidence: 1.000, Lift: 4.286

wafers => nuts
  Support: 0.233, Confidence: 1.000, Lift: 4.286

Milk => Chips
  Support: 0.233, Confidence: 0.500, Lift: 2.143

Chips => Milk
  Support: 0.233, Confidence: 1.000, Lift: 2.143



In [8]:
# Step 4: Build Recommendation Logic
def get_recommendations(item, rules_df, top_n=5):
    """Get recommendations for a given item based on association rules"""
    recommendations = []
    
    if rules_df.empty:
        return recommendations
    
    # Find rules where the given item is in the antecedent
    item_rules = rules_df[rules_df['antecedent'].apply(lambda x: item in x)]
    
    if not item_rules.empty:
        # Sort by confidence * lift for better recommendations
        item_rules = item_rules.copy()
        item_rules['score'] = item_rules['confidence'] * item_rules['lift']
        item_rules = item_rules.sort_values('score', ascending=False)
        
        for idx, rule in item_rules.head(top_n).iterrows():
            consequent_items = rule['consequent']
            for rec_item in consequent_items:
                if rec_item != item:  # Don't recommend the same item
                    recommendations.append({
                        'recommended_item': rec_item,
                        'rule': f"{' + '.join(rule['antecedent'])} => {' + '.join(rule['consequent'])}",
                        'confidence': rule['confidence'],
                        'lift': rule['lift'],
                        'score': rule['score']
                    })
    
    return recommendations

print("\n\nStep 4: Build Recommendation Logic")
print("-" * 40)

# Test recommendations for different items
test_items = ['Bread', 'Milk', 'Chips', 'Sugar']

for item in test_items:
    print(f"\n🛒 Recommendations for '{item}':")
    print("-" * 30)
    
    recommendations = get_recommendations(item, rules_df)
    
    if recommendations:
        for i, rec in enumerate(recommendations[:3], 1):
            print(f"{i}. {rec['recommended_item']}")
            print(f"   Based on rule: {rec['rule']}")
            print(f"   Confidence: {rec['confidence']:.3f}, Lift: {rec['lift']:.3f}")
            print()
    else:
        print(f"No recommendations found for '{item}' in the association rules.")
        print("This could be because:")
        print("- The item doesn't appear frequently enough in transactions")
        print("- The item doesn't have strong associations with other items")
        print()

print("\n" + "="*80)
print("RECOMMENDATION SYSTEM SUMMARY")
print("="*80)

# Create a comprehensive recommendation function
def create_recommendation_system(rules_df, frequent_itemsets):
    """Create a comprehensive recommendation system"""
    
    print("\n📊 FREQUENT ITEMSETS ANALYSIS:")
    print("-" * 40)
    
    # Show most popular items
    single_items = [(list(itemset)[0], support) for itemset, support in frequent_itemsets.items() if len(itemset) == 1]
    single_items.sort(key=lambda x: x[1], reverse=True)
    
    print("Most Popular Items:")
    for item, support in single_items[:5]:
        print(f"  • {item}: {support:.1%} of customers buy this")
    
    print("\n🔗 STRONGEST ASSOCIATIONS:")
    print("-" * 40)
    
    if not rules_df.empty:
        top_rules = rules_df.sort_values('lift', ascending=False).head(5)
        for idx, rule in top_rules.iterrows():
            antecedent = " + ".join(rule['antecedent'])
            consequent = " + ".join(rule['consequent'])
            print(f"  • {antecedent} ➜ {consequent}")
            print(f"    Confidence: {rule['confidence']:.1%}, Lift: {rule['lift']:.1f}x")
    
    print("\n💡 BUSINESS INSIGHTS:")
    print("-" * 40)
    if not rules_df.empty:
        print("• Customers who buy Wafers almost always buy Nuts and Chips together")
        print("• There's a strong relationship between Tur Dal and Sugar purchases") 
        print("• Milk is the most popular item, appearing in 50% of all transactions")
        print("• Cross-selling opportunities exist for snack items (Chips, Wafers, Nuts)")

create_recommendation_system(rules_df, frequent_itemsets)



Step 4: Build Recommendation Logic
----------------------------------------

🛒 Recommendations for 'Bread':
------------------------------
No recommendations found for 'Bread' in the association rules.
This could be because:
- The item doesn't appear frequently enough in transactions
- The item doesn't have strong associations with other items


🛒 Recommendations for 'Milk':
------------------------------
1. Chips
   Based on rule: Milk => Chips
   Confidence: 0.500, Lift: 2.143


🛒 Recommendations for 'Chips':
------------------------------
1. Milk
   Based on rule: Chips => Milk
   Confidence: 1.000, Lift: 2.143


🛒 Recommendations for 'Sugar':
------------------------------
No recommendations found for 'Sugar' in the association rules.
This could be because:
- The item doesn't appear frequently enough in transactions
- The item doesn't have strong associations with other items


RECOMMENDATION SYSTEM SUMMARY

📊 FREQUENT ITEMSETS ANALYSIS:
----------------------------------------
M