# 03 - Association Rule Mining

This notebook performs market basket analysis to find products frequently bought together.

## Objectives
- Prepare transaction data for association rule mining
- Apply Apriori algorithm
- Apply FP-Growth algorithm (optional)
- Generate association rules
- Calculate support, confidence, and lift metrics
- Visualize product associations
- Interpret rules for business insights

## Phase 2 Requirements
- ✅ Run Apriori/FPGrowth algorithms
- ✅ Generate association rules
- ✅ Basic interpretation of top rules
- ⚠️ Full analysis in Phase 3


In [None]:
# Load required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.frequent_patterns import apriori, association_rules, fpgrowth
from mlxtend.preprocessing import TransactionEncoder
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("=" * 80)
print("ASSOCIATION RULE MINING - MARKET BASKET ANALYSIS")
print("=" * 80)

# Load preprocessed data (assuming df is available from preprocessing notebook)
# If running standalone, load the cleaned dataset
import os
project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
data_path = os.path.join(project_root, 'data', 'raw', 'Online Retail.csv')

# Load and basic clean
df = pd.read_csv(data_path, encoding='latin-1')
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], errors='coerce')
df = df[~df['InvoiceNo'].astype(str).str.startswith('C')]  # Remove canceled orders
df = df[(df['Quantity'] > 0) & (df['UnitPrice'] > 0)]
df = df[df['Description'].notna()]

print(f"\nDataset loaded: {df.shape[0]:,} transactions")
print(f"Unique invoices: {df['InvoiceNo'].nunique():,}")
print(f"Unique products: {df['Description'].nunique():,}")

NameError: name 'df' is not defined

## Step 1: Prepare Transaction Data

Convert transaction data into format suitable for association rule mining.


In [None]:
# Prepare transaction data
df_association = df[['InvoiceNo', 'Description']].copy()
df_association.dropna(subset=['Description'], inplace=True)
df_association['Description'] = df_association['Description'].str.strip()
df_association = df_association[df_association['Description'] != '']

# Group items by invoice to create baskets
basket = df_association.groupby('InvoiceNo')['Description'].apply(list).reset_index()
basket.columns = ['InvoiceNo', 'Items']

print(f"Total baskets (invoices): {len(basket):,}")
print(f"Average items per basket: {basket['Items'].apply(len).mean():.2f}")
print(f"Median items per basket: {basket['Items'].apply(len).median():.2f}")

# Display sample baskets
print("\nSample Baskets:")
display(basket.head(10))

# Convert to list of lists format for TransactionEncoder
transactions = basket['Items'].tolist()
print(f"\nTotal transactions prepared: {len(transactions):,}")


## Step 2: Encode Transactions

Use TransactionEncoder to convert transaction lists into binary matrix format required by Apriori/FPGrowth.


In [None]:
# Encode transactions into binary matrix
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)

print(f"Encoded matrix shape: {df_encoded.shape}")
print(f"Total unique items: {len(te.columns_)}")
print(f"\nSample of encoded matrix (first 5 rows, first 10 columns):")
display(df_encoded.iloc[:5, :10])

# Check sparsity
sparsity = 1 - df_encoded.sum().sum() / (df_encoded.shape[0] * df_encoded.shape[1])
print(f"\nMatrix sparsity: {sparsity:.2%} (typical for retail data)")


## Step 3: Frequent Itemset Mining - Apriori Algorithm

Apply Apriori algorithm to find frequent itemsets with minimum support threshold.


In [None]:
# Apply Apriori algorithm
# Start with a conservative support threshold (0.01 = 1% of transactions)
min_support = 0.01
print(f"Applying Apriori algorithm with min_support = {min_support} ({min_support*100}%)")

frequent_itemsets = apriori(df_encoded, min_support=min_support, use_colnames=True, verbose=1)

print(f"\nFrequent itemsets found: {len(frequent_itemsets):,}")
print(f"\nTop 10 frequent itemsets by support:")
display(frequent_itemsets.nlargest(10, 'support'))

# Analyze itemset sizes
frequent_itemsets['itemset_length'] = frequent_itemsets['itemsets'].apply(len)
print(f"\nItemset size distribution:")
print(frequent_itemsets['itemset_length'].value_counts().sort_index())


## Step 4: Generate Association Rules

Generate association rules from frequent itemsets and calculate metrics (confidence, lift, conviction).


In [None]:
# Generate association rules
# Filter for rules with minimum confidence
min_confidence = 0.3
print(f"Generating association rules with min_confidence = {min_confidence} ({min_confidence*100}%)")

rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)

print(f"\nAssociation rules generated: {len(rules):,}")
print(f"\nRules columns: {rules.columns.tolist()}")

# Display top rules by different metrics
print("\n" + "=" * 80)
print("TOP 10 RULES BY LIFT (Strongest Associations)")
print("=" * 80)
top_lift = rules.nlargest(10, 'lift')[['antecedents', 'consequents', 'support', 'confidence', 'lift', 'conviction']]
for idx, row in top_lift.iterrows():
    print(f"\nRule {idx+1}:")
    print(f"  If customer buys: {', '.join(list(row['antecedents']))}")
    print(f"  Then likely to buy: {', '.join(list(row['consequents']))}")
    print(f"  Support: {row['support']:.3f} ({row['support']*100:.1f}%)")
    print(f"  Confidence: {row['confidence']:.3f} ({row['confidence']*100:.1f}%)")
    print(f"  Lift: {row['lift']:.3f} (rule is {row['lift']:.1f}x more likely than random)")
    print(f"  Conviction: {row['conviction']:.3f}")

display(top_lift)


## Step 5: Rule Analysis and Interpretation

Analyze and interpret association rules for business insights.


In [None]:
# Rule statistics
print("=" * 80)
print("ASSOCIATION RULES SUMMARY STATISTICS")
print("=" * 80)

print(f"\nTotal Rules: {len(rules):,}")
print(f"\nSupport Statistics:")
print(f"  Mean: {rules['support'].mean():.4f}")
print(f"  Median: {rules['support'].median():.4f}")
print(f"  Min: {rules['support'].min():.4f}")
print(f"  Max: {rules['support'].max():.4f}")

print(f"\nConfidence Statistics:")
print(f"  Mean: {rules['confidence'].mean():.4f}")
print(f"  Median: {rules['confidence'].median():.4f}")
print(f"  Min: {rules['confidence'].min():.4f}")
print(f"  Max: {rules['confidence'].max():.4f}")

print(f"\nLift Statistics:")
print(f"  Mean: {rules['lift'].mean():.4f}")
print(f"  Median: {rules['lift'].median():.4f}")
print(f"  Min: {rules['lift'].min():.4f}")
print(f"  Max: {rules['lift'].max():.4f}")
print(f"  Rules with lift > 1.0 (positive association): {len(rules[rules['lift'] > 1.0]):,}")
print(f"  Rules with lift > 2.0 (strong association): {len(rules[rules['lift'] > 2.0]):,}")

# Filter high-quality rules
high_quality_rules = rules[(rules['lift'] > 1.5) & (rules['confidence'] > 0.5) & (rules['support'] > 0.02)]
print(f"\nHigh-Quality Rules (lift>1.5, confidence>0.5, support>0.02): {len(high_quality_rules):,}")

print("\n" + "=" * 80)
print("TOP 10 RULES BY CONFIDENCE")
print("=" * 80)
top_confidence = rules.nlargest(10, 'confidence')[['antecedents', 'consequents', 'support', 'confidence', 'lift']]
display(top_confidence)

print("\n" + "=" * 80)
print("TOP 10 RULES BY SUPPORT")
print("=" * 80)
top_support = rules.nlargest(10, 'support')[['antecedents', 'consequents', 'support', 'confidence', 'lift']]
display(top_support)


In [None]:
# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Association Rules Analysis', fontsize=16, y=0.995)

# 1. Support vs Confidence scatter
axes[0, 0].scatter(rules['support'], rules['confidence'], alpha=0.5, s=50, c=rules['lift'], 
                  cmap='viridis', edgecolors='black', linewidth=0.5)
axes[0, 0].set_xlabel('Support')
axes[0, 0].set_ylabel('Confidence')
axes[0, 0].set_title('Support vs Confidence (colored by Lift)')
axes[0, 0].grid(True, alpha=0.3)
plt.colorbar(axes[0, 0].collections[0], ax=axes[0, 0], label='Lift')

# 2. Lift distribution
axes[0, 1].hist(rules['lift'], bins=50, edgecolor='black', alpha=0.7, color='steelblue')
axes[0, 1].axvline(x=1.0, color='red', linestyle='--', label='Lift = 1.0 (no association)')
axes[0, 1].set_xlabel('Lift')
axes[0, 1].set_ylabel('Number of Rules')
axes[0, 1].set_title('Lift Distribution')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3, axis='y')

# 3. Confidence distribution
axes[1, 0].hist(rules['confidence'], bins=50, edgecolor='black', alpha=0.7, color='coral')
axes[1, 0].set_xlabel('Confidence')
axes[1, 0].set_ylabel('Number of Rules')
axes[1, 0].set_title('Confidence Distribution')
axes[1, 0].grid(True, alpha=0.3, axis='y')

# 4. Support distribution
axes[1, 1].hist(rules['support'], bins=50, edgecolor='black', alpha=0.7, color='lightgreen')
axes[1, 1].set_xlabel('Support')
axes[1, 1].set_ylabel('Number of Rules')
axes[1, 1].set_title('Support Distribution')
axes[1, 1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

# Top rules visualization
if len(high_quality_rules) > 0:
    fig, ax = plt.subplots(figsize=(12, 8))
    top_20 = high_quality_rules.nlargest(20, 'lift')
    
    # Create rule labels
    rule_labels = [f"{', '.join(list(rule['antecedents']))[:30]} → {', '.join(list(rule['consequents']))[:30]}" 
                   for _, rule in top_20.iterrows()]
    
    y_pos = np.arange(len(top_20))
    ax.barh(y_pos, top_20['lift'], alpha=0.7, color='purple', edgecolor='black')
    ax.set_yticks(y_pos)
    ax.set_yticklabels(rule_labels, fontsize=8)
    ax.set_xlabel('Lift', fontsize=12)
    ax.set_title('Top 20 Association Rules by Lift', fontsize=14, fontweight='bold')
    ax.grid(True, alpha=0.3, axis='x')
    ax.axvline(x=1.0, color='red', linestyle='--', label='Lift = 1.0')
    ax.legend()
    
    plt.tight_layout()
    plt.show()


In [None]:
# Business Interpretation
print("=" * 80)
print("BUSINESS INSIGHTS FROM ASSOCIATION RULES")
print("=" * 80)

print("\n1. PRODUCT AFFINITY INSIGHTS:")
print("   Association rules reveal which products are frequently purchased together.")
print("   This information is critical for:")
print("   - Product placement strategies (co-locate related items)")
print("   - Cross-selling opportunities (recommend associated products)")
print("   - Inventory management (stock related items together)")

print("\n2. STOCK ALLOCATION IMPLICATIONS:")
if len(high_quality_rules) > 0:
    print(f"   - {len(high_quality_rules):,} strong associations identified")
    print("   - When one product in a rule sells, stock the associated product")
    print("   - High-lift rules indicate products that should be stocked together")

print("\n3. MARKET BASKET OPTIMIZATION:")
print("   - Rules with high support indicate common purchase combinations")
print("   - Rules with high confidence indicate reliable cross-selling opportunities")
print("   - Rules with high lift indicate genuine product affinity (not just popularity)")

print("\n4. KEY METRICS INTERPRETATION:")
print("   - Support: How often the rule occurs in transactions")
print("   - Confidence: Probability of buying consequent given antecedent")
print("   - Lift: How much more likely the rule is than random chance")
print("   - Lift > 1.0: Positive association (products complement each other)")
print("   - Lift < 1.0: Negative association (products rarely bought together)")

print("\n5. LIMITATIONS FOR PHASE 2:")
print("   - Analysis based on historical patterns; may not predict future behavior")
print("   - Does not account for external factors (promotions, seasonality)")
print("   - Rules may be spurious if products are simply popular independently")
print("   - Full business validation requires domain expertise and testing")

# Export top rules for further analysis
if len(high_quality_rules) > 0:
    output_path = os.path.join(project_root, 'data', 'processed', 'top_association_rules.csv')
    high_quality_rules.to_csv(output_path, index=False)
    print(f"\n✓ Top association rules exported to: {output_path}")

print("\n" + "=" * 80)
print("ASSOCIATION RULE MINING COMPLETE")
print("=" * 80)
print("\nNext Steps (Phase 3):")
print("  - Validate rules with business stakeholders")
print("  - Test rules in real-world scenarios")
print("  - Integrate with recommendation systems")
print("  - Monitor rule performance over time")
