In [2]:
# MARKET BASKET ANALYSIS - PHASES 1 & 2 (Local Download Version)
# This version downloads the dataset locally and tests each step

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import requests
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set up visualization style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

In [14]:
import pandas as pd
import os

# Method 1: Try loading CSV with common encodings
def load_csv_with_encoding(file_path):
    encodings = ['latin-1', 'ISO-8859-1', 'cp1252', 'utf-16']
    
    for encoding in encodings:
        try:
            print(f"Trying encoding: {encoding}")
            df = pd.read_csv(file_path, encoding=encoding)
            print(f"‚úÖ Successfully loaded CSV with {encoding} encoding")
            # return df
        except UnicodeDecodeError as e:
            print(f"‚ùå Failed with {encoding}: {e}")
        except Exception as e:
            print(f"‚ùå Error with {encoding}: {e}")
    return df
    return None

# Try to load CSV
print("=== ATTEMPTING TO LOAD CSV ===")
csv_file = 'OnlineRetail.csv'
if os.path.exists(csv_file):
    df_csv = load_csv_with_encoding(csv_file)
else:
    print(f"‚ùå CSV file not found: {csv_file}")
    df_csv = None



=== ATTEMPTING TO LOAD CSV ===
Trying encoding: latin-1
‚úÖ Successfully loaded CSV with latin-1 encoding
Trying encoding: ISO-8859-1
‚úÖ Successfully loaded CSV with ISO-8859-1 encoding
Trying encoding: cp1252
‚úÖ Successfully loaded CSV with cp1252 encoding
Trying encoding: utf-16
‚ùå Error with utf-16: UTF-16 stream does not start with BOM


In [18]:
def find_dataset():
    """Look for the dataset in various formats and locations"""
    print("=== SEARCHING FOR DATASET ===")
    
    possible_files = [
        'OnlineRetail.csv',
        'OnlineRetail.xlsx',  # Original from UCI/Kaggle
        'Online Retail.xlsx',  # Original from UCI/Kaggle
        'online_retail.csv',  # Possible CSV version
        
        'onlineretail.csv',
        '/kaggle/input/onlineretail/OnlineRetail.xlsx',  # Kaggle path
    ]
    
    for file_path in possible_files:
        if os.path.exists(file_path):
            print(f"‚úì Found dataset: {file_path}")
            return file_path
    
    print("‚ùå No dataset file found locally")
    return None



def download_dataset():
    """Download the dataset from UCI repository"""
    print("=== DOWNLOADING DATASET ===")
    
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx"
    local_filename = "Online Retail.xlsx"
    # local_filename = "OnlineRetail.csv"
    
    # Check if file already exists
    if os.path.exists(local_filename):
        print(f"‚úì Dataset already exists locally: {local_filename}")
        return local_filename
    
    print(f"Downloading dataset from: {url}")
    print("This may take a few minutes...")
    
    try:
        # Download the file
        response = requests.get(url, stream=True)
        response.raise_for_status()
        
        # Get file size
        total_size = int(response.headers.get('content-length', 0))
        
        with open(local_filename, 'wb') as f:
            downloaded = 0
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
                    downloaded += len(chunk)
                    if total_size > 0:
                        progress = (downloaded / total_size) * 100
                        print(f"Progress: {progress:.1f}%", end='\r')
        
        print(f"\n‚úì Dataset downloaded successfully: {local_filename}")
        print(f"‚úì File size: {os.path.getsize(local_filename) / (1024*1024):.2f} MB")
        
        return local_filename
        
    except Exception as e:
        print(f"‚ùå Error downloading dataset: {e}")
        return None

def load_dataset(file_path):
    """Load the dataset based on file format"""
    print(f"\n=== LOADING DATASET: {file_path} ===")
    
    try:
        if file_path.endswith('.xlsx'):
            print("Loading Excel file...")
            df = pd.read_excel(file_path)
            print("‚úì Successfully loaded Excel file")
        elif file_path.endswith('.csv'):
            print("Loading CSV file...")
            df = pd.read_csv(file_path, encoding='latin-1')
            print("‚úì Successfully loaded CSV file")
        else:
            print("‚ùå Unsupported file format")
            return None
        
        print(f"‚úì Dataset shape: {df.shape}")
        print(f"‚úì Columns: {list(df.columns)}")
        return df
        
    except Exception as e:
        print(f"‚ùå Error loading file: {e}")
        return None

def test_data_loading(file_path):
    """Test loading the dataset"""
    print("\n=== TESTING DATA LOADING ===")
    
    try:
        # Test reading the file
        df = pd.read_excel(file_path)
        print(f"‚úì Successfully loaded dataset")
        print(f"‚úì Shape: {df.shape}")
        print(f"‚úì Columns: {list(df.columns)}")
        return df
    except Exception as e:
        print(f"‚ùå Error loading dataset: {e}")
        return None

def phase1_exploration(df):
    """Perform Phase 1: Data Exploration"""
    print("\n" + "="*60)
    print("PHASE 1: PROJECT SETUP & DATA UNDERSTANDING")
    print("="*60)
    
    # 1.1 Basic Information
    print("\n1.1 BASIC DATASET INFORMATION")
    print("-" * 40)
    print(f"Dataset Shape: {df.shape}")
    print(f"Number of records: {df.shape[0]:,}")
    print(f"Number of columns: {df.shape[1]}")
    
    # 1.2 Display sample data
    print("\n1.2 SAMPLE DATA")
    print("-" * 40)
    print("First 5 rows:")
    print(df.head())
    
    # 1.3 Data types and info
    print("\n1.3 DATA TYPES AND INFO")
    print("-" * 40)
    print(df.info())
    
    # 1.4 Missing values analysis
    print("\n1.4 MISSING VALUES ANALYSIS")
    print("-" * 40)
    missing_data = df.isnull().sum()
    missing_percent = (df.isnull().sum() / len(df)) * 100
    
    missing_info = pd.DataFrame({
        'Missing Count': missing_data,
        'Missing Percentage': missing_percent
    })
    print(missing_info)
    
    # 1.5 Data Quality Issues
    print("\n1.5 DATA QUALITY ISSUES")
    print("-" * 40)
    negative_quantity = (df['Quantity'] <= 0).sum()
    negative_price = (df['UnitPrice'] <= 0).sum()
    cancelled_invoices = df['InvoiceNo'].astype(str).str.startswith('C').sum()
    
    print(f"Records with Quantity <= 0: {negative_quantity:,} ({negative_quantity/len(df)*100:.2f}%)")
    print(f"Records with UnitPrice <= 0: {negative_price:,} ({negative_price/len(df)*100:.2f}%)")
    print(f"Cancelled invoices (starting with 'C'): {cancelled_invoices:,} ({cancelled_invoices/len(df)*100:.2f}%)")
    
    # 1.6 Key Business Metrics
    print("\n1.6 KEY BUSINESS METRICS")
    print("-" * 40)
    
    # Convert InvoiceDate to datetime
    df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
    
    total_transactions = df['InvoiceNo'].nunique()
    total_products = df['StockCode'].nunique()
    total_customers = df['CustomerID'].nunique()
    date_range = f"{df['InvoiceDate'].min().strftime('%Y-%m-%d')} to {df['InvoiceDate'].max().strftime('%Y-%m-%d')}"
    
    print(f"Total transactions: {total_transactions:,}")
    print(f"Unique products: {total_products:,}")
    print(f"Unique customers: {total_customers:,}")
    print(f"Time period: {date_range}")
    
    # Countries distribution
    country_count = df['Country'].nunique()
    top_countries = df['Country'].value_counts().head(3)
    print(f"Number of countries: {country_count}")
    print("Top 3 countries:")
    print(top_countries)
    
    return {
        'original_shape': df.shape,
        'missing_info': missing_info,
        'negative_quantity': negative_quantity,
        'negative_price': negative_price,
        'cancelled_invoices': cancelled_invoices,
        'total_transactions': total_transactions,
        'total_products': total_products,
        'total_customers': total_customers
    }

def create_visualizations(df):
    """Create initial visualizations"""
    print("\n1.7 CREATING VISUALIZATIONS")
    print("-" * 40)
    
    # Create a simple visualization folder
    if not os.path.exists('visualizations'):
        os.makedirs('visualizations')
    
    try:
        # Plot 1: Quantity distribution
        plt.figure(figsize=(10, 6))
        plt.hist(df[df['Quantity'] < 100]['Quantity'], bins=50, edgecolor='black', alpha=0.7)
        plt.title('Distribution of Quantity (Quantities < 100)')
        plt.xlabel('Quantity')
        plt.ylabel('Frequency')
        plt.savefig('visualizations/quantity_distribution.png', dpi=300, bbox_inches='tight')
        print("‚úì Created quantity distribution plot")
        
        # Plot 2: UnitPrice distribution
        plt.figure(figsize=(10, 6))
        plt.hist(df[df['UnitPrice'] < 50]['UnitPrice'], bins=50, edgecolor='black', alpha=0.7)
        plt.title('Distribution of UnitPrice (Prices < $50)')
        plt.xlabel('UnitPrice ($)')
        plt.ylabel('Frequency')
        plt.savefig('visualizations/price_distribution.png', dpi=300, bbox_inches='tight')
        print("‚úì Created price distribution plot")
        
        # Plot 3: Top products
        plt.figure(figsize=(12, 8))
        top_products = df.groupby('Description')['Quantity'].sum().sort_values(ascending=False).head(10)
        plt.barh(range(len(top_products)), top_products.values)
        plt.yticks(range(len(top_products)), [desc[:40] + '...' if len(desc) > 40 else desc for desc in top_products.index])
        plt.title('Top 10 Products by Quantity Sold')
        plt.xlabel('Total Quantity Sold')
        plt.tight_layout()
        plt.savefig('visualizations/top_products.png', dpi=300, bbox_inches='tight')
        print("‚úì Created top products plot")
        
        # Plot 4: Monthly transactions
        plt.figure(figsize=(12, 6))
        monthly_tx = df.set_index('InvoiceDate').resample('M')['InvoiceNo'].nunique()
        plt.plot(monthly_tx.index, monthly_tx.values, marker='o')
        plt.title('Monthly Transactions Over Time')
        plt.xlabel('Month')
        plt.ylabel('Number of Transactions')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig('visualizations/monthly_transactions.png', dpi=300, bbox_inches='tight')
        print("‚úì Created monthly transactions plot")
        
        plt.close('all')
        
    except Exception as e:
        print(f"‚ö†Ô∏è Could not create all visualizations: {e}")

def phase2_cleaning(df, phase1_results):
    """Perform Phase 2: Data Cleaning"""
    print("\n" + "="*60)
    print("PHASE 2: DATA PREPROCESSING & CLEANING")
    print("="*60)
    
    original_size = len(df)
    print(f"Starting with {original_size:,} records")
    
    # 2.1 Apply cleaning steps
    print("\n2.1 APPLYING CLEANING STEPS")
    print("-" * 40)
    
    # Step 1: Remove cancellations
    df_clean = df[~df['InvoiceNo'].astype(str).str.startswith('C')]
    removed_cancelled = original_size - len(df_clean)
    print(f"‚úì Removed cancellations: {removed_cancelled:,} records")
    
    # Step 2: Remove invalid quantities
    df_clean = df_clean[df_clean['Quantity'] > 0]
    removed_quantity = original_size - len(df_clean) - removed_cancelled
    print(f"‚úì Removed invalid quantities: {removed_quantity:,} records")
    
    # Step 3: Remove invalid prices
    df_clean = df_clean[df_clean['UnitPrice'] > 0]
    removed_price = original_size - len(df_clean) - removed_cancelled - removed_quantity
    print(f"‚úì Removed invalid prices: {removed_price:,} records")
    
    # Step 4: Remove missing CustomerID
    df_clean = df_clean[df_clean['CustomerID'].notnull()]
    removed_customerid = phase1_results['missing_info'].loc['CustomerID', 'Missing Count']
    print(f"‚úì Removed missing CustomerID: {removed_customerid:,} records")
    
    # Step 5: Remove missing Description
    df_clean = df_clean[df_clean['Description'].notnull()]
    removed_description = phase1_results['missing_info'].loc['Description', 'Missing Count']
    print(f"‚úì Removed missing Description: {removed_description:,} records")
    
    # 2.2 Data Standardization
    print("\n2.2 DATA STANDARDIZATION")
    print("-" * 40)
    
    df_clean['Description'] = df_clean['Description'].str.upper().str.strip()
    print("‚úì Standardized product descriptions")
    
    # Remove duplicates
    duplicates = df_clean.duplicated().sum()
    if duplicates > 0:
        df_clean = df_clean.drop_duplicates()
        print(f"‚úì Removed duplicates: {duplicates:,} records")
    
    # 2.3 Post-cleaning analysis
    print("\n2.3 POST-CLEANING ANALYSIS")
    print("-" * 40)
    
    cleaned_size = len(df_clean)
    total_removed = original_size - cleaned_size
    retention_rate = (cleaned_size / original_size) * 100
    
    print(f"Original dataset: {original_size:,} records")
    print(f"Cleaned dataset: {cleaned_size:,} records")
    print(f"Records removed: {total_removed:,} records")
    print(f"Retention rate: {retention_rate:.2f}%")
    
    # Key metrics after cleaning
    print(f"\nKey metrics after cleaning:")
    print(f"Transactions: {df_clean['InvoiceNo'].nunique():,}")
    print(f"Products: {df_clean['StockCode'].nunique():,}")
    print(f"Customers: {df_clean['CustomerID'].nunique():,}")
    
    # 2.4 Prepare transaction data
    print("\n2.4 PREPARING TRANSACTION DATA")
    print("-" * 40)
    
    # Create transaction baskets
    basket_data = df_clean.groupby('InvoiceNo')['StockCode'].apply(list).reset_index()
    print(f"‚úì Created {len(basket_data):,} transaction baskets")
    
    # Show sample baskets
    print("\nSample transaction baskets:")
    for i in range(min(3, len(basket_data))):
        basket = basket_data.iloc[i]
        print(f"  Invoice {basket['InvoiceNo']}: {len(basket['StockCode'])} items")
    
    # 2.5 Save cleaned data
    print("\n2.5 SAVING CLEANED DATA")
    print("-" * 40)
    
    df_clean.to_csv('online_retail_cleaned.csv', index=False)
    basket_data.to_csv('transaction_baskets.csv', index=False)
    
    print("‚úì Saved cleaned data: 'online_retail_cleaned.csv'")
    print("‚úì Saved transaction baskets: 'transaction_baskets.csv'")
    
    return df_clean, basket_data



In [4]:
def create_summary_report(phase1_results, df_clean):
    """Create a summary report"""
    print("\n" + "="*60)
    print("SUMMARY REPORT")
    print("="*60)
    
    print("üìä DATA QUALITY IMPROVEMENT")
    print("-" * 40)
    print(f"Original records: {phase1_results['original_shape'][0]:,}")
    print(f"Cleaned records: {len(df_clean):,}")
    print(f"Data quality improvement: {((phase1_results['original_shape'][0] - len(df_clean)) / phase1_results['original_shape'][0] * 100):.1f}%")
    
    print("\n‚úÖ CLEANING ACTIONS COMPLETED:")
    print("  ‚Ä¢ Removed cancelled invoices")
    print("  ‚Ä¢ Removed invalid quantities and prices") 
    print("  ‚Ä¢ Handled missing CustomerID and Description")
    print("  ‚Ä¢ Standardized product descriptions")
    print("  ‚Ä¢ Prepared transaction baskets for FP-Growth")
    
    print(f"\nüéØ DATASET READY FOR ANALYSIS:")
    print(f"  ‚Ä¢ {df_clean['InvoiceNo'].nunique():,} transactions")
    print(f"  ‚Ä¢ {df_clean['StockCode'].nunique():,} products") 
    print(f"  ‚Ä¢ {df_clean['CustomerID'].nunique():,} customers")
    print(f"  ‚Ä¢ Time period: {df_clean['InvoiceDate'].min().strftime('%Y-%m-%d')} to {df_clean['InvoiceDate'].max().strftime('%Y-%m-%d')}")
    
    print(f"\nüìà NEXT STEPS:")
    print(f"  ‚Ä¢ Proceed to Phase 3: FP-Growth Algorithm")
    print(f"  ‚Ä¢ Generate association rules")
    print(f"  ‚Ä¢ Implement temporal validation")



In [22]:
# MAIN EXECUTION
def main():
    print("=== MARKET BASKET ANALYSIS - LOCAL TEST ===")
    print("This script will download the dataset and test Phases 1 & 2")
    print()
    
    # Step 1: Download dataset
    file_path = find_dataset()
    if not file_path:
        file_path = download_dataset()
        if not file_path:
            print("‚ùå Could not obtain dataset. Exiting.")
            return
    
    # Step 2: Test loading
    # df = test_data_loading(file_path)
    df = load_dataset(file_path)
    if df is None:
        print("‚ùå Failed to load dataset. Exiting.")
        return
    
    # Step 3: Phase 1 - Exploration
    phase1_results = phase1_exploration(df)
    
    # Step 4: Create visualizations
    create_visualizations(df)
    
    # Step 5: Phase 2 - Cleaning
    df_clean, basket_data = phase2_cleaning(df, phase1_results)
    
    # Step 6: Summary report
    create_summary_report(phase1_results, df_clean)
    
    print("\n" + "="*60)
    print("üéâ PHASES 1 & 2 COMPLETED SUCCESSFULLY!")
    print("="*60)
    print("\nGenerated files:")
    print("  ‚Ä¢ OnlineRetail.csv (original dataset)")
    print("  ‚Ä¢ online_retail_cleaned.csv (cleaned data)")
    print("  ‚Ä¢ transaction_baskets.csv (transaction data for FP-Growth)")
    print("  ‚Ä¢ visualizations/ (EDA plots)")
    
    print(f"\nYou can now proceed to Phase 3: FP-Growth Algorithm")

# Run the main function
if __name__ == "__main__":
    main()

=== MARKET BASKET ANALYSIS - LOCAL TEST ===
This script will download the dataset and test Phases 1 & 2

=== SEARCHING FOR DATASET ===
‚úì Found dataset: OnlineRetail.csv

=== LOADING DATASET: OnlineRetail.csv ===
Loading CSV file...
‚úì Successfully loaded CSV file
‚úì Dataset shape: (541909, 8)
‚úì Columns: ['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate', 'UnitPrice', 'CustomerID', 'Country']

PHASE 1: PROJECT SETUP & DATA UNDERSTANDING

1.1 BASIC DATASET INFORMATION
----------------------------------------
Dataset Shape: (541909, 8)
Number of records: 541,909
Number of columns: 8

1.2 SAMPLE DATA
----------------------------------------
First 5 rows:
  InvoiceNo StockCode                          Description  Quantity  \
0    536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6   
1    536365     71053                  WHITE METAL LANTERN         6   
2    536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   
3    536365    84029G  KNITTED