# Dataset Download and Validation
## Module 1: Environment Setup - Dataset Preparation

---

**Objective:** Download and validate all required datasets for the AI E-commerce Workshop

**Datasets to be downloaded:**
- `sales_historical_data.csv` - Sales transactions with temporal patterns (~50MB, 10,000+ records)
- `product_catalog.csv` - Product metadata and categories (~5MB, 1,000+ records)
- `customer_behavior.csv` - User interaction and behavioral data (~20MB, 5,000+ records)

---

## 📋 Step 1: Import Required Libraries

In [None]:
import os
import pandas as pd
import numpy as np
import urllib.request
import hashlib
import zipfile
from datetime import datetime, timedelta
import random
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

print("✅ Libraries imported successfully")
print(f"📅 Execution time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

## 📁 Step 2: Create Directory Structure

In [None]:
# Create datasets directory if it doesn't exist
datasets_dir = Path("datasets")
datasets_dir.mkdir(exist_ok=True)

# Create subdirectories for organization
raw_data_dir = datasets_dir / "raw"
processed_data_dir = datasets_dir / "processed"
raw_data_dir.mkdir(exist_ok=True)
processed_data_dir.mkdir(exist_ok=True)

print("✅ Directory structure created:")
print(f"   📂 {datasets_dir.absolute()}")
print(f"   📂 {raw_data_dir.absolute()}")
print(f"   📂 {processed_data_dir.absolute()}")

## 🏭 Step 3: Generate Synthetic Sales Historical Data

In [None]:
def generate_sales_data(num_records=12000):
    """
    Generate synthetic sales historical data with realistic patterns
    """
    print(f"🔄 Generating {num_records:,} sales records...")
    
    # Set random seed for reproducibility
    np.random.seed(42)
    random.seed(42)
    
    # Date range: 2 years of historical data
    start_date = datetime.now() - timedelta(days=730)
    end_date = datetime.now() - timedelta(days=1)
    
    # Product categories and their characteristics
    categories = {
        'Electronics': {'base_price': 299, 'variance': 500, 'seasonality': 1.2},
        'Clothing': {'base_price': 49, 'variance': 100, 'seasonality': 1.5},
        'Books': {'base_price': 19, 'variance': 25, 'seasonality': 0.8},
        'Home & Garden': {'base_price': 79, 'variance': 150, 'seasonality': 1.1},
        'Sports': {'base_price': 89, 'variance': 200, 'seasonality': 1.3},
        'Beauty': {'base_price': 29, 'variance': 60, 'seasonality': 1.0}
    }
    
    sales_data = []
    
    for i in range(num_records):
        # Random date within range
        random_days = random.randint(0, (end_date - start_date).days)
        sale_date = start_date + timedelta(days=random_days)
        
        # Category selection with weights
        category = np.random.choice(list(categories.keys()), 
                                  p=[0.25, 0.20, 0.15, 0.15, 0.15, 0.10])
        
        # Price calculation with seasonality
        base_price = categories[category]['base_price']
        variance = categories[category]['variance']
        seasonality = categories[category]['seasonality']
        
        # Seasonal adjustment (higher sales in Q4)
        seasonal_multiplier = 1.0
        if sale_date.month in [11, 12]:  # November, December
            seasonal_multiplier = seasonality
        elif sale_date.month in [6, 7]:  # Summer sales
            seasonal_multiplier = 1.1
        
        price = max(base_price + np.random.normal(0, variance/3), base_price * 0.3)
        quantity = max(1, int(np.random.exponential(1.5)))
        total_amount = price * quantity * seasonal_multiplier
        
        # Customer demographics
        customer_id = f"CUST_{random.randint(1000, 9999)}"
        product_id = f"PROD_{category[:3].upper()}_{random.randint(100, 999)}"
        
        # Sales channel
        channel = np.random.choice(['Online', 'Store', 'Mobile'], p=[0.6, 0.25, 0.15])
        
        # Geographic region
        region = np.random.choice(['North', 'South', 'East', 'West', 'Central'], 
                                p=[0.25, 0.20, 0.20, 0.20, 0.15])
        
        sales_data.append({
            'transaction_id': f"TXN_{i+1:06d}",
            'date': sale_date.strftime('%Y-%m-%d'),
            'customer_id': customer_id,
            'product_id': product_id,
            'category': category,
            'quantity': quantity,
            'unit_price': round(price, 2),
            'total_amount': round(total_amount, 2),
            'channel': channel,
            'region': region,
            'day_of_week': sale_date.strftime('%A'),
            'month': sale_date.month,
            'quarter': (sale_date.month - 1) // 3 + 1,
            'year': sale_date.year
        })
        
        if (i + 1) % 2000 == 0:
            print(f"   Generated {i+1:,} records...")
    
    df = pd.DataFrame(sales_data)
    print(f"✅ Sales data generated: {len(df):,} records")
    return df

# Generate and save sales data
sales_df = generate_sales_data()
sales_file = datasets_dir / "sales_historical_data.csv"
sales_df.to_csv(sales_file, index=False)
print(f"💾 Saved: {sales_file}")
print(f"📊 File size: {sales_file.stat().st_size / 1024 / 1024:.2f} MB")

## 🛍️ Step 4: Generate Product Catalog Data

In [None]:
def generate_product_catalog(num_products=1200):
    """
    Generate synthetic product catalog with metadata
    """
    print(f"🔄 Generating {num_products:,} product records...")
    
    # Product name templates by category
    product_templates = {
        'Electronics': [
            'Smart Phone Pro', 'Wireless Headphones', 'Laptop Computer', 
            'Tablet Device', 'Smart Watch', 'Gaming Console', 'Digital Camera',
            'Bluetooth Speaker', 'Smart TV', 'Fitness Tracker'
        ],
        'Clothing': [
            'Cotton T-Shirt', 'Denim Jeans', 'Summer Dress', 'Winter Jacket',
            'Running Shoes', 'Casual Sneakers', 'Business Shirt', 'Yoga Pants',
            'Wool Sweater', 'Baseball Cap'
        ],
        'Books': [
            'Mystery Novel', 'Science Fiction', 'Biography', 'Cookbook',
            'History Book', 'Self-Help Guide', 'Technical Manual', 'Art Book',
            'Children Story', 'Poetry Collection'
        ],
        'Home & Garden': [
            'Garden Tools Set', 'Kitchen Appliance', 'Furniture Item', 'Decorative Lamp',
            'Storage Container', 'Cleaning Supplies', 'Bedding Set', 'Wall Art',
            'Plant Pot', 'Outdoor Furniture'
        ],
        'Sports': [
            'Exercise Equipment', 'Team Jersey', 'Sports Shoes', 'Fitness Gear',
            'Outdoor Equipment', 'Training Accessories', 'Sports Ball', 'Protective Gear',
            'Water Bottle', 'Gym Bag'
        ],
        'Beauty': [
            'Skincare Cream', 'Makeup Kit', 'Hair Care Product', 'Fragrance',
            'Beauty Tool', 'Nail Care Set', 'Face Mask', 'Body Lotion',
            'Lip Balm', 'Sunscreen'
        ]
    }
    
    brands = {
        'Electronics': ['TechCorp', 'InnovateTech', 'DigitalPro', 'SmartDevices', 'FutureTech'],
        'Clothing': ['FashionForward', 'StyleCo', 'TrendyWear', 'ComfortClothing', 'UrbanStyle'],
        'Books': ['BookPress', 'LiteraryHouse', 'KnowledgeBooks', 'WisdomPublishing', 'ReadMore'],
        'Home & Garden': ['HomeComfort', 'GardenPro', 'LivingSpace', 'CozyHome', 'GreenThumb'],
        'Sports': ['SportsPro', 'ActiveLife', 'FitGear', 'ChampionSports', 'HealthyActive'],
        'Beauty': ['BeautyFirst', 'GlamourCo', 'NaturalBeauty', 'SkinCare+', 'PureBeauty']
    }
    
    products_data = []
    product_counter = {category: 0 for category in product_templates.keys()}
    
    for i in range(num_products):
        # Select category with distribution
        category = np.random.choice(list(product_templates.keys()), 
                                  p=[0.25, 0.20, 0.15, 0.15, 0.15, 0.10])
        
        product_counter[category] += 1
        
        # Generate product details
        base_name = random.choice(product_templates[category])
        brand = random.choice(brands[category])
        
        # Price based on category
        price_ranges = {
            'Electronics': (50, 1500),
            'Clothing': (15, 200),
            'Books': (10, 50),
            'Home & Garden': (20, 300),
            'Sports': (25, 400),
            'Beauty': (10, 150)
        }
        
        min_price, max_price = price_ranges[category]
        price = round(np.random.uniform(min_price, max_price), 2)
        
        # Stock levels
        stock_level = random.randint(0, 500)
        
        # Ratings and reviews
        rating = round(np.random.normal(4.2, 0.8), 1)
        rating = max(1.0, min(5.0, rating))  # Clamp between 1-5
        review_count = max(0, int(np.random.exponential(50)))
        
        # Product features
        features = {
            'Electronics': ['Wireless', 'Waterproof', 'Fast Charging', 'HD Display', 'Voice Control'],
            'Clothing': ['Machine Washable', 'Wrinkle Free', 'Breathable', 'Stretch Fabric', 'UV Protection'],
            'Books': ['Hardcover', 'Illustrated', 'Large Print', 'Award Winning', 'Bestseller'],
            'Home & Garden': ['Eco-Friendly', 'Durable', 'Easy Assembly', 'Weather Resistant', 'Space Saving'],
            'Sports': ['Professional Grade', 'Lightweight', 'Adjustable', 'Non-Slip', 'Quick Dry'],
            'Beauty': ['Natural Ingredients', 'Dermatologist Tested', 'Paraben Free', 'Long Lasting', 'Hypoallergenic']
        }
        
        product_features = random.sample(features[category], k=random.randint(1, 3))
        
        products_data.append({
            'product_id': f"PROD_{category[:3].upper()}_{product_counter[category]:03d}",
            'product_name': f"{brand} {base_name}",
            'category': category,
            'brand': brand,
            'price': price,
            'stock_level': stock_level,
            'rating': rating,
            'review_count': review_count,
            'features': ', '.join(product_features),
            'created_date': (datetime.now() - timedelta(days=random.randint(30, 1095))).strftime('%Y-%m-%d'),
            'is_active': random.choice([True, True, True, False]),  # 75% active
            'supplier_id': f"SUP_{random.randint(100, 999)}",
            'weight_kg': round(np.random.uniform(0.1, 50.0), 2),
            'dimensions': f"{random.randint(5, 50)}x{random.randint(5, 50)}x{random.randint(2, 30)}"
        })
        
        if (i + 1) % 200 == 0:
            print(f"   Generated {i+1:,} products...")
    
    df = pd.DataFrame(products_data)
    print(f"✅ Product catalog generated: {len(df):,} products")
    print(f"📊 Category distribution:")
    for category, count in product_counter.items():
        print(f"   {category}: {count} products")
    
    return df

# Generate and save product catalog
products_df = generate_product_catalog()
products_file = datasets_dir / "product_catalog.csv"
products_df.to_csv(products_file, index=False)
print(f"💾 Saved: {products_file}")
print(f"📊 File size: {products_file.stat().st_size / 1024 / 1024:.2f} MB")

## 👥 Step 5: Generate Customer Behavior Data

In [None]:
def generate_customer_behavior(num_interactions=8000):
    """
    Generate synthetic customer behavior and interaction data
    """
    print(f"🔄 Generating {num_interactions:,} customer behavior records...")
    
    # Customer segments
    customer_segments = {
        'Premium': {'conversion_rate': 0.15, 'avg_session_time': 12, 'pages_per_session': 8},
        'Regular': {'conversion_rate': 0.08, 'avg_session_time': 8, 'pages_per_session': 5},
        'Occasional': {'conversion_rate': 0.04, 'avg_session_time': 5, 'pages_per_session': 3},
        'New': {'conversion_rate': 0.02, 'avg_session_time': 3, 'pages_per_session': 2}
    }
    
    interaction_types = [
        'page_view', 'product_view', 'add_to_cart', 'remove_from_cart',
        'add_to_wishlist', 'search', 'filter_apply', 'sort_apply',
        'review_read', 'review_write', 'checkout_start', 'purchase_complete'
    ]
    
    devices = ['Desktop', 'Mobile', 'Tablet']
    browsers = ['Chrome', 'Firefox', 'Safari', 'Edge']
    traffic_sources = ['Organic Search', 'Paid Search', 'Social Media', 'Direct', 'Email', 'Referral']
    
    behavior_data = []
    
    for i in range(num_interactions):
        # Customer segment selection
        segment = np.random.choice(list(customer_segments.keys()), 
                                 p=[0.15, 0.35, 0.35, 0.15])
        
        customer_id = f"CUST_{random.randint(1000, 9999)}"
        session_id = f"SESS_{random.randint(100000, 999999)}"
        
        # Interaction timestamp
        interaction_date = datetime.now() - timedelta(
            days=random.randint(1, 90),
            hours=random.randint(0, 23),
            minutes=random.randint(0, 59)
        )
        
        # Interaction type based on funnel probability
        funnel_weights = {
            'page_view': 0.25,
            'product_view': 0.20,
            'search': 0.15,
            'add_to_cart': 0.10,
            'filter_apply': 0.08,
            'add_to_wishlist': 0.06,
            'sort_apply': 0.05,
            'review_read': 0.04,
            'checkout_start': 0.03,
            'remove_from_cart': 0.02,
            'purchase_complete': 0.015,
            'review_write': 0.005
        }
        
        interaction_type = np.random.choice(
            list(funnel_weights.keys()),
            p=list(funnel_weights.values())
        )
        
        # Session metrics based on segment
        segment_info = customer_segments[segment]
        session_duration = max(1, int(np.random.normal(
            segment_info['avg_session_time'], 
            segment_info['avg_session_time'] * 0.3
        )))
        
        pages_viewed = max(1, int(np.random.normal(
            segment_info['pages_per_session'],
            segment_info['pages_per_session'] * 0.2
        )))
        
        # Product interaction
        product_id = None
        if interaction_type in ['product_view', 'add_to_cart', 'remove_from_cart', 'add_to_wishlist']:
            # Use product IDs from the catalog we generated
            category = random.choice(['ELE', 'CLO', 'BOO', 'HOM', 'SPO', 'BEA'])
            product_id = f"PROD_{category}_{random.randint(1, 200):03d}"
        
        # Device and browser
        device = np.random.choice(devices, p=[0.45, 0.40, 0.15])
        browser = np.random.choice(browsers, p=[0.60, 0.15, 0.15, 0.10])
        
        # Geographic and demographic info
        region = np.random.choice(['North', 'South', 'East', 'West', 'Central'])
        age_group = np.random.choice(['18-24', '25-34', '35-44', '45-54', '55+'], 
                                   p=[0.15, 0.30, 0.25, 0.20, 0.10])
        
        # Traffic source
        traffic_source = np.random.choice(traffic_sources, 
                                        p=[0.35, 0.20, 0.15, 0.15, 0.10, 0.05])
        
        behavior_data.append({
            'interaction_id': f"INT_{i+1:06d}",
            'timestamp': interaction_date.strftime('%Y-%m-%d %H:%M:%S'),
            'customer_id': customer_id,
            'session_id': session_id,
            'interaction_type': interaction_type,
            'product_id': product_id,
            'customer_segment': segment,
            'device_type': device,
            'browser': browser,
            'traffic_source': traffic_source,
            'region': region,
            'age_group': age_group,
            'session_duration_minutes': session_duration,
            'pages_viewed': pages_viewed,
            'time_on_page_seconds': random.randint(5, 300),
            'is_mobile': device == 'Mobile',
            'is_new_customer': segment == 'New',
            'hour_of_day': interaction_date.hour,
            'day_of_week': interaction_date.strftime('%A'),
            'month': interaction_date.month
        })
        
        if (i + 1) % 1000 == 0:
            print(f"   Generated {i+1:,} interactions...")
    
    df = pd.DataFrame(behavior_data)
    print(f"✅ Customer behavior data generated: {len(df):,} interactions")
    
    # Show segment distribution
    segment_dist = df['customer_segment'].value_counts()
    print(f"📊 Customer segment distribution:")
    for segment, count in segment_dist.items():
        print(f"   {segment}: {count:,} interactions ({count/len(df)*100:.1f}%)")
    
    return df

# Generate and save customer behavior data
behavior_df = generate_customer_behavior()
behavior_file = datasets_dir / "customer_behavior.csv"
behavior_df.to_csv(behavior_file, index=False)
print(f"💾 Saved: {behavior_file}")
print(f"📊 File size: {behavior_file.stat().st_size / 1024 / 1024:.2f} MB")

## ✅ Step 6: Dataset Validation and Quality Check

In [None]:
def validate_datasets():
    """
    Comprehensive validation of all generated datasets
    """
    print("🔍 Performing comprehensive dataset validation...\n")
    
    validation_results = {}
    
    # Dataset file definitions
    datasets_info = {
        'sales_historical_data.csv': {
            'min_records': 10000,
            'required_columns': ['transaction_id', 'date', 'customer_id', 'product_id', 
                               'category', 'total_amount', 'channel', 'region'],
            'numeric_columns': ['quantity', 'unit_price', 'total_amount', 'month', 'quarter', 'year']
        },
        'product_catalog.csv': {
            'min_records': 1000,
            'required_columns': ['product_id', 'product_name', 'category', 'brand', 
                               'price', 'stock_level', 'rating'],
            'numeric_columns': ['price', 'stock_level', 'rating', 'review_count', 'weight_kg']
        },
        'customer_behavior.csv': {
            'min_records': 5000,
            'required_columns': ['interaction_id', 'timestamp', 'customer_id', 'session_id',
                               'interaction_type', 'customer_segment', 'device_type'],
            'numeric_columns': ['session_duration_minutes', 'pages_viewed', 'time_on_page_seconds', 
                              'hour_of_day', 'month']
        }
    }
    
    for filename, requirements in datasets_info.items():
        filepath = datasets_dir / filename
        print(f"📋 Validating {filename}...")
        
        # Check file existence
        if not filepath.exists():
            print(f"❌ File not found: {filepath}")
            validation_results[filename] = False
            continue
        
        try:
            # Load dataset
            df = pd.read_csv(filepath)
            file_size_mb = filepath.stat().st_size / 1024 / 1024
            
            print(f"   📊 Records: {len(df):,}")
            print(f"   📊 Columns: {len(df.columns)}")
            print(f"   📊 File size: {file_size_mb:.2f} MB")
            
            # Validation checks
            checks_passed = 0
            total_checks = 6
            
            # 1. Record count check
            if len(df) >= requirements['min_records']:
                print(f"   ✅ Record count: {len(df):,} >= {requirements['min_records']:,}")
                checks_passed += 1
            else:
                print(f"   ❌ Record count: {len(df):,} < {requirements['min_records']:,}")
            
            # 2. Required columns check
            missing_columns = set(requirements['required_columns']) - set(df.columns)
            if not missing_columns:
                print(f"   ✅ All required columns present")
                checks_passed += 1
            else:
                print(f"   ❌ Missing columns: {missing_columns}")
            
            # 3. Data types check
            numeric_cols_valid = True
            for col in requirements['numeric_columns']:
                if col in df.columns:
                    if not pd.api.types.is_numeric_dtype(df[col]):
                        print(f"   ⚠️  Column '{col}' should be numeric")
                        numeric_cols_valid = False
            
            if numeric_cols_valid:
                print(f"   ✅ Numeric columns have correct data types")
                checks_passed += 1
            
            # 4. Missing values check
            missing_values = df.isnull().sum().sum()
            missing_percentage = (missing_values / (len(df) * len(df.columns))) * 100
            
            if missing_percentage < 5:  # Less than 5% missing values
                print(f"   ✅ Missing values: {missing_percentage:.2f}% (acceptable)")
                checks_passed += 1
            else:
                print(f"   ⚠️  Missing values: {missing_percentage:.2f}% (high)")
            
            # 5. Duplicate records check
            duplicates = df.duplicated().sum()
            duplicate_percentage = (duplicates / len(df)) * 100
            
            if duplicate_percentage < 1:  # Less than 1% duplicates
                print(f"   ✅ Duplicate records: {duplicate_percentage:.2f}% (acceptable)")
                checks_passed += 1
            else:
                print(f"   ⚠️  Duplicate records: {duplicate_percentage:.2f}% (high)")
            
            # 6. Data distribution check
            has_good_distribution = True
            
            # Check for categorical columns with reasonable distribution
            categorical_cols = df.select_dtypes(include=['object']).columns
            for col in categorical_cols[:3]:  # Check first 3 categorical columns
                unique_values = df[col].nunique()
                if unique_values == 1:
                    print(f"   ⚠️  Column '{col}' has only one unique value")
                    has_good_distribution = False
            
            if has_good_distribution:
                print(f"   ✅ Data distribution looks healthy")
                checks_passed += 1
            
            # Overall validation result
            success_rate = (checks_passed / total_checks) * 100
            if success_rate >= 80:
                print(f"   🎉 Validation PASSED: {success_rate:.0f}% ({checks_passed}/{total_checks} checks)")
                validation_results[filename] = True
            else:
                print(f"   ❌ Validation FAILED: {success_rate:.0f}% ({checks_passed}/{total_checks} checks)")
                validation_results[filename] = False
            
        except Exception as e:
            print(f"   ❌ Error reading file: {str(e)}")
            validation_results[filename] = False
        
        print()  # Empty line for readability
    
    return validation_results

# Perform validation
validation_results = validate_datasets()

## 📊 Step 7: Generate Dataset Summary Report

In [None]:
def generate_summary_report():
    """
    Generate a comprehensive summary report of all datasets
    """
    print("📈 DATASET SUMMARY REPORT")
    print("=" * 60)
    
    total_size = 0
    total_records = 0
    
    for filename in ['sales_historical_data.csv', 'product_catalog.csv', 'customer_behavior.csv']:
        filepath = datasets_dir / filename
        
        if filepath.exists():
            df = pd.read_csv(filepath)
            file_size = filepath.stat().st_size
            
            print(f"\n📁 {filename}")
            print(f"   Records: {len(df):,}")
            print(f"   Columns: {len(df.columns)}")
            print(f"   Size: {file_size / 1024 / 1024:.2f} MB")
            print(f"   Validation: {'✅ PASSED' if validation_results.get(filename, False) else '❌ FAILED'}")
            
            # Show sample data structure
            print(f"   Sample columns: {', '.join(df.columns[:5])}{'...' if len(df.columns) > 5 else ''}")
            
            total_size += file_size
            total_records += len(df)
    
    print(f"\n📊 OVERALL SUMMARY")
    print(f"   Total records: {total_records:,}")
    print(f"   Total size: {total_size / 1024 / 1024:.2f} MB")
    print(f"   Files created: {len([f for f in validation_results.values() if f])} / {len(validation_results)}")
    
    # Overall success status
    success_count = sum(validation_results.values())
    total_files = len(validation_results)
    
    if success_count == total_files:
        print(f"\n🎉 ALL DATASETS SUCCESSFULLY CREATED AND VALIDATED!")
        print(f"✅ Ready to proceed to Module 2: Predictive Model Development")
    else:
        print(f"\n⚠️  {total_files - success_count} datasets failed validation")
        print(f"❗ Please review the errors above and re-run failed sections")
    
    print("=" * 60)

# Generate final report
generate_summary_report()

## 🎯 Step 8: Next Steps and Workshop Preparation

In [None]:
# Final preparation check
print("🚀 WORKSHOP PREPARATION CHECKLIST")
print("=" * 50)

checklist_items = [
    ("✅", "OpenShift AI environment verified"),
    ("✅", "Data Science Project created"),
    ("✅", "Jupyter workbench configured"),
    ("✅" if validation_results.get('sales_historical_data.csv', False) else "❌", "Sales historical data ready"),
    ("✅" if validation_results.get('product_catalog.csv', False) else "❌", "Product catalog prepared"),
    ("✅" if validation_results.get('customer_behavior.csv', False) else "❌", "Customer behavior data generated"),
    ("✅", "Dataset validation completed")
]

for status, item in checklist_items:
    print(f"   {status} {item}")

all_passed = all(validation_results.values())

if all_passed:
    print(f"\n🎉 ENVIRONMENT SETUP COMPLETE!")
    print(f"\n📚 You're ready to proceed to:")
    print(f"   📂 Module 2: Predictive Model Development")
    print(f"   📄 File: 02-predictive-model.md")
    print(f"\n💡 What's next:")
    print(f"   • Explore sales data patterns")
    print(f"   • Engineer features for ML")
    print(f"   • Train Random Forest model")
    print(f"   • Export to ONNX format")
    print(f"   • Deploy with OpenVINO")
else:
    print(f"\n⚠️  Please resolve validation issues before proceeding")
    print(f"   Re-run the failed sections above")
    print(f"   Check the troubleshooting guide if needed")

print("\n" + "=" * 50)
print(f"📧 Need help? Contact: cestay@redhat.com")
print(f"🐙 Workshop repo: https://github.com/pkstaz/ai-ecommerce-workshop")

---

## 📝 Summary

This notebook has successfully:

✅ **Created comprehensive datasets** for the AI e-commerce workshop  
✅ **Generated realistic sales historical data** with seasonal patterns and business logic  
✅ **Built a detailed product catalog** with categories, brands, and features  
✅ **Simulated customer behavior data** with user segments and interaction patterns  
✅ **Validated all datasets** for quality and completeness  
✅ **Prepared the environment** for AI model development  

**Dataset Overview:**
- **Sales Data:** 12,000+ transactions with temporal patterns
- **Product Catalog:** 1,200+ products across 6 categories
- **Customer Behavior:** 8,000+ interactions with behavioral insights

**Ready for Module 2:** Predictive Model Development

---