# Feature Engineering for Sales Forecasting - Part 1
## Module 2: Predictive Model - Basic Feature Engineering

---

**Objective:** Create fundamental features from raw sales data

**This notebook covers:**
- Data integration and enrichment
- Temporal feature engineering
- Product and category features
- Customer behavioral features

**Next notebook:** `02_feature_engineering_part2.ipynb` - Advanced features and preprocessing

---

## 📋 Step 1: Import Libraries and Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import joblib
from pathlib import Path

# Configure display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
warnings.filterwarnings('ignore')

# Set visualization style
plt.style.use('default')
sns.set_palette("husl")

print("✅ Libraries imported successfully")
print(f"📅 Feature engineering started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

## 📂 Step 2: Load and Combine Datasets

In [None]:
# Define data paths
data_dir = Path("../../datasets")
processed_dir = data_dir / "processed"
processed_dir.mkdir(exist_ok=True)

# Load datasets
print("📊 Loading datasets...")

# Load sales data
sales_df = pd.read_csv(data_dir / "sales_historical_data.csv")
print(f"✅ Sales data loaded: {len(sales_df):,} records")

# Load product catalog
products_df = pd.read_csv(data_dir / "product_catalog.csv")
print(f"✅ Product catalog loaded: {len(products_df):,} records")

# Load customer behavior data
behavior_df = pd.read_csv(data_dir / "customer_behavior.csv")
print(f"✅ Customer behavior loaded: {len(behavior_df):,} records")

# Display basic info about sales data
print(f"\n📈 Sales Data Overview:")
print(f"   Date range: {sales_df['date'].min()} to {sales_df['date'].max()}")
print(f"   Categories: {sales_df['category'].nunique()} unique")
print(f"   Products: {sales_df['product_id'].nunique()} unique")
print(f"   Customers: {sales_df['customer_id'].nunique()} unique")
print(f"   Total revenue: ${sales_df['total_amount'].sum():,.2f}")

## 🔗 Step 3: Data Integration and Enrichment

In [None]:
def integrate_datasets(sales_df, products_df, behavior_df):
    """
    Integrate multiple datasets to create enriched features
    """
    print("🔗 Integrating datasets...")
    
    # Start with sales data as base
    enriched_df = sales_df.copy()
    
    # Convert date to datetime
    enriched_df['date'] = pd.to_datetime(enriched_df['date'])
    
    # Merge with product information
    print("   📦 Adding product features...")
    product_features = products_df[[
        'product_id', 'brand', 'price', 'rating', 'review_count', 
        'weight_kg', 'is_active'
    ]].copy()
    
    enriched_df = enriched_df.merge(
        product_features, 
        on='product_id', 
        how='left',
        suffixes=('', '_catalog')
    )
    
    # Calculate customer interaction metrics from behavior data
    print("   👥 Adding customer behavior features...")
    
    # Convert behavior timestamp
    behavior_df['timestamp'] = pd.to_datetime(behavior_df['timestamp'])
    behavior_df['date'] = behavior_df['timestamp'].dt.date
    
    # Customer activity aggregations
    customer_metrics = behavior_df.groupby('customer_id').agg({
        'session_duration_minutes': ['mean', 'sum'],
        'pages_viewed': ['mean', 'sum'],
        'interaction_type': 'count',
        'device_type': lambda x: x.mode().iloc[0] if len(x.mode()) > 0 else 'Unknown',
        'customer_segment': lambda x: x.mode().iloc[0] if len(x.mode()) > 0 else 'Unknown'
    }).round(2)
    
    # Flatten column names
    customer_metrics.columns = [
        'avg_session_duration', 'total_session_duration',
        'avg_pages_viewed', 'total_pages_viewed',
        'total_interactions', 'preferred_device', 'customer_segment'
    ]
    
    customer_metrics = customer_metrics.reset_index()
    
    # Merge customer metrics
    enriched_df = enriched_df.merge(
        customer_metrics,
        on='customer_id',
        how='left'
    )
    
    # Fill missing values for customers not in behavior data
    behavior_fill_values = {
        'avg_session_duration': enriched_df['avg_session_duration'].median(),
        'total_session_duration': 0,
        'avg_pages_viewed': enriched_df['avg_pages_viewed'].median(),
        'total_pages_viewed': 0,
        'total_interactions': 0,
        'preferred_device': 'Unknown',
        'customer_segment': 'Unknown'
    }
    
    enriched_df = enriched_df.fillna(behavior_fill_values)
    
    print(f"✅ Datasets integrated: {len(enriched_df):,} records with {len(enriched_df.columns)} features")
    
    return enriched_df

# Perform data integration
enriched_data = integrate_datasets(sales_df, products_df, behavior_df)

# Display sample of enriched data
print("\n📋 Sample of enriched dataset:")
print(enriched_data.head())
print(f"\n📊 New features added: {len(enriched_data.columns) - len(sales_df.columns)}")

## 📅 Step 4: Temporal Feature Engineering

In [None]:
def create_temporal_features(df):
    """
    Create sophisticated temporal features for capturing seasonality and trends
    """
    print("📅 Creating temporal features...")
    
    df = df.copy()
    
    # Basic temporal features
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['day_of_year'] = df['date'].dt.dayofyear
    df['week_of_year'] = df['date'].dt.isocalendar().week
    df['day_of_week'] = df['date'].dt.dayofweek  # 0=Monday, 6=Sunday
    
    # Cyclical encoding for temporal features
    print("   🔄 Creating cyclical temporal features...")
    
    # Month cyclical (12 months)
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    
    # Day of week cyclical (7 days)
    df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
    df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
    
    # Day of year cyclical (365 days)
    df['day_of_year_sin'] = np.sin(2 * np.pi * df['day_of_year'] / 365)
    df['day_of_year_cos'] = np.cos(2 * np.pi * df['day_of_year'] / 365)
    
    # Week of year cyclical (52 weeks)
    df['week_of_year_sin'] = np.sin(2 * np.pi * df['week_of_year'] / 52)
    df['week_of_year_cos'] = np.cos(2 * np.pi * df['week_of_year'] / 52)
    
    # Business calendar features
    print("   📆 Adding business calendar features...")
    
    # Weekend indicator
    df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
    
    # Month-end indicator (last 3 days of month)
    df['is_month_end'] = (df['date'].dt.days_in_month - df['day'] <= 2).astype(int)
    
    # Quarter features
    df['quarter'] = df['date'].dt.quarter
    df['is_q4'] = (df['quarter'] == 4).astype(int)  # Holiday season
    
    # Holiday seasons (approximate)
    df['is_holiday_season'] = (
        ((df['month'] == 11) & (df['day'] >= 20)) |  # Late November
        (df['month'] == 12) |  # December
        ((df['month'] == 1) & (df['day'] <= 7))     # Early January
    ).astype(int)
    
    # Summer season
    df['is_summer'] = (df['month'].isin([6, 7, 8])).astype(int)
    
    # Back-to-school season
    df['is_back_to_school'] = (
        ((df['month'] == 8) & (df['day'] >= 15)) |
        ((df['month'] == 9) & (df['day'] <= 15))
    ).astype(int)
    
    # Days since epoch (for trend analysis)
    epoch = pd.to_datetime('2020-01-01')
    df['days_since_epoch'] = (df['date'] - epoch).dt.days
    
    print(f"✅ Temporal features created: {len([col for col in df.columns if any(x in col.lower() for x in ['sin', 'cos', 'is_', 'day', 'week', 'month', 'year', 'quarter'])])} features")
    
    return df

# Apply temporal feature engineering
enriched_data = create_temporal_features(enriched_data)

# Visualize some temporal patterns
print("\n📊 Temporal Pattern Analysis:")

# Create visualization of temporal patterns
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Monthly sales pattern
monthly_sales = enriched_data.groupby('month')['total_amount'].sum()
axes[0, 0].bar(monthly_sales.index, monthly_sales.values)
axes[0, 0].set_title('Monthly Sales Distribution')
axes[0, 0].set_xlabel('Month')
axes[0, 0].set_ylabel('Total Sales ($)')

# Day of week pattern
dow_sales = enriched_data.groupby('day_of_week')['total_amount'].mean()
dow_labels = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
axes[0, 1].bar(range(7), dow_sales.values)
axes[0, 1].set_title('Average Sales by Day of Week')
axes[0, 1].set_xlabel('Day of Week')
axes[0, 1].set_ylabel('Average Sales ($)')
axes[0, 1].set_xticks(range(7))
axes[0, 1].set_xticklabels(dow_labels)

# Quarterly pattern
quarterly_sales = enriched_data.groupby('quarter')['total_amount'].sum()
axes[1, 0].bar(quarterly_sales.index, quarterly_sales.values)
axes[1, 0].set_title('Quarterly Sales Distribution')
axes[1, 0].set_xlabel('Quarter')
axes[1, 0].set_ylabel('Total Sales ($)')

# Holiday vs non-holiday sales
holiday_comparison = enriched_data.groupby('is_holiday_season')['total_amount'].mean()
axes[1, 1].bar(['Regular Season', 'Holiday Season'], holiday_comparison.values)
axes[1, 1].set_title('Holiday Season Impact on Sales')
axes[1, 1].set_ylabel('Average Sales ($)')

plt.tight_layout()
plt.show()

print(f"\n📈 Key Temporal Insights:")
print(f"   Highest sales month: {monthly_sales.idxmax()} (${monthly_sales.max():,.0f})")
print(f"   Best day of week: {dow_labels[dow_sales.idxmax()]} (${dow_sales.max():.0f} avg)")
print(f"   Holiday season lift: {((holiday_comparison[1] / holiday_comparison[0] - 1) * 100):.1f}%")

## 🏷️ Step 5: Product and Category Features

In [None]:
def create_product_features(df):
    """
    Create product-specific features including pricing, popularity, and performance metrics
    """
    print("🏷️ Creating product and category features...")
    
    df = df.copy()
    
    # Limpiar columnas duplicadas si existen
    columns_to_clean = [col for col in df.columns if col.endswith('_x') or col.endswith('_y')]
    if columns_to_clean:
        print(f"   🧹 Cleaning {len(columns_to_clean)} duplicate columns...")
        df = df.drop(columns=columns_to_clean)
    
    # Price-related features
    print("   💰 Creating price features...")
    
    # Price difference between unit price and catalog price
    df['price_difference'] = df['unit_price'] - df['price']
    df['price_ratio'] = df['unit_price'] / (df['price'] + 0.01)  # Avoid division by zero
    
    # Discount indicator
    df['is_discounted'] = (df['price_difference'] < -0.01).astype(int)
    df['discount_percentage'] = np.where(
        df['is_discounted'],
        ((df['price'] - df['unit_price']) / df['price'] * 100).clip(0, 100),
        0
    )
    
    # Price categories by quartiles within each category
    df['price_quartile'] = df.groupby('category')['unit_price'].transform(
        lambda x: pd.qcut(x, q=4, labels=['Low', 'Medium', 'High', 'Premium'], duplicates='drop')
    )
    
    # Category-level aggregations - SOLO SI NO EXISTEN
    print("   📦 Creating category features...")
    
    category_cols_needed = ['category_avg_price', 'category_price_std', 'category_median_price',
                           'category_avg_quantity', 'category_quantity_std', 'category_avg_amount', 
                           'category_total_amount', 'category_transaction_count']
    
    missing_category_cols = [col for col in category_cols_needed if col not in df.columns]
    
    if missing_category_cols:
        print(f"   Creating missing category columns: {len(missing_category_cols)}")
        
        category_stats = df.groupby('category').agg({
            'unit_price': ['mean', 'std', 'median'],
            'quantity': ['mean', 'std'],
            'total_amount': ['mean', 'sum', 'count']
        }).round(2)
        
        # Flatten column names
        category_stats.columns = category_cols_needed
        category_stats = category_stats.reset_index()
        
        # Merge category statistics
        df = df.merge(category_stats, on='category', how='left', suffixes=('', '_new'))
    else:
        print("   ✅ Category features already exist")
    
    # Product performance features - CREAR PRIMERO
    print("   📊 Creating product performance features...")
    
    # Product-level aggregations (historical performance)
    product_stats = df.groupby('product_id').agg({
        'quantity': ['sum', 'mean', 'count'],
        'total_amount': ['sum', 'mean'],
        'customer_id': 'nunique'
    }).round(2)
    
    product_stats.columns = [
        'product_total_qty', 'product_avg_qty', 'product_transaction_count',
        'product_total_revenue', 'product_avg_revenue',
        'product_unique_customers'
    ]
    
    product_stats = product_stats.reset_index()
    
    # Calculate product popularity score
    product_stats['product_popularity_score'] = (
        product_stats['product_transaction_count'] * 0.4 +
        product_stats['product_unique_customers'] * 0.6
    ).round(2)
    
    # Merge product statistics - CON VERIFICACIÓN
    original_cols = set(df.columns)
    df = df.merge(product_stats, on='product_id', how='left', suffixes=('', '_prod'))
    new_cols = set(df.columns) - original_cols
    print(f"   ✅ Added product columns: {len(new_cols)}")
    
    # Verificar que product_popularity_score existe
    if 'product_popularity_score' not in df.columns:
        print("   ❌ product_popularity_score not found, creating fallback...")
        df['product_popularity_score'] = 1.0
    
    # Relative performance features - AHORA SÍ SE PUEDEN CREAR
    print("   📈 Creating relative performance features...")
    
    # Price relative to category average
    if 'category_avg_price' in df.columns:
        df['price_vs_category_avg'] = df['unit_price'] / (df['category_avg_price'] + 0.01)
    else:
        print("   ⚠️ Warning: category_avg_price not found, creating fallback...")
        df['price_vs_category_avg'] = 1.0
    
    # Product performance relative to category - AHORA CON VERIFICACIÓN
    if 'product_popularity_score' in df.columns:
        category_pop_mean = df.groupby('category')['product_popularity_score'].transform('mean')
        df['product_performance_vs_category'] = df['product_popularity_score'] / (category_pop_mean + 0.01)
    else:
        print("   ⚠️ Warning: product_popularity_score not found for relative performance")
        df['product_performance_vs_category'] = 1.0
    
    # Brand features
    print("   🏷️ Creating brand features...")
    
    # Brand-level aggregations
    brand_stats = df.groupby('brand').agg({
        'total_amount': ['sum', 'mean', 'count'],
        'rating': 'mean',
        'review_count': 'mean'
    }).round(2)
    
    brand_stats.columns = [
        'brand_total_revenue', 'brand_avg_revenue', 'brand_transaction_count',
        'brand_avg_rating', 'brand_avg_reviews'
    ]
    
    brand_stats = brand_stats.reset_index()
    
    # Merge brand statistics
    df = df.merge(brand_stats, on='brand', how='left', suffixes=('', '_brand'))
    
    # Calculate brand market share within category
    try:
        category_totals = df.groupby('category')['total_amount'].sum().reset_index()
        category_totals.columns = ['category', 'category_total_revenue_calc']
        
        brand_category_revenue = df.groupby(['brand', 'category'])['total_amount'].sum().reset_index()
        brand_category_revenue = brand_category_revenue.merge(category_totals, on='category')
        brand_category_revenue['brand_market_share'] = (
            brand_category_revenue['total_amount'] / brand_category_revenue['category_total_revenue_calc'] * 100
        ).round(2)
        
        # Merge brand market share
        df = df.merge(
            brand_category_revenue[['brand', 'category', 'brand_market_share']], 
            on=['brand', 'category'], 
            how='left',
            suffixes=('', '_market')
        )
    except Exception as e:
        print(f"   ⚠️ Warning: Could not calculate brand market share: {e}")
        df['brand_market_share'] = 0.0
    
    # Product quality indicators
    print("   ⭐ Creating quality indicators...")
    
    # Rating-based features
    df['is_high_rated'] = (df['rating'] >= 4.0).astype(int)
    df['is_well_reviewed'] = (df['review_count'] >= 50).astype(int)
    
    # Review density (reviews per rating point)
    df['review_density'] = df['review_count'] / (df['rating'] + 0.1)
    
    # Limpiar columnas duplicadas finales
    final_columns_to_clean = [col for col in df.columns if col.endswith('_x') or col.endswith('_y') or col.endswith('_prod') or col.endswith('_brand') or col.endswith('_market')]
    if final_columns_to_clean:
        print(f"   🧹 Final cleanup: removing {len(final_columns_to_clean)} duplicate columns")
        df = df.drop(columns=final_columns_to_clean)
    
    product_feature_count = len([col for col in df.columns if any(x in col.lower() for x in ['price', 'brand', 'category', 'product', 'rating', 'review'])])
    print(f"✅ Product features created: {product_feature_count} features")
    
    return df

# Apply product feature engineering
enriched_data = create_product_features(enriched_data)

# Analyze product feature distributions
print("\n📊 Product Feature Analysis:")

# Create visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Price distribution by category
sns.boxplot(data=enriched_data, x='category', y='unit_price', ax=axes[0, 0])
axes[0, 0].set_title('Price Distribution by Category')
axes[0, 0].tick_params(axis='x', rotation=45)

# Discount percentage distribution
discount_data = enriched_data[enriched_data['is_discounted'] == 1]['discount_percentage']
axes[0, 1].hist(discount_data, bins=20, alpha=0.7)
axes[0, 1].set_title('Discount Percentage Distribution')
axes[0, 1].set_xlabel('Discount %')
axes[0, 1].set_ylabel('Frequency')

# Rating vs Sales correlation
rating_sales = enriched_data.groupby('rating')['total_amount'].mean()
axes[1, 0].scatter(rating_sales.index, rating_sales.values, alpha=0.6)
axes[1, 0].set_title('Rating vs Average Sales')
axes[1, 0].set_xlabel('Product Rating')
axes[1, 0].set_ylabel('Average Sales ($)')

# Brand performance
top_brands = enriched_data.groupby('brand')['brand_total_revenue'].first().nlargest(10)
axes[1, 1].barh(range(len(top_brands)), top_brands.values)
axes[1, 1].set_title('Top 10 Brands by Revenue')
axes[1, 1].set_xlabel('Total Revenue ($)')
axes[1, 1].set_yticks(range(len(top_brands)))
axes[1, 1].set_yticklabels(top_brands.index)

plt.tight_layout()
plt.show()

# Key insights
print(f"\n📈 Key Product Insights:")
discount_rate = (enriched_data['is_discounted'].sum() / len(enriched_data)) * 100
avg_discount = enriched_data[enriched_data['is_discounted'] == 1]['discount_percentage'].mean()
high_rated_performance = enriched_data.groupby('is_high_rated')['total_amount'].mean()

print(f"   Discount rate: {discount_rate:.1f}% of transactions")
print(f"   Average discount: {avg_discount:.1f}%")
print(f"   High-rated products sales lift: {((high_rated_performance[1] / high_rated_performance[0] - 1) * 100):.1f}%")

## 👥 Step 6: Customer and Behavioral Features

In [None]:
def create_customer_features(df):
    """
    Create customer-specific features including segmentation, behavior, and purchase patterns
    """
    print("👥 Creating customer and behavioral features...")
    
    df = df.copy()
    
    # Customer transaction history features
    print("   📊 Creating customer transaction features...")
    
    # Customer-level aggregations
    customer_stats = df.groupby('customer_id').agg({
        'total_amount': ['sum', 'mean', 'count'],
        'quantity': ['sum', 'mean'],
        'product_id': 'nunique',
        'category': 'nunique',
        'brand': 'nunique',
        'date': ['min', 'max']
    }).round(2)
    
    # Flatten column names
    customer_stats.columns = [
        'customer_total_spent', 'customer_avg_order_value', 'customer_transaction_count',
        'customer_total_quantity', 'customer_avg_quantity',
        'customer_unique_products', 'customer_unique_categories', 'customer_unique_brands',
        'customer_first_purchase', 'customer_last_purchase'
    ]
    
    customer_stats = customer_stats.reset_index()
    
    # Calculate customer lifetime value and recency
    customer_stats['customer_lifetime_days'] = (
        pd.to_datetime(customer_stats['customer_last_purchase']) - 
        pd.to_datetime(customer_stats['customer_first_purchase'])
    ).dt.days + 1
    
    # Days since last purchase (recency)
    reference_date = df['date'].max()
    customer_stats['customer_recency_days'] = (
        reference_date - pd.to_datetime(customer_stats['customer_last_purchase'])
    ).dt.days
    
    # Customer value per day
    customer_stats['customer_value_per_day'] = (
        customer_stats['customer_total_spent'] / customer_stats['customer_lifetime_days']
    ).round(2)
    
    # Customer diversity score (based on categories and brands purchased)
    customer_stats['customer_diversity_score'] = (
        customer_stats['customer_unique_categories'] * 0.6 +
        customer_stats['customer_unique_brands'] * 0.4
    ).round(2)
    
    # Merge customer statistics
    df = df.merge(customer_stats, on='customer_id', how='left')
    
    # Customer segmentation based on RFM analysis
    print("   🎯 Creating customer segmentation features...")
    
    # RFM quintiles
    df['recency_quintile'] = pd.qcut(df['customer_recency_days'], q=5, labels=[5,4,3,2,1], duplicates='drop')
    df['frequency_quintile'] = pd.qcut(df['customer_transaction_count'], q=5, labels=[1,2,3,4,5], duplicates='drop')
    df['monetary_quintile'] = pd.qcut(df['customer_total_spent'], q=5, labels=[1,2,3,4,5], duplicates='drop')
    
    # Convert to numeric
    df['recency_quintile'] = pd.to_numeric(df['recency_quintile'], errors='coerce')
    df['frequency_quintile'] = pd.to_numeric(df['frequency_quintile'], errors='coerce')
    df['monetary_quintile'] = pd.to_numeric(df['monetary_quintile'], errors='coerce')
    
    # Fill NaN values with median
    df['recency_quintile'] = df['recency_quintile'].fillna(df['recency_quintile'].median())
    df['frequency_quintile'] = df['frequency_quintile'].fillna(df['frequency_quintile'].median())
    df['monetary_quintile'] = df['monetary_quintile'].fillna(df['monetary_quintile'].median())
    
    # RFM Score
    df['rfm_score'] = (
        df['recency_quintile'].astype(int) * 100 +
        df['frequency_quintile'].astype(int) * 10 +
        df['monetary_quintile'].astype(int)
    )
    
    # Customer lifecycle stage
    def assign_customer_lifecycle(row):
        if row['customer_transaction_count'] == 1:
            return 'New'
        elif row['customer_recency_days'] <= 30:
            if row['customer_transaction_count'] >= 5:
                return 'Loyal'
            else:
                return 'Active'
        elif row['customer_recency_days'] <= 90:
            return 'Regular'
        elif row['customer_recency_days'] <= 180:
            return 'At_Risk'
        else:
            return 'Dormant'
    
    df['customer_lifecycle_stage'] = df.apply(assign_customer_lifecycle, axis=1)
    
    # Channel and regional preferences
    print("   🛒 Creating channel and regional features...")
    
    # Customer's preferred channel
    customer_channel_prefs = df.groupby('customer_id')['channel'].agg(
        lambda x: x.mode().iloc[0] if len(x.mode()) > 0 else 'Unknown'
    ).reset_index()
    customer_channel_prefs.columns = ['customer_id', 'customer_preferred_channel']
    
    df = df.merge(customer_channel_prefs, on='customer_id', how='left')
    
    # Channel consistency (how often customer uses their preferred channel)
    df['uses_preferred_channel'] = (df['channel'] == df['customer_preferred_channel']).astype(int)
    
    customer_channel_consistency = df.groupby('customer_id')['uses_preferred_channel'].mean().reset_index()
    customer_channel_consistency.columns = ['customer_id', 'customer_channel_consistency']
    
    df = df.merge(customer_channel_consistency, on='customer_id', how='left')
    
    # Regional features
    region_stats = df.groupby('region').agg({
        'total_amount': ['mean', 'std'],
        'customer_id': 'nunique'
    }).round(2)
    
    region_stats.columns = ['region_avg_order_value', 'region_order_std', 'region_unique_customers']
    region_stats = region_stats.reset_index()
    
    df = df.merge(region_stats, on='region', how='left')
    
    # Customer vs regional performance
    df['customer_vs_region_performance'] = df['customer_avg_order_value'] / df['region_avg_order_value']
    
    # Behavioral features from customer behavior data
    print("   🎭 Enhancing with behavioral features...")
    
    # Digital engagement score
    df['digital_engagement_score'] = (
        df['avg_session_duration'] * 0.3 +
        df['avg_pages_viewed'] * 0.4 +
        (df['total_interactions'] / df['customer_transaction_count']) * 0.3
    ).round(2)
    
    # Customer segment encoding
    segment_values = {
        'Premium': 4,
        'Regular': 3,
        'Occasional': 2,
        'New': 1,
        'Unknown': 0
    }
    
    df['customer_segment_encoded'] = df['customer_segment'].map(segment_values)
    
    print(f"✅ Customer features created: {len([col for col in df.columns if any(x in col.lower() for x in ['customer', 'rfm', 'lifecycle', 'channel', 'region', 'segment'])])} features")
    
    return df

# Apply customer feature engineering
enriched_data = create_customer_features(enriched_data)

# Analyze customer features
print("\n📊 Customer Feature Analysis:")

# Create visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Customer lifecycle distribution
lifecycle_counts = enriched_data['customer_lifecycle_stage'].value_counts()
axes[0, 0].pie(lifecycle_counts.values, labels=lifecycle_counts.index, autopct='%1.1f%%')
axes[0, 0].set_title('Customer Lifecycle Distribution')

# RFM Score distribution
axes[0, 1].hist(enriched_data['rfm_score'], bins=20, alpha=0.7)
axes[0, 1].set_title('RFM Score Distribution')
axes[0, 1].set_xlabel('RFM Score')
axes[0, 1].set_ylabel('Frequency')

# Customer segment performance
segment_performance = enriched_data.groupby('customer_segment')['customer_avg_order_value'].mean().sort_values(ascending=False)
axes[1, 0].bar(segment_performance.index, segment_performance.values)
axes[1, 0].set_title('Average Order Value by Customer Segment')
axes[1, 0].set_ylabel('Average Order Value ($)')
axes[1, 0].tick_params(axis='x', rotation=45)

# Channel preferences
channel_counts = enriched_data['customer_preferred_channel'].value_counts()
axes[1, 1].bar(channel_counts.index, channel_counts.values)
axes[1, 1].set_title('Customer Channel Preferences')
axes[1, 1].set_ylabel('Number of Customers')

plt.tight_layout()
plt.show()

# Key insights
print(f"\n📈 Key Customer Insights:")
avg_clv = enriched_data['customer_total_spent'].mean()
loyal_customers = (enriched_data['customer_lifecycle_stage'] == 'Loyal').sum()
digital_engagement = enriched_data['digital_engagement_score'].mean()

print(f"   Average Customer Lifetime Value: ${avg_clv:.2f}")
print(f"   Loyal customers: {loyal_customers:,} ({loyal_customers/len(enriched_data)*100:.1f}%)")
print(f"   Average digital engagement score: {digital_engagement:.2f}")

## 💾 Step 7: Save Intermediate Results

In [None]:
# Save the enriched data with basic features for Part 2
print("💾 Saving intermediate results...")

# Save enriched dataset
enriched_data.to_csv(processed_dir / "enriched_data_part1.csv", index=False)

print(f"✅ Intermediate data saved: {processed_dir / 'enriched_data_part1.csv'}")
print(f"📊 Current dataset shape: {enriched_data.shape}")
print(f"📈 Features created so far: {len(enriched_data.columns)}")

# Summary of Part 1
print("\n" + "="*60)
print("🎯 PART 1 FEATURE ENGINEERING COMPLETE")
print("="*60)

print(f"\n📊 Features Created in Part 1:")
temporal_features = len([col for col in enriched_data.columns if any(x in col.lower() for x in ['sin', 'cos', 'is_', 'day', 'week', 'month', 'year', 'quarter'])])
product_features = len([col for col in enriched_data.columns if any(x in col.lower() for x in ['price', 'brand', 'category', 'product', 'rating', 'review'])])
customer_features = len([col for col in enriched_data.columns if any(x in col.lower() for x in ['customer', 'rfm', 'lifecycle', 'channel', 'region', 'segment'])])

print(f"   🕒 Temporal features: {temporal_features}")
print(f"   🏷️ Product features: {product_features}")
print(f"   👥 Customer features: {customer_features}")
print(f"   📋 Total features: {len(enriched_data.columns)}")

print(f"\n🚀 Next Steps:")
print(f"   📂 Continue to: 02_feature_engineering_part2.ipynb")
print(f"   🎯 Part 2 will add: Lag features, interactions, and preprocessing")
print(f"   📄 Then proceed to: 03_train_model.ipynb")

print("\n" + "="*60)

---

## 📝 Part 1 Summary

This notebook has successfully created the fundamental features for sales forecasting:

✅ **Data Integration** - Combined sales, product, and customer behavior data  
✅ **Temporal Features** - Seasonal patterns, cyclical encoding, business calendars  
✅ **Product Features** - Pricing analysis, brand performance, quality metrics  
✅ **Customer Features** - RFM segmentation, behavioral scoring, lifecycle stages  

**Ready for Part 2:** Advanced feature engineering including lag features, interaction terms, and final preprocessing.

---