# Feature Engineering Part 2 - Simplified
## Advanced Features and Preprocessing

**Prerequisites:** Complete Part 1 first

**This notebook covers:**
- Lag features and moving averages
- Interaction features
- Data preprocessing
- Final validation and export

## 📋 Setup and Load Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import pickle
from pathlib import Path

warnings.filterwarnings('ignore')
plt.style.use('default')

print("✅ Libraries imported")
print(f"📅 Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

In [None]:
# Load data from Part 1
data_dir = Path("../../datasets")
processed_dir = data_dir / "processed"

enriched_data = pd.read_csv(processed_dir / "enriched_data_part1.csv")
enriched_data['date'] = pd.to_datetime(enriched_data['date'])

print(f"✅ Data loaded: {enriched_data.shape}")
print(f"📊 Starting with {len(enriched_data.columns)} features")

## 📊 Lag Features and Moving Averages

In [None]:
def create_lag_features_simple(df):
    """Create essential lag features and moving averages"""
    print("📊 Creating lag features...")
    
    df = df.sort_values(['date', 'product_id']).copy()
    
    # Daily aggregations
    daily_sales = df.groupby(['date', 'product_id']).agg({
        'quantity': 'sum',
        'total_amount': 'sum'
    }).reset_index()
    daily_sales.columns = ['date', 'product_id', 'daily_quantity', 'daily_revenue']
    
    # Add product category mapping
    product_cat = df[['product_id', 'category']].drop_duplicates()
    daily_sales = daily_sales.merge(product_cat, on='product_id')
    
    # Key lag features (simplified)
    for lag in [1, 7, 30]:
        daily_sales[f'qty_lag_{lag}d'] = daily_sales.groupby('product_id')['daily_quantity'].shift(lag)
        daily_sales[f'rev_lag_{lag}d'] = daily_sales.groupby('product_id')['daily_revenue'].shift(lag)
    
    # Moving averages (simplified)
    for window in [7, 30]:
        daily_sales[f'qty_ma_{window}d'] = daily_sales.groupby('product_id')['daily_quantity'].transform(
            lambda x: x.rolling(window, min_periods=1).mean()
        )
        daily_sales[f'rev_ma_{window}d'] = daily_sales.groupby('product_id')['daily_revenue'].transform(
            lambda x: x.rolling(window, min_periods=1).mean()
        )
    
    # Growth rate
    daily_sales['qty_growth_7d'] = (
        (daily_sales['qty_ma_7d'] - daily_sales['qty_lag_7d']) / 
        (daily_sales['qty_lag_7d'] + 0.01) * 100
    ).clip(-100, 500)
    
    # Merge back
    df = df.merge(daily_sales, on=['date', 'product_id'], how='left')
    
    # Fill missing values
    lag_cols = [col for col in df.columns if 'lag_' in col or '_ma_' in col or 'growth' in col]
    for col in lag_cols:
        if 'growth' in col:
            df[col] = df[col].fillna(0)
        else:
            df[col] = df.groupby('product_id')[col].fillna(method='ffill').fillna(0)
    
    print(f"✅ Created {len(lag_cols)} lag features")
    return df

# Apply lag features
enriched_data = create_lag_features_simple(enriched_data)
print(f"📊 Dataset shape after lag features: {enriched_data.shape}")

## 🔗 Interaction Features

In [None]:
def create_interaction_features_simple(df):
    """Create key interaction features"""
    print("🔗 Creating interaction features...")
    
    # Price and time interactions
    df['price_x_weekend'] = df['unit_price'] * df['is_weekend']
    df['price_x_holiday'] = df['unit_price'] * df['is_holiday_season']
    df['discount_x_holiday'] = df['discount_percentage'] * df['is_holiday_season']
    
    # Customer and product interactions
    df['segment_x_price'] = df['customer_segment_encoded'] * df['unit_price']
    df['lifecycle_x_rating'] = df.get('lifecycle_encoded', 1) * df['rating']
    
    # Lag and seasonal interactions
    df['holiday_x_growth'] = df['is_holiday_season'] * df['qty_growth_7d']
    df['ma7_x_holiday'] = df['qty_ma_7d'] * df['is_holiday_season']
    
    # Advanced interactions
    df['price_x_rating_x_segment'] = (
        df['unit_price'] * df['rating'] * df['customer_segment_encoded']
    )
    
    interaction_count = len([col for col in df.columns if '_x_' in col])
    print(f"✅ Created {interaction_count} interaction features")
    return df

# Apply interaction features
enriched_data = create_interaction_features_simple(enriched_data)
print(f"📊 Dataset shape after interactions: {enriched_data.shape}")

## 🧹 Data Preprocessing

In [None]:
def preprocess_for_ml(df):
    """Prepare data for machine learning"""
    print("🧹 Preprocessing for ML...")
    
    # Remove non-feature columns
    exclude_cols = ['transaction_id', 'date', 'customer_id', 'product_id', 
                   'customer_first_purchase', 'customer_last_purchase']
    
    # Separate target
    target = 'quantity'
    feature_cols = [col for col in df.columns if col not in exclude_cols + [target]]
    
    # Identify categorical vs numerical
    categorical_cols = [col for col in feature_cols if df[col].dtype == 'object']
    numerical_cols = [col for col in feature_cols if col not in categorical_cols]
    
    print(f"   📊 {len(numerical_cols)} numerical, {len(categorical_cols)} categorical features")
    
    # Handle missing values
    for col in numerical_cols:
        if df[col].isnull().sum() > 0:
            if 'price' in col.lower() or 'amount' in col.lower():
                df[col] = df[col].fillna(df[col].median())
            else:
                df[col] = df[col].fillna(df[col].mean())
    
    for col in categorical_cols:
        if df[col].isnull().sum() > 0:
            df[col] = df[col].fillna('Unknown')
    
    # Encode categorical variables
    label_encoders = {}
    
    # High cardinality: label encoding
    high_cardinality = [col for col in categorical_cols if df[col].nunique() > 10]
    for col in high_cardinality:
        le = LabelEncoder()
        df[f'{col}_encoded'] = le.fit_transform(df[col].astype(str))
        label_encoders[col] = le
        numerical_cols.append(f'{col}_encoded')
    
    # Low cardinality: one-hot encoding
    low_cardinality = [col for col in categorical_cols if df[col].nunique() <= 10]
    df_encoded = pd.get_dummies(df[low_cardinality], prefix=low_cardinality, drop_first=True)
    
    # Create final feature matrix
    X = pd.concat([df[numerical_cols], df_encoded], axis=1)
    y = df[target]
    
    feature_summary = {
        'feature_names': list(X.columns),
        'numerical_features': numerical_cols,
        'categorical_features': list(df_encoded.columns),
        'label_encoders': label_encoders,
        'target_column': target,
        'total_features': len(X.columns)
    }
    
    print(f"✅ Preprocessing complete: {X.shape}")
    return X, y, feature_summary

# Apply preprocessing
X, y, feature_summary = preprocess_for_ml(enriched_data)

## 🎯 Feature Validation

In [None]:
def validate_features(X, y):
    """Quick validation of feature quality"""
    print("🎯 Validating features...")
    
    # Check completeness
    missing_pct = (X.isnull().sum().sum() / (X.shape[0] * X.shape[1])) * 100
    print(f"   Missing values: {missing_pct:.3f}%")
    
    # Check variance
    low_variance = sum(1 for col in X.columns if X[col].var() < 0.01)
    print(f"   Low variance features: {low_variance}")
    
    # Check correlations
    correlations = [abs(X[col].corr(y)) for col in X.columns]
    correlations = [c for c in correlations if not np.isnan(c)]
    strong_corr = sum(1 for c in correlations if c > 0.1)
    
    print(f"   Strong correlations (>0.1): {strong_corr}")
    print(f"   Average correlation: {np.mean(correlations):.4f}")
    
    # Overall score
    checks = [
        missing_pct < 1.0,
        low_variance < X.shape[1] * 0.1,
        strong_corr > 10,
        X.shape[1] < X.shape[0] * 0.1
    ]
    
    score = (sum(checks) / len(checks)) * 100
    print(f"\n🏆 Quality Score: {score:.0f}%")
    
    return {
        'missing_percentage': missing_pct,
        'low_variance_features': low_variance,
        'strong_correlations': strong_corr,
        'avg_correlation': np.mean(correlations),
        'overall_score': score
    }

validation_results = validate_features(X, y)

## 💾 Save Results

In [None]:
# Save processed data
print("💾 Saving results...")

# Save feature matrix and target
X.to_csv(processed_dir / "feature_matrix.csv", index=False)
y.to_csv(processed_dir / "target_variable.csv", index=False)

# Save metadata
with open(processed_dir / "feature_summary.pkl", 'wb') as f:
    pickle.dump(feature_summary, f)

with open(processed_dir / "validation_results.pkl", 'wb') as f:
    pickle.dump(validation_results, f)

# Feature statistics
feature_stats = pd.DataFrame({
    'feature_name': X.columns,
    'mean': X.mean(),
    'std': X.std(),
    'correlation_with_target': [X[col].corr(y) for col in X.columns]
}).round(4)

feature_stats.to_csv(processed_dir / "feature_statistics.csv", index=False)

print(f"✅ Files saved to: {processed_dir}")
print(f"   📁 feature_matrix.csv: {X.shape}")
print(f"   📁 target_variable.csv: {len(y)} records")
print(f"   📁 feature_summary.pkl: metadata")
print(f"   📁 feature_statistics.csv: correlations")

## 📊 Final Summary

In [None]:
# Final summary and visualization
print("\n" + "="*60)
print("🎉 FEATURE ENGINEERING COMPLETE!")
print("="*60)

# Feature breakdown
temporal_features = len([f for f in X.columns if any(x in f.lower() for x in ['day', 'week', 'month', 'year', 'quarter', 'holiday', 'weekend', 'sin', 'cos'])])
product_features = len([f for f in X.columns if any(x in f.lower() for x in ['price', 'brand', 'category', 'rating', 'review', 'product'])])
customer_features = len([f for f in X.columns if any(x in f.lower() for x in ['customer', 'segment', 'lifecycle', 'rfm'])])
lag_features = len([f for f in X.columns if 'lag_' in f or '_ma_' in f])
interaction_features = len([f for f in X.columns if '_x_' in f])

print(f"\n📊 Feature Summary:")
print(f"   🕒 Temporal: {temporal_features}")
print(f"   🏷️ Product: {product_features}")
print(f"   👥 Customer: {customer_features}")
print(f"   📊 Lag: {lag_features}")
print(f"   🔗 Interaction: {interaction_features}")
print(f"   📋 Total: {X.shape[1]} features")

print(f"\n📈 Dataset Ready:")
print(f"   Records: {X.shape[0]:,}")
print(f"   Features: {X.shape[1]}")
print(f"   Quality Score: {validation_results['overall_score']:.0f}%")
print(f"   Target: {feature_summary['target_column']}")

print(f"\n🚀 Next Steps:")
print(f"   📂 Continue to: 03_train_model.ipynb")
print(f"   🎯 Train Random Forest for sales forecasting")

# Quick visualization
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Target distribution
axes[0].hist(y, bins=30, alpha=0.7)
axes[0].set_title('Target Distribution')
axes[0].set_xlabel('Quantity')
axes[0].axvline(y.mean(), color='red', linestyle='--', label=f'Mean: {y.mean():.2f}')
axes[0].legend()

# Feature types
feature_types = {
    'Temporal': temporal_features,
    'Product': product_features,
    'Customer': customer_features,
    'Lag': lag_features,
    'Interaction': interaction_features
}

axes[1].pie(feature_types.values(), labels=feature_types.keys(), autopct='%1.1f%%')
axes[1].set_title('Feature Type Distribution')

plt.tight_layout()
plt.show()

print("\n" + "="*60)