# 🤖 Model Training and Evaluation
## House Price Prediction - Machine Learning Pipeline

Welcome to the final notebook in our house price prediction series! In this notebook, we'll:

1. **Load Engineered Features** - Import the processed data from our feature engineering
2. **Train Multiple Models** - Test various ML algorithms
3. **Evaluate Performance** - Compare models using multiple metrics  
4. **Hyperparameter Tuning** - Optimize the best performing models
5. **Make Predictions** - Generate predictions on test data
6. **Model Interpretation** - Understand feature importance

Let's build some powerful predictive models! 🚀

## 1. Setup and Imports

Let's start by importing our necessary libraries and setting up our environment.

In [None]:
# Essential imports with fallback handling
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
from time import time

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Set style for better plots
plt.style.use('default')
sns.set_palette("husl")

print("📦 Basic packages loaded successfully!")

# Try to import scikit-learn components
try:
    from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
    from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
    from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.neighbors import KNeighborsRegressor
    from sklearn.svm import SVR
    from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
    from sklearn.preprocessing import StandardScaler
    
    print("✅ Scikit-learn imported successfully!")
    sklearn_available = True
    
except ImportError as e:
    print(f"⚠️ Scikit-learn import error: {e}")
    print("🔄 Will use basic implementations where possible")
    sklearn_available = False

# Try to import additional useful libraries
try:
    import scipy.stats as stats
    print("✅ SciPy available for statistical functions")
    scipy_available = True
except ImportError:
    print("⚠️ SciPy not available - using basic statistics")
    scipy_available = False

# Set random seed for reproducibility
np.random.seed(42)

print("\n🎯 Environment setup complete!")
print(f"   Scikit-learn: {'✅' if sklearn_available else '❌'}")
print(f"   SciPy: {'✅' if scipy_available else '❌'}")
print(f"   NumPy version: {np.__version__}")
print(f"   Pandas version: {pd.__version__}")

# Set up plotting parameters
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

## 2. Load Engineered Data

Let's load the features we created in the previous notebook.

In [None]:
# Load Engineered Data
print("=== Loading Engineered Data ===")

# Define paths to our processed data
data_dir = "../outputs/processed_data"
original_data_dir = "../data"

# Try to load engineered features first
try:
    # Load the complete engineered dataset
    engineered_file = os.path.join(data_dir, "engineered_features.csv")
    if os.path.exists(engineered_file):
        df = pd.read_csv(engineered_file)
        print(f"✅ Loaded engineered features from: {engineered_file}")
        print(f"   Shape: {df.shape}")
        data_source = "engineered"
    else:
        raise FileNotFoundError("Engineered features not found")
        
except (FileNotFoundError, Exception) as e:
    print(f"⚠️ Could not load engineered features: {e}")
    print("🔄 Trying to load original training data...")
    
    # Fallback to original data
    try:
        train_file = os.path.join(original_data_dir, "train.csv")
        if os.path.exists(train_file):
            df = pd.read_csv(train_file)
            print(f"✅ Loaded original training data from: {train_file}")
            print(f"   Shape: {df.shape}")
            data_source = "original"
        else:
            raise FileNotFoundError("No data files found")
    except Exception as e2:
        print(f"❌ Could not load any data: {e2}")
        print("Please make sure you have either:")
        print("1. Run the feature engineering notebook first, OR")
        print("2. Downloaded the Kaggle data to ../data/")
        data_source = None

if data_source:
    # Basic data info
    print(f"\n📊 Dataset Information:")
    print(f"   Rows: {df.shape[0]:,}")
    print(f"   Columns: {df.shape[1]:,}")
    print(f"   Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
    
    # Check for target variable
    if 'SalePrice' in df.columns:
        print(f"   Target variable: SalePrice")
        print(f"   Target range: ${df['SalePrice'].min():,.0f} - ${df['SalePrice'].max():,.0f}")
        print(f"   Target mean: ${df['SalePrice'].mean():,.0f}")
        has_target = True
    else:
        print("   ⚠️ No target variable found")
        has_target = False
    
    # Check for missing values
    missing_count = df.isnull().sum().sum()
    print(f"   Missing values: {missing_count}")
    
    if missing_count > 0:
        print("   Features with missing values:")
        missing_features = df.isnull().sum()[df.isnull().sum() > 0]
        for feature, count in missing_features.head(10).items():
            print(f"     {feature}: {count}")
    
    # Display first few rows
    print(f"\n📋 First 5 rows:")
    print(df.head())
    
    # Data types
    print(f"\n🔢 Data types:")
    print(df.dtypes.value_counts())
    
else:
    print("❌ Cannot proceed without data. Please check your data files.")

## 3. Data Preparation

Now let's prepare our data for machine learning by creating features (X) and target (y) variables, and splitting into train/test sets.

In [None]:
# Data Preparation
print("=== Data Preparation ===")

if data_source and has_target:
    # Prepare features and target
    print("Preparing features and target variable...")
    
    # Separate features and target
    if 'SalePrice' in df.columns:
        X = df.drop('SalePrice', axis=1)
        y = df['SalePrice']
        print(f"✅ Features (X): {X.shape}")
        print(f"✅ Target (y): {y.shape}")
    else:
        print("❌ No target variable found")
        X, y = None, None
    
    if X is not None and y is not None:
        # Handle missing values in features
        print(f"\n🧹 Cleaning data...")
        
        # Check for missing values
        missing_cols = X.isnull().sum()
        missing_cols = missing_cols[missing_cols > 0]
        
        if len(missing_cols) > 0:
            print(f"   Found {len(missing_cols)} columns with missing values")
            
            # Fill missing values
            for col in missing_cols.index:
                if X[col].dtype in ['int64', 'float64']:
                    # Fill numerical columns with median
                    X[col].fillna(X[col].median(), inplace=True)
                else:
                    # Fill categorical columns with mode
                    X[col].fillna(X[col].mode()[0] if len(X[col].mode()) > 0 else 'Unknown', inplace=True)
            
            print(f"   ✅ Filled missing values")
        else:
            print(f"   ✅ No missing values found")
        
        # Handle infinite values
        inf_cols = []
        for col in X.select_dtypes(include=[np.number]).columns:
            if np.isinf(X[col]).any():
                inf_cols.append(col)
                X[col].replace([np.inf, -np.inf], X[col].median(), inplace=True)
        
        if inf_cols:
            print(f"   ✅ Handled infinite values in {len(inf_cols)} columns")
        
        # Convert categorical variables to numerical if needed
        categorical_cols = X.select_dtypes(include=['object']).columns
        if len(categorical_cols) > 0:
            print(f"   🏷️ Found {len(categorical_cols)} categorical columns")
            
            # Simple label encoding for categorical variables
            for col in categorical_cols:
                unique_vals = X[col].unique()
                if len(unique_vals) <= 50:  # Only encode if not too many categories
                    label_map = {val: i for i, val in enumerate(unique_vals)}
                    X[col] = X[col].map(label_map)
                else:
                    print(f"     Dropping {col} (too many categories: {len(unique_vals)})")
                    X = X.drop(col, axis=1)
            
            print(f"   ✅ Encoded categorical variables")
        
        # Final check
        print(f"\n📊 Final dataset info:")
        print(f"   Features shape: {X.shape}")
        print(f"   Target shape: {y.shape}")
        print(f"   Feature types: {X.dtypes.value_counts().to_dict()}")
        
        # Split data into train and test sets
        if sklearn_available:
            print(f"\n✂️ Splitting data...")
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2, random_state=42
            )
            print(f"   Training set: {X_train.shape[0]} samples")
            print(f"   Test set: {X_test.shape[0]} samples")
            
            # Basic statistics
            print(f"\n📈 Target variable statistics:")
            print(f"   Training mean: ${y_train.mean():,.0f}")
            print(f"   Training std: ${y_train.std():,.0f}")
            print(f"   Test mean: ${y_test.mean():,.0f}")
            print(f"   Test std: ${y_test.std():,.0f}")
            
            data_ready = True
            
        else:
            print("⚠️ Scikit-learn not available - manual split needed")
            # Manual train/test split
            split_idx = int(0.8 * len(X))
            indices = np.random.permutation(len(X))
            
            train_idx = indices[:split_idx]
            test_idx = indices[split_idx:]
            
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
            
            print(f"   ✅ Manual split completed")
            print(f"   Training set: {len(X_train)} samples")
            print(f"   Test set: {len(X_test)} samples")
            
            data_ready = True
    else:
        data_ready = False
        
else:
    print("❌ Cannot prepare data without target variable")
    data_ready = False

if data_ready:
    print(f"\n🎉 Data preparation complete!")
    print(f"   Ready for model training!")
else:
    print(f"\n❌ Data preparation failed")
    print(f"   Please check your data and try again")