# 8. Model Training

Train and optimize multiple machine learning models using the selected features. Compare different algorithms and hyperparameters to find the best performing model.

In [None]:
# Model Training Setup
print("🤖 Model Training Setup")
print("="*50)

available_features = [f for f in manual_selected_features if f in X_train.columns]
    
print(f"Using top correlation features: {available_features[:5]}...")  # Show first 5
print(f"Selected {len(available_features)} features from top importance analysis:")
for i, feature in enumerate(available_features, 1):
    print(f"  {i:2d}. {feature}")

# Prepare training data with selected features
X_train_selected = X_train[available_features].copy()
X_test_selected = X_test[available_features].copy()

print(f"\n📊 Training Data Preparation:")
print(f"  • Selected features: {len(available_features)}")
print(f"  • Training samples: {len(X_train_selected):,}")
print(f"  • Test samples: {len(X_test_selected):,}")
print(f"  • Feature reduction: {((len(X_train.columns) - len(available_features)) / len(X_train.columns) * 100):.1f}%")

print(f"\n✅ Model training setup complete")


In [None]:
# Train Multiple Models with Optimized Parameters
print("🏋️ Training Multiple Models with Enhanced Configuration")
print("="*50)

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, HuberRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Advanced models for better noise handling
import xgboost as xgb
import catboost as cb

# Create feature scaler for models that benefit from scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_selected)
X_test_scaled = scaler.transform(X_test_selected)

# Define models to train with optimized parameters
models = {
    'Linear Regression': LinearRegression(),
    'K-Neighbors': KNeighborsRegressor(n_neighbors=5),
    # 'Support Vector': SVR(kernel='rbf', C=1.0, epsilon=0.01),
    
    # Optimized XGBoost (reduced overfitting)
    'XGBoost (Tuned)': xgb.XGBRegressor(
        n_estimators=50,
        learning_rate=0.05,
        max_depth=4,
        reg_alpha=0.1,
        reg_lambda=0.1,
        random_state=42,
        verbosity=0
    ),
    
    # Optimized CatBoost
    'CatBoost (Tuned)': cb.CatBoostRegressor(
        iterations=75,
        learning_rate=0.08,
        depth=4,
        l2_leaf_reg=3,
        random_seed=42,
        verbose=False
    ),
    
    'Huber Regressor': HuberRegressor(epsilon=1.35, alpha=0.001),
    'Random Forest': RandomForestRegressor(n_estimators=50, max_depth=8, min_samples_split=10, random_state=42),
    
}

# Train models and store results
model_results = []

print("Training and evaluating models...")
print(f"{"Model":<22} {"Train R²":<10} {"Test R²":<10} {"Train RMSE":<12} {"Test RMSE":<12} {"Time (s)":<10}")
print("-" * 95)

import time
from sklearn.metrics import r2_score, mean_squared_error

# Train regular models
for name, model in models.items():
    start_time = time.time()
    model.fit(X_train_selected, y_train)
    
    train_pred = model.predict(X_train_selected)
    test_pred = model.predict(X_test_selected)
    
    train_r2 = r2_score(y_train, train_pred)
    test_r2 = r2_score(y_test, test_pred)
    train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, test_pred))
    training_time = time.time() - start_time
    
    model_results.append({
        "Model": name,
        "Train R²": train_r2,
        "Test R²": test_r2,
        "Train RMSE": train_rmse,
        "Test RMSE": test_rmse,
        "Training Time": training_time,
        "Model Object": model
    })
    
    print(f"{name:<22} {train_r2:<10.4f} {test_r2:<10.4f} {train_rmse:<12.4f} {test_rmse:<12.4f} {training_time:<10.2f}")

# Train scaled versions
scaled_models = {
    "K-Neighbors (Scaled)": KNeighborsRegressor(n_neighbors=5),
   # "Support Vector (Scaled)": SVR(kernel="rbf", C=1.0, epsilon=0.01)
}

print(f"\n{"Scaled Models":<22} {"Train R²":<10} {"Test R²":<10} {"Train RMSE":<12} {"Test RMSE":<12} {"Time (s)":<10}")
print("-" * 95)

for name, model in scaled_models.items():
    start_time = time.time()
    model.fit(X_train_scaled, y_train)
    
    train_pred = model.predict(X_train_scaled)
    test_pred = model.predict(X_test_scaled)
    
    train_r2 = r2_score(y_train, train_pred)
    test_r2 = r2_score(y_test, test_pred)
    train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, test_pred))
    training_time = time.time() - start_time
    
    model_results.append({
        "Model": name,
        "Train R²": train_r2,
        "Test R²": test_r2,
        "Train RMSE": train_rmse,
        "Test RMSE": test_rmse,
        "Training Time": training_time,
        "Model Object": model
    })
    
    print(f"{name:<22} {train_r2:<10.4f} {test_r2:<10.4f} {train_rmse:<12.4f} {test_rmse:<12.4f} {training_time:<10.2f}")

print(f"\n✅ All models trained successfully")

# Sort by Test R² score
model_results_df = pd.DataFrame(model_results)
model_results_df = model_results_df.sort_values("Test R²", ascending=False)

print(f"\nModel training complete - {len(model_results)} models evaluated")

In [None]:
# Model Performance Analysis
print("📊 Model Performance Analysis")
print("="*50)

# Add overfitting calculation
model_results_df["Overfitting"] = model_results_df["Train R²"] - model_results_df["Test R²"]

# Sort by test R² score
results_sorted = model_results_df.sort_values("Test R²", ascending=False)

print("🏆 MODEL LEADERBOARD (by Test R²):")
print(f"{"Rank":<4} {"Model":<18} {"Test R²":<10} {"Test RMSE":<12} {"Overfitting":<12} {"Time (s)":<10}")
print("-" * 80)

for i, (_, row) in enumerate(results_sorted.iterrows(), 1):
    print(f"{i:<4} {row["Model"]:<18} {row["Test R²"]:<10.4f} {row["Test RMSE"]:<12.4f} {row["Overfitting"]:<12.4f} {row["Training Time"]:<10.2f}")

# Identify best models
best_model = results_sorted.iloc[0]
least_overfit = model_results_df.loc[model_results_df["Overfitting"].abs().idxmin()]
fastest_model = model_results_df.loc[model_results_df["Training Time"].idxmin()]

print(f"\n🎯 KEY FINDINGS:")
print(f"  • Best Performance: {best_model["Model"]} (R² = {best_model["Test R²"]:.4f})")
print(f"  • Least Overfitting: {least_overfit["Model"]} (Δ = {least_overfit["Overfitting"]:.4f})")
print(f"  • Fastest Training: {fastest_model["Model"]} ({fastest_model["Training Time"]:.2f}s)")

# Performance categories
excellent_models = model_results_df[model_results_df["Test R²"] > 0.9]
good_models = model_results_df[(model_results_df["Test R²"] > 0.8) & (model_results_df["Test R²"] <= 0.9)]
fair_models = model_results_df[(model_results_df["Test R²"] > 0.6) & (model_results_df["Test R²"] <= 0.8)]

print(f"\n📈 PERFORMANCE CATEGORIES:")
print(f"  • Excellent (R² > 0.9): {len(excellent_models)} models")
if len(excellent_models) > 0:
    print(f"    - {", ".join(excellent_models["Model"].tolist())}")
print(f"  • Good (R² 0.8-0.9): {len(good_models)} models")
if len(good_models) > 0:
    print(f"    - {", ".join(good_models["Model"].tolist())}")
print(f"  • Fair (R² 0.6-0.8): {len(fair_models)} models")
if len(fair_models) > 0:
    print(f"    - {", ".join(fair_models["Model"].tolist())}")

# Best model details
print(f"\n🏅 RECOMMENDED MODEL: {best_model["Model"]}")
print(f"  • Training R²: {best_model["Train R²"]:.4f}")
print(f"  • Test R²: {best_model["Test R²"]:.4f}")
print(f"  • Test RMSE: {best_model["Test RMSE"]:.4f}")
print(f"  • Train RMSE: {best_model["Train RMSE"]:.4f}")
print(f"  • Overfitting: {best_model["Overfitting"]:.4f}")
print(f"  • Training Time: {best_model["Training Time"]:.2f}s")

# Store best model for evaluation section
best_trained_model = best_model["Model Object"]

In [None]:
# Visualize Model Performance
print("📈 Model Performance Visualization")
print("="*50)

# Create comprehensive visualization
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Model Performance Comparison', fontsize=16, fontweight='bold')

# 1. R² Score Comparison
models_list = model_results_df['Model'].tolist()
train_r2 = model_results_df['Train R²'].tolist()
test_r2 = model_results_df['Test R²'].tolist()

x = np.arange(len(models_list))
width = 0.35

ax1.bar(x - width/2, train_r2, width, label='Train R²', alpha=0.8)
ax1.bar(x + width/2, test_r2, width, label='Test R²', alpha=0.8)
ax1.set_xlabel('Models')
ax1.set_ylabel('R² Score')
ax1.set_title('R² Score: Train vs Test')
ax1.set_xticks(x)
ax1.set_xticklabels(models_list, rotation=45, ha='right')
ax1.legend()
ax1.grid(True, alpha=0.3)

# 2. RMSE Comparison
train_rmse = model_results_df['Train RMSE'].tolist()
test_rmse = model_results_df['Test RMSE'].tolist()

ax2.bar(x - width/2, train_rmse, width, label='Train RMSE', alpha=0.8)
ax2.bar(x + width/2, test_rmse, width, label='Test RMSE', alpha=0.8)
ax2.set_xlabel('Models')
ax2.set_ylabel('RMSE')
ax2.set_title('RMSE: Train vs Test')
ax2.set_xticks(x)
ax2.set_xticklabels(models_list, rotation=45, ha='right')
ax2.legend()
ax2.grid(True, alpha=0.3)

# 3. Overfitting Analysis
overfitting = model_results_df['Overfitting'].tolist()
colors = ['green' if x <= 0.1 else 'orange' if x <= 0.2 else 'red' for x in overfitting]

bars = ax3.bar(models_list, overfitting, color=colors, alpha=0.7)
ax3.set_xlabel('Models')
ax3.set_ylabel('Overfitting (Train R² - Test R²)')
ax3.set_title('Overfitting Analysis')
ax3.set_xticklabels(models_list, rotation=45, ha='right')
ax3.axhline(y=0.1, color='orange', linestyle='--', alpha=0.7, label='Moderate (0.1)')
ax3.axhline(y=0.2, color='red', linestyle='--', alpha=0.7, label='High (0.2)')
ax3.legend()
ax3.grid(True, alpha=0.3)

# 4. Performance vs Training Time
training_times = model_results_df['Training Time'].tolist()
test_r2_scores = model_results_df['Test R²'].tolist()

scatter = ax4.scatter(training_times, test_r2_scores, s=100, alpha=0.7)
for i, model in enumerate(models_list):
    ax4.annotate(model[:8], (training_times[i], test_r2_scores[i]), 
                xytext=(5, 5), textcoords='offset points', fontsize=8)
ax4.set_xlabel('Training Time (seconds)')
ax4.set_ylabel('Test R² Score')
ax4.set_title('Performance vs Training Time')
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Summary statistics
print(f"\n📊 SUMMARY STATISTICS:")
print(f"  • Average Test R²: {model_results_df['Test R²'].mean():.4f}")
print(f"  • Best Test R²: {model_results_df['Test R²'].max():.4f}")
print(f"  • Average RMSE: {model_results_df['Test RMSE'].mean():.4f}")
print(f"  • Best RMSE: {model_results_df['Test RMSE'].min():.4f}")
print(f"  • Average Training Time: {model_results_df['Training Time'].mean():.2f}s")
print(f"  • Models with R² > 0.8: {len(model_results_df[model_results_df['Test R²'] > 0.8])}/{len(model_results_df)}")

print(f"\n✅ Model training visualization complete")