# XGBoost Model for Popularity Prediction

This notebook trains an XGBoost model to predict track popularity.

**Objectives:**
1. Load processed ML-ready data
2. Train XGBoost regression model
3. Evaluate model performance
4. Analyze feature importance
5. Save trained model

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib
import json
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("✅ Libraries loaded successfully!")

## 1. Load Processed Data

In [None]:
# Load training and test data
X_train = pd.read_parquet('../data/processed/X_train.parquet')
X_test = pd.read_parquet('../data/processed/X_test.parquet')
y_train = pd.read_parquet('../data/processed/y_train.parquet').squeeze()
y_test = pd.read_parquet('../data/processed/y_test.parquet').squeeze()

print(f"Training set: X={X_train.shape}, y={y_train.shape}")
print(f"Test set: X={X_test.shape}, y={y_test.shape}")
print(f"\nNumber of features: {X_train.shape[1]}")
print(f"\nTarget distribution:")
print(y_train.describe())

## 2. Initialize XGBoost Model

In [None]:
# XGBoost hyperparameters
model = XGBRegressor(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=3,
    gamma=0.1,
    random_state=42,
    n_jobs=-1,
    early_stopping_rounds=20,
    eval_metric='rmse'
)

print("XGBoost Model Configuration:")
print(model)

## 3. Train Model

In [None]:
import time

print("Training XGBoost model...")
start_time = time.time()

# Fit model with early stopping
model.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    verbose=50
)

training_time = time.time() - start_time
print(f"\n✅ Training complete in {training_time:.2f} seconds")

## 4. Make Predictions

In [None]:
# Predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

print("Predictions generated")
print(f"Train predictions: {y_train_pred.shape}")
print(f"Test predictions: {y_test_pred.shape}")

## 5. Evaluate Model Performance

In [None]:
# Calculate metrics
def evaluate_model(y_true, y_pred, dataset_name=""):
    """Calculate and display regression metrics"""
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    
    print(f"\n{dataset_name} Metrics:")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  MAE:  {mae:.4f}")
    print(f"  R²:   {r2:.4f}")
    
    return {'rmse': rmse, 'mae': mae, 'r2': r2}

# Evaluate on both sets
train_metrics = evaluate_model(y_train, y_train_pred, "Training Set")
test_metrics = evaluate_model(y_test, y_test_pred, "Test Set")

# Check for overfitting
print(f"\nOverfitting Check:")
print(f"  R² difference: {train_metrics['r2'] - test_metrics['r2']:.4f}")
print(f"  RMSE difference: {test_metrics['rmse'] - train_metrics['rmse']:.4f}")

## 6. Visualize Results

In [None]:
# Prediction vs Actual plots
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Training set
axes[0].scatter(y_train, y_train_pred, alpha=0.3, s=10)
axes[0].plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'r--', lw=2)
axes[0].set_xlabel('Actual Popularity')
axes[0].set_ylabel('Predicted Popularity')
axes[0].set_title(f'Training Set (R² = {train_metrics["r2"]:.4f})')
axes[0].grid(True, alpha=0.3)

# Test set
axes[1].scatter(y_test, y_test_pred, alpha=0.3, s=10, color='orange')
axes[1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[1].set_xlabel('Actual Popularity')
axes[1].set_ylabel('Predicted Popularity')
axes[1].set_title(f'Test Set (R² = {test_metrics["r2"]:.4f})')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../outputs/plots/xgboost_predictions.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Residuals plot
residuals = y_test - y_test_pred

fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Residual scatter
axes[0].scatter(y_test_pred, residuals, alpha=0.3, s=10)
axes[0].axhline(y=0, color='r', linestyle='--', lw=2)
axes[0].set_xlabel('Predicted Popularity')
axes[0].set_ylabel('Residuals')
axes[0].set_title('Residual Plot')
axes[0].grid(True, alpha=0.3)

# Residual histogram
axes[1].hist(residuals, bins=50, edgecolor='black', alpha=0.7)
axes[1].axvline(x=0, color='r', linestyle='--', lw=2)
axes[1].set_xlabel('Residuals')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Residual Distribution')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../outputs/plots/xgboost_residuals.png', dpi=300, bbox_inches='tight')
plt.show()

## 7. Feature Importance Analysis

In [None]:
# Get feature importance
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 20 Most Important Features:")
print(feature_importance.head(20))

In [None]:
# Plot top 20 features
plt.figure(figsize=(10, 8))
top_features = feature_importance.head(20)
plt.barh(range(len(top_features)), top_features['importance'], color='steelblue')
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Feature Importance')
plt.title('Top 20 Feature Importances (XGBoost)', fontweight='bold')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig('../outputs/plots/xgboost_feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()

## 8. Save Model and Metadata

In [None]:
import os
os.makedirs('../outputs/models', exist_ok=True)

# Save model (joblib format)
model_path = '../outputs/models/xgboost_popularity_model.joblib'
joblib.dump(model, model_path)
print(f"✅ Model saved to {model_path}")

# Also save in XGBoost native format
model_path_native = '../outputs/models/xgboost_popularity_model.json'
model.save_model(model_path_native)
print(f"✅ Model saved to {model_path_native} (XGBoost native format)")

In [None]:
# Save model metadata
metadata = {
    'model_type': 'XGBoost Regressor',
    'target_variable': 'popularity',
    'n_features': X_train.shape[1],
    'n_train_samples': len(X_train),
    'n_test_samples': len(X_test),
    'training_date': datetime.now().isoformat(),
    'training_time_seconds': training_time,
    'hyperparameters': {
        'n_estimators': model.n_estimators,
        'max_depth': model.max_depth,
        'learning_rate': model.learning_rate,
        'subsample': model.subsample,
        'colsample_bytree': model.colsample_bytree,
        'min_child_weight': model.min_child_weight,
        'gamma': model.gamma,
    },
    'performance': {
        'train': train_metrics,
        'test': test_metrics
    },
    'top_10_features': feature_importance.head(10)[['feature', 'importance']].to_dict('records')
}

metadata_path = '../outputs/models/model_metadata.json'
with open(metadata_path, 'w') as f:
    json.dump(metadata, f, indent=2)

print(f"✅ Metadata saved to {metadata_path}")

In [None]:
# Save feature importance
feature_importance.to_csv('../outputs/models/feature_importance.csv', index=False)
print("✅ Feature importance saved to feature_importance.csv")

## 9. Model Summary

In [None]:
print("\n" + "="*80)
print("XGBOOST MODEL TRAINING COMPLETE")
print("="*80)
print(f"\nModel Performance:")
print(f"  Test R²: {test_metrics['r2']:.4f}")
print(f"  Test RMSE: {test_metrics['rmse']:.4f}")
print(f"  Test MAE: {test_metrics['mae']:.4f}")
print(f"\nTop 5 Important Features:")
for i, row in feature_importance.head(5).iterrows():
    print(f"  {i+1}. {row['feature']}: {row['importance']:.4f}")
print(f"\nFiles Saved:")
print(f"  ✓ xgboost_popularity_model.joblib")
print(f"  ✓ xgboost_popularity_model.json")
print(f"  ✓ model_metadata.json")
print(f"  ✓ feature_importance.csv")
print(f"  ✓ Visualization plots (PNG)")
print("\n" + "="*80)