## 03. Modeling and Evaluation

### 1. Setup and Data Loading

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBRegressor, XGBClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report, confusion_matrix, f1_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

# Load processed data
try:
    df = pd.read_csv('data/processed/processed_data.csv')
    print(f"Dataset Shape: {df.shape}")
except FileNotFoundError:
    print("Processed data not found. Please run 01_Data_Preprocessing.ipynb first.")

Dataset Shape: (5000, 36)


### 2. Data Preparation

In [2]:
# Define Features and Targets
# For Regression: Predict Engagement_Rate. We must drop Engagement_Rate (target) and Engagement_Level (leakage/irrelevant).
X_reg = df.drop(columns=['Engagement_Rate', 'Engagement_Level_Encoded'])
y_reg = df['Engagement_Rate']

# For Classification: Predict Engagement_Level. We can use Engagement_Rate as a feature as it aggregates interactions.
X_clf = df.drop(columns=['Engagement_Level_Encoded'])
y_clf = df['Engagement_Level_Encoded']

# Split Data for Regression
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

# Split Data for Classification
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
    X_clf, y_clf, test_size=0.2, random_state=42
)

# Scale Features (Important for Linear/Logistic Regression)
scaler_r = StandardScaler()
X_train_r_scaled = scaler_r.fit_transform(X_train_r)
X_test_r_scaled = scaler_r.transform(X_test_r)

scaler_c = StandardScaler()
X_train_c_scaled = scaler_c.fit_transform(X_train_c)
X_test_c_scaled = scaler_c.transform(X_test_c)

print("Data Split and Scaled.")

Data Split and Scaled.


### 3. Regression Modeling (Predicting Engagement Rate)

In [4]:
reg_models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42)
}

reg_params = {
    'Linear Regression': {},
    'Random Forest': {
        'n_estimators': [100, 200],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5]
    },
    'Gradient Boosting': {
        'n_estimators': [100, 200],
        'learning_rate': [0.05, 0.1],
        'max_depth': [3, 5]
    },
    'XGBoost': {
        'n_estimators': [100, 200],
        'learning_rate': [0.05, 0.1],
        'max_depth': [3, 5]
    }
}

In [5]:
reg_results = {}
best_reg_models = {}

print("Training and Tuning Regression Models...")
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for name, model in reg_models.items():
    print(f"\nProcessing {name}...")
    
    # Select data
    if name == 'Linear Regression':
        X_train = X_train_r_scaled
        X_test = X_test_r_scaled
    else:
        X_train = X_train_r
        X_test = X_test_r
        
    # K-Fold CV (Baseline)
    cv_scores = cross_val_score(model, X_train, y_train_r, cv=kf, scoring='r2')
    print(f"  Baseline CV R2: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")
    
    # GridSearchCV
    if reg_params[name]:
        print(f"  Tuning {name}...")
        grid = GridSearchCV(model, reg_params[name], cv=3, scoring='r2', n_jobs=-1)
        grid.fit(X_train, y_train_r)
        best_model = grid.best_estimator_
        print(f"  Best Params: {grid.best_params_}")
    else:
        best_model = model
        best_model.fit(X_train, y_train_r)
        
    best_reg_models[name] = best_model
    
    # Evaluate on Test Set
    preds = best_model.predict(X_test)
    mse = mean_squared_error(y_test_r, preds)
    r2 = r2_score(y_test_r, preds)
    
    reg_results[name] = {
        'CV Mean R2': cv_scores.mean(),
        'Test MSE': mse, 
        'Test R2': r2
    }
    print(f"  Test MSE: {mse:.4f}, Test R2: {r2:.4f}")

Training and Tuning Regression Models...

Processing Linear Regression...
  Baseline CV R2: 1.0000 (+/- 0.0000)
  Test MSE: 0.0000, Test R2: 1.0000

Processing Random Forest...
  Baseline CV R2: 0.9301 (+/- 0.0731)
  Tuning Random Forest...
  Best Params: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}
  Test MSE: 493854.1665, Test R2: 0.4605

Processing Gradient Boosting...
  Baseline CV R2: 0.9333 (+/- 0.0741)
  Tuning Gradient Boosting...
  Best Params: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 200}
  Test MSE: 439839.7384, Test R2: 0.5195

Processing XGBoost...
  Baseline CV R2: 0.5327 (+/- 0.2102)
  Tuning XGBoost...
  Best Params: {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 100}
  Test MSE: 661703.2962, Test R2: 0.2771


### 4. Classification Modeling (Predicting Engagement Level)

In [7]:
clf_models = {
    'Logistic Regression': LogisticRegression(max_iter=2000),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')
}

clf_params = {
    'Logistic Regression': {'C': [0.1, 1, 10]},
    'Random Forest': {
        'n_estimators': [100, 200],
        'max_depth': [10, 20, None]
    },
    'Gradient Boosting': {
        'n_estimators': [100, 200],
        'learning_rate': [0.05, 0.1]
    },
    'XGBoost': {
        'n_estimators': [100, 200],
        'learning_rate': [0.05, 0.1],
        'max_depth': [3, 5]
    }
}