## 03. Modeling and Evaluation

### 1. Setup and Data Loading

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBRegressor, XGBClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report, confusion_matrix, f1_score

# Load processed data
try:
    df = pd.read_csv('data/processed/processed_data.csv')
    print(f"Dataset Shape: {df.shape}")
except FileNotFoundError:
    print("Processed data not found. Please run 01_Data_Preprocessing.ipynb first.")

Dataset Shape: (5000, 36)


### 2. Data Preparation

In [2]:
# Define Features and Targets
# For Regression: Predict Engagement_Rate. We must drop Engagement_Rate (target) and Engagement_Level (leakage/irrelevant).
X_reg = df.drop(columns=['Engagement_Rate', 'Engagement_Level_Encoded'])
y_reg = df['Engagement_Rate']

# For Classification: Predict Engagement_Level. We can use Engagement_Rate as a feature as it aggregates interactions.
X_clf = df.drop(columns=['Engagement_Level_Encoded'])
y_clf = df['Engagement_Level_Encoded']

# Split Data for Regression
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

# Split Data for Classification
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
    X_clf, y_clf, test_size=0.2, random_state=42
)

# Scale Features (Important for Linear/Logistic Regression)
scaler_r = StandardScaler()
X_train_r_scaled = scaler_r.fit_transform(X_train_r)
X_test_r_scaled = scaler_r.transform(X_test_r)

scaler_c = StandardScaler()
X_train_c_scaled = scaler_c.fit_transform(X_train_c)
X_test_c_scaled = scaler_c.transform(X_test_c)

print("Data Split and Scaled.")

Data Split and Scaled.
