In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
import joblib

In [6]:
# Define processed data folder
processed_dir = Path.cwd().parent / Path('data') / 'processed'

In [7]:
df_clean = pd.read_csv(processed_dir / "data_clean_processed.csv")
df = pd.read_csv(processed_dir / "data_processed.csv")

In [None]:
scaler = joblib.load(processed_dir / 'scaler_original.pkl')
scaler_clean = joblib.load(processed_dir / 'scaler_clean.pkl')

In [9]:
df.drop(columns="Unnamed: 0",axis=1, inplace=True)
df_clean.drop(columns="Unnamed: 0",axis=1, inplace=True)

In [12]:
# Features to scale
features = [col for col in df.columns if col not in ['BeatsPerMinute']]

# Original dataset
X = df[features]
y = df['BeatsPerMinute']

# Cleaned dataset
X_clean = df_clean[features]
y_clean = df_clean['BeatsPerMinute']

In [18]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=1)

In [32]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Dictionary of models to try
models = {
    'GradientBoosting': GradientBoostingRegressor(random_state=42),
    'RandomForest': RandomForestRegressor(random_state=42, n_jobs=-1),
    'XGBoost': XGBRegressor(random_state=42, n_jobs=-1, verbosity=1)  # n_jobs=-1 uses all cores
}

# Evaluate each model
results = {}
for name, model in models.items():
    pipe = Pipeline([('model', model)])
    pipe.fit(X_train, y_train)
    
    y_pred = pipe.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = pipe.score(X_test, y_test)
    
    results[name] = {'RMSE': rmse, 'R2': r2}

# Print results
for name, metrics in results.items():
    print(f"{name} - RMSE: {metrics['RMSE']:.2f}, R2: {metrics['R2']:.3f}")


GradientBoosting - RMSE: 26.51, R2: 0.000
RandomForest - RMSE: 26.75, R2: -0.018
XGBoost - RMSE: 26.61, R2: -0.007
