### train base models and compare performance

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df = pd.read_csv('../data/energydata_complete.csv')
df['date'] = pd.to_datetime(df['date'])
df['hour'] = df['date'].dt.hour
df = df.drop(columns=['date'])
numeric_cols = df.select_dtypes(include=['int64', 'float64', 'int32']).columns
print(numeric_cols)

Index(['Appliances', 'lights', 'T1', 'RH_1', 'T2', 'RH_2', 'T3', 'RH_3', 'T4',
       'RH_4', 'T5', 'RH_5', 'T6', 'RH_6', 'T7', 'RH_7', 'T8', 'RH_8', 'T9',
       'RH_9', 'T_out', 'Press_mm_hg', 'RH_out', 'Windspeed', 'Visibility',
       'Tdewpoint', 'rv1', 'rv2', 'hour'],
      dtype='object')


In [3]:
target = "Appliances"
X = df.drop(columns=[target])
y = df[target]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# check rmse(error mag)/r^2(overall trend) for performance
def evaluate(model, name):
    model.fit(X_train, y_train)
    preds = model.predict(X_val)

    rmse = np.sqrt(mean_squared_error(y_val, preds))
    r2 = r2_score(y_val, preds)

    print(f"Model: {name}")
    print(f"  RMSE: {rmse:.3f}")
    print(f"  R^2:  {r2:.3f}")
    print("-"*40)

In [4]:
linear_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LinearRegression())
])
evaluate(linear_pipeline, "Linear Regression")

ridge_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", Ridge(alpha=1.0))
])
evaluate(ridge_pipeline, "Ridge Regression")

lasso_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", Lasso(alpha=0.01))
])
evaluate(lasso_pipeline, "Lasso Regression")

Model: Linear Regression
  RMSE: 91.055
  R^2:  0.171
----------------------------------------
Model: Ridge Regression
  RMSE: 91.054
  R^2:  0.172
----------------------------------------
Model: Lasso Regression
  RMSE: 91.050
  R^2:  0.172
----------------------------------------


In [5]:

dt_model = DecisionTreeRegressor(max_depth=None, random_state=42)
evaluate(dt_model, "Decision Tree")

rf_model = RandomForestRegressor(
    n_estimators=200,
    max_depth=None,
    random_state=42
)
evaluate(rf_model, "Random Forest")

gb_model = GradientBoostingRegressor(
    n_estimators=200,
    learning_rate=0.1
)
evaluate(gb_model, "Gradient Boosting")

svr_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", SVR(kernel="rbf", C=10, epsilon=0.1))
])
evaluate(svr_pipeline, "SVR (RBF)")


Model: Decision Tree
  RMSE: 88.978
  R^2:  0.209
----------------------------------------
Model: Random Forest
  RMSE: 67.308
  R^2:  0.547
----------------------------------------
Model: Gradient Boosting
  RMSE: 80.524
  R^2:  0.352
----------------------------------------
Model: SVR (RBF)
  RMSE: 93.824
  R^2:  0.120
----------------------------------------


### finetune

In [7]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)
preds = rf.predict(X_val)

rmse = np.sqrt(mean_squared_error(y_val, preds))
r2 = r2_score(y_val, preds)

print(f"RMSE: {rmse:.3f}, R2: {r2:.3f}")

RMSE: 67.308, R2: 0.547


In [8]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 300, 500],
    'max_depth': [None, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

rf = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=3,
    scoring='r2',
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

print("Best params:", grid_search.best_params_)
best_rf = grid_search.best_estimator_

preds = best_rf.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, preds))
r2 = r2_score(y_val, preds)

print(f"Tuned RMSE: {rmse:.3f}, R2: {r2:.3f}")




Best params: {'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}
Tuned RMSE: 63.679, R2: 0.595
