In [6]:
!pip install scikit-learn




[notice] A new release of pip available: 22.3 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

df = pd.read_csv("data/raw/energydata_complete.csv", parse_dates=["date"], skipinitialspace=True)

df["hour"] = df["date"].dt.hour
df["dow"] = df["date"].dt.dayofweek
df["month"] = df["date"].dt.month

lags = [1, 6, 24, 144]
for L in lags:
    df[f"lag_{L}"] = df["Appliances"].shift(L)

df = df.dropna().reset_index(drop=True)

cols = ["hour", "dow", "month"] + [f"lag_{L}" for L in lags]
X = df[cols]
y = df["Appliances"]

split = int(len(df) * 0.8)
X_train, X_test = X.iloc[:split], X.iloc[split:]
y_train, y_test = y.iloc[:split], y.iloc[split:]

naive_pred = y.shift(1).iloc[split:]
mae_naive = mean_absolute_error(y_test, naive_pred)
rmse_naive = np.sqrt(mean_squared_error(y_test, naive_pred))

model = LinearRegression()
model.fit(X_train, y_train)
pred = model.predict(X_test)

mae_lr = mean_absolute_error(y_test, pred)
rmse_lr = np.sqrt(mean_squared_error(y_test, pred))

print(pd.DataFrame({
    "Model": ["Naive", "Linear Regression"],
    "MAE": [mae_naive, mae_lr],
    "RMSE": [rmse_naive, rmse_lr]
}))


               Model        MAE       RMSE
0              Naive  26.047461  65.185076
1  Linear Regression  25.910229  59.930032
