Preprocessing

In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

def preprocessing():
    df = pd.read_csv("HomeC.csv")
    selected_cols = ["use [kW]", "Solar [kW]", "Furnace 1 [kW]", "Furnace 2 [kW]", "dewPoint", "apparentTemperature"]
    df = df[selected_cols]
    df.dropna(inplace=True)
    X = df.drop(columns="use [kW]")
    y = df["use [kW]"]
    X = X.to_numpy()
    y = y.to_numpy()
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    return df, X_train, X_test, y_train, y_test

In [14]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

def evaluate_model(y_true, y_pred, model_name):
    r2 = r2_score(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    
    print(f"Model: {model_name}")
    print(f"R² Score: {r2:.4f}")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")
    print("="*50)

Multiple Linear Regression from scratch

In [15]:
# @title 1.2 - build linear model + learning update (gradient descent + learning rate)
class  LinearRegressionScratch:
    """
    Custom implementation of linear regression using gradient descent.
    """
    def __init__(self, learning_rate=0.01, n_iterations=1000):
        self.coef_ = None
        self.intercept_ = None
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations

    def fit(self, X, y):
        """
        Fits the linear regression model to the given data using gradient descent.
        Args:
            X: A numpy array of shape (n_samples, n_features) representing the input data.
            y: A numpy array of shape (n_samples,) representing the target values.
        """
        # Initialize coefficients (slope) and intercept to zero or random small values
        self.coef_ = np.dot(np.linalg.inv(np.dot(X.T, X)), np.dot(X.T, y))
        self.intercept_ = np.mean(y - np.dot(X, self.coef_))

        m = len(y)  # number of training examples

        for _ in range(self.n_iterations):
            # Calculate the predictions
            y_pred = self.predict(X)

            # Compute the residuals (errors)
            error = (y_pred - y)

            # Calculate the gradient for intercept (slope) and coefficients (slope)
            intercept_gradient = (1/m) * np.sum(error)
            coef_gradient = (1/m) * np.dot(X.T, error)

            # Update the parameters using the gradients
            self.intercept_ -= self.learning_rate * intercept_gradient
            self.coef_ -= self.learning_rate * coef_gradient

    def predict(self, X):
        """
        Predicts the target values for new data.
        Args:
            X: A numpy array of shape (n_samples, n_features) representing the new input data.
        Returns:
            A numpy array of shape (n_samples,) representing the predicted target values.
        """
        return np.dot(X, self.coef_) + self.intercept_

Data Preparation

In [29]:
df, X_train, X_test, y_train, y_test = preprocessing()
# vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
from sklearn.preprocessing import PolynomialFeatures
n_power = 7
poly_features = PolynomialFeatures(degree=n_power, include_bias=False)
X_poly_train = poly_features.fit_transform(X_train)
X_poly_test = poly_features.transform(X_test)
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

  df = pd.read_csv("HomeC.csv")


In [30]:
df.corr()

Unnamed: 0,use [kW],Solar [kW],Furnace 1 [kW],Furnace 2 [kW],dewPoint,apparentTemperature
use [kW],1.0,-0.131635,0.314776,0.489414,0.01604,0.00556
Solar [kW],-0.131635,1.0,-0.020461,-0.107684,0.089664,0.093793
Furnace 1 [kW],0.314776,-0.020461,1.0,0.240445,-0.301091,-0.308621
Furnace 2 [kW],0.489414,-0.107684,0.240445,1.0,-0.233404,-0.242814
dewPoint,0.01604,0.089664,-0.301091,-0.233404,1.0,0.900818
apparentTemperature,0.00556,0.093793,-0.308621,-0.242814,0.900818,1.0


Model Training

In [31]:
from sklearn.linear_model import LinearRegression

lr, n_iterations = 0.1, 1000
multi_lin_reg_scratch = LinearRegressionScratch(lr, n_iterations)
multi_lin_reg_scratch.fit(X_train, y_train)

poly_sklearn = LinearRegression()
poly_sklearn.fit(X_poly_train,y_train)

Model Evaluating

In [32]:
evaluate_model(y_test, multi_lin_reg_scratch.predict(X_test), "Multiple Linear Regression")
evaluate_model(y_test, poly_sklearn.predict(X_poly_test), "Polynomial Regression")

Model: Multiple Linear Regression
R² Score: 0.3276
Mean Squared Error (MSE): 0.7465
Root Mean Squared Error (RMSE): 0.8640
Mean Absolute Error (MAE): 0.4849
Mean Absolute Percentage Error (MAPE): 247.20%
Model: Polynomial Regression
R² Score: 0.7124
Mean Squared Error (MSE): 0.3193
Root Mean Squared Error (RMSE): 0.5650
Mean Absolute Error (MAE): 0.3299
Mean Absolute Percentage Error (MAPE): 157.25%
