Building a Linear Regression model from scratch

In [2]:
import random

In [3]:
class LinearRegressionModel:
    def __init__(self):
        #Holds the value of coefficients
        self.theta = []
        #Holds the value of predicted labels
        self.predicted = []
        #Number of rows the dataset has
        self.rows = 0
        #Learning rate
        self.alpha = 0.01
        #Actual Labels
        self.label = []
        #Actual Features (value of the data that will be given)
        self.features = []
        #Dimension of the input set
        self.dimension = 0

    def initialize_theta(self, method):
        #Value of the Coefficients will be zero initially
        if method == 'zeroinitialization': self.theta = [0 for _ in range(self.dimension + 1)]
        #Value of the Coefficients will be random
        elif method == 'randominitialization': self.theta = [random.uniform(-1, 1) for _ in range(self.dimension + 1)]
        #Value of the Coefficients will be normalized random numbers
        elif method == 'normalinitialization': self.theta = [random.gauss(0, 1) for _ in range(self.dimension + 1)]

    #Splitting Labels from the dataset
    def feature_label_split(self, dataset):
        features, labels = zip(*[(data[:-1], data[-1]) for data in dataset])
        self.features, self.label = list(features), list(labels)

    #Linear Equation
    def linear_function(self, row):
        return sum(self.theta[index] * value for index, value in enumerate(row)) + self.theta[-1]

    #Predicted value using the continuous coefficients (theta)
    def predict(self, features):
        self.predicted = [self.linear_function(row) for row in features]

    #Mean Squared Error
    def loss(self):
        squared_errors = [(self.predicted[i] - self.label[i])**2 for i in range(self.rows)]
        return sum(squared_errors) / (2 * self.rows)

    #R-Squared
    def r_squared(self):
        mean_label = sum(self.label) / self.rows
        total_variance = sum((y - mean_label) ** 2 for y in self.label)
        explained_variance = sum((self.predicted[i] - self.label[i]) ** 2 for i in range(self.rows))
        return 1 - (explained_variance / total_variance)

    #Updating the value of Coefficients (theta)
    def gradient_descent(self):
        gradients = [0 for _ in range(self.dimension + 1)]

        for i in range(self.rows):
            error = self.predicted[i] - self.label[i]
            for j in range(self.dimension):
                gradients[j] += error * self.features[i][j]
            gradients[-1] += error

        gradients = [g / self.rows for g in gradients]

        self.theta = [self.theta[i] - self.alpha * gradients[i] for i in range(self.dimension + 1)]

    #Data training session
    def fit(self, dataset, learning_rate=1, init_method='zeroinitialization', epochs=1):
        self.rows = len(dataset)
        self.alpha = learning_rate
        self.dimension = len(dataset[0]) - 1
        self.feature_label_split(dataset)
        self.initialize_theta(init_method)

        for epoch in range(epochs):
            self.predict(self.features)
            self.gradient_descent()
            current_rs = self.r_squared()
            current_loss = self.loss()
            print(f'Epoch {epoch + 1}/{epochs}: R-squared -> {current_rs:.4f}, Loss -> {current_loss:.4f}')

    def get_predictions(self):
        return self.predicted



In [4]:
#Building Dataset
alpha = 0.0001
dataset = [(x1 := random.randint(0, 100), x2 := random.randint(0, 100), x1 + x2) for _ in range(1000)]

In [5]:
model = LinearRegressionModel()

In [6]:
model.fit(dataset, learning_rate=alpha, epochs=10)

Epoch 1/10: R-squared -> -5.9902, Loss -> 5769.5260
Epoch 2/10: R-squared -> -0.2505, Loss -> 1032.1170
Epoch 3/10: R-squared -> 0.7763, Loss -> 184.6427
Epoch 4/10: R-squared -> 0.9600, Loss -> 33.0371
Epoch 5/10: R-squared -> 0.9928, Loss -> 5.9155
Epoch 6/10: R-squared -> 0.9987, Loss -> 1.0628
Epoch 7/10: R-squared -> 0.9998, Loss -> 0.1940
Epoch 8/10: R-squared -> 1.0000, Loss -> 0.0380
Epoch 9/10: R-squared -> 1.0000, Loss -> 0.0096
Epoch 10/10: R-squared -> 1.0000, Loss -> 0.0041


In [7]:
model.predict([(245, 55)])
print(model.get_predictions()[0])

300.24868185159994


IMPORTANT!
Do we actually need ML to solve this problem?