Building a Logistic Regression model from scratch

In [1]:
import random
import math

In [6]:
class LogisticRegressionModel:
    def __init__(self):
        #Holds the value of coefficients
        self.theta = []
        #Holds the value of predicted labels
        self.predicted = []
        #Number of rows the dataset has
        self.rows = 0
        #Learning rate
        self.alpha = 0.01
        #Actual Labels
        self.label = []
        #Actual Features (value of the data that will be given)
        self.features = []
        #Dimension of the input set
        self.dimension = 0

    def initialize_theta(self, method):
        #Value of the Coefficients will be zero initially
        if method == 'zeroinitialization': self.theta = [0 for _ in range(self.dimension + 1)]
        #Value of the Coefficients will be random
        elif method == 'randominitialization': self.theta = [random.uniform(-1, 1) for _ in range(self.dimension + 1)]
        #Value of the Coefficients will be normalized random numbers
        elif method == 'normalinitialization': self.theta = [random.gauss(0, 1) for _ in range(self.dimension + 1)]

    #Splitting Labels from the dataset
    def feature_label_split(self, dataset):
        features, labels = zip(*[(data[:-1], data[-1]) for data in dataset])
        self.features, self.label = list(features), list(labels)

    #Linear Equation
    def linear_function(self, row):
        return sum(self.theta[index] * value for index, value in enumerate(row)) + self.theta[-1]

    #Logistic Function
    def logistic_function(self, row):
        return 1/(1+math.exp(-self.linear_function(row)))

    #Predicted value using the continuous coefficients (theta)
    def predict(self, features):
        self.predicted = [self.logistic_function(row) for row in features]

    #Log Loss
    def log_loss(self):
        log_errors = [(self.label[i]*math.log(self.predicted[i])) + ((1-self.label[i])*math.log(1-self.predicted[i])) for i in range(self.rows)]
        return sum(log_errors) / -self.rows

    #Accuracy
    def accuracy(self, threshold=0.5):
        correct_predictions = 0
        
        for true, pred in zip(self.label, self.predicted):
            predicted_class = 1 if pred >= threshold else 0
            if predicted_class == true: correct_predictions += 1
        
        accuracy_value = (correct_predictions / self.rows)
        return accuracy_value

    #Updating the value of Coefficients (theta)
    def gradient_descent(self):
        gradients = [0 for _ in range(self.dimension + 1)]

        for i in range(self.rows):
            error = self.predicted[i] - self.label[i]
            for j in range(self.dimension):
                gradients[j] += error * self.features[i][j]
            gradients[-1] += error

        gradients = [g / self.rows for g in gradients]

        self.theta = [self.theta[i] - self.alpha * gradients[i] for i in range(self.dimension + 1)]

    #Data training session
    def fit(self, dataset, learning_rate=1, init_method='zeroinitialization', epochs=1):
        self.rows = len(dataset)
        self.alpha = learning_rate
        self.dimension = len(dataset[0]) - 1
        self.feature_label_split(dataset)
        self.initialize_theta(init_method)

        for epoch in range(epochs):
            self.predict(self.features)
            self.gradient_descent()
            current_accuracy = self.accuracy()
            current_loss = self.log_loss()
            print(f'Epoch {epoch + 1}/{epochs}: Accuracy -> {current_accuracy:.4f}, Loss -> {current_loss:.4f}')

    def get_predictions(self, threshold=0.5):
        return [1 if pred >= threshold else 0 for pred in self.predicted]



In [7]:
#Building Dataset
alpha = 0.0001
dataset = [(x1 := random.randint(0, 100), x2 := random.randint(0, 100), 1 if x1 > x2 else 0) for _ in range(1000)]

In [8]:
model = LogisticRegressionModel()

In [13]:
model.fit(dataset, learning_rate=alpha, epochs=10)

Epoch 1/10: Accuracy -> 0.5180, Loss -> 0.6931
Epoch 2/10: Accuracy -> 0.9350, Loss -> 0.6790
Epoch 3/10: Accuracy -> 0.9390, Loss -> 0.6654
Epoch 4/10: Accuracy -> 0.9400, Loss -> 0.6524
Epoch 5/10: Accuracy -> 0.9420, Loss -> 0.6400
Epoch 6/10: Accuracy -> 0.9440, Loss -> 0.6281
Epoch 7/10: Accuracy -> 0.9480, Loss -> 0.6166
Epoch 8/10: Accuracy -> 0.9510, Loss -> 0.6057
Epoch 9/10: Accuracy -> 0.9540, Loss -> 0.5952
Epoch 10/10: Accuracy -> 0.9560, Loss -> 0.5851


In [18]:
x1 = 500
x2 = 280
model.predict([(x1, x2)])
classes = ['less', 'greater']
print(f'{x1} is {classes[model.get_predictions()[0]]} than {x2}')

500 is greater than 280


Also traditional programming is better to solve this problem rather than using ML