# Logistic Regression

Implementation of logistic regression from scratch

Dataset used available on: https://www.kaggle.com/datasets/merishnasuwal/breast-cancer-prediction-dataset?resource=download

In [334]:
import numpy as np
import pandas as pd

#### Class implementation

In [335]:
def sigmoid(z):
    """
    Computes the sigmoid function.

    :param z: The input value or array.
    :return: The sigmoid of the input.
    :rtype: float or np.ndarray
    """
    z = np.clip(z, -500, 500)
    return 1 / (1 + np.exp(-z))

def log_loss(y, y_hat):
    """
    Computes the log loss (cross-entropy loss) for binary classification.

    :param y: The true label.
    :param y_hat: The predicted probability.
    :return: The log loss.
    :rtype: float
    """
    epsilon = 1e-15
    y_hat = np.clip(y_hat, epsilon, 1 - epsilon)
    return -(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat))

class LogisticRegression:
    """
    Represents a logistic regression model for binary classification.
    """
    def __init__(self, X, y, epochs, learning_rate=0.01):
        """
        Initializes the logistic regression model.

        :param X: The feature matrix (training data).
        :type X: np.ndarray
        :param y: The label vector (training labels).
        :type y: np.ndarray
        :param epochs: The number of training epochs.
        :type epochs: int
        :param learning_rate: The learning rate for gradient descent.
        :type learning_rate: float
        """
        self.num_data = X.shape[0]
        self.num_features = X.shape[1]
        self.weights = np.random.randn(self.num_features) * 0.001
        self.bias = 0.0
        self.X = X
        self.y = y
        self.epochs = epochs
        self.learning_rate = learning_rate
        
    def compute_accuracy(self):
        """
        Computes the accuracy of the model on the training data.

        :return: The accuracy of the model.
        :rtype: float
        """
        correct = 0
        for i in range(self.num_data):
            x_i = self.X[i]
            y_i = self.y[i]
            y_hat = self.classify(x_i)
            if y_hat == y_i:
                correct += 1
        return correct / self.num_data
        
    def compute_loss(self):
        """
        Computes the average log loss over the training data.

        :return: The average log loss.
        :rtype: float
        """
        loss = 0
        for i in range(self.num_data):
            x_i = self.X[i]
            y_i = self.y[i]
            z_i = np.dot(self.weights, x_i) + self.bias
            y_hat = sigmoid(z_i)
            loss += log_loss(y_i, y_hat)
        return loss / self.num_data
        
    def calculate_gradients(self):
        """
        Calculates the gradients of the loss function with respect to the model parameters.

        :return: The gradients for the weights and bias.
        :rtype: tuple(np.ndarray, float)
        """
        grad_w = np.zeros(self.num_features)
        grad_b = 0
        for i in range(self.num_data):
            x_i = self.X[i]
            y_i = self.y[i]
            z_i = np.dot(self.weights, x_i) + self.bias
            diff = sigmoid(z_i) - y_i
            grad_w += diff * x_i
            grad_b += diff
        grad_w, grad_b = grad_w / self.num_data, grad_b / self.num_data
        return grad_w, grad_b
        
    def update_weights(self):
        """
        Updates the model parameters (weights and bias) using gradient descent.
        """
        grad_w, grad_b = self.calculate_gradients()
        self.weights -= self.learning_rate * grad_w
        self.bias -= self.learning_rate * grad_b
    
    def predict(self, X):
        """
        Predicts the probability of the positive class for the given input data.

        :param X: The input feature matrix.
        :type X: np.ndarray
        :return: The predicted probabilities for the positive class.
        :rtype: np.ndarray
        """
        z = np.dot(self.weights, X) + self.bias
        y_hat = sigmoid(z)
        return y_hat
    
    def classify(self, X, threshold=0.5):
        """
        Classifies the input data into binary labels based on the given threshold.

        :param X: The input feature matrix.
        :type X: np.ndarray
        :param threshold: The threshold for classification.
        :type threshold: float
        :return: The predicted binary label (0 or 1).
        :rtype: int
        """
        y_hat = self.predict(X)
        return int(y_hat > threshold) 
      
    def train(self):
        """
        Trains the logistic regression model using gradient descent for a specified number of epochs.
        """
        for epoch in range(self.epochs):
            self.update_weights()
            loss = self.compute_loss()
            accuracy = self.compute_accuracy()
            print(f'Epoch {epoch + 1}/{self.epochs} - accuracy: {accuracy:.4f} - loss: {loss:.4f}')
        
    def test(self, X_test, y_test):
        """
        Evaluates the model's accuracy on the test data.

        :param X_test: The test feature matrix.
        :type X_test: np.ndarray
        :param y_test: The test label vector.
        :type y_test: np.ndarray
        :return: The accuracy of the model on the test data.
        :rtype: float
        """
        correct = 0
        for i in range(len(X_test)):
            y_pred = self.classify(X_test[i])
            if y_pred == y_test[i]:
                correct += 1
        accuracy = correct / len(X_test)
        print(f'Test set accuracy: {accuracy:.4f}')

#### Importing the dataset and pre processing

In [336]:
X = pd.read_csv('Breast_cancer_data.csv')
X.head()

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,diagnosis
0,17.99,10.38,122.8,1001.0,0.1184,0
1,20.57,17.77,132.9,1326.0,0.08474,0
2,19.69,21.25,130.0,1203.0,0.1096,0
3,11.42,20.38,77.58,386.1,0.1425,0
4,20.29,14.34,135.1,1297.0,0.1003,0


In [337]:
X.shape

(569, 6)

In [338]:
diagnosticated = X[X['diagnosis'] == 1]
not_diagnosticated = X[X['diagnosis'] == 0]


In [339]:
diagnosticated.shape

(357, 6)

In [340]:
# Split 80% of 'not_diagnosticated' data for training
not_diagnosticated_train = not_diagnosticated.sample(frac=0.8, random_state=42)

# Select the same number of samples from 'diagnosticated' as in 'not_diagnosticated_train'
diagnosticated_train = diagnosticated.sample(n=len(not_diagnosticated_train), random_state=42)

# Combine the diagnosed and not diagnosed data to create the training set
train_data = pd.concat([not_diagnosticated_train, diagnosticated_train])

# Optional: shuffle the rows to avoid an obvious sequence of classes
train_data = train_data.sample(frac=1, random_state=42).reset_index(drop=True)

train_data.head()

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,diagnosis
0,10.8,9.71,68.77,357.6,0.09594,1
1,16.26,21.88,107.5,826.8,0.1165,0
2,12.83,22.33,85.26,503.2,0.1088,0
3,23.09,19.83,152.1,1682.0,0.09342,0
4,17.95,20.01,114.2,982.0,0.08402,0


In [341]:
# Step 1: Select the remaining 20% of 'not_diagnosticated' for the test set
not_diagnosticated_test = not_diagnosticated.drop(not_diagnosticated_train.index)

# Step 2: Select the remaining samples from 'diagnosticated' for the test set
diagnosticated_test = diagnosticated.drop(diagnosticated_train.index)

# Step 3: Combine the diagnosed and not diagnosed data to create the test set
test_data = pd.concat([not_diagnosticated_test, diagnosticated_test])

# Optional: shuffle the rows to avoid an obvious sequence of classes
test_data = test_data.sample(frac=1, random_state=42).reset_index(drop=True)

test_data.head()

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,diagnosis
0,11.67,20.02,75.21,416.2,0.1016,1
1,8.95,15.76,58.74,245.2,0.09462,1
2,19.1,26.29,129.1,1132.0,0.1215,0
3,12.83,15.73,82.89,506.9,0.0904,1
4,17.47,24.68,116.1,984.6,0.1049,0


In [342]:
# dataset
X_train = train_data.drop(columns='diagnosis').values
y_train = train_data['diagnosis'].values
X_test = test_data.drop(columns='diagnosis').values
y_test = test_data['diagnosis'].values

In [343]:
train_mean, train_std = np.mean(X_train, axis=0), np.std(X_train, axis=0)
test_mean, test_std = np.mean(X_test, axis=0), np.std(X_test, axis=0)

def normalize(X, mean, std):
    X_norm =  (X - mean) / std
    return X_norm

X_train = normalize(X_train, train_mean, train_std)
X_test = normalize(X_test, test_mean, test_std)

#### Model

In [344]:
# Model 
model = LogisticRegression(X_train, y_train, epochs=100, learning_rate=0.1)

model.train()


Epoch 1/100 - accuracy: 0.8971 - loss: 0.6505
Epoch 2/100 - accuracy: 0.9000 - loss: 0.6142
Epoch 3/100 - accuracy: 0.9000 - loss: 0.5832
Epoch 4/100 - accuracy: 0.9000 - loss: 0.5564
Epoch 5/100 - accuracy: 0.9000 - loss: 0.5332
Epoch 6/100 - accuracy: 0.9000 - loss: 0.5129
Epoch 7/100 - accuracy: 0.9000 - loss: 0.4950
Epoch 8/100 - accuracy: 0.9000 - loss: 0.4791
Epoch 9/100 - accuracy: 0.9000 - loss: 0.4650
Epoch 10/100 - accuracy: 0.9029 - loss: 0.4523
Epoch 11/100 - accuracy: 0.9029 - loss: 0.4408
Epoch 12/100 - accuracy: 0.9029 - loss: 0.4304
Epoch 13/100 - accuracy: 0.9029 - loss: 0.4208
Epoch 14/100 - accuracy: 0.9029 - loss: 0.4121
Epoch 15/100 - accuracy: 0.9029 - loss: 0.4041
Epoch 16/100 - accuracy: 0.9029 - loss: 0.3967
Epoch 17/100 - accuracy: 0.9029 - loss: 0.3899
Epoch 18/100 - accuracy: 0.9029 - loss: 0.3835
Epoch 19/100 - accuracy: 0.9029 - loss: 0.3775
Epoch 20/100 - accuracy: 0.9029 - loss: 0.3720
Epoch 21/100 - accuracy: 0.9029 - loss: 0.3668
Epoch 22/100 - accurac

#### Testing the model

In [345]:
model.test(X_test, y_test)

Test set accuracy: 0.7773
