## Logistic Regression with Softmax

In [65]:
import numpy as np
import pandas as pd

In [66]:
from sklearn.datasets import load_iris
dataset = load_iris()
X = dataset.data
y = dataset.target

target_names = list(dataset.target_names)
print(target_names)

['setosa', 'versicolor', 'virginica']


In [79]:
class SoftmaxReg:
    """
    Softmax Regression for multiclass.
    """
    def __init__(self, 
                 epochs=100, 
                 tolerance = 1e-10, 
                 alpha=0.001, 
                 lambd=0, 
                 threshold=0.5, 
                 verbose=False,
                 minibatch_size=30,
                ):
        self.epochs = epochs
        self.alpha = alpha # Learning rate
        self.lambd = lambd # Regularization parameter
        self.tolerance = tolerance
        self.threshold = threshold
        self.verbose = verbose
        self.minibatch_size = minibatch_size
        
    def add_ones(self, X):
        return np.concatenate((np.ones((len(X),1)), X), axis = 1)
    
    def softmax(self, X, theta):
        z = X@theta
        z -= np.max(z) # to prevent overflow
        return np.exp(z) / np.sum(np.exp(z), axis=1, keepdims=True)
    
    def compute_cost(self, X, y_true):
        n, d = X.shape
        y_hat = self.softmax(X, self.theta)
        
        temp_theta = self.theta[:, 1:].copy()
        
        if self.no_classes > 2:  
            temp_theta = self.theta.copy()
            temp_theta[0, :] = 0 # Not including bias in regularization
        
        Cost = (-1 / n) * np.sum(y_true * np.log(y_hat)) + (self.lambd/2)*np.sum(temp_theta*temp_theta)
        
        return Cost
    
    def compute_grad(self, X, y):
        n, _ = X.shape 
        theta_wo_bias = self.theta.copy()
        theta_wo_bias[0, :] = 0
        grad = (-1 / n) * X.T@(y - self.softmax(X, self.theta)) + self.lambd * theta_wo_bias
        if self.no_classes <= 2:
            grad= grad.reshape(-1, 1)
        return grad
    
    def get_minibatch(self, X, y,  minibatch):
        X_mb = X[minibatch*self.minibatch_size: (minibatch+1)*self.minibatch_size]
        y_mb = y[minibatch*self.minibatch_size: (minibatch+1)*self.minibatch_size]
        return X_mb, y_mb
    
    def one_hot_encode(self, y):
        n = len(y)
        if self.no_classes > 2:
            # Turn y into one-hot-labels if number of classes is greater than 2
            y_encode = np.zeros((n, self.no_classes))
            y_encode[range(n), y] = 1 #numpy advanced indexing
            y = y_encode
        else:
            y = y.reshape(-1, 1)
        return y
    
    def fit(self, X, y):
        X = X.copy()
        X = self.add_ones(X)
        
        n, d = X.shape
        
        self.classes = np.unique(y)
        self.no_classes = len(self.classes)
        
        y = self.one_hot_encode(y)
        
        if self.no_classes > 2:
            self.theta = np.zeros((d, self.no_classes))
        else:
            self.theta = np.zeros((d, 1))
            
        current_epoch = 1
        norm = 1

        no_of_minibatch = int(n/self.minibatch_size)
        
        while (norm >= self.tolerance and current_epoch < self.epochs):
            # Shuffle X for minibatch gradient descent
            shuffled_index = np.random.permutation(n)
            X_shuffled = X[shuffled_index]
            y_shuffled = y[shuffled_index]
            
            old_theta = self.theta.copy()        
            
            for mb in range(no_of_minibatch):
                X_mb, y_mb = self.get_minibatch(X, y, mb)
                
                theta_wo_bias = self.theta[:, 1:].copy()
                if self.no_classes > 2:
                    grad = self.compute_grad(X_mb, y_mb)
            
                self.theta = self.theta - self.alpha*grad
            
            if self.verbose and (current_epoch%10 == 0):
                print(f'cost for {current_epoch} epoch : {self.compute_cost(X, y)}')
            norm = np.linalg.norm(old_theta - self.theta)
            current_epoch += 1
            
        return self.theta
    
    def evaluate(self, X, y):
        """
        Returns mse loss for a dataset evaluated on the hypothesis
        """
        X = self.add_ones(X)
        y = self.one_hot_encode(y)
        return self.compute_cost(X, y)
    
    def predict(self, X):
        proba = self.predict_proba(X)
        if self.no_classes > 2:
            # Multiclass classification
            y_hat = np.argmax(proba, axis=1)
        elif self.no_classes == 2:
            # Binary classification
            y_hat = (proba >= self.threshold).astype(int)
        return y_hat
        
    def predict_proba(self, X):
        """
        Returns probability of predictions.
        """
        X = self.add_ones(X)
        
        return self.softmax(X, self.theta)

In [80]:
logreg = SoftmaxReg(epochs=9000, 
                alpha=0.001, 
                verbose=True, 
                minibatch_size=20,
               )

In [81]:
logreg.fit(X, y)

cost for 10 epoch : 1.0405393269354362
cost for 20 epoch : 0.9971981162708727
cost for 30 epoch : 0.9587096726587906
cost for 40 epoch : 0.923934111226926
cost for 50 epoch : 0.89242308351885
cost for 60 epoch : 0.8638280402482029
cost for 70 epoch : 0.8378377100145463
cost for 80 epoch : 0.8141697597635079
cost for 90 epoch : 0.7925694843009948
cost for 100 epoch : 0.7728089104045688
cost for 110 epoch : 0.7546853551472168
cost for 120 epoch : 0.7380195087848012
cost for 130 epoch : 0.7226532665277198
cost for 140 epoch : 0.7084474996450751
cost for 150 epoch : 0.6952798942623154
cost for 160 epoch : 0.6830429333145113
cost for 170 epoch : 0.6716420592393022
cost for 180 epoch : 0.6609940300949891
cost for 190 epoch : 0.6510254664892866
cost for 200 epoch : 0.6416715780106838
cost for 210 epoch : 0.6328750534977968
cost for 220 epoch : 0.624585097856329
cost for 230 epoch : 0.616756598147818
cost for 240 epoch : 0.6093494026119763
cost for 250 epoch : 0.6023276976868681
cost for 260 e

array([[ 0.36043894,  0.49106346, -0.8515024 ],
       [ 0.74121438,  0.62128861, -1.36250298],
       [ 1.77970306, -0.24955262, -1.53015043],
       [-2.44103432, -0.12703503,  2.56806935],
       [-1.11607952, -0.69511384,  1.81119335]])

In [82]:
predictions = logreg.predict(X)

In [83]:
np.sum(y == predictions.squeeze()) / len(y)

0.9866666666666667

In [84]:
predictions.squeeze()

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])