## Logistic Regression with Softmax

In [20]:
import numpy as np
import pandas as pd

In [21]:
from sklearn.datasets import load_iris
dataset = load_iris()
X = dataset.data
y = dataset.target

target_names = list(dataset.target_names)
print(target_names)

['setosa', 'versicolor', 'virginica']


In [13]:
class LogReg:
    """
    This implementation of Logistic Regression uses batch gradient descent with regularization.
    """
    def __init__(self, num_iters=1000, tolerance = 1e-10, alpha=0.001, lambd=0, threshold=0.5, verbose=False):
        self.num_iters = num_iters
        self.alpha = alpha # Learning rate
        self.lambd = lambd # Regularization parameter
        self.tolerance = tolerance
        self.threshold = threshold
        self.verbose = verbose
        
    def add_ones(self, X):
        return np.concatenate((np.ones((len(X),1)), X), axis = 1)
      
    def sigmoid(self, X, theta):
        return 1/(1 + np.exp(X@theta))
    
    def cost(self, X, y_true):
        m = X.shape[0]
        y_hat = self.sigmoid(X, self.theta)
        temp_theta = self.theta[:, 1:].copy()
        
        Cost = np.sum(-1*y_true*np.log(y_hat)-(1-y_true)*np.log(1-y_hat)) + self.lambd * np.sum(temp_theta**2)
        
        return Cost
    
    def fit(self, X, y):
        X = X.copy()
        X = self.add_ones(X)
        
        n, d = X.shape
        
        self.classes = np.unique(y)
        self.no_classes = len(self.classes)
        
        # Turn y into one-hot-labels if number of classes is greater than 2
        if self.no_classes > 2:
            y_encode = np.zeros((n, self.no_classes))
            y_encode[range(n), y] = 1 #numpy advanced indexing
            y = y_encode
        else:
            y = y.reshape(-1, 1)        
        
        if self.no_classes > 2:
            self.theta = np.zeros((d, self.no_classes))
        else:
            self.theta = np.zeros((d, 1))
        
        current_iter = 1
        norm = 1
        while (norm >= self.tolerance and current_iter < self.num_iters):
            old_theta = self.theta.copy()
            
            temp_theta = self.theta[:, 1:].copy()
            grad = X.T@(y - self.sigmoid(X, self.theta)) + self.lambd * np.sum(temp_theta)
            
            if self.no_classes <= 2:
                grad= grad.reshape(-1, 1)
            
            self.theta = self.theta - self.alpha*grad
            
            if self.verbose and (current_iter%500 == 0):
                print(f'cost for {current_iter} iteration : {self.cost(X, y)}')
            norm = np.linalg.norm(old_theta - self.theta)
            current_iter += 1
            
        return self.theta
    
    def evaluate(self, X, y):
        """
        Returns mse loss for a dataset evaluated on the hypothesis
        """
        X = self.add_ones(X)
        return self.cost(X, y)
    
    def predict(self, X):
        proba = self.predict_proba(X)
        if self.no_classes > 2:
            # Multiclass classification
            y_hat = np.argmax(proba, axis=1)
        elif self.no_classes == 2:
            # Binary classification
            y_hat = (proba >= self.threshold).astype(int)
        return y_hat
        
    def predict_proba(self, X):
        """
        Returns probability of predictions.
        """
        X = self.add_ones(X)
        
        return self.sigmoid(X, self.theta)

## Logistic Regression with Mini-Batch GD

In [36]:
class LogReg:
    """
    This implementation of Logistic Regression uses batch gradient descent with regularization.
    """
    def __init__(self, 
                 epochs=100, 
                 tolerance = 1e-10, 
                 alpha=0.001, 
                 lambd=0, 
                 threshold=0.5, 
                 verbose=False,
                 minibatch_size=30,
                ):
        self.epochs = epochs
        self.alpha = alpha # Learning rate
        self.lambd = lambd # Regularization parameter
        self.tolerance = tolerance
        self.threshold = threshold
        self.verbose = verbose
        self.minibatch_size = minibatch_size
        
    def add_ones(self, X):
        return np.concatenate((np.ones((len(X),1)), X), axis = 1)
      
    def sigmoid(self, X, theta):
        return 1/(1 + np.exp(X@theta))
    
    def cost(self, X, y_true):
        m = X.shape[0]
        y_hat = self.sigmoid(X, self.theta)
        temp_theta = self.theta[:, 1:].copy()
        
        Cost = np.sum(-1*y_true*np.log(y_hat)-(1-y_true)*np.log(1-y_hat)) + self.lambd * np.sum(temp_theta**2)
        
        return Cost
    
    def get_minibatch(self, X, y,  minibatch):
        X_mb = X[minibatch*self.minibatch_size: (minibatch+1)*self.minibatch_size]
        y_mb = y[minibatch*self.minibatch_size: (minibatch+1)*self.minibatch_size]
        return X_mb, y_mb
    
    def fit(self, X, y):
        X = X.copy()
        X = self.add_ones(X)
        
        n, d = X.shape
        
        self.classes = np.unique(y)
        self.no_classes = len(self.classes)
        
        # Turn y into one-hot-labels if number of classes is greater than 2
        if self.no_classes > 2:
            y_encode = np.zeros((n, self.no_classes))
            y_encode[range(n), y] = 1 #numpy advanced indexing
            y = y_encode
        else:
            y = y.reshape(-1, 1)        
        
        if self.no_classes > 2:
            self.theta = np.zeros((d, self.no_classes))
        else:
            self.theta = np.zeros((d, 1))
        
        current_epoch = 1
        norm = 1
        
        no_of_minibatch = int(n/self.minibatch_size)
        
        while (norm >= self.tolerance and current_epoch < self.epochs):
            # Shuffle X for minibatch gradient descent
            shuffled_index = np.random.permutation(n)
            X_shuffled = X[shuffled_index]
            y_shuffled = y[shuffled_index]
            
            old_theta = self.theta.copy()
            theta_wo_bias = self.theta[:, 1:].copy()
            
            for mb in range(no_of_minibatch):
                X_mb, y_mb = self.get_minibatch(X, y, mb)
                
                grad = X_mb.T@(y_mb - self.sigmoid(X_mb, self.theta)) + self.lambd * np.sum(theta_wo_bias)
                
                if self.no_classes <= 2:
                    grad= grad.reshape(-1, 1)
            
                self.theta = self.theta - self.alpha*grad
            
            if self.verbose and (current_epoch%100 == 0):
                print(f'cost for {current_epoch} epoch : {self.cost(X, y)}')
            norm = np.linalg.norm(old_theta - self.theta)
            current_epoch += 1
            
        return self.theta
    
    def evaluate(self, X, y):
        """
        Returns mse loss for a dataset evaluated on the hypothesis
        """
        X = self.add_ones(X)
        return self.cost(X, y)
    
    def predict(self, X):
        proba = self.predict_proba(X)
        if self.no_classes > 2:
            # Multiclass classification
            y_hat = np.argmax(proba, axis=1)
        elif self.no_classes == 2:
            # Binary classification
            y_hat = (proba >= self.threshold).astype(int)
        return y_hat
        
    def predict_proba(self, X):
        """
        Returns probability of predictions.
        """
        X = self.add_ones(X)
        
        return self.sigmoid(X, self.theta)

In [56]:
logreg = LogReg(epochs=9000, 
                alpha=0.001, 
                verbose=True, 
                minibatch_size=20,
               )

In [57]:
logreg.fit(X, y)

cost for 100 epoch : 135.59525985436156
cost for 200 epoch : 121.23009983356812
cost for 300 epoch : 113.79842079017831
cost for 400 epoch : 109.12447470126725
cost for 500 epoch : 105.89970812105123
cost for 600 epoch : 103.53725844231357
cost for 700 epoch : 101.72803765520975
cost for 800 epoch : 100.29308217606992
cost for 900 epoch : 99.12191340628333
cost for 1000 epoch : 98.14301442596208
cost for 1100 epoch : 97.30832151431306
cost for 1200 epoch : 96.58451643840516
cost for 1300 epoch : 95.9478825413426
cost for 1400 epoch : 95.38114079198942
cost for 1500 epoch : 94.87143794739103
cost for 1600 epoch : 94.40903094350807
cost for 1700 epoch : 93.98640555707738
cost for 1800 epoch : 93.5976733701057
cost for 1900 epoch : 93.23815132045026
cost for 2000 epoch : 92.90406354016872
cost for 2100 epoch : 92.59232661496557
cost for 2200 epoch : 92.3003926991976
cost for 2300 epoch : 92.02613336297057
cost for 2400 epoch : 91.7677525139346
cost for 2500 epoch : 91.52372033949058
cost 

array([[-0.52603372, -5.82116714,  5.08083386],
       [-0.82769709, -0.28662784,  4.14958303],
       [-2.94848302,  2.82319963,  4.1482948 ],
       [ 4.5690382 , -0.72198485, -6.41348991],
       [ 2.08991479,  2.40110809, -6.87615605]])

In [58]:
predictions = logreg.predict(X)

(150, 3)


In [59]:
np.sum(y == predictions.squeeze()) / len(y)

0.98

In [60]:
predictions.squeeze()

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [10]:
import numpy as np
np.random.seed(0)
x = np.array([2, 4, 5, 9, 10])

it = np.random.permutation(5)
it

array([2, 0, 1, 3, 4])

In [12]:
x[it]

array([ 5,  2,  4,  9, 10])