## Logistic Regression with Softmax

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.datasets import load_iris
dataset = load_iris()
X = dataset.data
y = dataset.target

target_names = list(dataset.target_names)
print(target_names)

['setosa', 'versicolor', 'virginica']


In [3]:
#y = (y>1).astype(int)

In [4]:
# Model: Linear Regression
class LogReg:
    """
    This implementation of Logistic Regression uses batch gradient descent with regularization.
    """
    def __init__(self, num_iters=1000, tolerance = 1e-10, alpha=0.001, lambd=0, threshold=0.5, verbose=False):
        self.num_iters = num_iters
        self.alpha = alpha # Learning rate
        self.lambd = lambd # Regularization parameter
        self.tolerance = tolerance
        self.threshold = threshold
        self.verbose = verbose
        
    def add_ones(self, X):
        return np.concatenate((np.ones((len(X),1)), X), axis = 1)
      
    def sigmoid(self, X, theta):
        return 1/(1 + np.exp(X@theta))
    
    def cost(self, X, y_true):
        m = X.shape[0]
        y_hat = self.sigmoid(X, self.theta)
        temp_theta = self.theta[:, 1:].copy()
        
        Cost = np.sum(-1*y_true*np.log(y_hat)-(1-y_true)*np.log(1-y_hat)) + self.lambd * np.sum(temp_theta**2)
        
        return Cost
    
    def fit(self, X, y):
        X = X.copy()
        X = self.add_ones(X)
        
        n, d = X.shape
        
        self.classes = np.unique(y)
        self.no_classes = len(self.classes)
        
        # Turn y into one-hot-labels if number of classes is greater than 2
        if self.no_classes > 2:
            y_encode = np.zeros((n, self.no_classes))
            y_encode[range(n), y] = 1 #numpy advanced indexing
            y = y_encode
        else:
            y = y.reshape(-1, 1)        
        
        if self.no_classes > 2:
            self.theta = np.zeros((d, self.no_classes))
        else:
            self.theta = np.zeros((d, 1))
        
        current_iter = 1
        norm = 1
        while (norm >= self.tolerance and current_iter < self.num_iters):
            old_theta = self.theta.copy()
            
            temp_theta = self.theta[:, 1:].copy()
            grad = X.T@(y - self.sigmoid(X, self.theta)) + self.lambd * np.sum(temp_theta)
            
            if self.no_classes <= 2:
                grad= grad.reshape(-1, 1)
            
            self.theta = self.theta - self.alpha*grad
            
            if self.verbose and (current_iter%500 == 0):
                print(f'cost for {current_iter} iteration : {self.cost(X, y)}')
            norm = np.linalg.norm(old_theta - self.theta)
            current_iter += 1
            
        return self.theta
    
    def evaluate(self, X, y):
        """
        Returns mse loss for a dataset evaluated on the hypothesis
        """
        X = self.add_ones(X)
        return self.cost(X, y)
    
    def predict(self, X):
        proba = self.predict_proba(X)
        print(proba.shape)
        if self.no_classes > 2:
            # Multiclass classification
            y_hat = np.argmax(proba, axis=1)
        elif self.no_classes == 2:
            # Binary classification
            y_hat = (proba >= self.threshold).astype(int)
        return y_hat
        
    def predict_proba(self, X):
        """
        Returns probability of predictions.
        """
        X = self.add_ones(X)
        
        return self.sigmoid(X, self.theta)

In [5]:
logreg = LogReg(num_iters=30000, alpha=0.001, verbose=True)

In [6]:
logreg.fit(X, y)

cost for 500 iteration : 100.8893466253054
cost for 1000 iteration : 93.55043991296125
cost for 1500 iteration : 90.4218146105867
cost for 2000 iteration : 88.55315404898136
cost for 2500 iteration : 87.26301539831763
cost for 3000 iteration : 86.30342396795784
cost for 3500 iteration : 85.55702321969936
cost for 4000 iteration : 84.95868830753454
cost for 4500 iteration : 84.46840852565802
cost for 5000 iteration : 84.05981985747101
cost for 5500 iteration : 83.71463780861362
cost for 6000 iteration : 83.41967801922067
cost for 6500 iteration : 83.16514118749294
cost for 7000 iteration : 82.94356714012383
cost for 7500 iteration : 82.74916598970448
cost for 8000 iteration : 82.57737306956281
cost for 8500 iteration : 82.42454263861949
cost for 9000 iteration : 82.28773098078418
cost for 9500 iteration : 82.16453903552011
cost for 10000 iteration : 82.05299583890223
cost for 10500 iteration : 81.9514706587301
cost for 11000 iteration : 81.85860575462758
cost for 11500 iteration : 81.77

array([[-0.62738981, -7.31876769,  9.99124547],
       [-0.99903598,  0.23820684,  4.1390738 ],
       [-3.48622877,  2.78764351,  5.06871866],
       [ 5.45243564, -1.31128207, -6.85186537],
       [ 2.55914269,  2.77782575, -9.96363615]])

In [7]:
predictions = logreg.predict(X)

(150, 3)


In [8]:
np.sum(y == predictions.squeeze()) / len(y)

0.98

In [9]:
predictions.squeeze()

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])