## Logistic Regression with L1 Regularization

In [1]:
import numpy as np
import pandas as pd

np.set_printoptions(precision=4, suppress=True)

In [2]:
from sklearn.datasets import load_iris
dataset = load_iris()
X = dataset.data
y = dataset.target

target_names = list(dataset.target_names)
print(target_names)


['setosa', 'versicolor', 'virginica']


In [3]:
# Change to binary class
y = (y > 0).astype(int)
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [4]:
# Model: Linear Regression
class LogReg:
    """
    This implementation of Logistic Regression uses batch gradient descent with regularization.
    """
    def __init__(self, num_iters=100, tolerance = 1e-10, alpha=0.00001, lambd=10, threshold=0.5, verbose=False):
        self.num_iters = num_iters
        self.alpha = alpha # Learning rate
        self.lambd = lambd # Regularization parameter
        self.tolerance = tolerance
        self.threshold = threshold
        self.verbose = verbose
        
    def add_ones(self, X):
        return np.concatenate((np.ones((len(X),1)), X), axis = 1)
      
    def sigmoid(self, X, theta):
        return 1/(1 + np.exp(X@theta))
    
    def cost(self, X, y_true):
        m = X.shape[0]
        y_hat = self.sigmoid(X, self.theta)
        temp_theta = self.theta[1:].copy()
        
        Cost = np.sum(-1*y_true*np.log(y_hat)-(1-y_true)*np.log(1-y_hat)) + self.lambd * np.sum(np.abs(temp_theta))
        
        return Cost
    
    def fit(self, X, y):
        X = X.copy()
        X = self.add_ones(X)
        y = y.reshape(-1, 1)
        
        self.theta = np.zeros((len(X[0]), 1))
        
        current_iter = 1
        norm = 1
        while (norm >= self.tolerance and current_iter < self.num_iters):
            old_theta = self.theta.copy()
            #grad = np.dot(np.transpose(y_hat-self.y), self.X)
            temp_theta = self.theta[1:].copy()
            
            grad = X.T@(y - self.sigmoid(X, self.theta)) + self.lambd * np.sum(np.sign(temp_theta))
            grad= grad.reshape(-1, 1)
            
            self.theta = self.theta - self.alpha*grad
            
            if self.verbose and (current_iter%100 == 0):
                print(f'cost for {current_iter} iteration : {self.cost(X, y)}')
            norm = np.linalg.norm(old_theta - self.theta)
            current_iter += 1
            
        return self.theta
    
    def evaluate(self, X, y):
        """
        Returns mse loss for a dataset evaluated on the hypothesis
        """
        X = self.add_ones(X)
        return self.cost(X, y)
    
    def predict(self, X):
        prob = self.predict_proba(X)
        return (prob > self.threshold).astype(int)
        
    def predict_proba(self, X):
        """
        Returns probability of predictions.
        """
        X = self.add_ones(X)  
        return self.sigmoid(X, self.theta)

In [5]:
logreg = LogReg(verbose=True, num_iters=5500)

In [6]:
logreg.fit(X, y)

cost for 100 iteration : 81.86845645452154
cost for 200 iteration : 75.69639362386795
cost for 300 iteration : 70.86446051377749
cost for 400 iteration : 66.75641561522221
cost for 500 iteration : 63.229183428839
cost for 600 iteration : 60.19046541639731
cost for 700 iteration : 57.565248049021044
cost for 800 iteration : 55.29047068140731
cost for 900 iteration : 53.313106222444105
cost for 1000 iteration : 51.58870719451656
cost for 1100 iteration : 50.08008738480574
cost for 1200 iteration : 48.75614167476681
cost for 1300 iteration : 47.59082189285532
cost for 1400 iteration : 46.56226869803543
cost for 1500 iteration : 45.65208702420064
cost for 1600 iteration : 44.844747154127276
cost for 1700 iteration : 44.1270924149807
cost for 1800 iteration : 43.487935655357575
cost for 1900 iteration : 42.917728778764314
cost for 2000 iteration : 42.4082919663456
cost for 2100 iteration : 41.9525914749114
cost for 2200 iteration : 41.54455689633133
cost for 2300 iteration : 41.178930467754

array([[ 0.8499],
       [-0.11  ],
       [ 1.1056],
       [-1.322 ],
       [-0.    ]])

In [7]:
predictions = logreg.predict(X)
predictions = predictions.squeeze()

In [8]:
np.sum(y == predictions) / len(y)

1.0

In [9]:
predictions

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])