In [43]:
import numpy as np
from sklearn.preprocessing import scale

class LogisticClassifierGradient:
    def __init__(self):
        self.w = None
        
    # learn the weights of the model (fit function) alpha is learning rate
    def fit(self, X, y, w0, alpha, h, tolerance, max_it):
            
        n = X.shape[0]
        X = scale(X)
        X = np.hstack((np.ones([n,1]), X)) # stack two matricies horiziontally
        

        L = lambda w: (self.sigmoid(X @ w) - y).T @ (self.sigmoid(X @ w) - y) # loss function (apply sigmoid function)

        # learn weights
        self.w = self.gradientDescent(L, w0, alpha, h, tolerance, max_it)

    def predict(self, X): #create model, fit it, and when we run predict we want to use it to just predict
        n = X.shape[0]
        X = scale(X)
        
        X = np.hstack((np.ones([n,1]), X)) # stack two matricies horiziontally
        return self.sigmoid(X @ self.w)

    def sigmoid(self, z):
        return 1/(1+np.exp(-z))
        


# run gradient descent to minimize the loss function
    def gradientDescent(self, f, x0, alpha, h, tolerance, max_iterations):
        # set x equal to the initial guess
        x = x0

        # take up to maxIterations number of steps
        for counter in range(max_iterations):
            # update the gradient
            gradient = self.computeGradient(f, x, h)

            if counter % 1000 == 0:
                print(f"iter {counter:5d} | loss {f(x):.6f} | grad norm {np.linalg.norm(gradient):.6f}")

            # stop if the norm of the gradient is near 0
            if np.linalg.norm(gradient) < tolerance:
                print('Gradient descent took', counter, 'iterations to converge')
                print('The norm of the gradient is', np.linalg.norm(gradient))
                
                # return the approximate critical value x
                return x

            # if we do not converge, print a message
            elif counter == max_iterations - 1:
                print("Gradient descent failed")
                print('The gradient is', gradient)
                
                # return x, sometimes it is still pretty good
                return x

            # take a step in the opposite direction as the gradient
            x -= alpha*gradient

        return None

 # estimate the gradient
    def computeGradient(self, f, x, h):
        n = len(x)
        gradient = np.zeros(n)
        
        # compute f at current point
        fx = f(x)

        # find each component of the gradient
        for counter in range(n):
            xUp = x.copy()
            xUp[counter] += h
            gradient[counter] = (f(xUp) - fx)/h

        # return the gradient
        return gradient

    def standardize(self, X):
        X_mean = X.mean(axis=0)
        X_std = X.std(axis=0)
        X_std[X_std == 0] = 1

        X_standardized = (X - X_mean) / X_std

        return X_standardized

In [45]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

data = load_breast_cancer()
X = data['data']
y = data['target']

trainX, testX, trainY, testY = train_test_split(X, y, test_size=0.25, random_state=0, stratify=y)
# ok kinda big tolerance but what can u do.
# ohh ik. maybe use a DIFFERENT ALGORITHM!!!>!!
model = LogisticClassifierGradient()
model.fit(trainX,
          trainY, 
          np.zeros(X.shape[1] + 1, dtype=float), 
          alpha=0.001, 
          h=1e-4, 
          tolerance=0.1, 
          max_it=10000000000000)

iter     0 | loss 106.500000 | grad norm 306.144048
iter  1000 | loss 4.589987 | grad norm 1.031394
iter  2000 | loss 3.937777 | grad norm 0.648580
iter  3000 | loss 3.613718 | grad norm 0.503511
iter  4000 | loss 3.401874 | grad norm 0.422362
iter  5000 | loss 3.246229 | grad norm 0.369360
iter  6000 | loss 3.124085 | grad norm 0.331117
iter  7000 | loss 3.024326 | grad norm 0.301491
iter  8000 | loss 2.940724 | grad norm 0.277408
iter  9000 | loss 2.869394 | grad norm 0.257194
iter 10000 | loss 2.807715 | grad norm 0.239856
iter 11000 | loss 2.753814 | grad norm 0.224754
iter 12000 | loss 2.706296 | grad norm 0.211447
iter 13000 | loss 2.664093 | grad norm 0.199617
iter 14000 | loss 2.626365 | grad norm 0.189020
iter 15000 | loss 2.592446 | grad norm 0.179469
iter 16000 | loss 2.561794 | grad norm 0.170813
iter 17000 | loss 2.533966 | grad norm 0.162932
iter 18000 | loss 2.508597 | grad norm 0.155725
iter 19000 | loss 2.485380 | grad norm 0.149110
iter 20000 | loss 2.464057 | grad no