# Logistic Regression from scratch without using scikit learn

**Importing the dataset**

In [33]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
data = sm.datasets.fair.load_pandas().data
data['affairs'] = (data.affairs > 0).astype(int)
x = data.drop('affairs', axis = 1)
y = data['affairs']

** Defining logistic regression class, with all functions ** 

In [34]:
import numpy as np
import pandas as pd
class logisticRegression():
    
    def __init__(self):
        self.coefficient = 0
        
    def sigmoid(self, array):
        return 1 / (1 + np.exp(-array)).round(3)
        
    def fit(self, x, y, alpha, threshold):
        
        """ x: input pandas dataframe
        y: output pandas series (Class labels)
 
        step1: Check if the number of rows in x and y are the same. If not raise a value error with a message."""
        
        if x.shape[0] != y.shape[0]:
            raise ValueError("The number of rows in input dataframe and output series is not equal")
        else:
            pass
        
        """step2: Check if there is any missing value in the dataset (both for x and y). 
               If there is, raise a value error with a message."""
        
        if x.isnull().values.any() == True:
            raise ValueError("The input dataframe should not have missing values")
        else:
            pass
        
        if y.isnull().values.any() == True:
            raise ValueError("The output should not have missing values")
        else:
            pass
        
        """step3: Check if there is any categorical value in x or y. If there is, raise a value error with a message."""
        for c in x.columns:
            if x[c].dtype == 'object':
                raise ValueError("Input dataframe should not have categorical variables")
            else:
                pass
        
        if y.dtype == 'object':
                raise ValueError("The output should not be categorical")
        else:
            pass
        
        """step4: Transform both x and y into numpy.arrays (it is easier to work with arrays for matrix operations)."""
        x = x.values
        y = y.values
        y = y.reshape(6366,1)
        
        """step5: Add bias to the input vector x. bias means add a column which is 1 across al the rows.
               This will increase the number of columns of x by 1. x.shape[1] will increase by 1."""
        bias = np.ones((x.shape[0],1))
        x = np.concatenate((bias, x), axis = 1)
        """step6: initialize self.coef.
               You can initialize the coef randomly. 
               Use numpy.random.rand(size) or np.random.uniform(low=-1, high=1, size=(x.shape[1])).
               Think about the size of the coefficent array. 
               Logically, you need to have a coefficient for each input variable as well as the bias.""" 
        
        coefficient = np.random.uniform(low=-1, high=1, size=(x.shape[1],1))
        
        """step7: create a list to save the cost values for each iteration."""
        cost = []
        
        """step8: while not converged and iteration number > 10000
                    calculate the predicted values
                    calculate the error 
                    calculate the cost function and append it to the cost list
                    calculate the gradient in a way that gradient is 
                                      gradient = (t(x) * (error))/(size_of_x) (number of rows)
                    adjust the coef in a way that
                                        coef = coef - alpha*gradient
                    adjust alpha in a way that
                                        alpha = alpha*0.95"""
        iteration = 1
        not_converged = True
        
        while not_converged:
            product = np.dot(x,coefficient)
            y_pred = self.sigmoid(product)
            error = y - y_pred
            a = np.sum((-y*np.log(y_pred+0.000001)) - ((1-y)*np.log(1-y_pred+0.00001)))/ x.shape[0]
            cost = np.append(cost,a)
            gradient = np.dot(x.T,error)/x.shape[0]
            coefficient = coefficient - (alpha*gradient)
            alpha = alpha * 0.95
            iteration = iteration + 1
            
            """step 8: Check if the convergence criteria is satisfied:
                if you iterate at least as many times 10000
                if the difference between the average of the last 5 cost values and the last cost value 
                is less than the threshold.
                You will not need to return anything because you are working on the coefs, which are class attributes
            """
            iteration = iteration + 1
            b = abs(cost[-5:].mean() - cost[-1])
            if b < threshold and iteration >5:
                not_converged = False
            else:
                pass
            
        self.coefficient = coefficient
    
    
    def predict_prob(self, x):
        
        """Convert x into numpy aray and add bias
        Check if size of self.coef is the same with the number of columns in x
        Using x and self.coef, make the predictions
        """
        
        x = x.values
        bias = np.ones((x.shape[0],1))
        x = np.concatenate((bias, x), axis = 1)
        product = np.dot(x,self.coefficient)
        y_pred = (1/(1+np.exp(-product))).round(3)      
        
        
        return y_pred
    
    
    def predict_class(self, x):
        
        """Make discrete predictions. Instead of returning probabilities return 0 or 1.
        """
        
        y_pred = self.predict_prob(x) 
        y_pred[y_pred>=0.5] = 1
        y_pred[y_pred<0.5] = 0
        
        
        
        return y_pred
    
    def get_accuracy(self, x, y):
        
        """Calculate the accuracy rate
        number of true classification/total number of instances
        number of true classification is True positive + True negative
        """
        
        y = y.values
        y = y.reshape(6366,1)
        y_pred = self.predict_class(x)
        accuracy = float(np.sum(y==y_pred)/float(x.shape[0])) * 100
        
        
        return accuracy

** Testing the Accuracy **

In [36]:
data['affairs'] = (data.affairs > 0).astype(int)
x = data.drop('affairs', axis = 1)
y = data['affairs']
LR = logisticRegression()
LR.fit(x, y, 0.99, 0.001) 
print(LR.get_accuracy(x, y))
class_labels_pred = LR.predict_class(x)
print(class_labels_pred)

67.75054979579014
[[ 0.]
 [ 0.]
 [ 0.]
 ..., 
 [ 0.]
 [ 0.]
 [ 0.]]


  if __name__ == '__main__':
  if __name__ == '__main__':
