In [135]:
import math
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import seaborn as ns
from tqdm import tqdm
import matplotlib.pyplot as plt


In [181]:
class Logistic_Regression:
    
    def __init__(self, X, y, learningRate = 0.00001, tolerance = 0.00005,maxIteration = 5000 ):
        self.X_train = X
        self.y_train = y
        self.tolerance = tolerance
        self.maxIteration = maxIteration
        self.learningRate = learningRate
        
        
    
    def addX0(self):
        return np.column_stack([np.ones([X.shape[0],1]), X])
    
    def sigmoid(self, z):
        sig= 1 / (1+ np.exp(-z))
        return sig
    
    def costFunction (self, X, y):
        pred_ = np.log(np.ones(X.shape[0]) + np.exp(X.dot(self.w))) - X.dot(self.w).dot(y)
        cost = pred_.sum()
        return cost
    
    def gradient(self, X, y):
        sigmoid = self.sigmoid(X.dot(self.w))
        grad = (sigmoid - y).dot(X)
        return grad
    
    def gradientDescent(self, X, y):
        
        errors= []
        last = float('inf')
        
        for i in tqdm(range(self.maxIteration)):
            self.w = self.w - self.learningRate * self.gradient(X, y)
            curr = self.costFunction(X, y)
            
            diff = last - curr
            last = curr
            
            errors.append(curr)
            
#             if diff < self.tolerance:
#                 print(" The model has stopped learning")
#                 break
                
        #self.plot_cost(errors)
        
        
    def predict(self, X):
        pred = self.sigmoid(X.dot(self.w))
        return np.around(pred)
        
    def evaluate(self, y, y_hat):
            
        y = (y == 1)
        y_hat= (y_hat == 1)
            
        accuracy = (y == y_hat).sum() / y.size
        precision = (y & y_hat).sum() / y_hat.sum()
        recall = (y & y_hat).sum() / y.sum()
        print('accuracy was:' +str(accuracy) )
        print('precision was:' +str(precision) )
        print('recall was:' +str(recall) )
        return recall, precision, accuracy
        
    def fit(self):
#         self.X_train, self.X_test. self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.3)
            
        self.w = np.ones(self.X_train.shape[1], dtype = np.float64) * 0
        self.gradientDescent(self.X_train, self.y_train)
            
        print(self.w)
            
        y_hat_train = self.predict(self.X_train)
        recall, precision, accuracy = self.evaluate(self.y_train, y_hat_train)
            
    def plot(self):
        plt.figure(figsize=(12, 8))
        ax = plt.axes(projection='3d')

        # Data for three-dimensional scattered points
        ax.scatter3D(self.X_train[:, 0], self.X_train[:, 1], 
        self.sigmoid(self.X_train.dot(self.w)), 
        c = self.y_train[:], cmap='viridis', s=100);

        ax.set_xlim3d(55, 80)
        ax.set_ylim3d(80, 240)
        plt.xlabel('$x_1$ feature', fontsize=15)
        plt.ylabel('$x_2$ feature', fontsize=15, )
        ax.set_zlabel('$P(Y = 1|x_1, x_2)$', fontsize=15, rotation = 0)

    
    
    def scatterPlt(self):
        # evenly sampled points
        x_min, x_max = 55, 80
        y_min, y_max = 80, 240

        xx, yy = np.meshgrid(np.linspace(x_min, x_max, 250),
                                 np.linspace(y_min, y_max, 250))
        grid = np.c_[xx.ravel(), yy.ravel()]
        probs = grid.dot(self.w).reshape(xx.shape)

        f, ax = plt.subplots(figsize=(14,12))


        ax.contour(xx, yy, probs, levels=[0.5], cmap="Greys", vmin=0, vmax=.6)


        ax.scatter(self.X_train[:, 0], self.X_train[:, 1], 
                   c=self.y_train[:], s=50,
                   cmap="RdBu", vmin=-.2, vmax=1.2,
                   edgecolor="white", linewidth=1)

        plt.xlabel('x1 feature')
        plt.ylabel('x2 feature')


    def plot3D(self):
        # evenly sampled points
        x_min, x_max = 55, 80
        y_min, y_max = 80, 240

        xx, yy = np.meshgrid(np.linspace(x_min, x_max, 250),
                             np.linspace(y_min, y_max, 250))

        grid = np.c_[xx.ravel(), yy.ravel()]
        probs = grid.dot(self.w).reshape(xx.shape)
        fig = plt.figure(figsize=(14,12))
        ax = plt.axes(projection='3d')
        ax.contour3D(xx, yy, probs, 50, cmap='binary')

        ax.scatter3D(self.X_train[:, 0], self.X_train[:, 1], 
                   c=self.y_train[:], s=50,
                   cmap="RdBu", vmin=-.2, vmax=1.2,
                   edgecolor="white", linewidth=1)

        ax.set_xlabel('x1')
        ax.set_ylabel('x2')
        ax.set_zlabel('probs')
        ax.set_title('3D contour')
        plt.show()
            
         
        
        

In [182]:
train_df = pd.read_excel('Lab3_data.xls', sheet_name = '2004--2005 Data' ) 
test_df = pd.read_excel('Lab3_data.xls', sheet_name = '2004--2007 Data')

In [183]:
train_df = train_df.values

In [184]:
X_train,y_train= train_df[:, 1:], train_df[:, 0]

In [185]:
lr = Logistic_Regression(X_train, y_train, tolerance= 0.0)

In [186]:
lr.fit()

100%|██████████| 5000/5000 [00:00<00:00, 36484.84it/s]

[-0.47599252  0.23073142]
accuracy was:0.9565217391304348
precision was:0.9722222222222222
recall was:0.9722222222222222



