In [None]:
## SOLIS NAVARRO LUIS FERNANDO
## @LSOLIS

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.optimize import fmin_tnc

In [29]:
def sigmoid(x):
    return (1/(1+ np.exp(-x)))

def net_input(theta,x):
    return np.dot(x,theta)

def probability(theta,x):
    return sigmoid(net_input(theta, x)) 

def cost_function(theta,x,y):
    m=x.shape[0]
    total_cost=-(1/m)*np.sum(y*np.log(probability(theta, x)) + (1-y)*np.log(1-probability(theta, x)))
    return total_cost

def gradient(theta, x, y):
    # Computes the gradient of the cost function at the point theta
    m = x.shape[0]
    return (1 / m) * np.dot(x.T, sigmoid(net_input(theta,   x)) - y)


def fit(x, y,theta):
    opt_weights = fmin_tnc(func=cost_function, x0=theta,
                  fprime=gradient,args=(x, y.flatten()))
    return opt_weights[0]

In [28]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

class LogisticRegression:
    def __init__(self, df, X_cols, y_col, iterations=10000, learning_rate=0.001):
        self.df = df
        self.X_cols = X_cols
        self.y_col = y_col
        self.iterations = iterations
        self.learning_rate = learning_rate
    
    def sigmoid(self,z):
        return 1/(1+np.exp(-z))
    
    def train(self):
        # We initialize our W and b as zeros
        w = np.zeros(len(self.X_cols)).reshape(len(self.X_cols),1)
        b = 0
        
        X = self.df[self.X_cols].to_numpy().T
        y = self.df[self.y_col].to_numpy().T
        m = self.df.shape[0] #Number of samples
        
        loss = [] #Keeping track of the cost function values
        
        for i in range(self.iterations):
            #Computes our predictions
            z = np.dot(w.T, X)+b
            pred = self.sigmoid(z)
            
            #Computes our cost function
            cost = (-1/m)*np.sum(np.dot(y,np.log(pred).T) + np.dot(1-y,np.log(1-pred).T ))
            loss.append(cost)
            
            
            #Computes the gradient 
            dw = (1/m)*np.dot(X,(pred-y).T)
            db = (1/m)*np.sum(pred-y, axis=1)
            
            #Updates the W and b
            w = w - self.learning_rate*dw
            b = b - self.learning_rate*db
        return {"W":w, "b": b, "loss": loss}

In [30]:
col_names = ['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'label']
diabetes_data = pd.read_csv("dataset/diabetes.csv", header=None, names=col_names)
diabetes_data.head()

lg = LogisticRegression(diabetes_data, ['pregnant', 'insulin', 'bmi', 'age','glucose','bp','pedigree'], ['label'])
model = lg.train()


In [31]:
model["W"]

array([[ 0.49544415],
       [ 0.0088088 ],
       [-0.0022681 ],
       [-0.04862071],
       [ 0.05689882],
       [-0.09014763],
       [ 0.07616258]])

In [32]:
data = [[6, 0, 33.6, 50, 148, 72, 0.627]] 
  
# Create the pandas DataFrame 
df = pd.DataFrame(data, columns = ['pregnant','insulin','bmi','age','glucose','bp','pedigree']) 

In [33]:
def predict(X, W, b):
    pred = sigmoid(np.dot(W.T,X)+b)
    pred = [1 if prob >= 0.5 else 0 for prob in pred[0]]
    return pred
X = df[['pregnant','insulin','bmi','age','glucose','bp','pedigree']].to_numpy().T
pred = predict(X, model['W'], model['b'])
print(np.array(diabetes_data.label==pred[0]).sum()/500)

0.536


In [None]:
#Implementation With sklearn Model

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [12]:
col_names = ['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'label']
diabetes_data = pd.read_csv("dataset/diabetes.csv", header=None, names=col_names)
diabetes_data.head()

Unnamed: 0,pregnant,glucose,bp,skin,insulin,bmi,pedigree,age,label
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [14]:
#SPLIT DATASET IN FEATURES AND TARGETS VARIABLE
feature_cols = ['pregnant', 'insulin', 'bmi', 'age','glucose','bp','pedigree']
X = diabetes_data[feature_cols] # Features
y = diabetes_data.label # Target variable

In [15]:
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0)

In [16]:
from sklearn.linear_model import LogisticRegression

# instantiate the model (using the default parameters)
logreg = LogisticRegression(max_iter = 1000)

# fit the model with data
logreg.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [17]:
#predicting the output for out test set
y_pred=logreg.predict(X_test)
y_pred

array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0], dtype=int64)

In [23]:
X_test;

In [19]:
# import the metrics class to create confusion matrics
from sklearn import metrics
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

array([[118,  12],
       [ 26,  36]], dtype=int64)

In [20]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8020833333333334


In [21]:
import pickle
import os
#Saving the model
if not os.path.exists('models'):
    os.makedirs('models')
    
MODEL_PATH = "models/logistic_reg.sav"
pickle.dump(logreg, open(MODEL_PATH, 'wb'))

In [22]:
# initialize list of lists 
data = [[6, 0, 33.6, 50, 148, 72, 0.627]] 
  
# Create the pandas DataFrame 
df = pd.DataFrame(data, columns = ['pregnant','insulin','bmi','age','glucose','bp','pedigree']) 

#Predict On new Data
new_pred = logreg.predict(df)
new_pred

array([1], dtype=int64)

In [24]:
Path_Model="models/logistic_reg.sav"

In [28]:
model=pickle.load(open(Path_Model,"rb"))
#model=LoadModel(Path_Model)

new_predict=model.predict(df)
print(new_predict)

[1]


In [34]:
# credits: https://medium.com/swlh/logistic-regression-simple-python-implementation-f3c2f8a8ee80