In [1]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("diabetes.csv")
df.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [49]:
# Do with Logistic Regression
# Function
import copy
import math

def sigmoid(z):
    return 1/(1+np.exp(-z))
def compute_cost(x,y,w,b):
    m = x.shape[0]
    cost = 0.0
    for i in range(m):
        z_i = np.dot(x[i], w) + b
        f_wb_i = sigmoid(z_i)
        cost += -y[i]*np.log(f_wb_i) - (1-y[i])*np.log(1-f_wb_i)
    
    cost = cost/m
    return cost

def compute_gradient_descent(x,y,w,b):
    m,n = x.shape
    dj_w = np.zeros((n,))
    dj_b = 0
    for i in range(m):
        f_wb_i = sigmoid(np.dot(x[i],w) + b)
        err_i = f_wb_i - y[i]
        for j in range(n):
            dj_w[j] = dj_w[j] + err_i* x[i,j]
        dj_b = dj_b + err_i
    dj_w /= m
    dj_b /= m
    
    return dj_w, dj_b

def gradient_descent(x,y,w_in,b_in,alpha,epochs):
    j_history = []
    w = copy.deepcopy(w_in)
    b = b_in
    
    initial_cost = compute_cost(x, y, w, b)
    j_history.append(initial_cost)
    
    for i in range(epochs):
        dj_w, dj_b = compute_gradient_descent(x,y,w,b)
        
        w = w - alpha * dj_w
        b = b - alpha * dj_b
        
        if i<100000:
            j_history.append( compute_cost(x,y,w,b) )
        if i % math.ceil(epochs/10) == 0:
            print(f"Iteration {i:4d}: Cost {j_history[-1]}")
    return w, b, j_history

In [50]:
x = df.drop('Outcome', axis=1)
y = df['Outcome']
x_train = x.loc[0:600,:].values
x_test = x.loc[600:,:].values
y_train = y.loc[0:600].values
y_test = y.loc[600:].values
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((601, 8), (601,), (168, 8), (168,))

In [82]:
w_init = np.zeros_like(x_train[0])
b_init = 0.
alpha = 0.0001
epochs = 3000
w,b,j_history = gradient_descent(x_train, y_train, w_init, b_init, alpha, epochs)
w,b

Iteration    0: Cost 0.674443019397975
Iteration  300: Cost 0.6218515970864751
Iteration  600: Cost 0.6202832308762841
Iteration  900: Cost 0.6191592579035591
Iteration 1200: Cost 0.6181947315804824
Iteration 1500: Cost 0.6173403545717032
Iteration 1800: Cost 0.6165738702602932
Iteration 2100: Cost 0.6158817179143111
Iteration 2400: Cost 0.6152542825955406
Iteration 2700: Cost 0.6146840920716914


(array([ 0.04724783,  0.01111487, -0.02900922, -0.00335571,  0.00117795,
         0.007211  ,  0.00169261, -0.00792097]),
 -0.011581725329604036)

In [66]:
def predict(x,w,b):
    m = x.shape[0]
    pred = []
    for i in range(m):
        z_i = np.dot(x[i], w) + b
        # therehold
        if sigmoid(z_i) <= 0.5:
            pred.append(0)
        elif sigmoid(z_i) > 0.5:
            pred.append(1)
    return np.array(pred)
y_pred = predict(x_test, w,b)
y_pred

array([0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0])

In [68]:
pd.DataFrame({'Actual Values: ' : y_test, 'Predicted Values: ': y_pred})

Unnamed: 0,Actual Values:,Predicted Values:
0,0,0
1,0,1
2,0,0
3,1,0
4,1,1
...,...,...
163,0,0
164,0,0
165,0,0
166,1,0


In [77]:
def accuracy(y_true, y_pred):
    correct_predictions = np.sum(y_pred == y_true)
    accuracy = correct_predictions / len(y_true)
    return "{:.2f}%".format(accuracy * 100)
accuracy(y_test, y_pred)
# Accuracy is good let's do it with sklearn

'67.26%'

In [79]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Create a logistic regression model
model = LogisticRegression(max_iter=3000)  # Increase max_iter for convergence

# Train the model on the training data
model.fit(x_train, y_train)

# Make predictions on the testing data
y_pred = model.predict(x_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.7797619047619048
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.90      0.84       108
           1       0.76      0.57      0.65        60

    accuracy                           0.78       168
   macro avg       0.77      0.73      0.74       168
weighted avg       0.78      0.78      0.77       168



In [80]:
print(f"Computer w,b: {model.coef_}, {model.intercept_}")
print(f"My w,b: {w}, {b}")

Computer w,b: [[ 1.18478236e-01  3.22860968e-02 -1.02285718e-02 -2.12132609e-03
  -8.36661468e-04  1.01653043e-01  9.24124920e-01  8.70083948e-03]], [-8.38876762]
My w,b: [ 0.04724783  0.01111487 -0.02900922 -0.00335571  0.00117795  0.007211
  0.00169261 -0.00792097], -0.011581725329604036
