In [1]:
import numpy as np
import matplotlib.pyplot as plt
import random
import csv

In [2]:
def read_csv(file_path):
    with open(file_path, 'r') as file:
        reader = csv.reader(file)
        data = list(reader)
        columns = data[0]
        data = data[1:]
        return data, columns

data, columns = read_csv('./diabetes.csv')
print(columns)
for row in data:
    print(row)

['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
['6', '148', '72', '35', '0', '33.6', '0.627', '50', '1']
['1', '85', '66', '29', '0', '26.6', '0.351', '31', '0']
['8', '183', '64', '0', '0', '23.3', '0.672', '32', '1']
['1', '89', '66', '23', '94', '28.1', '0.167', '21', '0']
['0', '137', '40', '35', '168', '43.1', '2.288', '33', '1']
['5', '116', '74', '0', '0', '25.6', '0.201', '30', '0']
['3', '78', '50', '32', '88', '31', '0.248', '26', '1']
['10', '115', '0', '0', '0', '35.3', '0.134', '29', '0']
['2', '197', '70', '45', '543', '30.5', '0.158', '53', '1']
['8', '125', '96', '0', '0', '0', '0.232', '54', '1']
['4', '110', '92', '0', '0', '37.6', '0.191', '30', '0']
['10', '168', '74', '0', '0', '38', '0.537', '34', '1']
['10', '139', '80', '0', '0', '27.1', '1.441', '57', '0']
['1', '189', '60', '23', '846', '30.1', '0.398', '59', '1']
['5', '166', '72', '19', '175', '25.8', '0.587', '51', '1']
['7', '100

In [3]:
def train_test_splitter(X, y, test_size = 0.2):
    indices = list(range(len(X)))
    random.shuffle(indices)
    
    test_set_size = int(len(X) * test_size)
    
    test_indices = indices[:test_set_size]
    train_indices = indices[test_set_size:]
    
    X_train = np.array([X[i] for i in train_indices])
    X_test = np.array([X[i] for i in test_indices])
    y_train = np.array([y[i] for i in train_indices])
    y_test = np.array([y[i] for i in test_indices])
    
    return X_test, y_train, X_train, y_test

In [24]:
X = [row[:-1] for row in data]  # All columns except the last one
y = [row[-1] for row in data]   # Only the last column

print("First few rows of X before conversion:", X[:5])
print("First few rows of y before conversion:", y[:5])

X = [[float(value) for value in row] for row in X]
y = [float(value) for value in y]

print("First few rows of X after conversion:", X[:5])
print("First few rows of y after conversion:", y[:5])

X_test, y_train, X_train, y_test = train_test_splitter(X,y,1/3)

print("X_test:", X_test)
print("y_test:", y_test)
print("X_train:", X_train)
np.set_printoptions(suppress=True)  # Suppress scientific notation for small numbers
print("X_train after suppression:",X_train)
print("y_train:", y_train)

First few rows of X before conversion: [['6', '148', '72', '35', '0', '33.6', '0.627', '50'], ['1', '85', '66', '29', '0', '26.6', '0.351', '31'], ['8', '183', '64', '0', '0', '23.3', '0.672', '32'], ['1', '89', '66', '23', '94', '28.1', '0.167', '21'], ['0', '137', '40', '35', '168', '43.1', '2.288', '33']]
First few rows of y before conversion: ['1', '0', '1', '0', '1']
First few rows of X after conversion: [[6.0, 148.0, 72.0, 35.0, 0.0, 33.6, 0.627, 50.0], [1.0, 85.0, 66.0, 29.0, 0.0, 26.6, 0.351, 31.0], [8.0, 183.0, 64.0, 0.0, 0.0, 23.3, 0.672, 32.0], [1.0, 89.0, 66.0, 23.0, 94.0, 28.1, 0.167, 21.0], [0.0, 137.0, 40.0, 35.0, 168.0, 43.1, 2.288, 33.0]]
First few rows of y after conversion: [1.0, 0.0, 1.0, 0.0, 1.0]
X_test: [[  1.     89.     66.     23.     94.     28.1     0.167  21.   ]
 [  1.    103.     30.     38.     83.     43.3     0.183  33.   ]
 [  6.    144.     72.     27.    228.     33.9     0.255  40.   ]
 [ 10.    125.     70.     26.    115.     31.1     0.205  41. 

In [25]:
#Logistic Regression
class Logistic_Regression():
    def __init__(self,learning_rate,iterations):
        self.learning_rate = learning_rate
        self.iterations = iterations
        
    #Function for model training
    def fit(self, X, Y):
        # no_of_training_examples, no_of_features
        self.m, self.n = X.shape
        #weight initialization
        self.W = np.zeros(self.n)
        self.b = 0
        self.X = X
        self.Y = Y
        
        #gradient descent learning
        for i in range(self.iterations):
            self.update_weights()
        return self

    #Helper functions to update weights in gradient descent
    
    def update_weights(self):
        A = 1/(1 + np.exp(-(self.X.dot(self.W) + self.b)))
        
        #claculate gradients
        # slf.Y.T => transpose of Y
        #self.m => no. of training examples
        tmp = (A - self.Y.T) #diff between predicted prob. A and actual labels
        tmp = np.reshape(tmp,self.m) # reshape tmp to shape determined by self.m
        dW = np.dot(self.X.T, tmp)/self.m # gradient of weights by dot prod b/w transpose of X and tmp
        db = np.sum(tmp) / self.m # gradient of bias (db) by taking the sum of all elements in tmp 
                                  # and then dividing by self.m, the number of training examples
        
        self.W = self.W - self.learning_rate*dW
        self.b = self.b - self.learning_rate*db
        
        return self
    
    def predict(self, X):
        Z = 1/(1 + np.exp(- (X.dot(self.W) + self.b)))
        Y = np.where(Z>0.5,1,0)
        return Y

In [26]:
model = Logistic_Regression(learning_rate=0.01, iterations=1000)

In [27]:
model.fit(X_train, y_train)
Y_pred = model.predict(X_test)
correctly_classified = 0

count = 0
for count in range(np.size(Y_pred)):

    if y_test[count] == Y_pred[count]:
        correctly_classified += 1

    count += 1

print( "Accuracy on test set by our model       :  ", (  
  correctly_classified / count ) * 100 )

Accuracy on test set by our model       :   61.111111111111114
