In [32]:
# Data from https://archive.ics.uci.edu/ml/datasets/Pima+Indians+Diabetes

   #1. Number of times pregnant
   #2. Plasma glucose concentration a 2 hours in an oral glucose tolerance test
   #3. Diastolic blood pressure (mm Hg)
   #4. Triceps skin fold thickness (mm)
   #5. 2-Hour serum insulin (mu U/ml)
   #6. Body mass index (weight in kg/(height in m)^2)
   #7. Diabetes pedigree function
   #8. Age (years)
   #9. Class variable (0 or 1)

In [33]:
import pandas as pd 
import numpy as np
from math import exp

In [34]:
pima_indians_file = "C:/Anaconda3/Notebooks/logistic regression/pima-indians-diabetes.data"

In [35]:
data = pd.read_csv(pima_indians_file,sep=",",names=['times_pregnant','plasma_glucose','blood_pressure','skin_fold_thickness','insulin','body_mass','diabetes_pedigree','age','yes_no'],header=0)

In [36]:
data.shape

(767, 9)

In [116]:
data.tail(5)

Unnamed: 0,times_pregnant,plasma_glucose,blood_pressure,skin_fold_thickness,insulin,body_mass,diabetes_pedigree,age,yes_no
762,10,101,76,48,180,32.9,0.171,63,0
763,2,122,70,27,0,36.8,0.34,27,0
764,5,121,72,23,112,26.2,0.245,30,0
765,1,126,60,0,0,30.1,0.349,47,1
766,1,93,70,31,0,30.4,0.315,23,0


In [38]:
X=data.loc[0:,['times_pregnant','plasma_glucose','blood_pressure','skin_fold_thickness','insulin','body_mass','diabetes_pedigree','age']]

In [39]:
Y=data.loc[0:,['yes_no']]

In [43]:
#Normalize the features
for i in X.columns:
    min = X[i].min()
    max = X[i].max()
    X[i]=(X[i]-min)/(max-min)

In [44]:
X.head(5)

Unnamed: 0,times_pregnant,plasma_glucose,blood_pressure,skin_fold_thickness,insulin,body_mass,diabetes_pedigree,age
0,0.058824,0.427136,0.540984,0.292929,0.0,0.396423,0.116567,0.166667
1,0.470588,0.919598,0.52459,0.0,0.0,0.347243,0.253629,0.183333
2,0.058824,0.447236,0.540984,0.232323,0.111111,0.418778,0.038002,0.0
3,0.0,0.688442,0.327869,0.353535,0.198582,0.642325,0.943638,0.2
4,0.294118,0.582915,0.606557,0.0,0.0,0.38152,0.052519,0.15


In [45]:
#Get the matrix from dataframe using numpy
features = np.matrix(X)
labels = np.matrix(Y)

In [46]:
bias = np.ones((features.shape[0],1))
features =  np.append(bias, features,axis=1)
features[0]

matrix([[ 1.        ,  0.05882353,  0.42713568,  0.54098361,  0.29292929,
          0.        ,  0.39642325,  0.11656704,  0.16666667]])

In [67]:
def sigmoid(featureValues, theta):
    #print(featureValues)
    z=np.dot(featureValues,theta.T)
    #print(z)
    return 1/(1+np.exp(-z))

In [71]:
#Cost function which gives rmse - root mean squared error
def costfunction(featureValues, actualValues, theta):
    m = featureValues.shape[0] # number of observations
    A = sigmoid(featureValues, theta)
    cost = -1/m * np.sum(actualValues.T * np.log(A) + (1-actualValues).T * (np.log(1-A)))
    return cost

In [72]:
def gradientDescent(featureValues, actualValues, learning_rate, num_iterations):
    num_features = featureValues.shape[1] #size of theta is determined by number of features
    theta = np.matrix([0.0 for i in range(num_features)]) # initialize theta values by 0.0
    
    cost = [0.0 for i in range(num_iterations)]
    
    for it in range(num_iterations):
        predictedValues = sigmoid(featureValues, theta)
        error = np.repeat((predictedValues-actualValues), num_features, axis=1)
       
        error_derivative = np.sum(np.multiply(featureValues,error),axis=0)
        theta = theta - (learning_rate*(1/len(actualValues)))*error_derivative
        
        cost[it] = costfunction(featureValues, actualValues, theta)
    return theta, cost

In [77]:
learning_rate=0.1
num_iterations=50000
theta, cost = gradientDescent(features, labels, learning_rate, num_iterations)

In [78]:
theta

matrix([[-7.98309118,  2.09077891,  6.96518857, -1.61063466,  0.03845261,
         -0.97221919,  5.98639637,  2.20060121,  0.87693448]])

In [79]:
cost[0],cost[num_iterations-1]

(0.68952524424367168, 0.47118003505001571)

In [107]:
#Test
test_index=1
features[test_index]

predictedValue = sigmoid(theta, features[test_index][0])

actualValue = labels[test_index]
predictedValue.shape, actualValue.shape
predicted = predictedValue[0,0]
print(predicated)
#np.subtract(actualValue,predicatedValue)

0.795547890969


In [114]:
def accuracy():
    matched = list()
    for i in range(features.shape[0]):
        predictedValue = sigmoid(theta, features[i][0])
        predicted = predictedValue[0,0]
        if predicted > 0.5:
            predicted = 1
        else:
            predicted = 0
        actualValue = labels[i]
        actual = actualValue[0,0]
        if predicted == actual:
            matched.append(predicted)
    return (len(matched) /features.shape[0])*100       

In [115]:
a=accuracy()
print(a)

78.22685788787483
