In [1]:
# Dependencies

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression

In [3]:
# Load the data

data = pd.read_csv("../data/diabetes.csv")

In [4]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [8]:
print((data[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']] == 0).sum())

Glucose            5
BloodPressure     35
SkinThickness    227
Insulin          374
BMI               11
dtype: int64


In [11]:
# Mark zero values as missing or NaN
#data[[1,2,3,4,5]] = data[[1,2,3,4,5]].replace(0, np.NaN)
data[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']] = \
    data[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']].replace(0, np.NaN)
# Count the number of NaN values in each column
print(data.isnull().sum())

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64


In [12]:
# Confirm that the zeroes are now NaNs
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,,33.6,0.627,50,1
1,1,85.0,66.0,29.0,,26.6,0.351,31,0
2,8,183.0,64.0,,,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [13]:
# Fill missing values with mean column values
data.fillna(data.mean(), inplace=True)
# Count the number of NaN values in each column
print(data.isnull().sum())

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [15]:
# Split dataset into inputs and outputs
values = data.values
X = values[:,0:8]
y = values[:,8]

In [25]:
# Initiate the LR model with random hyperparameters
lr = LogisticRegression(penalty='l2',dual=False,max_iter=400)

In [26]:
# Pass data to the LR model
lr.fit(X,y)

LogisticRegression(max_iter=400)

In [27]:
lr.score(X,y)

0.7747395833333334

In [30]:
# Cross-validation
from sklearn.model_selection import cross_val_score
result = cross_val_score(lr, X, y, cv=3, scoring='accuracy')
print(result.mean())

0.7734375


In [43]:
# Let's define the grid values of the hyperparameters used above.

dual=[True, False]
max_iter=[100,110,120,130,140]
#max_iter=[400, 450, 500, 550]
param_grid = dict(dual=dual, max_iter=max_iter)

In [44]:
import time
from sklearn.model_selection import GridSearchCV

lr = LogisticRegression(penalty='l2') # leave out penalty
grid = GridSearchCV(estimator=lr, param_grid=param_grid, cv=3, n_jobs=-1)

start_time = time.time()
grid_result = grid.fit(X, y)
# Summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
print("Execution time: " + str((time.time() - start_time)) + ' s')

Best: 0.773438 using {'dual': False, 'max_iter': 140}
Execution time: 0.6874678134918213 s


 0.76171875 0.765625   0.77213542 0.7734375 ]


In [48]:
dual=[True,False]
max_iter=[400,410,420,430,440]
C = [1.0,1.5,2.0,2.5]
param_grid = dict(dual=dual,max_iter=max_iter,C=C)

lr = LogisticRegression(penalty='l2')
grid = GridSearchCV(estimator=lr, param_grid=param_grid, cv = 3, n_jobs=-1)

start_time = time.time()
grid_result = grid.fit(X, y)
# Summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
print("Execution time: " + str((time.time() - start_time)) + ' s')

Best: 0.773438 using {'C': 1.0, 'dual': False, 'max_iter': 400}
Execution time: 2.7554914951324463 s


 0.7734375  0.7734375  0.7734375  0.7734375         nan        nan
        nan        nan        nan 0.77213542 0.77213542 0.77213542
 0.77213542 0.77213542        nan        nan        nan        nan
        nan 0.77213542 0.77213542 0.77213542 0.77213542 0.77213542
        nan        nan        nan        nan        nan 0.77213542
 0.77213542 0.77213542 0.77213542 0.77213542]


In [49]:
from sklearn.model_selection import RandomizedSearchCV
random = RandomizedSearchCV(estimator=lr, param_distributions=param_grid, cv = 3, n_jobs=-1)

start_time = time.time()
random_result = random.fit(X, y)
# Summarize results
print("Best: %f using %s" % (random_result.best_score_, random_result.best_params_))
print("Execution time: " + str((time.time() - start_time)) + ' s')

Best: 0.773438 using {'max_iter': 420, 'dual': False, 'C': 1.0}
Execution time: 0.7826457023620605 s


        nan 0.77213542        nan        nan]
