#  Logistic regression using scikit-learn : diabetes dataset

## Import necessary libraries

In [153]:
import numpy
import pandas as pd
import matplotlib.pylab as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression

%matplotlib inline

from subprocess import check_output
print(check_output(["ls", "data"]).decode("utf8"))


hourly_wages.csv
pima-indians-diabetes-data.csv



## Load and Explore data

Data Description : https://www.kaggle.com/uciml/pima-indians-diabetes-database

In [102]:
data_loc = "data/pima-indians-diabetes-data.csv"
data = pd.read_csv(data_loc)

In [103]:
data.head()
# data.head(10)

Unnamed: 0,num_pregnant,plasma_glucose,blood_pressure,skin_thickness,insulin,bmi,pedigree,age,diabetes
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [104]:
data.describe(include='all')

Unnamed: 0,num_pregnant,plasma_glucose,blood_pressure,skin_thickness,insulin,bmi,pedigree,age,diabetes
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [105]:
data.dtypes

num_pregnant        int64
plasma_glucose      int64
blood_pressure      int64
skin_thickness      int64
insulin             int64
bmi               float64
pedigree          float64
age                 int64
diabetes            int64
dtype: object

In [106]:
data['diabetes'].value_counts()

0    500
1    268
Name: diabetes, dtype: int64

## Missing value treatment

The following columns have an invalid zero minimum value:

* Plasma glucose concentration
* Diastolic blood pressure
* Triceps skinfold thickness
* 2-Hour serum insulin
* Body mass index

In [107]:
# Lets cross check
data.head(20)

Unnamed: 0,num_pregnant,plasma_glucose,blood_pressure,skin_thickness,insulin,bmi,pedigree,age,diabetes
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [108]:
# Read the data with out headers
data = pd.read_csv("data/pima-indians-diabetes-data.csv", header=None, skiprows=1)
print((data[[1,2,3,4,5]] == 0.0).sum())

1      5
2     35
3    227
4    374
5     11
dtype: int64


In [109]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [110]:
# Mark zero values as missing or NaN
data[[1,2,3,4,5]] = data[[1,2,3,4,5]].replace(0, np.NaN)
# Count the number of NaN values in each column
print(data.isnull().sum())

0      0
1      5
2     35
3    227
4    374
5     11
6      0
7      0
8      0
dtype: int64


In [111]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148.0,72.0,35.0,,33.6,0.627,50,1
1,1,85.0,66.0,29.0,,26.6,0.351,31,0
2,8,183.0,64.0,,,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [112]:
# Impute the values

In [113]:
# Fill missing values with mean column values
data.fillna(data.mean(), inplace=True)
# Count the number of NaN values in each column
print(data.isnull().sum())

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
dtype: int64


In [114]:
data.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148.0,72.0,35.0,155.548223,33.6,0.627,50,1
1,1,85.0,66.0,29.0,155.548223,26.6,0.351,31,0
2,8,183.0,64.0,29.15342,155.548223,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
5,5,116.0,74.0,29.15342,155.548223,25.6,0.201,30,0
6,3,78.0,50.0,32.0,88.0,31.0,0.248,26,1
7,10,115.0,72.405184,29.15342,155.548223,35.3,0.134,29,0
8,2,197.0,70.0,45.0,543.0,30.5,0.158,53,1
9,8,125.0,96.0,29.15342,155.548223,32.457464,0.232,54,1


## Train and test split

In [115]:
X = data.drop(columns=[8])
# X

In [116]:
Y = data[8]
# Y

In [117]:
(trainX, testX, trainY, testY) = train_test_split(X, Y, test_size=0.25, random_state=42)

## Build a simple model

In [118]:
lr = LogisticRegression()

In [142]:
lr.fit(trainX,trainY)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [120]:
print(f"Training score : {lr.score(trainX,trainY) * 100}")
print(f"Testing score : {lr.score(testX,testY) * 100}")

Training score : 76.73611111111111
Testing score : 76.04166666666666


In [149]:
from sklearn.metrics import classification_report,confusion_matrix

In [144]:
test_pred = lr.predict(testX)
test_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0])

In [147]:
print(classification_report(testY, test_pred))

              precision    recall  f1-score   support

           0       0.79      0.85      0.82       123
           1       0.69      0.59      0.64        69

   micro avg       0.76      0.76      0.76       192
   macro avg       0.74      0.72      0.73       192
weighted avg       0.76      0.76      0.76       192



In [150]:
print(confusion_matrix(testY, test_pred))

[[105  18]
 [ 28  41]]


## Cross Validation

In [121]:
# You will need the following dependencies for applying Cross-validation and evaluating the cross-validated score

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [122]:
# Build the k-fold cross-validator

kfold = KFold(n_splits=3, random_state=7)

In [123]:
result = cross_val_score(lr, X, Y, cv=kfold, scoring='accuracy')
print(result.mean())

0.7513020833333334




## Hyper parameter Tuning

### Grid Search

In [124]:
from sklearn.model_selection import GridSearchCV

In [125]:
dual=[True,False]
max_iter=[100,110,120,130,140]
C = [1.0,1.5,2.0,2.5]
param_grid = dict(dual=dual,max_iter=max_iter,C=C)

lr = LogisticRegression(penalty='l2')
grid = GridSearchCV(estimator=lr, param_grid=param_grid, cv = 3, n_jobs=-1)

start_time = time.time()
grid_result = grid.fit(X, Y)
# Summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
print("Execution time: " + str((time.time() - start_time)) + ' ms')

Best: 0.763021 using {'C': 2.0, 'dual': False, 'max_iter': 100}
Execution time: 2.247950792312622 ms




In [126]:
print(grid.best_estimator_.score(trainX, trainY))
print(grid.best_estimator_.score(testX, testY))

0.7708333333333334
0.75


### Random Search

In [127]:
from sklearn.model_selection import RandomizedSearchCV

random = RandomizedSearchCV(estimator=lr, param_distributions=param_grid, cv = 3, n_jobs=-1)

start_time = time.time()
random_result = random.fit(trainX, trainY)
# Summarize results
print("Best: %f using %s" % (random_result.best_score_, random_result.best_params_))
print("Execution time: " + str((time.time() - start_time)) + ' ms')

Best: 0.758681 using {'max_iter': 100, 'dual': False, 'C': 2.0}
Execution time: 0.23199987411499023 ms




In [128]:
print(grid.best_estimator_.score(trainX, trainY))
print(grid.best_estimator_.score(testX, testY))

0.7708333333333334
0.75


## Save and Load the model / Prediction

In [129]:
import pickle

In [133]:
Xnew = [[3, 78.0, 50.000000, 32.0000, 88.000000, 31.000000, 0.248,26]]
ynew = grid.best_estimator_.predict(Xnew)
ynew

array([0])

In [136]:
# save the model to disk
filename = 'logr_model'
pickle.dump(grid.best_estimator_, open(filename, 'wb'))

In [137]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))

In [138]:
ynew = loaded_model.predict(Xnew)
ynew

array([0])