# Diabetes Database 

# Context 

This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases. The objective of the dataset is to diagnostically predict whether or not a patient has diabetes, based on certain diagnostic measurements included in the dataset. Several constraints were placed on the selection of these instances from a larger database. In particular, all patients here are females at least 21 years old.

# Content 

The datasets consists of several medical predictor variables and one target variable, Outcome. Predictor variables includes the number of pregnancies the patient has had, their BMI, insulin level, age, and so on.

# We setup with all the required packages and load the data set. 

In [41]:
# Python libraries
# Classic,data manipulation and linear algebra
import pandas as pd
import numpy as np

# Plots
%matplotlib inline
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import plotly.offline as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.tools as tls
import plotly.figure_factory as ff
py.offline.init_notebook_mode(connected=True)
py.init_notebook_mode(connected=True)



# Data processing, metrics and modeling
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import precision_score, recall_score, confusion_matrix,  roc_curve, precision_recall_curve, accuracy_score, roc_auc_score
#import lightgbm as lgbm
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_curve,auc
from sklearn.model_selection import cross_val_predict
from yellowbrick.classifier import DiscriminationThreshold



#ignore warning messages 
import warnings
warnings.filterwarnings('ignore') 

In [42]:
df = pd.read_csv("diabetes.csv", sep=",")

df = df.reindex(
    np.random.permutation(df.index))

In [43]:
df.shape

(768, 9)

In [44]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
66,0,109,88,30,0,32.5,0.855,38,1
566,1,99,72,30,18,38.6,0.412,21,0
471,0,137,70,38,0,33.2,0.17,22,0
519,6,129,90,7,326,19.6,0.582,60,0
191,9,123,70,44,94,33.1,0.374,40,0


In [45]:
df.info(null_counts = True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 768 entries, 66 to 213
Data columns (total 9 columns):
Pregnancies                 768 non-null int64
Glucose                     768 non-null int64
BloodPressure               768 non-null int64
SkinThickness               768 non-null int64
Insulin                     768 non-null int64
BMI                         768 non-null float64
DiabetesPedigreeFunction    768 non-null float64
Age                         768 non-null int64
Outcome                     768 non-null int64
dtypes: float64(2), int64(7)
memory usage: 60.0 KB


In [46]:
missing_values_count = df.isnull().sum()

In [47]:
missing_values_count[0:10]

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [48]:
D = df[(df['Outcome'] != 0)]
H = df[(df['Outcome'] == 0)]

#------------COUNT-----------------------
def target_count():
    trace = go.Bar( x = df['Outcome'].value_counts().values.tolist(), 
                    y = ['healthy','diabetic' ], 
                    orientation = 'h', 
                    text=df['Outcome'].value_counts().values.tolist(), 
                    textfont=dict(size=15),
                    textposition = 'auto',
                    opacity = 0.8,marker=dict(
                    color=['lightskyblue', 'gold'],
                    line=dict(color='#000000',width=1.5)))

    layout = dict(title =  'Count of Outcome variable')

    fig = dict(data = [trace], layout=layout)
    py.iplot(fig)

#------------PERCENTAGE-------------------
def target_percent():
    trace = go.Pie(labels = ['healthy','diabetic'], values = df['Outcome'].value_counts(), 
                   textfont=dict(size=15), opacity = 0.8,
                   marker=dict(colors=['lightskyblue', 'gold'], 
                               line=dict(color='#000000', width=1.5)))


    layout = dict(title =  'Distribution of Outcome variable')

    fig = dict(data = [trace], layout=layout)
    py.iplot(fig)

# I wasn't able to resolve the issue with graph plotting. I have tried installing the extension for matplot lib in conda and tried I still couldn't get the display of the graph. Some sources mentioned to update the browser which didn't work as well. I have to still look into it.All the iplot weren't displaying in my notebook. I'm uploading my notebook with the issue as I couldn't resolve it in time. 

In [63]:
target_count()
target_percent()

In [64]:
df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)

In [76]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
66,0,109.0,88.0,30.0,,32.5,0.855,38,1
566,1,99.0,72.0,30.0,18.0,38.6,0.412,21,0
471,0,137.0,70.0,38.0,,33.2,0.17,22,0
519,6,129.0,90.0,7.0,326.0,19.6,0.582,60,0
191,9,123.0,70.0,44.0,94.0,33.1,0.374,40,0


In [65]:
# Define missing plot to detect all missing values in dataset
def missing_plot(dataset, key) :
    null_feat = pd.DataFrame(len(dataset[key]) - dataset.isnull().sum(), columns = ['Count'])
    percentage_null = pd.DataFrame((len(dataset[key]) - (len(dataset[key]) - dataset.isnull().sum()))/len(dataset[key])*100, columns = ['Count'])
    percentage_null = percentage_null.round(2)

    trace = go.Bar(x = null_feat.index, y = null_feat['Count'] ,opacity = 0.8, text = percentage_null['Count'],  textposition = 'auto',marker=dict(color = '#7EC0EE',
            line=dict(color='#000000',width=1.5)))

    layout = dict(title =  "Missing Values (count & %)")

    fig = dict(data = [trace], layout=layout)
    py.iplot(fig)

In [66]:
# Plotting 
missing_plot(df, 'Outcome')

# Logistic Regression 

In [67]:
from sklearn.datasets import fetch_openml
import numpy as np
db = fetch_openml('diabetes', version=1)  # load dataset from https://openml.org/
db.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'details', 'categories', 'url'])

In [68]:
X, y = db["data"],db["target"]

In [69]:
X_train , X_test , y_train , y_test = X[:200], X[200:],y[:200],y[200:]

In [70]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)

In [71]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_predict = lr.predict(X_test)

In [72]:
#print confusion matrix and accuracy score
lr_conf_matrix = confusion_matrix(y_test, lr_predict)
lr_acc_score = accuracy_score(y_test, lr_predict)
print(lr_conf_matrix)
print(lr_acc_score*100)

[[329  46]
 [ 85 108]]
76.93661971830986


# let's use SVC classifier and compare the accuracy 

In [73]:
from sklearn.svm import SVC

In [74]:
lin_svc = SVC()
lin_svc.fit(X_train, y_train)
lin_svc_predict=lin_svc.predict(X_test) 

In [75]:
lin_svc_conf_matrix = confusion_matrix(y_test, lin_svc_predict)
lin_svc_acc_score = accuracy_score(y_test, lin_svc_predict)
print(lin_svc_conf_matrix)
print(lin_svc_acc_score*100)

[[375   0]
 [193   0]]
66.02112676056338
