# Diabetes Prediction using ML

## Importing important libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC                                  #importing support vector regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix , classification_report
from sklearn.pipeline import make_pipeline
import pickle
%matplotlib inline

# Data collection and analysis

## Pima dataset

In [2]:
db_df = pd.read_csv('https://raw.githubusercontent.com/1Abneesh/DiagnosticAI/master/Datasets/diabetes.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'diabetes.csv'

In [None]:
#Finding the number of Number of rows and colmns of dataset
db_df.shape

In [None]:
#Looking the top 5 data of dataset
db_df.head()

In [None]:
#Getting the statistical measure of data
db_df.describe()

In [None]:
data = db_df
fig, ax = plt.subplots(4, 2, figsize = (20, 15))
plt.suptitle('Distribution of Numerical features based on target variable', fontsize = 25, color = 'teal')
sn.histplot(x = data['Pregnancies'], hue= data['Outcome'], kde= True, ax= ax[0,0], palette = 'ocean')
ax[0,0].set(xlabel = 'Pregnancies')
sn.histplot(x = data['Glucose'], hue= data['Outcome'], kde= True, ax= ax[0,1], palette = 'twilight')
ax[0,1].set(xlabel = 'Glucose')
sn.histplot(x = data['BloodPressure'], hue= data['Outcome'], kde= True, ax= ax[1,0], palette = 'viridis')
ax[1,0].set(xlabel = 'Blood Pressure')

sn.histplot(x = data['SkinThickness'], hue= data['Outcome'], kde= True, ax= ax[1,1], palette = 'Pastel2_r')
ax[1,1].set(xlabel = 'Skin Thickness')
sn.histplot(x = data['Insulin'], hue= data['Outcome'], kde= True, ax= ax[2,0], palette = 'gnuplot')
ax[2,0].set(xlabel = 'Insulin')
sn.histplot(x = data['BMI'], hue= data['Outcome'], kde= True, ax= ax[2,1], palette = 'twilight_shifted')
ax[2,1].set(xlabel = 'BMI')

sn.histplot(x = data['DiabetesPedigreeFunction'], hue= data['Outcome'], kde= True, ax= ax[3,0], palette = 'RdPu_r')
ax[3,0].set(xlabel = 'Diabetes Pedigree Function')
sn.histplot(x = data['Age'], hue= data['Outcome'], kde= True, ax= ax[3,1], palette = 'mako')
ax[3,1].set(xlabel = 'Age')


plt.show()

In [None]:
#cheching for empty value in dataset
db_df.isnull().sum()

In [None]:
#plotting a heatmap to check for null value
plt.figure(figsize=(15,15))
sn.heatmap(db_df.isnull()) #since we get heatmap of uniform colour so no data is missing

In [None]:
#checking data in dataset for true or false data
db_df_shpe = db_df['Outcome'].value_counts()
print('The total data not having diabetes:-{}\nThe total data having diabetes:-{}'.format(db_df_shpe[0],db_df_shpe[1]))

In [None]:
sn.countplot(x = data['Outcome'], palette= 'winter')
plt.xlabel('Outcome')

In [None]:
#Getting more analysis of our data
db_df.groupby('Outcome').mean()

In [None]:
#seprating the data and labels
X = db_df.drop(['Outcome'],axis=1)
y = db_df['Outcome']

In [None]:
X

# Creating a correlational matrix

In [None]:
plt.figure(figsize=(15,15))
heatmap = sn.heatmap(db_df.corr(),annot=True,cmap="YlGnBu");
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12);

## Data Standardization

In [None]:
scaler =StandardScaler()
X_standardised = scaler.fit_transform(X)

In [None]:
X_standardised

## Train test split

In [None]:
X_train ,X_test,y_train,y_test = train_test_split(X_standardised,y,test_size=0.2,random_state=12,stratify=y)

In [None]:
print('The original data shape is {}. Test data shape {} and train data shape is {}'.format(X.shape,X_train.shape,X_test.shape))

## Selecting the best possible model with hyperperameters between logestic_regression and support vector classifier.

In [None]:
model_params = {
    'logestic_regression' :{
        'model' : LogisticRegression(),
         'params' :{
             'penalty':['l1', 'l2', 'elasticnet', None],
             'C':[-7,1e-2,0,1,2,3,4,5,6,7,8,9,10,20,30,40,50],
             'max_iter':[10,50,100,200,300,500],
             'tol':[1e-5,1e-4,1e-6,1e-8]
      }
  },
    'SVC':{
     'model' : SVC(),
     'params':{
         'gamma':['auto','scale'],
         'C':[-7,1e-2,0,1,2,3,4,5,6,7,8,9,10,20,30,40,50],
         'kernel':['linear', 'poly', 'rbf', 'sigmoid'],
         'coef0':[0.0,0.5,0.7,0.9,1.0,2.0]  
     }
   }
}

In [None]:
scores = []
for model_name,mp in model_params.items():
    clf = GridSearchCV(mp['model'],mp['params'],cv =5,return_train_score=False)
    clf.fit(X_train,y_train)
    scores.append({
        'model':model_name,
        'best_score':clf.best_score_,
        'best_params':clf.best_params_,
    })

In [None]:
t_df = pd.DataFrame(scores,columns=['model','best_score','best_params'])

In [None]:
print(t_df)

In [None]:
clf.best_score_

# Therefore on comparing the perfomance of both the model we conlude that svc work better

### Model evalution

#### Accuracy score

In [3]:
y_test_predict = clf.predict(X_test)
y_train_predict = clf.predict(X_train)
res1= accuracy_score(y_test_predict,y_test)
res2 = accuracy_score(y_train_predict,y_train)

NameError: name 'clf' is not defined

In [4]:
print('Accuracy score on test is {} and train data for our model is {}'.format(res1,res2))

NameError: name 'res1' is not defined

In [5]:
#creating the confusion matrix for train data checking the accuracy of our model
cm= confusion_matrix(y_true = y_train,y_pred=y_train_predict)

plt.figure(figsize=(10,7))
sn.heatmap(cm,annot=True,fmt='d')
plt.xlabel('Predected')
plt.ylabel('Truth')

NameError: name 'y_train' is not defined

## Making a predictive system

In [6]:
input_data = (5,187,76,27,207,43.6,1.034,53)

# changing the input_data to numpy array
input_data = np.asarray(input_data)


# reshape the array as we are predicting for one instance
input_data = input_data.reshape(1,-1)

#standarised the data
input_data_std = scaler.transform(input_data)
print(input_data_std)

prediction = clf.predict(input_data_std)
if prediction[0] == 1:
    print('The patient has Diabetes')
else:
    print('The patient has not Diabatese')


NameError: name 'scaler' is not defined

## Saving the model

In [28]:
#Saving the scaler and model
filename = 'diabetes_model.sav'
pickle.dump(clf, open(filename, 'wb'))
scalerfile = 'scaler.sav'
pickle.dump(scaler, open(scalerfile, 'wb'))

In [29]:
# loading the saved model
load_model = pickle.load(open('diabetes_model.sav', 'rb'))
load_scaler = pickle.load(open('scaler.sav','rb'))

In [30]:
input_data = (5,104,74,0,0,28.8,0.153,48)

# changing the input_data to numpy array
input_data = np.asarray(input_data)


# reshape the array as we are predicting for one instance
input_data = input_data.reshape(1,-1)

#standarised the data
print('before input data',input_data)
input_data_std = load_scaler.transform(input_data)
print('after std input data',input_data_std)
print()
prediction = clf.predict(input_data_std)
if prediction[0] == 1:
    print('The patient has Diabetes')
else:
    print('The patient has not Diabatese')

before input data [[  5.    104.     74.      0.      0.     28.8     0.153  48.   ]]
after std input data [[ 0.3429808  -0.5287506   0.25303625 -1.28821221 -0.69289057 -0.40519961
  -0.96304428  1.2558199 ]]

The patient has not Diabatese




In [31]:
for col in db_df:
    print(col)

Pregnancies
Glucose
BloodPressure
SkinThickness
Insulin
BMI
DiabetesPedigreeFunction
Age
Outcome
