#Diabetes Prediction

In [24]:
!pip install --upgrade scikit-learn



In [25]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

PIMA Diabetes Dataset

In [26]:
data = pd.read_csv('/content/diabetes.csv')

In [27]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [28]:
data.shape

(768, 9)

In [29]:
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [30]:
data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [31]:
data.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


In [32]:
data['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

0 --> Non Diabetic

1 --> Diabetic

In [33]:
X = data.drop(columns = 'Outcome', axis = 1)
Y = data['Outcome']

In [34]:
X = X.to_numpy()
Y = Y.to_numpy()
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, stratify = Y, test_size =0.2, random_state =2)
print(X.shape, X_test.shape, X_train.shape)

(768, 8) (154, 8) (614, 8)


In [35]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [36]:
print(X_train[0:5])

[[-1.13796489 -0.07971099 -3.5556072  -1.29075209 -0.70188945  0.02825037
  -0.98159708 -0.7885233 ]
 [ 0.64067858 -0.52091877  0.02549599  0.70066503 -0.12789497 -0.17184452
  -1.03823795  0.31879426]
 [-0.84152431  2.12632792 -0.48609018  0.14057897  6.43927725 -0.25938604
  -0.21545477  2.19271628]
 [-0.54508373 -0.52091877  0.53708216  1.50967824  0.91035977  0.19082747
   0.71762906 -0.36263193]
 [-1.13796489  1.84269435 -0.17913848  1.13628753 -0.70188945  1.22881971
   4.24128748 -0.70334503]]


In [37]:
classifier = svm.SVC(kernel='linear')

In [38]:
classifier.fit(X_train, Y_train)

In [39]:
def accuracy_precision_recall_f1(true_labels,pred_labels):
  accuracy_value = accuracy_score(true_labels,pred_labels)
  precision_value = precision_score(true_labels,pred_labels)
  recall_value = recall_score(true_labels,pred_labels)
  f1_value = f1_score(true_labels,pred_labels)
  print('Accuracy score =',round(accuracy_value*100,2),'%')
  print('Precision score =',round(precision_value*100,2),'%')
  print('Recall score =',round(recall_value*100,2),'%')
  print('F1 score =',round(f1_value*100,2),'%')

In [40]:
X_train_pred = classifier.predict(X_train)
X_test_pred = classifier.predict(X_test)
print('For training Data:')
accuracy_precision_recall_f1(Y_train, X_train_pred)
print('---------------------------------------')
print('For testing Data:')
accuracy_precision_recall_f1(Y_test, X_test_pred)
print('---------------------------------------')

For training Data:
Accuracy score = 78.66 %
Precision score = 74.56 %
Recall score = 58.88 %
F1 score = 65.8 %
---------------------------------------
For testing Data:
Accuracy score = 77.27 %
Precision score = 75.68 %
Recall score = 51.85 %
F1 score = 61.54 %
---------------------------------------


Making a predictive system

In [41]:
input_data = input('Enter the data separated by spaces : ')
input_data = input_data.split()
input_data = [float(x) for x in input_data]
input_data = np.asarray(input_data)
input_data = input_data.reshape(1,-1)
input_data = scaler.transform(input_data)
print(input_data)

prediction = classifier.predict(input_data)
print(prediction)
print('The person is ' + ('Diabetic!' if prediction ==1 else 'Non-Diabetic!'))

Enter the data separated by spaces : 6 148 72 35 0 33.6 0.627 50
[[ 0.64067858  0.83421941  0.12781322  0.88736039 -0.70188945  0.17832154
   0.46721679  1.42611182]]
[1]
The person is Diabetic!


#Saving the trained model

In [42]:
import pickle

In [43]:
filename = 'diabetes_model.sav'
pickle.dump(classifier, open(filename, 'wb'))