In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

In [2]:
diabetes_dataset = pd.read_csv('diabetes.csv') 

In [3]:
diabetes_dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
diabetes_dataset.shape

(768, 9)

In [5]:
diabetes_dataset = diabetes_dataset[diabetes_dataset['BMI'] >= 10]
diabetes_dataset = diabetes_dataset[diabetes_dataset['BloodPressure'] >= 10]
diabetes_dataset = diabetes_dataset[diabetes_dataset['Glucose'] >= 10]

diabetes_dataset =diabetes_dataset.drop(['SkinThickness'], axis=1)
diabetes_dataset =diabetes_dataset.drop(['BloodPressure'], axis=1)

diabetes_dataset =diabetes_dataset[diabetes_dataset['Insulin'] >= 13]

diabetes_dataset = diabetes_dataset.drop(['Pregnancies'], axis=1)
diabetes_dataset = diabetes_dataset.drop(['Age'], axis=1)


In [6]:
diabetes_dataset.describe()

Unnamed: 0,Glucose,Insulin,BMI,DiabetesPedigreeFunction,Outcome
count,392.0,392.0,392.0,392.0,392.0
mean,122.627551,156.056122,33.086224,0.523046,0.331633
std,30.860781,118.84169,7.027659,0.345488,0.471401
min,56.0,14.0,18.2,0.085,0.0
25%,99.0,76.75,28.4,0.26975,0.0
50%,119.0,125.5,33.2,0.4495,0.0
75%,143.0,190.0,37.1,0.687,1.0
max,198.0,846.0,67.1,2.42,1.0


In [7]:
diabetes_dataset['Outcome'].value_counts()

0    262
1    130
Name: Outcome, dtype: int64

In [8]:
diabetes_dataset.groupby('Outcome').mean()

Unnamed: 0_level_0,Glucose,Insulin,BMI,DiabetesPedigreeFunction
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,111.431298,130.854962,31.750763,0.472168
1,145.192308,206.846154,35.777692,0.625585


In [9]:
# separating the data and labels
X = diabetes_dataset.drop(columns = 'Outcome', axis=1)
Y = diabetes_dataset['Outcome']

In [10]:
scaler = StandardScaler()

In [11]:
scaler.fit(X)

In [12]:
standardized_data = scaler.transform(X)

In [13]:
print(standardized_data)

[[-1.09104581 -0.52284201 -0.710421   -1.03187632]
 [ 0.46631407  0.10063086  1.42673006  5.11511079]
 [-1.44794079 -0.57339386 -0.29723846 -0.79712575]
 ...
 [-1.12349081 -1.18001611 -0.66767798  0.70411863]
 [-0.70170584  0.20173457 -0.02653266 -1.0202837 ]
 [-0.05280589 -0.37118644 -0.9811268  -0.80582021]]


In [14]:
X = standardized_data
Y = diabetes_dataset['Outcome']

In [15]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.4, random_state=3)

In [16]:
print(X.shape, X_train.shape, X_test.shape)

(392, 4) (235, 4) (157, 4)


In [17]:
classifier = svm.SVC(C=12)

In [18]:
classifier.fit(X_train, Y_train)

In [19]:
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [20]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.851063829787234


In [21]:
# accuracy score on the test data
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [22]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.7834394904458599


In [23]:
input_data = (122.62,118.84,33.20,0.687)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

# standardize the input data
std_data = scaler.transform(input_data_reshaped)
print(std_data)

prediction = classifier.predict(std_data)
print(prediction)

if (prediction[0] == 0):
    print('The person is not diabetic')
else:
    print('The person is diabetic')

[[-2.44992840e-04 -3.13557331e-01  1.62103635e-02  4.75164369e-01]]
[0]
The person is not diabetic


