In [2]:
print("Hello world")

Hello world


In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score


In [4]:
diabetes_data = pd.read_csv("data/diabetes.csv")
diabetes_data.head()
diabetes_data.shape

(768, 9)

In [5]:
diabetes_data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


Let us get the count of diabetic and non diabetic people

In [6]:
diabetes_data["Outcome"].value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64

Get the statistics of people who are diabetic and people who are non diabetic

In [7]:
diabetes_data.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


We see that the mean of Glucose for those people who are diabetic is higher than those who aren't

The Analysis about the data clearly shows us that the difference between the vitals is significant. This information can be used to decide upon a model to be used. And in this case, Support Vector Machine works.

In [12]:
diabetes_vitals_values = diabetes_data.iloc[:, :-1];
print("diabetes_vitals_values : ", diabetes_vitals_values.shape)
diabetes_outcomes = diabetes_data.iloc[:,-1];
print("diabetes_outcomes : ", diabetes_outcomes.shape)


diabetes_vitals_values :  (768, 8)
diabetes_outcomes :  (768,)


In [17]:
scaler = StandardScaler()
standardized_diabetes_values = scaler.fit_transform(diabetes_vitals_values)
standardized_diabetes_values

array([[ 0.63994726,  0.84832379,  0.14964075, ...,  0.20401277,
         0.46849198,  1.4259954 ],
       [-0.84488505, -1.12339636, -0.16054575, ..., -0.68442195,
        -0.36506078, -0.19067191],
       [ 1.23388019,  1.94372388, -0.26394125, ..., -1.10325546,
         0.60439732, -0.10558415],
       ...,
       [ 0.3429808 ,  0.00330087,  0.14964075, ..., -0.73518964,
        -0.68519336, -0.27575966],
       [-0.84488505,  0.1597866 , -0.47073225, ..., -0.24020459,
        -0.37110101,  1.17073215],
       [-0.84488505, -0.8730192 ,  0.04624525, ..., -0.20212881,
        -0.47378505, -0.87137393]])

In [25]:
X = standardized_diabetes_values
Y = diabetes_data['Outcome']
print(X)
print(Y)


[[ 0.63994726  0.84832379  0.14964075 ...  0.20401277  0.46849198
   1.4259954 ]
 [-0.84488505 -1.12339636 -0.16054575 ... -0.68442195 -0.36506078
  -0.19067191]
 [ 1.23388019  1.94372388 -0.26394125 ... -1.10325546  0.60439732
  -0.10558415]
 ...
 [ 0.3429808   0.00330087  0.14964075 ... -0.73518964 -0.68519336
  -0.27575966]
 [-0.84488505  0.1597866  -0.47073225 ... -0.24020459 -0.37110101
   1.17073215]
 [-0.84488505 -0.8730192   0.04624525 ... -0.20212881 -0.47378505
  -0.87137393]]
0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64


In [26]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [28]:
print("X_Train : ", X_train.shape)
print("Y_Train : ", Y_train.shape)
print("X_Test : ", X_test.shape)
print("Y_Test : ", Y_test.shape)

X_Train :  (614, 8)
Y_Train :  (614,)
X_Test :  (154, 8)
Y_Test :  (154,)


In [31]:
classifier = svm.SVC(kernel='linear')

In [32]:
classifier.fit(X_train, Y_train)

Training model accuracy

In [40]:
X_train_prediction = classifier.predict(X_train)
training_accuracy_score = accuracy_score(X_train_prediction, Y_train)
print("Accuracy Score of training model : {:.2f}%".format(training_accuracy_score*100))

Accuracy Score of training model : 78.66%


In [41]:
X_test_prediction = classifier.predict(X_test)
testing_accuracy_score = accuracy_score(X_test_prediction, Y_test)
print("Accuracy Score of testing model : {:.2f}%".format(testing_accuracy_score*100))

Accuracy Score of testing model : 77.27%


If we had *overfitting*, then the Accuracy for the *training data* would be **very high** and that of the *testing data* would be **very low**.
Our model right now has not overfitting as it doesn't show such anomalies.

In [59]:
input_data = (0,118,84,47,230,45.8,0.551,31) #line 18 from the CSV has diabetes 
# prediction for the above data is incorrect

input_data = (3,126,88,41,235,39.3,0.704,27) #Line 22 from CSV
# prediction is acurate and is correct

input_data = (9,171,110,24,240,45.4,0.721,54) #Line 45 from CSV
# prediction is accurate and correct.
input_data_as_NP_array = np.asarray(input_data)
input_data_reshaped = input_data_as_NP_array.reshape(1,-1)
standardized_input_data = scaler.transform(input_data_reshaped)
print("Standardized Input Data : ", standardized_input_data)

Standardized Input Data :  [[1.53084665 1.56815814 2.11415525 0.21726125 1.39100445 1.70165987
  0.75238313 1.76634642]]




In [60]:
prediction = classifier.predict(standardized_input_data)
print(prediction)


[1]


The prediction is incorrect as the Person **is Diabetic** according to the given data.

In [61]:
if (prediction[0] == 0):
    print("Person does not have Diabetes")
else:
    print("Person has Diabetes")

Person has Diabetes
