DIABETES PREDICIION MODEL USING SUPPORT VECTOR CLASSIFIER

IMPORTING THE DEPENDENCIES

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

DATA COLLECTION

--> dataset link : https://www.kaggle.com/uciml/pima-indians-diabetes-database
:: The objective of the dataset is to diagnostically predict whether or not a patient has diabetes, based on certain diagnostic measurements included in the dataset. Several constraints were placed on the selection of these instances from a larger database. In particular, all patients here are females at least 21 years old of Pima Indian heritage.

In [None]:
dataset = pd.read_csv("diabetes_dataset.csv")


In [None]:
dataset.head()

In [None]:
dataset.isnull().sum()

In [None]:

print("total number of rows : {0}".format(len(dataset)))


Checking for number of 0 in each column

In [None]:
print("number of rows missing Glucose: {0}".format(len(dataset.loc[dataset['Glucose'] == 0])))
print("number of rows missing BloodPressure: {0}".format(len(dataset.loc[dataset['BloodPressure'] == 0])))
print("number of rows missing SkinThickness: {0}".format(len(dataset.loc[dataset['SkinThickness'] == 0])))
print("number of rows missing Insulin: {0}".format(len(dataset.loc[dataset['Insulin'] == 0])))
print("number of rows missing BMI: {0}".format(len(dataset.loc[dataset['BMI'] == 0])))
print("number of rows missing Age: {0}".format(len(dataset.loc[dataset['Age'] == 0])))

In [None]:
dataset['Outcome'].value_counts()

1 --> patient is diabetec
0 --> patient is not diabetec

SEPERATING THE FEATURES
X --> INDEPENDENT VARIABLES (FEATURES)
Y --> DEPENDENT VARIABLE (OUTCOME)

In [None]:
x = dataset.iloc[ : , :-1]
y = dataset.iloc[ : , -1]

In [None]:
x.head()

In [None]:
y.head()

Filling 0 values with mean value of the column

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=0, strategy='mean')

x= imputer.fit_transform(x)


In [None]:
print(x)

DATA STANDARDISATION

In [None]:
scalar = StandardScaler()
x = scalar.fit_transform(x)

In [None]:
print(x)

SPLITTING THE DATA INTO TRAIN AND TEST

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split( x, y, test_size=0.2, stratify=y, random_state=52)

In [None]:
print(xtrain)

In [None]:
print(ytrain)

In [None]:
print(xtrain.shape,xtest.shape)

In [None]:
print(ytrain.shape,ytest.shape)

TRAINING THE MODEL

-->USING SUPPORT VECTOR MACHINE MODEL

In [None]:
svmmodel = svm.SVC(kernel='linear')

In [None]:
#Training the support vector machine classifier 
svmmodel.fit(xtrain, ytrain)

-->USING RANDOM FOREST CLASSIFIER

In [None]:

random_forest_model = RandomForestClassifier(random_state=10)

random_forest_model.fit(xtrain, ytrain.ravel())

-->USING DECISION TREE CLASSIFIER

In [None]:

modeldc = DecisionTreeClassifier(criterion = 'entropy', random_state = 0,max_depth=3)
modeldc.fit(xtrain, ytrain)

ACCURACY SCORE

-->ACCURACY FOR SUPPORT VECTOR MACHINE

In [None]:
#ACCURACY ON TRAINING DATASET
x_train_predict =  svmmodel.predict(xtrain)
train_accuracy = accuracy_score(x_train_predict, ytrain)

print("Accuracy =", train_accuracy*100)

In [None]:
#ACCURACY ON TESTING DATASET
x_test_predict =  svmmodel.predict(xtest)
test_accuracy = accuracy_score(x_test_predict, ytest)

print("Accuracy =", test_accuracy*100)

-->ACCURACY FOR RANDOM FOREST CLASSIFIER

In [None]:
predict_train_data = random_forest_model.predict(xtest)

from sklearn import metrics

print("Accuracy = ",metrics.accuracy_score(ytest, predict_train_data)*100)

-->ACCURACY FOR DECISION TREE CLASSIFIER

In [None]:
ypred=modeldc.predict(xtest)

from sklearn import metrics

print("Accuracy = ",metrics.accuracy_score(ytest, ypred)*100)

PREDICTION

SINCE WE GET THE BEST ACCURACY USING SUPPORT VECTOR MACHINE MODEL WE USE IT TO PREDICT THE TEST CASES

CASE 1 :PREDICTION :: 
VALUES -- > 5,116,74,0,0,25.6,0.201,30 :: 
OUTCOME -- > 0

-->Transforming the input data

In [None]:
inputdata = (5,116,74,0,0,25.6,0.201,30) #OUTCOME SHOULD BE NON DIABETEC

#convesion to numpy array
inputnp = np.asarray(inputdata) 

inputnp = inputnp.reshape(1,-1)

#standardisation
data=scalar.transform(inputnp)



-->Prediction

In [None]:
prediction=svmmodel.predict(data)
if prediction[0] == 0:
    print("Not Diabetec")
else:
    print("Diabetec")

CASE 2 :PREDICTION :: 
VALUES -- > 3,78,50,32,88,31,0.248,26 :: 
OUTCOME -- > 1

-->Transforming the input data

In [None]:
inputdata2 = (5,166,72,19,175,25,0.587,51) #OUTCOME SHOULD BE DIABETEC

#convesion to numpy array
inputnp2 = np.asarray(inputdata2) 

inputnp2 = inputnp2.reshape(1,-1)

#standardisation
data2=scalar.transform(inputnp2)



-->Prediction

In [None]:
prediction=svmmodel.predict(data2)
if prediction[0] == 0:
    print("Not Diabetec")
else:
    print("Diabetec")

--END--