<a href="https://colab.research.google.com/github/manjotmb20/Diabetes-prediction-using-Machine-Learning/blob/master/fdiabetes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [0]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier 
from sklearn.naive_bayes import GaussianNB

In [0]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

In [0]:
from sklearn.model_selection import GridSearchCV

In [0]:
from sklearn.feature_selection import RFECV
import lightgbm as lgb


In [0]:
import xgboost as xgb

In [0]:
diabetes=pd.read_csv("diabetes.csv")

Feature labels description-: 
preg-: Pregnancies
plas-: Glucose
pres-: BloodPressure
skin-: SkinThickness
insu-: Insulin
mass-: BMI
pedi-: DiabetesPedigreeFunction
age-:  Age

In [75]:
diabetes.head()

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


Removing outlier from skinthickness

In [0]:
max_skinthickness=diabetes.skin.max()
diabetes=diabetes[diabetes.skin!=max_skinthickness]

Replacing Zero values in Pressure,SkinThickness,Insulin,BMI with mean

In [0]:
def replace_zero(df,field,target):
  mean=df.loc[df[field]!=0,[field,target]].groupby(target).mean()
  diabetes.loc[(df[field]==0)&(df[target]==0),field]=mean.iloc[0][0]
  diabetes.loc[(df[field]==0)&(df[target]==1),field]=mean.iloc[1][0]

for col in ['plas','pres','skin','insu','mass']:
  replace_zero(diabetes,col,'class')

In [103]:
diabetes.groupby('class').size()

class
0    500
1    267
dtype: int64

In [104]:
diabetes.isnull().sum()

preg     0
plas     0
pres     0
skin     0
insu     0
mass     0
pedi     0
age      0
class    0
dtype: int64

In [109]:
diabetes[diabetes.mass==0].shape

(0, 9)

In [0]:
diabetes=diabetes[(diabetes.pres!=0)&(diabetes.mass!=0)&(diabetes.plas!=0)]

In [0]:
feature_names=['preg','plas','pres','skin','insu','mass','pedi','age']
X=diabetes[feature_names]
y=diabetes['class']

In [0]:
y.head()

0    tested_positive
1    tested_negative
2    tested_positive
3    tested_negative
4    tested_positive
Name: class, dtype: object

In [0]:
models=[]
models.append(('KNN',KNeighborsClassifier()))
models.append(('SVC',SVC()))
models.append(('LR',LogisticRegression()))
models.append(('DT',DecisionTreeClassifier()))
models.append(('GNB',GaussianNB()))
models.append(('RF',RandomForestClassifier()))
models.append(('GB',GradientBoostingClassifier()))

In [0]:
X_train, X_test, y_train, y_test=train_test_split(X,y,stratify=diabetes['class'],random_state=0)

In [113]:
len(X_test)

192

In [115]:
names=[]
scores=[]

for name, model in models:
  model.fit(X_train,y_train)
  y_pred=model.predict(X_test)
  scores.append(accuracy_score(y_test,y_pred))
  names.append(name)

tr_split=pd.DataFrame({'Name': names, 'Score': scores})
print(tr_split)

  Name     Score
0  KNN  0.807292
1  SVC  0.651042
2   LR  0.781250
3   DT  0.812500
4  GNB  0.776042
5   RF  0.817708
6   GB  0.869792




In [128]:
strat_k_fold = StratifiedKFold(n_splits=10, random_state=10)
names=[]
scores=[]

for name, model in models:
  score=cross_val_score(model,X,y,cv=strat_k_fold,scoring='accuracy').mean()
  names.append(name)
  scores.append(score)

kf_cross_val = pd.DataFrame({'Name': names, 'Score': scores})
print(kf_cross_val)    



  Name     Score
0  KNN  0.846172
1  SVC  0.651914
2   LR  0.784962
3   DT  0.848855
4  GNB  0.777187
5   RF  0.848821
6   GB  0.894446


GRID SEARCH on XGB Parameters-: As from the above algorithms we see that Gradient Boosting algorithm performed well with 89% accuracy, so we will further explore XGboost Algorithm.

In [0]:
target='class'
predictors=['preg','plas','pres','skin','insu','mass','pedi','age']
param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch1.fit(diabetes[predictors],diabetes[target])


Implementing XGBoost Classifier


In [134]:
clf3=xgb.XGBClassifier(learning_rate=0.1,n_estimators=200,max_depth=5,min_child_weight=5,gamma=0,subsample=0.8,colsample_bytree=0.8,reg_alpha=0.005,objective='binary:logistic',n_thread=4,scale_pos_weight=1)
clf3.fit(X_train,y_train)
y_pred=clf3.predict(X_test)
print("Final Best Accuracy-: {}%".format(accuracy_score(y_test,y_pred)*100))

Final Best Accuracy-: 89.58333333333334%
