In [100]:
import pandas as pd
import seaborn as sns
import numpy as np

In [101]:
df = pd.read_csv(r"diabetes.csv")

In [102]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [103]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
Pregnancies                 768 non-null int64
Glucose                     768 non-null int64
BloodPressure               768 non-null int64
SkinThickness               768 non-null int64
Insulin                     768 non-null int64
BMI                         768 non-null float64
DiabetesPedigreeFunction    768 non-null float64
Age                         768 non-null int64
Outcome                     768 non-null int64
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [104]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


# Managing Missing Values

In [105]:
df["Age_cat"] = pd.cut(df.Age, bins = [20, 28, 35, 45, 60, 81], labels = [1,2,3,4,5])
df.Age_cat = df.Age_cat.astype(int)

Glucose: filling it wrt median of respective Age category

In [106]:
df.Glucose = df.Glucose.where(df["Glucose"] != 0, np.nan)
df.Glucose.fillna(df.groupby(['Age_cat']).transform('median').Glucose, inplace=True)
df["Glucose_cat"] = pd.cut(df.Glucose, bins = [0, 140, 200], labels = [1,2])
df.Glucose_cat = df.Glucose_cat.astype(int)

In [107]:
df.Insulin = df.Insulin.where(df["Insulin"] != 0, np.nan)
df.Insulin.fillna(df.groupby(['Glucose_cat']).transform('median').Insulin, inplace=True)

In [108]:
df.BMI = df.BMI.where(df["BMI"] != 0, 32)
df["BMI_cat"] = pd.cut(df.BMI, bins = [15, 16,18.5, 25, 30, 35, np.inf], labels = [1,2,3,4,5,6])
df.BMI_cat = df.BMI_cat.astype(int)

In [109]:
df.BloodPressure = df.BloodPressure.where(df.BloodPressure != 0, df[df["BloodPressure"] != 0]["BloodPressure"].median())
df.BloodPressure = df.BloodPressure.where(df["BloodPressure"] > 40, 40)

In [110]:
df.SkinThickness = df.SkinThickness.where(df["SkinThickness"] < 60, 60)
df.SkinThickness = df.SkinThickness.where(df["SkinThickness"] != 0, np.nan)
df.SkinThickness = df.SkinThickness.fillna(df.SkinThickness.median())

In [111]:
df["Preg_cat"] = pd.cut(df.Pregnancies, bins = [-1, 2, 5, np.inf], labels = [1,2,3])
df.Pred_cat = df.Pred_cat.astype(int)

# Trainging Model

In [112]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.25, random_state = 42)
for i,j in split.split(df, df["Outcome"]):
    X = df.loc[i]
    test = df.loc[j]

Random Forest

In [123]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

model = RandomForestClassifier(n_estimators = 75) 
model.fit(X.drop("Outcome", axis = 1), X["Outcome"])
pred = model.predict(test.drop("Outcome", axis = 1))
print(confusion_matrix(test.Outcome, pred)," Accuracy: ", accuracy_score(test.Outcome, pred)) 

[[106  19]
 [ 27  40]]  Accuracy:  0.7604166666666666


Applying Grid Search and using the best model

In [210]:
from sklearn.model_selection import GridSearchCV
n_estimators = [75, 100, 125, 150, 200 , 300]
min_samples_split = [2, 5, 10, 15, 20, 100]
min_samples_leaf = [2, 5, 10, 15]
max_depth = [2, 4, 6, 8]
param_grid = [{"n_estimators":n_estimators,  "min_samples_split": min_samples_split,
               "min_samples_leaf": min_samples_leaf,"max_depth" : max_depth}]

model_forest = RandomForestClassifier()
grid_search = GridSearchCV(model_forest, param_grid, cv = 5)
grid_search.fit(X.drop("Outcome", axis = 1), X["Outcome"])

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid

selecting the best model

In [211]:
final_model = grid_search.best_estimator_
grid_search.best_params_

{'max_depth': 8,
 'min_samples_leaf': 5,
 'min_samples_split': 5,
 'n_estimators': 100}

In [213]:
final_model.fit(X.drop("Outcome", axis = 1), X["Outcome"])
pred = final_model.predict(test.drop("Outcome", axis = 1))
print(confusion_matrix(test.Outcome, pred)," Accuracy: ", accuracy_score(test.Outcome, pred)) 

[[110  15]
 [ 30  37]]  Accuracy:  0.765625


Random Forest without Outliers

In [193]:
#REMOVING OUTLIERS AND PREDICTING
df_temp = df.copy()
y = df_temp.Outcome
df_temp.drop("Outcome", axis = 1, inplace = True)
IQR = df_temp.quantile(0.75) - df_temp.quantile(0.25)
df_temp = df_temp[~((df_temp < (df_temp.quantile(0.25) - 1.5*IQR)) | (df_temp > (df_temp.quantile(0.75) +1.5*IQR)))
                  .any(axis=1)]
y = y.loc[df_temp.index]
df_temp["Outcome"] = y
df_temp.index = np.arange(0, len(df_temp))

In [194]:
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.25)
for i,j in split.split(df_temp, df_temp["Outcome"]):
    XX = df_temp.loc[i]
    test_temp = df_temp.loc[j]

In [196]:
model = RandomForestClassifier(n_estimators = 75) 
model.fit(XX.drop("Outcome", axis = 1), XX["Outcome"])
pred = model.predict(test_temp.drop("Outcome", axis = 1))
print(confusion_matrix(test_temp.Outcome, pred)," Accuracy: ", accuracy_score(test_temp.Outcome, pred)) 

[[89  2]
 [19  8]]  Accuracy:  0.8220338983050848


SVC

In [199]:
from sklearn.svm import SVC
model = SVC() 
model.fit(X.drop("Outcome", axis = 1), X["Outcome"])
pred = model.predict(test.drop("Outcome", axis = 1))
print(confusion_matrix(test.Outcome, pred)," Accuracy: ", accuracy_score(test.Outcome, pred)) 

[[125   0]
 [ 67   0]]  Accuracy:  0.6510416666666666




Logistic Regression

In [201]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression() 
model.fit(X.drop("Outcome", axis = 1), X["Outcome"])
pred = model.predict(test.drop("Outcome", axis = 1))
print(confusion_matrix(test.Outcome, pred)," Accuracy: ", accuracy_score(test.Outcome, pred))     


[[103  22]
 [ 34  33]]  Accuracy:  0.7083333333333334




K Neighbours Classifier

In [202]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
model.fit(X.drop("Outcome", axis = 1), X["Outcome"])
pred = model.predict(test.drop("Outcome", axis = 1))
print(confusion_matrix(test.Outcome, pred)," Accuracy: ", accuracy_score(test.Outcome, pred)) 

[[96 29]
 [34 33]]  Accuracy:  0.671875


Gaussian Nauve Baye's

In [204]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X.drop("Outcome", axis = 1), X["Outcome"])
pred = model.predict(test.drop("Outcome", axis = 1))
print(confusion_matrix(test.Outcome, pred)," Accuracy: ", accuracy_score(test.Outcome, pred)) 

[[94 31]
 [25 42]]  Accuracy:  0.7083333333333334


Caliberated Classifier 

In [205]:
from sklearn.calibration import CalibratedClassifierCV
model = CalibratedClassifierCV()
model.fit(X.drop("Outcome", axis = 1), X["Outcome"])
pred = model.predict(test.drop("Outcome", axis = 1))
print(confusion_matrix(test.Outcome, pred)," Accuracy: ", accuracy_score(test.Outcome, pred)) 

[[109  16]
 [ 39  28]]  Accuracy:  0.7135416666666666




Perceptron

In [206]:
from sklearn.linear_model import Perceptron
model = Perceptron(max_iter = 5)
model.fit(X.drop("Outcome", axis = 1), X["Outcome"])
pred = model.predict(test.drop("Outcome", axis = 1))
print(confusion_matrix(test.Outcome, pred)," Accuracy: ", accuracy_score(test.Outcome, pred)) 

[[103  22]
 [ 40  27]]  Accuracy:  0.6770833333333334




X G Boost

In [207]:
import xgboost as xgb
model = xgb.XGBClassifier()
model.fit(X.drop("Outcome", axis = 1), X["Outcome"])
pred = model.predict(test.drop("Outcome", axis = 1))
print(confusion_matrix(test.Outcome, pred)," Accuracy: ", accuracy_score(test.Outcome, pred)) 

[[105  20]
 [ 25  42]]  Accuracy:  0.765625
