In [1]:
import pandas as pd
import numpy as np

In [2]:
diabets=pd.read_csv("diabetes.csv" , decimal=',').copy()

In [3]:
diabets.columns

Index(['patient_number', 'cholesterol', 'glucose', 'hdl_chol',
       'chol_hdl_ratio', 'age', 'gender', 'height', 'weight', 'bmi',
       'systolic_bp', 'diastolic_bp', 'waist', 'hip', 'waist_hip_ratio',
       'diabetes'],
      dtype='object')

In [4]:
missing_values= [col for col in diabets.columns
                if diabets[col].isnull().any()]

In [5]:
print(missing_values)

[]


we have no missing values in our dataset

In [6]:
print(diabets.dtypes)

patient_number       int64
cholesterol          int64
glucose              int64
hdl_chol             int64
chol_hdl_ratio     float64
age                  int64
gender              object
height               int64
weight               int64
bmi                float64
systolic_bp          int64
diastolic_bp         int64
waist                int64
hip                  int64
waist_hip_ratio    float64
diabetes            object
dtype: object


In [7]:
diabets.drop('patient_number', inplace=True, axis=1)

we don't need patient_number column.

In [8]:
y=diabets.iloc[:,-1]
X=diabets.iloc[:, 0:-1]

In [9]:
cats=(X.dtypes==object)
categorical_values= list(cats[cats].index)
print(categorical_values)
for v in categorical_values:
    print(v+":\t"+str(X[v].nunique()))

['gender']
gender:	2


we have now devided the columns into categorical and numerical values

In [10]:
values=list(X.columns)

print(values)
numerical_values=values.copy()

for x in categorical_values:
    numerical_values.remove(x)
print(numerical_values)

['cholesterol', 'glucose', 'hdl_chol', 'chol_hdl_ratio', 'age', 'gender', 'height', 'weight', 'bmi', 'systolic_bp', 'diastolic_bp', 'waist', 'hip', 'waist_hip_ratio']
['cholesterol', 'glucose', 'hdl_chol', 'chol_hdl_ratio', 'age', 'height', 'weight', 'bmi', 'systolic_bp', 'diastolic_bp', 'waist', 'hip', 'waist_hip_ratio']


In [11]:
for x in categorical_values:
    print(x)
    print(X[x].unique())

gender
['female' 'male']


In [12]:
from sklearn.preprocessing import OrdinalEncoder
oe= OrdinalEncoder()
print(X.head(10))

   cholesterol  glucose  hdl_chol  chol_hdl_ratio  age  gender  height  \
0          193       77        49             3.9   19  female      61   
1          146       79        41             3.6   19  female      60   
2          217       75        54             4.0   20  female      67   
3          226       97        70             3.2   20  female      64   
4          164       91        67             2.4   20  female      70   
5          170       69        64             2.7   20  female      64   
6          149       77        49             3.0   20  female      62   
7          164       71        63             2.6   20    male      72   
8          230      112        64             3.6   20    male      67   
9          179      105        60             3.0   20  female      58   

   weight   bmi  systolic_bp  diastolic_bp  waist  hip  waist_hip_ratio  
0     119  22.5          118            70     32   38             0.84  
1     135  26.4          108         

Using OrdinalEncoder to transform categorical values to numerical values

In [13]:
oec=["gender"]
X[oec]=oe.fit_transform(X[oec])

In [14]:
print(X.head(10))

   cholesterol  glucose  hdl_chol  chol_hdl_ratio  age  gender  height  \
0          193       77        49             3.9   19     0.0      61   
1          146       79        41             3.6   19     0.0      60   
2          217       75        54             4.0   20     0.0      67   
3          226       97        70             3.2   20     0.0      64   
4          164       91        67             2.4   20     0.0      70   
5          170       69        64             2.7   20     0.0      64   
6          149       77        49             3.0   20     0.0      62   
7          164       71        63             2.6   20     1.0      72   
8          230      112        64             3.6   20     1.0      67   
9          179      105        60             3.0   20     0.0      58   

   weight   bmi  systolic_bp  diastolic_bp  waist  hip  waist_hip_ratio  
0     119  22.5          118            70     32   38             0.84  
1     135  26.4          108         

now it is time to train our dataset with multiple models and see the one with the best result (we are using 3 differents models)

In [15]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

In [16]:
decisiontreemodel= DecisionTreeClassifier()
dt=cross_val_score(decisiontreemodel, X, y, cv=10)
print(dt)

[0.92307692 0.8974359  0.92307692 0.92307692 0.94871795 0.92307692
 0.8974359  0.82051282 0.74358974 0.25641026]


In [17]:
nee=[100, 200, 500,1000]
for ne in nee:
    randomforestmodel=RandomForestClassifier(n_estimators=ne)
    rf=cross_val_score(randomforestmodel, X, y, cv=10)
    print(rf.mean())

0.8948717948717949
0.9000000000000001
0.9025641025641027
0.9025641025641027


In [18]:
Xb=X.copy()
for xb in Xb.columns:
    Xb[xb]=pd.to_numeric(Xb[xb])
for ne in nee:
    xgboostmodel=XGBClassifier(n_estimators=ne, learning_rate=0.00001, n_jobs=4)
# xgboostmodel.fit(Xb,y)
    xg=cross_val_score(xgboostmodel, Xb, y, cv=10)
    print(ne)
    print(xg.mean())
# print(Xb.dtypes)



















100
0.9076923076923078




































200
0.9076923076923078




































500
0.9076923076923078




































1000
0.9076923076923078


In [19]:
allmodels=[dt, rf, xg]
for x in allmodels:
    print(x.mean())

0.8256410256410256
0.9025641025641027
0.9076923076923078


In [21]:
import pickle
filename = 'model.pkl'
model=XGBClassifier(n_estimators=1000, learning_rate=0.05, n_jobs=4)
model.fit(Xb,y)

pickle.dump(model, open(filename, 'wb'))
X_test = np.array([[120,40,46,3.3,26,0.0,70,170,60,136,88,33,39,15.0]])
#TEST
# load the model from disk
loaded_model = pickle.load(open("model.pkl", 'rb'))
result = loaded_model.predict(X_test)
print(result)

['No diabetes']
