In [85]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder

In [86]:
df = pd.read_csv('diabetes_prediction_dataset.csv')

In [87]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [88]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


In [89]:
df = df[['gender', 'age', 'bmi', 'HbA1c_level', 'blood_glucose_level', 'diabetes']]

In [90]:
df.head()

Unnamed: 0,gender,age,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,25.19,6.6,140,0
1,Female,54.0,27.32,6.6,80,0
2,Male,28.0,27.32,5.7,158,0
3,Female,36.0,23.45,5.0,155,0
4,Male,76.0,20.14,4.8,155,0


In [91]:
le_gender = LabelEncoder()
df['gender'] = le_gender.fit_transform(df['gender'])

In [92]:
df.head()

Unnamed: 0,gender,age,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,80.0,25.19,6.6,140,0
1,0,54.0,27.32,6.6,80,0
2,1,28.0,27.32,5.7,158,0
3,0,36.0,23.45,5.0,155,0
4,1,76.0,20.14,4.8,155,0


In [93]:
X = df.iloc[:, 0:-1]
Y = df.iloc[:, -1]

X.head(), Y.head()

(   gender   age    bmi  HbA1c_level  blood_glucose_level
 0       0  80.0  25.19          6.6                  140
 1       0  54.0  27.32          6.6                   80
 2       1  28.0  27.32          5.7                  158
 3       0  36.0  23.45          5.0                  155
 4       1  76.0  20.14          4.8                  155,
 0    0
 1    0
 2    0
 3    0
 4    0
 Name: diabetes, dtype: int64)

In [96]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)
X_train.shape, X_test.shape,Y_train.shape, Y_test.shape

((80000, 5), (20000, 5), (80000,), (20000,))

In [98]:
classifier = DecisionTreeClassifier()
model = classifier.fit(X_train, Y_train)
model.score(X_train, Y_train), model.score(X_test, Y_test)

(0.9968375, 0.95435)

In [100]:
model.predict(X_test[0:6]), Y_test.head()

(array([0, 0, 0, 0, 0, 0]),
 3582     0
 60498    0
 53227    0
 21333    0
 3885     0
 Name: diabetes, dtype: int64)

In [101]:
import pickle

In [None]:
data = {"model":classifier, "le_gender": le_gender}
with open('diabetes_prediction_model.pkl', 'wb') as file:
    pickle.dump(data, file)