# Diabetes classification

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('Diabetes_dataset.csv')
data.head()

Unnamed: 0,Age,Gender,Family_Diabetes,PhysicalActivity,BMI,Smoking,Alcohol,Sleep,RegularMedicine,JunkFood,Stress,BPLevel,Pregnancies,UrinationFreq,Diabetic
0,50-59,Male,no,> 60 min,39.0,no,no,8,no,occasionally,sometimes,high,0.0,not much,no
1,50-59,Male,no,< 30 min,28.0,no,no,8,yes,very often,sometimes,normal,0.0,not much,no
2,40-49,Male,no,> 60 min,24.0,no,no,6,no,occasionally,sometimes,normal,0.0,not much,no
3,50-59,Male,no,> 60 min,23.0,no,no,8,no,occasionally,sometimes,normal,0.0,not much,no
4,40-49,Male,no,< 30 min,27.0,no,no,8,no,occasionally,sometimes,normal,0.0,not much,no


### Imputing missing values

In [3]:
missing = data.isna().sum()
missing[missing > 0]

BMI             4
Pregnancies    42
dtype: int64

In [4]:
BMImean = round(data.BMI.mean())

In [5]:
data.BMI.fillna(BMImean,inplace=True)
data.Pregnancies.fillna(0,inplace=True)

### Identify inputs and targets

In [6]:
input_cols = data.columns[:-1]
target_col = 'Diabetic'

In [7]:
numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = data.select_dtypes('object').columns.tolist()

### Encoding categorical data

In [8]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

map_dict ={}
for col in categorical_cols:
    data[col] = le.fit_transform(data[col])
 
    name_mapping = dict(zip(le.classes_,
                        le.transform(le.classes_)))
 
    map_dict[col]= name_mapping
print(map_dict)

{'Age': {'40-49': 0, '50-59': 1, '60 or older': 2, 'less than 40': 3}, 'Gender': {'Female': 0, 'Male': 1}, 'Family_Diabetes': {'no': 0, 'yes': 1}, 'PhysicalActivity': {'30-60 min': 0, '< 30 min': 1, '> 60 min': 2, 'none': 3}, 'Smoking': {'no': 0, 'yes': 1}, 'Alcohol': {'no': 0, 'yes': 1}, 'RegularMedicine': {'no': 0, 'yes': 1}, 'JunkFood': {'occasionally': 0, 'often': 1, 'very often': 2}, 'Stress': {'not at all': 0, 'sometimes': 1, 'very often': 2}, 'BPLevel': {'high': 0, 'low': 1, 'normal': 2}, 'UrinationFreq': {'not much': 0, 'quite often': 1}, 'Diabetic': {'no': 0, 'yes': 1}}


### Building the model

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

In [10]:
X = data[input_cols]
y = data[target_col]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

In [11]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

print(classification_report(y_test, predictions))
print('Accuracy: ', accuracy_score(y_test, predictions))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97       136
           1       0.94      0.91      0.93        55

    accuracy                           0.96       191
   macro avg       0.95      0.94      0.95       191
weighted avg       0.96      0.96      0.96       191

Accuracy:  0.9581151832460733


### Pickle the model

In [12]:
import pickle
model_filename = 'diabetes-model.pkl'
pickle.dump(model, open(model_filename,'wb'))

model = pickle.load(open('diabetes-model.pkl','rb'))
print(model.predict([[3,1,0,0,32,0,0,9,0,0,0,2,0,0]]))

[0]
