In [1]:
import pandas as pd

#reading data
diabetes_data = pd.read_csv('diabetes_prediction_dataset.csv')

# Displaying the first few rows of the dataframe to ensure it's loaded correctly
print(diabetes_data.head())

   gender   age  hypertension  heart_disease smoking_history    bmi  \
0  Female  80.0             0              1           never  25.19   
1  Female  54.0             0              0         No Info  27.32   
2    Male  28.0             0              0           never  27.32   
3  Female  36.0             0              0         current  23.45   
4    Male  76.0             1              1         current  20.14   

   HbA1c_level  blood_glucose_level  diabetes  
0          6.6                  140         0  
1          6.6                   80         0  
2          5.7                  158         0  
3          5.0                  155         0  
4          4.8                  155         0  


In [2]:
#Removing smoking_history since some data have no information; best to dsiregard and add as limitations

columns_to_delete = [
    'smoking_history'
]

diabetes_data = diabetes_data.drop(columns = columns_to_delete)

In [3]:
# Let's map the Gender Column into 1 for male and 0 for female
gender_mapping = {'Male': 1, 'Female': 0}
diabetes_data['gender'] = diabetes_data['gender'].map(gender_mapping)


#All data fields must be filled out, thus remove nulls
# Check for missing values

diabetes_data = diabetes_data.dropna()
print(diabetes_data.isnull().sum())


gender                 0
age                    0
hypertension           0
heart_disease          0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64


In [4]:
from sklearn.model_selection import train_test_split


# Let's get the training set
X = diabetes_data.drop(columns=['diabetes'])
y = diabetes_data['diabetes']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0
)

In [5]:
from sklearn.preprocessing import MinMaxScaler

# FOR KNN We need to do scaling
scalers = {}
for col in X_train.columns:
  scaler = MinMaxScaler()
  X_train[col] = scaler.fit_transform(X_train[col].values.reshape(-1, 1))
  scalers[col] = scaler

# Apply Scaling into the Test Set
for col in X_test.columns:
  X_test[col] = scalers[col].transform(X_test[col].values.reshape(-1, 1))


In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
model = DecisionTreeClassifier()

# Train the model
model.fit(X_train, y_train)

# predictions
y_pred = model.predict(X_test)

# Eval
acc = accuracy_score(y_test, y_pred)
print(acc)

0.9531929789468421


In [8]:
import joblib
import numpy as np

In [9]:
# Save the trained model
joblib.dump(model, './../savedModels/diabetes_prediction_dt_model.joblib')



['./../savedModels/diabetes_prediction_dt_model.joblib']