In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
diabetes_data = pd.read_csv('diabetes_prediction_dataset.csv') #read data
diabetes_data #print data

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
99995,Female,80.0,0,0,No Info,27.32,6.2,90,0
99996,Female,2.0,0,0,No Info,17.37,6.5,100,0
99997,Male,66.0,0,0,former,27.83,5.7,155,0
99998,Female,24.0,0,0,never,35.42,4.0,100,0


In [None]:
#data clearing (features w/ no info, null values etc.)
columns_to_delete = [
    'smoking_history'
]

diabetes_data = diabetes_data.drop(columns = columns_to_delete)

In [None]:
gender_mapping = {'Male': 1, 'Female': 0}
diabetes_data['gender'] = diabetes_data['gender'].map(gender_mapping)

diabetes_data = diabetes_data.dropna()
print(diabetes_data.isnull().sum())


gender                 0
age                    0
hypertension           0
heart_disease          0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64


In [None]:
from sklearn.model_selection import train_test_split

X = diabetes_data.drop(columns = ['diabetes'])
y = diabetes_data['diabetes']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=1)

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

# Encode categorical variables (gender)


# FOR Decision Tree We need to do scaling
scalers = {}
for col in X_train.columns:
  scaler = MinMaxScaler()
  X_train[col] = scaler.fit_transform(X_train[col].values.reshape(-1, 1))
  scalers[col] = scaler

# Apply Scaling into the Test Set
for col in X_test.columns:
  X_test[col] = scalers[col].transform(X_test[col].values.reshape(-1, 1))

# Model Selection (Decision Tree)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
model = DecisionTreeClassifier()

# Train the model
model.fit(X_train, y_train)

# predictions
y_pred = model.predict(X_test)

# Eval
acc = accuracy_score(y_test, y_pred)
print(acc)

0.9534430164524679


# Model Evaluation (Decision Tree)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred = model.predict(X_test)
print(y_pred)
acc = accuracy_score(y_test, y_pred)
print('Decision Tree Accuracy: {0}'.format(acc))
#print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

[0 0 0 ... 0 0 0]
Decision Tree Accuracy: 0.9534430164524679
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.97      0.97     18293
           1       0.72      0.73      0.73      1704

    accuracy                           0.95     19997
   macro avg       0.85      0.85      0.85     19997
weighted avg       0.95      0.95      0.95     19997



# Model Deployment (Decision Tree)

In [None]:
import joblib
import numpy as np

In [None]:
# Save the trained model
joblib.dump(model, 'diabetes_prediction_model.joblib')

# Save scalers used for numerical features
scalers_in_order = [scalers[col] for col in X.columns]
joblib.dump(scalers_in_order, 'diabetes_prediction_model_scalers.joblib')

# Save mappings or encoders used for categorical variables (if any)
mappings = {'gender': gender_mapping, 'Label': {0: 'No diabetes', 1: 'Diabetes'}}
joblib.dump(mappings, 'diabetes_prediction_model_mappings.joblib')

['diabetes_prediction_model_mappings.joblib']

In [None]:
# Load the saved model and preprocessing objects
new_model = joblib.load('diabetes_prediction_model.joblib')
new_model_scalers = joblib.load('diabetes_prediction_model_scalers.joblib')
new_model_mappings = joblib.load('diabetes_prediction_model_mappings.joblib')

In [None]:

# Simple prediction model function
def predict_diabetes(datapoint:list):
  datapoint_numpy = np.array(datapoint)

  # Perform Scaling for numerical features, starting from index 1
  for i, scaler in enumerate(new_model_scalers):
    if i == 0:
      datapoint_numpy[i] = new_model_mappings['gender'][datapoint_numpy[i]]

    datapoint_numpy[i] = scaler.transform(np.array([datapoint_numpy[i]]).reshape(-1, 1))[0, 0]
  # Return Predicted Model
  return new_model.predict(datapoint_numpy.astype(np.float64).reshape(1, -1))


  # Example of integrating user input with prediction
gender = input('Gender (Male/Female): ')
age = int(input('Age: '))
hypertension = int(input('Hypertension (0 for No, 1 for Yes): '))
heart_disease = int(input('Heart Disease (0 for No, 1 for Yes): '))
bmi = float(input('BMI: '))
HbA1c_level = float(input('HbA1c level: '))
blood_glucose_level = float(input('Blood glucose level: '))

prediction = new_model_mappings['Label'][predict_diabetes([gender, age, hypertension, heart_disease, bmi, HbA1c_level, blood_glucose_level])[0]]
print('Prediction: ', prediction)


KeyboardInterrupt: Interrupted by user