In [1]:
import pandas as pd

#reading data
diabetes_data = pd.read_csv('/content/diabetes_prediction_dataset.csv')

# Displaying the first few rows of the dataframe to ensure it's loaded correctly
print(diabetes_data.head())

   gender   age  hypertension  heart_disease smoking_history    bmi  \
0  Female  80.0             0              1           never  25.19   
1  Female  54.0             0              0         No Info  27.32   
2    Male  28.0             0              0           never  27.32   
3  Female  36.0             0              0         current  23.45   
4    Male  76.0             1              1         current  20.14   

   HbA1c_level  blood_glucose_level  diabetes  
0          6.6                  140         0  
1          6.6                   80         0  
2          5.7                  158         0  
3          5.0                  155         0  
4          4.8                  155         0  


# Data Cleaning and Feature Selection

In [2]:
#Removing smoking_history since some data have no information; best to dsiregard and add as limitations

columns_to_delete = [
    'smoking_history'
]

diabetes_data = diabetes_data.drop(columns = columns_to_delete)

In [3]:
# Mapping the Gender Column into 1 for male and 0 for female
gender_mapping = {'Male': 1, 'Female': 0}
diabetes_data['gender'] = diabetes_data['gender'].map(gender_mapping)


#All data fields must be filled out, thus remove nulls
# Check for missing values

diabetes_data = diabetes_data.dropna()
print(diabetes_data.isnull().sum())


gender                 0
age                    0
hypertension           0
heart_disease          0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64


# Split the Data
Split the data into training and testing sets.



In [4]:
from sklearn.model_selection import train_test_split


#Identify feature to be predicted & Split training and test set
X = diabetes_data.drop(columns=['diabetes'])
y = diabetes_data['diabetes']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0
)

# Data Preprocessing
Encode categorical variables & scale/normalize numerical variables.

In [5]:
from sklearn.preprocessing import MinMaxScaler

# need to do scaling to all features
scalers = {}
for col in X_train.columns:
  scaler = MinMaxScaler()
  X_train[col] = scaler.fit_transform(X_train[col].values.reshape(-1, 1))
  scalers[col] = scaler

# Apply Scaling into the Test Set
for col in X_test.columns:
  X_test[col] = scalers[col].transform(X_test[col].values.reshape(-1, 1))


# Select Model

In [6]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=5)

# Train the model
model.fit(X_train, y_train)

# Evaluation of Model

In [7]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print('KNN Accuracy: {0}'.format(acc))
#print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
#print("Classification Report:\n", classification_report(y_test, y_pred))

KNN Accuracy: 0.9668950342551382


# Predictor/Model Deployment


In [8]:
import joblib
import numpy as np

In [9]:
# Save the trained model
joblib.dump(model, 'diabetes_prediction_model.joblib')

# Save scalers used for numerical features
scalers_in_order = [scalers[col] for col in X.columns]
joblib.dump(scalers_in_order, 'diabetes_prediction_model_scalers.joblib')

# Save mappings or encoders used for categorical variables (if any)
mappings = {'gender': gender_mapping, 'Label': {0: 'No diabetes', 1: 'Diabetes'}}
joblib.dump(mappings, 'diabetes_prediction_model_mappings.joblib')

['diabetes_prediction_model_mappings.joblib']

In [10]:
# Load the saved model and preprocessing objects
new_model = joblib.load('diabetes_prediction_model.joblib')
new_model_scalers = joblib.load('diabetes_prediction_model_scalers.joblib')
new_model_mappings = joblib.load('diabetes_prediction_model_mappings.joblib')


In [13]:

# Simple prediction model function
def predict_diabetes(datapoint:list):
  datapoint_numpy = np.array(datapoint)

  # Perform Scaling for numerical features, starting from index 1
  for i, scaler in enumerate(new_model_scalers):
    if i == 0:
      datapoint_numpy[i] = new_model_mappings['gender'][datapoint_numpy[i]]

    datapoint_numpy[i] = scaler.transform(np.array([datapoint_numpy[i]]).reshape(-1, 1))[0, 0]
  # Return Predicted Model
  return new_model.predict(datapoint_numpy.astype(np.float64).reshape(1, -1))


  # Example of integrating user input with prediction
gender = input('Gender (Male/Female): ')
age = int(input('Age: '))
hypertension = int(input('Hypertension (0 for No, 1 for Yes): '))
heart_disease = int(input('Heart Disease (0 for No, 1 for Yes): '))
bmi = float(input('BMI: '))
HbA1c_level = float(input('HbA1c level: '))
blood_glucose_level = float(input('Blood glucose level: '))




prediction = new_model_mappings['Label'][predict_diabetes([gender, age, hypertension, heart_disease, bmi, HbA1c_level, blood_glucose_level])[0]]
print('Prediction: ', prediction)


Gender (Male/Female): Male
Age: 80
Hypertension (0 for No, 1 for Yes): 1
Heart Disease (0 for No, 1 for Yes): 1
BMI: 21
HbA1c level: 8
Blood glucose level: 160
Prediction:  Diabetes




In [None]:
1