# Diabetes Prediction Project

This diabetes prediction project utilizes machine learning algorithms like Random Forest, K-Nearest Neighbors, AdaBoost, and Linear Regression to classify whether individuals have diabetes based on features such as glucose levels and BMI. 🩺 The models are trained on a dataset containing demographic and health-related attributes to provide accurate predictions and insights. 📊

## Step 1: Import Necessary Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report


## Step 2: Load and Prepare the Dataset

In [2]:
# Load the dataset
data = pd.read_csv('diabetes_prediction_dataset.csv')  # Replace with your dataset file path

# Display the first few rows to understand the data
print(data.head())

# Separate features (X) and target variable (y)
X = data.drop(['diabetes'], axis=1)  # Features
y = data['diabetes']  # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


   gender   age  hypertension  heart_disease smoking_history    bmi  \
0  Female  80.0             0              1           never  25.19   
1  Female  54.0             0              0         No Info  27.32   
2    Male  28.0             0              0           never  27.32   
3  Female  36.0             0              0         current  23.45   
4    Male  76.0             1              1         current  20.14   

   HbA1c_level  blood_glucose_level  diabetes  
0          6.6                  140         0  
1          6.6                   80         0  
2          5.7                  158         0  
3          5.0                  155         0  
4          4.8                  155         0  


## Step 3: Data Preprocessing

In [9]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder
# Identify numerical and categorical columns
numerical_cols = ['age', 'hypertension', 'heart_disease', 'bmi', 'HbA1c_level', 'blood_glucose_level']
categorical_cols = ['gender', 'smoking_history']

# Step 1: Handle numerical columns

# Impute missing values in numerical columns with the mean
numerical_imputer = SimpleImputer(strategy='mean')
X_train_numerical = numerical_imputer.fit_transform(X_train[numerical_cols])
X_test_numerical = numerical_imputer.transform(X_test[numerical_cols])

# Standardize numerical features
scaler = StandardScaler()
X_train_numerical = scaler.fit_transform(X_train_numerical)
X_test_numerical = scaler.transform(X_test_numerical)

# Step 2: Handle categorical columns

# Encode categorical columns
encoder = OneHotEncoder(drop='first', sparse=False)  # drop='first' to avoid multicollinearity
X_train_categorical = encoder.fit_transform(X_train[categorical_cols])
X_test_categorical = encoder.transform(X_test[categorical_cols])

# Combine numerical and categorical features
X_train_processed = np.hstack((X_train_numerical, X_train_categorical))
X_test_processed = np.hstack((X_test_numerical, X_test_categorical))




##  Step 4: Model Selection and Training

In [11]:
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
import pandas as pd

# Assuming `data` contains your DataFrame with the columns mentioned earlier

# Step 1: Encode categorical variables
label_encoder = LabelEncoder()
data['gender'] = label_encoder.fit_transform(data['gender'])
data['smoking_history'] = label_encoder.fit_transform(data['smoking_history'])

# Step 2: Split into features (X) and target (y)
X = data.drop(['diabetes'], axis=1)  # Features excluding the target variable
y = data['diabetes']  # Target variable

# Step 3: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Handle numerical features (if needed)
# If numerical features need scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 5: Initialize and train models
rf = RandomForestClassifier()
knn = KNeighborsClassifier()
adaboost = AdaBoostClassifier()
lr = LinearRegression()  # Note: LinearRegression is for regression, not classification
dt = DecisionTreeClassifier()

# Train the classification models (RandomForestClassifier, KNeighborsClassifier, AdaBoostClassifier, DecisionTreeClassifier)
rf.fit(X_train_scaled, y_train)
knn.fit(X_train_scaled, y_train)
adaboost.fit(X_train_scaled, y_train)
dt.fit(X_train_scaled, y_train)

# LinearRegression is typically used for regression tasks (predicting continuous variables)
# If you need to predict continuous variables like glucose levels, use LinearRegression
# lr.fit(X_train_scaled, y_train)  # Uncomment if predicting continuous variables

# Evaluation and further steps...


## Step 5: Model Evaluation

In [13]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import pandas as pd

# Assuming `data` contains your DataFrame with the columns mentioned earlier

# Step 1: Encode categorical variables
label_encoder = LabelEncoder()
data['gender'] = label_encoder.fit_transform(data['gender'])
data['smoking_history'] = label_encoder.fit_transform(data['smoking_history'])

# Step 2: Split into features (X) and target (y)
X = data.drop(['diabetes'], axis=1)  # Features excluding the target variable
y = data['diabetes']  # Target variable

# Step 3: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Handle numerical features (if needed)
# If numerical features need scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 5: Initialize models
rf = RandomForestClassifier()
knn = KNeighborsClassifier()
adaboost = AdaBoostClassifier()
lr = LinearRegression()  # Note: LinearRegression is for regression, not classification
dt = DecisionTreeClassifier()

# Train the classification models (RandomForestClassifier, KNeighborsClassifier, AdaBoostClassifier, DecisionTreeClassifier)
rf.fit(X_train_scaled, y_train)
knn.fit(X_train_scaled, y_train)
adaboost.fit(X_train_scaled, y_train)
dt.fit(X_train_scaled, y_train)

# Fit the LinearRegression model
lr.fit(X_train_scaled, y_train)  # Ensure LinearRegression is fitted before evaluation

# Function to evaluate and print metrics
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred.round())  # Round predictions for classification
    report = classification_report(y_test, y_pred.round())
    return accuracy, report

# Evaluate each model
models = {'Random Forest': rf,
          'K-Nearest Neighbors': knn,
          'AdaBoost': adaboost,
          'Linear Regression': lr,
          'Decision Tree': dt}

for name, model in models.items():
    accuracy, report = evaluate_model(model, X_test_scaled, y_test)
    print(f"=== {name} ===")
    print(f"Accuracy: {accuracy}")
    print(f"Classification Report:\n{report}\n")


=== Random Forest ===
Accuracy: 0.9701
Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98     18292
           1       0.95      0.69      0.80      1708

    accuracy                           0.97     20000
   macro avg       0.96      0.84      0.89     20000
weighted avg       0.97      0.97      0.97     20000


=== K-Nearest Neighbors ===
Accuracy: 0.96125
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     18292
           1       0.89      0.62      0.73      1708

    accuracy                           0.96     20000
   macro avg       0.93      0.81      0.86     20000
weighted avg       0.96      0.96      0.96     20000


=== AdaBoost ===
Accuracy: 0.9721
Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98     18292
           1       0.97      0.70      0.81    