In [2]:
# Import dependencies
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder

In [3]:
# Read the data into a Pandas DataFrame
# Read the CSV file from the Resources folder into a Pandas DataFrame
file_path = Path("../diabetes_prediction_dataset.csv")
df_diabetes = pd.read_csv(file_path)

# Review the DataFrame
df_diabetes.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [4]:
df_diabetes['gender'].fillna(df_diabetes['gender'].mode()[0], inplace=True)

In [5]:
gender_mapping = {'Female': 0, 'Male': 1, 'Other': 2}
df_diabetes['gender'] = df_diabetes['gender'].map(gender_mapping)

In [11]:
sh_dummies = pd.get_dummies(df_diabetes['smoking_history'], dtype = float)
df_diabetes = pd.concat([df_diabetes, sh_dummies], axis = 1)
df_diabetes = df_diabetes.drop(["smoking_history"], axis = 1)
df_diabetes.head()

Unnamed: 0,gender,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,No Info,current,...,ever,former,never,not current,No Info.1,current.1,ever.1,former.1,never.1,not current.1
0,0,80.0,0,1,25.19,6.6,140,0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0,54.0,0,0,27.32,6.6,80,0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,1,28.0,0,0,27.32,5.7,158,0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0,36.0,0,0,23.45,5.0,155,0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,1,76.0,1,1,20.14,4.8,155,0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [12]:
print(df_diabetes.head(30))

    gender   age  hypertension  heart_disease    bmi  HbA1c_level  \
0        0  80.0             0              1  25.19          6.6   
1        0  54.0             0              0  27.32          6.6   
2        1  28.0             0              0  27.32          5.7   
3        0  36.0             0              0  23.45          5.0   
4        1  76.0             1              1  20.14          4.8   
5        0  20.0             0              0  27.32          6.6   
6        0  44.0             0              0  19.31          6.5   
7        0  79.0             0              0  23.86          5.7   
8        1  42.0             0              0  33.64          4.8   
9        0  32.0             0              0  27.32          5.0   
10       0  53.0             0              0  27.32          6.1   
11       0  54.0             0              0  54.70          6.0   
12       0  78.0             0              0  36.05          5.0   
13       0  67.0             0    

In [13]:
# Separate features and target variable
X = df_diabetes.drop(columns=['diabetes']) 
y = df_diabetes['diabetes']

In [14]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) 

In [15]:
# Initialize Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

In [16]:
# Fit the model to training data
rf_classifier.fit(X_train, y_train)

In [17]:
# Predict the target variable using testing data
y_pred = rf_classifier.predict(X_test)

In [18]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.96968
Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98     22850
           1       0.95      0.68      0.79      2150

    accuracy                           0.97     25000
   macro avg       0.96      0.84      0.89     25000
weighted avg       0.97      0.97      0.97     25000

