# Training Diabetes predictor model

### Import necessary modules

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

### Load the dataset

In [2]:
# Load the dataset
file_path = 'diabetes_prediction_dataset.csv'
diabetes_data = pd.read_csv(file_path)

### About the dataset

The Diabetes prediction dataset is a collection of medical and demographic data from patients, along with their diabetes status (positive or negative). The data includes features such as age, gender, body mass index (BMI), hypertension, heart disease, smoking history, HbA1c level, and blood glucose level. This dataset can be used to build machine learning models to predict diabetes in patients based on their medical history and demographic information. The dataset can be downloaded at https://www.kaggle.com/datasets/iammustafatz/diabetes-prediction-dataset.


### Attributes of the dataset:

 - **Age**: Age is an important factor in predicting diabetes risk. As individuals get older, their risk of developing diabetes increases. This is partly due to factors such as reduced physical activity, changes in hormone levels, and a higher likelihood of developing other health conditions that can contribute to diabetes.

 - **Gender**: Gender can play a role in diabetes risk, although the effect may vary. For example, women with a history of gestational diabetes (diabetes during pregnancy) have a higher risk of developing type 2 diabetes later in life. Additionally, some studies have suggested that men may have a slightly higher risk of diabetes compared to women.

 - **Body Mass Index (BMI)**: BMI is a measure of body fat based on a person's height and weight. It is commonly used as an indicator of overall weight status and can be helpful in predicting diabetes risk. Higher BMI is associated with a greater likelihood of developing type 2 diabetes. Excess body fat, particularly around the waist, can lead to insulin resistance and impair the body's ability to regulate blood sugar levels.

 - **Hypertension**: Hypertension, or high blood pressure, is a condition that often coexists with diabetes. The two conditions share common risk factors and can contribute to each other's development. Having hypertension increases the risk of developing type 2 diabetes and vice versa. Both conditions can have detrimental effects on cardiovascular health.

 - **Heart Disease**: Heart disease, including conditions such as coronary artery disease and heart failure, is associated with an increased risk of diabetes. The relationship between heart disease and diabetes is bidirectional, meaning that having one condition increases the risk of developing the other. This is because they share many common risk factors, such as obesity, high blood pressure, and high cholesterol.

 - **Smoking History**: Smoking is a modifiable risk factor for diabetes. Cigarette smoking has been found to increase the risk of developing type 2 diabetes. Smoking can contribute to insulin resistance and impair glucose metabolism. Quitting smoking can significantly reduce the risk of developing diabetes and its complications.

 - **HbA1c Level**: HbA1c (glycated hemoglobin) is a measure of the average blood glucose level over the past 2-3 months. It provides information about long-term blood sugar control. Higher HbA1c levels indicate poorer glycemic control and are associated with an increased risk of developing diabetes and its complications.

 - **Blood Glucose Level**: Blood glucose level refers to the amount of glucose (sugar) present in the blood at a given time. Elevated blood glucose levels, particularly in the fasting state or after consuming carbohydrates, can indicate impaired glucose regulation and increase the risk of developing diabetes. Regular monitoring of blood glucose levels is important in the diagnosis and management of diabetes.

 - **Diabetes** - Indicating whether a person has diabetes or not

In [13]:
diabetes_data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,80.0,0,1,4,25.19,6.6,140,0
1,0,54.0,0,0,0,27.32,6.6,80,0
2,1,28.0,0,0,4,27.32,5.7,158,0
3,0,36.0,0,0,1,23.45,5.0,155,0
4,1,76.0,1,1,1,20.14,4.8,155,0


In [14]:
diabetes_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  int64  
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  int64  
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(6)
memory usage: 6.9 MB


In [15]:
diabetes_data.describe()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,0.41466,41.885856,0.07485,0.03942,2.17965,27.320767,5.527507,138.05806,0.085
std,0.493031,22.51684,0.26315,0.194593,1.889659,6.636783,1.070672,40.708136,0.278883
min,0.0,0.08,0.0,0.0,0.0,10.01,3.5,80.0,0.0
25%,0.0,24.0,0.0,0.0,0.0,23.63,4.8,100.0,0.0
50%,0.0,43.0,0.0,0.0,3.0,27.32,5.8,140.0,0.0
75%,1.0,60.0,0.0,0.0,4.0,29.58,6.2,159.0,0.0
max,2.0,80.0,1.0,1.0,5.0,95.69,9.0,300.0,1.0


In [17]:
diabetes_data.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64

In [21]:
# Encode categorical variables
# Convert 'gender' and 'smoking_history' columns to numerical values
label_encoder = LabelEncoder()
diabetes_data['gender'] = label_encoder.fit_transform(diabetes_data['gender'])
diabetes_data['smoking_history'] = label_encoder.fit_transform(diabetes_data['smoking_history'])


In [19]:
# Define features and target variable
X = diabetes_data.drop('diabetes', axis=1)  # Features
y = diabetes_data['diabetes']               # Target
X
y

0        0
1        0
2        0
3        0
4        0
        ..
99995    0
99996    0
99997    0
99998    0
99999    0
Name: diabetes, Length: 100000, dtype: int64

In [5]:
# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
# Scale the numerical features for better model performance
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_test

array([[-8.42217184e-01, -1.28824683e+00, -2.86309234e-01, ...,
        -9.83570464e-01,  2.55560889e-01, -2.95858589e-01],
       [-8.42217184e-01, -1.73274126e+00, -2.86309234e-01, ...,
        -9.56361028e-01, -4.91573701e-01,  1.70590592e-01],
       [ 1.18538964e+00,  9.34225330e-01, -2.86309234e-01, ...,
        -3.03334543e-01, -1.89245106e+00,  1.52083822e+00],
       ...,
       [-8.42217184e-01,  3.11933125e-01, -2.86309234e-01, ...,
        -1.23449933e-01, -6.78357348e-01, -9.34157469e-01],
       [ 1.18538964e+00,  1.37871976e+00, -2.86309234e-01, ...,
        -1.00746719e-03,  1.00269548e+00, -9.34157469e-01],
       [-8.42217184e-01,  4.52364651e-02, -2.86309234e-01, ...,
        -5.24033309e-01,  2.55560889e-01,  1.70590592e-01]])

In [7]:
# Initialize and train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [22]:
# Make predictions on the test set
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]
y_pred_proba

array([0.  , 0.  , 0.01, ..., 0.  , 0.02, 0.02])

In [9]:
# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
classification_report_text = classification_report(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

In [12]:
# Display results
print(f"Accuracy: {accuracy*100:.2f}%")
print("Classification Report:")
print(classification_report_text)
print(f"ROC AUC Score: {roc_auc*100:.2f}%")

Accuracy: 97.08%
Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98     18292
           1       0.95      0.69      0.80      1708

    accuracy                           0.97     20000
   macro avg       0.96      0.84      0.89     20000
weighted avg       0.97      0.97      0.97     20000

ROC AUC Score: 96.37%
