In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Plotting settings
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (10, 6)

print("Libraries imported successfully!")
print(f"Random seed set to: {RANDOM_SEED}")

âœ… Libraries imported successfully!
ðŸ“Œ Random seed set to: 42


In [22]:
import time
time_begin = time.time()
df = pd.read_csv('../data/raw/diabetes_dataset.csv')
df = df.drop_duplicates() # Remove duplicate rows
df.shape

(100000, 31)

In [42]:
df.head(10) # Display first 10 records

Unnamed: 0,age,gender,ethnicity,education_level,income_level,employment_status,smoking_status,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,...,hdl_cholesterol,ldl_cholesterol,triglycerides,glucose_fasting,glucose_postprandial,insulin_level,hba1c,diabetes_risk_score,diabetes_stage,diagnosed_diabetes
0,58,Male,Asian,Highschool,Lower-Middle,Employed,Never,0,215,5.7,...,41,160,145,136,236,6.36,8.18,29.6,Type 2,1
1,48,Female,White,Highschool,Middle,Employed,Former,1,143,6.7,...,55,50,30,93,150,2.0,5.63,23.0,No Diabetes,0
2,60,Male,Hispanic,Highschool,Middle,Unemployed,Never,1,57,6.4,...,66,99,36,118,195,5.07,7.51,44.7,Type 2,1
3,74,Female,Black,Highschool,Low,Retired,Never,0,49,3.4,...,50,79,140,139,253,5.28,9.03,38.2,Type 2,1
4,46,Male,White,Graduate,Middle,Retired,Never,1,109,7.2,...,52,125,160,137,184,12.74,7.2,23.5,Type 2,1
5,46,Female,White,Highschool,Upper-Middle,Employed,Never,2,124,9.0,...,61,119,179,100,133,8.77,6.03,23.5,Pre-Diabetes,0
6,75,Female,White,Graduate,Upper-Middle,Retired,Never,0,53,9.2,...,46,161,155,101,100,10.14,5.24,36.1,Pre-Diabetes,0
7,62,Male,White,Postgraduate,Middle,Unemployed,Current,1,75,4.1,...,49,159,120,110,189,8.96,7.04,34.2,Type 2,1
8,42,Male,Black,Highschool,Lower-Middle,Employed,Current,1,114,6.7,...,33,132,98,116,172,5.7,6.9,26.7,Type 2,1
9,59,Female,White,Graduate,Middle,Employed,Current,3,86,8.2,...,52,103,104,76,109,4.49,4.99,30.0,No Diabetes,0


```markdown 
# REMOVE NULL VALUES
```

In [31]:
# Identifying null values variable
isnull = df.isnull()
print(isnull.sum()) # Summing up null values for each column

age                                   0
gender                                0
ethnicity                             0
education_level                       0
income_level                          0
employment_status                     0
smoking_status                        0
alcohol_consumption_per_week          0
physical_activity_minutes_per_week    0
diet_score                            0
sleep_hours_per_day                   0
screen_time_hours_per_day             0
family_history_diabetes               0
hypertension_history                  0
cardiovascular_history                0
bmi                                   0
waist_to_hip_ratio                    0
systolic_bp                           0
diastolic_bp                          0
heart_rate                            0
cholesterol_total                     0
hdl_cholesterol                       0
ldl_cholesterol                       0
triglycerides                         0
glucose_fasting                       0


In [45]:
# Handling Null Values
total_null = isnull.sum().sum()
print(f"There are currently:        {total_null} null values") # Summing up null values for each column before handling

for column in df.columns:
    if df[column].dtype in [np.float64, np.int64]: # Numerical variables
        print(f"{column} - Mean: {round(df[column].mean(),3)}, Median: {df[column].median()}")

for column in df.columns:
    if df[column].isnull().sum() > 0: # Check column datatypes if there exists null values
        if df[column].dtype in [np.float64, np.int64]: # Numerical variables
            print(f"{column} contains {df[column].isnull().sum()} null values.")
            print(f"{column} - Mean: {df[column].mean()}, Median: {df[column].median()}, Std: {df[column].std()}")
            df[column] = df[column].fillna(df[column].mean())
        else:
            df[column] = df[column].fillna("Unknown") # Categorical variables


print(f"There are now currently:    {isnull.sum().sum()} null values") # Summing up null values for each column

There are currently:        0 null values
age - Mean: 50.12, Median: 50.0
alcohol_consumption_per_week - Mean: 2.0, Median: 2.0
physical_activity_minutes_per_week - Mean: 118.91, Median: 100.0
diet_score - Mean: 5.99, Median: 6.0
sleep_hours_per_day - Mean: 7.0, Median: 7.0
screen_time_hours_per_day - Mean: 6.0, Median: 6.0
family_history_diabetes - Mean: 0.22, Median: 0.0
hypertension_history - Mean: 0.25, Median: 0.0
cardiovascular_history - Mean: 0.08, Median: 0.0
bmi - Mean: 25.61, Median: 25.6
waist_to_hip_ratio - Mean: 0.86, Median: 0.86
systolic_bp - Mean: 115.8, Median: 116.0
diastolic_bp - Mean: 75.23, Median: 75.0
heart_rate - Mean: 69.63, Median: 70.0
cholesterol_total - Mean: 185.98, Median: 186.0
hdl_cholesterol - Mean: 54.04, Median: 54.0
ldl_cholesterol - Mean: 103.0, Median: 102.0
triglycerides - Mean: 121.46, Median: 121.0
glucose_fasting - Mean: 111.12, Median: 111.0
glucose_postprandial - Mean: 160.04, Median: 160.0
insulin_level - Mean: 9.06, Median: 8.79
hba1c - Me