In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.utils import resample
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC 
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, classification_report
from sklearn.model_selection import train_test_split

### Data Exploratory

In [5]:
stroke_df = pd.read_csv('../../cleaned_datasets/normalized_stroke_dataset.csv')
heart_disease_df = pd.read_csv('../../cleaned_datasets/heart-disease-cleaned.csv')
diabete_df = pd.read_csv('../../cleaned_datasets/cleaned_diabetes.csv')
hypertension_df = pd.read_csv('../../cleaned_datasets/hypertension_cleaned.csv')

# Each dateset should comes with a Label (1 || 0):
# - heart_disease: has_heart_disease
# - diabete: has_diabetes
# - stroke: has_stroke 
# - hypertension: has_hypertension

In [6]:
print("Stroke Dataset:")

# Normalized from float to int
stroke_df['sex'] = stroke_df['sex'].apply(lambda x: round(x) if isinstance(x, (int, float)) else x)
stroke_df['blood_glucose'] = stroke_df['blood_glucose'].apply(lambda x: round(x) if isinstance(x, (int, float)) else x)

stroke_df.rename(columns={'hypertension': 'has_hypertension', 'heart_disease': 'has_heart_disease', 'stroke': 'has_stroke'}, inplace=True)

print("Stroke Columns:", stroke_df.columns)
print("Stroke Shape:", stroke_df.shape)

stroke_df.info()
stroke_df.head()

Stroke Dataset:
Stroke Columns: Index(['age', 'has_hypertension', 'has_heart_disease', 'bmi', 'blood_glucose',
       'sex', 'smoking_status', 'has_stroke'],
      dtype='object')
Stroke Shape: (5109, 8)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5109 entries, 0 to 5108
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   age                5109 non-null   int64  
 1   has_hypertension   5109 non-null   int64  
 2   has_heart_disease  5109 non-null   int64  
 3   bmi                5109 non-null   float64
 4   blood_glucose      5109 non-null   int64  
 5   sex                5109 non-null   int64  
 6   smoking_status     5109 non-null   object 
 7   has_stroke         5109 non-null   int64  
dtypes: float64(1), int64(6), object(1)
memory usage: 319.4+ KB


Unnamed: 0,age,has_hypertension,has_heart_disease,bmi,blood_glucose,sex,smoking_status,has_stroke
0,67,0,1,36.6,229,0,formerly smoked,1
1,61,0,0,28.893237,202,1,never smoked,1
2,80,0,1,32.5,106,0,never smoked,1
3,49,0,0,34.4,171,1,smokes,1
4,79,1,0,24.0,174,1,never smoked,1


In [7]:
print("Heart Disease Dataset:")

# Rename the target to heart_disease
heart_disease_df.rename(columns={'target': 'has_heart_disease'}, inplace=True)

print("Heart Disease Columns:", heart_disease_df.columns)
print("Heart Disease Shape:", heart_disease_df.shape)

heart_disease_df.info()
heart_disease_df.head()

Heart Disease Dataset:
Heart Disease Columns: Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'has_heart_disease'],
      dtype='object')
Heart Disease Shape: (2000, 14)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   age                2000 non-null   int64  
 1   sex                2000 non-null   int64  
 2   cp                 2000 non-null   int64  
 3   trestbps           2000 non-null   int64  
 4   chol               2000 non-null   int64  
 5   fbs                2000 non-null   int64  
 6   restecg            2000 non-null   int64  
 7   thalach            2000 non-null   int64  
 8   exang              2000 non-null   int64  
 9   oldpeak            2000 non-null   float64
 10  slope              2000 non-null   int64  
 11  ca              

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,has_heart_disease
0,51,0,2,130,256,0,0,149,0,0.5,2,0,2,1
1,61,1,0,138,166,0,0,125,1,3.6,1,1,2,0
2,63,0,0,124,197,0,1,136,1,0.0,1,0,2,0
3,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
4,61,1,2,150,243,1,1,137,1,1.0,1,0,2,1


In [8]:
print("Hypertension Dataset:")

hypertension_df.rename(columns={'target': 'has_hypertension'}, inplace=True)

# Normalized from float to int
hypertension_df['age'] = hypertension_df['age'].apply(lambda x: round(x) if isinstance(x, (int, float)) else x)
hypertension_df['sex'] = hypertension_df['sex'].apply(lambda x: round(x) if isinstance(x, (int, float)) else x)

print("Hypertension Columns:", hypertension_df.columns)
print("Hypertension Shape:", hypertension_df.shape)

hypertension_df.info()
hypertension_df.head()

Hypertension Dataset:
Hypertension Columns: Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'has_hypertension'],
      dtype='object')
Hypertension Shape: (24422, 14)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24422 entries, 0 to 24421
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               24422 non-null  int64  
 1   sex               24422 non-null  int64  
 2   cp                24422 non-null  int64  
 3   trestbps          24422 non-null  int64  
 4   chol              24422 non-null  int64  
 5   fbs               24422 non-null  int64  
 6   restecg           24422 non-null  int64  
 7   thalach           24422 non-null  int64  
 8   exang             24422 non-null  int64  
 9   oldpeak           24422 non-null  float64
 10  slope             24422 non-null  int64  
 11  ca                24422 non-nu

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,has_hypertension
0,57,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,64,0,2,130,250,0,1,187,0,3.5,0,0,2,1
2,52,1,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,0,1,120,236,0,1,178,0,0.8,2,0,2,1
4,66,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [9]:
print("Diabete Dataset:")

# Rename columns to normalized with other datasets
diabete_df.drop(columns='smoking_history', inplace=True)
diabete_df.rename(columns={'blood_glucose_level': 'blood_glucose', 
                           'gender': 'sex', 'hypertension': 'has_hypertension',
                           'heart_disease': 'has_heart_disease', 'diabetes': 'has_diabetes'}, inplace=True)

# Normalized from float to int
diabete_df['age'] = diabete_df['age'].apply(lambda x: round(x) if isinstance(x, (int, float)) else x)

print("Diabete Columns:", diabete_df.columns)
print("Diabete Shape:", diabete_df.shape)

diabete_df.info()
diabete_df.head()

Diabete Dataset:
Diabete Columns: Index(['sex', 'age', 'has_hypertension', 'has_heart_disease', 'bmi',
       'HbA1c_level', 'blood_glucose', 'has_diabetes'],
      dtype='object')
Diabete Shape: (88195, 8)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88195 entries, 0 to 88194
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sex                88195 non-null  int64  
 1   age                88195 non-null  int64  
 2   has_hypertension   88195 non-null  int64  
 3   has_heart_disease  88195 non-null  int64  
 4   bmi                88195 non-null  float64
 5   HbA1c_level        88195 non-null  float64
 6   blood_glucose      88195 non-null  int64  
 7   has_diabetes       88195 non-null  int64  
dtypes: float64(2), int64(6)
memory usage: 5.4 MB


Unnamed: 0,sex,age,has_hypertension,has_heart_disease,bmi,HbA1c_level,blood_glucose,has_diabetes
0,0,80,0,1,25.19,6.6,140,0
1,0,54,0,0,27.32,6.6,80,0
2,1,28,0,0,27.32,5.7,158,0
3,0,36,0,0,23.45,5.0,155,0
4,1,76,1,1,20.14,4.8,155,0


In [10]:
# resampled_diabetes_df = resample(diabete_df, replace=True, n_samples=5109, random_state=42)
# resampled_diabetes_df['is_synthetic'] = [0] * len(diabete_df) + [1] * (5109 - len(diabete_df))

# resampled_diabetes_df = resampled_diabetes_df.reset_index(drop=True)

In [11]:
# original_data = resampled_diabetes_df[resampled_diabetes_df['is_synthetic'] == 0]

# def find_outliers_iqr(df, threshold=1.5):
#     outliers = {}
#     for column in df.select_dtypes(include=[np.number]).columns:
#         Q1 = df[column].quantile(0.25)
#         Q3 = df[column].quantile(0.75)
#         IQR = Q3 - Q1
#         lower_bound = Q1 - threshold * IQR
#         upper_bound = Q3 + threshold * IQR
#         # print(f"{column}: Lower Bound = {lower_bound}, Upper Bound = {upper_bound}")
#         outliers[column] = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
#     return outliers

# original_outliers = find_outliers_iqr(original_data)

# print(original_outliers)

In [12]:
# def impute_outliers(df, threshold=1.5):
#     for column in df.select_dtypes(include=[np.number]).columns:
#         Q1 = df[column].quantile(0.25)
#         Q3 = df[column].quantile(0.75)
#         IQR = Q3 - Q1
#         lower_bound = Q1 - threshold * IQR
#         upper_bound = Q3 + threshold * IQR

#         # Impute outliers with the median
#         median = df[column].median()
#         df[column] = df[column].apply(lambda x: median if x < lower_bound or x > upper_bound else x)
#     return df

# imputed_diabetes_df = impute_outliers(resampled_diabetes_df)

In [13]:
# final_outliers = find_outliers_iqr(imputed_diabetes_df)
# print(final_outliers)

In [14]:
# Plot boxplots for all features to visualize outliers
# plt.figure(figsize=(15, 10))
# for i, column in enumerate(imputed_diabetes_df.columns):
#     plt.subplot(4, 4, i+1)
#     sns.boxplot(y=imputed_diabetes_df[column])
#     plt.title(f'Boxplot of {column}')
# plt.tight_layout()
# plt.show()

### Combining Datasets

Below run only one or the other. Don't run both

##### 1. Merge by Concat Method

In [15]:
# # Apply source to keep track where it from
# stroke_df['source'] = 'stroke'
# heart_disease_df['source'] = 'heart_disease'
# diabete_df['source'] = 'diabete'

# # Merge through concat
# combined_df_concat = pd.concat([stroke_df, heart_disease_df], ignore_index=True)

##### 2. Merge by Pandas Merge Method

In [16]:
# Merge stroke and heart disease datasets
shared_columns_1 = list(set(stroke_df.columns).intersection(heart_disease_df.columns))
print(f"Shared Columns: {shared_columns_1}")

# Merge through the pandas merge function
combined_df_1 = pd.merge(stroke_df, heart_disease_df, on=shared_columns_1, how='inner')

print("Combined Dataset Shape: ", combined_df_1.shape)
combined_df_1.head()

Shared Columns: ['has_heart_disease', 'age', 'sex']
Combined Dataset Shape:  (37223, 19)


Unnamed: 0,age,has_hypertension,has_heart_disease,bmi,blood_glucose,sex,smoking_status,has_stroke,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,67,0,1,36.6,229,0,formerly smoked,1,2,152,277,0,1,172,0,0.0,2,1,2
1,67,0,1,36.6,229,0,formerly smoked,1,2,152,277,0,1,172,0,0.0,2,1,2
2,67,0,1,36.6,229,0,formerly smoked,1,2,152,277,0,1,172,0,0.0,2,1,2
3,67,0,1,36.6,229,0,formerly smoked,1,0,106,223,0,1,142,0,0.3,2,2,2
4,67,0,1,36.6,229,0,formerly smoked,1,2,152,277,0,1,172,0,0.0,2,1,2


In [17]:
# Merge (Stroke and Heart Disease) with Diabetes datasets
shared_columns_2 = list(set(combined_df_1.columns).intersection(diabete_df.columns))
print(f"Shared Columns: {shared_columns_2}")

# Merge through the pandas merge function
combined_df_2 = pd.merge(combined_df_1, diabete_df, on=shared_columns_2, how='outer')

print("Combined Dataset Shape: ", combined_df_2.shape)
combined_df_2.head()

Shared Columns: ['has_heart_disease', 'sex', 'bmi', 'has_hypertension', 'age', 'blood_glucose']
Combined Dataset Shape:  (125417, 21)


Unnamed: 0,age,has_hypertension,has_heart_disease,bmi,blood_glucose,sex,smoking_status,has_stroke,cp,trestbps,...,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,HbA1c_level,has_diabetes
0,67,0,1,36.6,229,0,formerly smoked,1.0,2.0,152.0,...,0.0,1.0,172.0,0.0,0.0,2.0,1.0,2.0,,
1,67,0,1,36.6,229,0,formerly smoked,1.0,2.0,152.0,...,0.0,1.0,172.0,0.0,0.0,2.0,1.0,2.0,,
2,67,0,1,36.6,229,0,formerly smoked,1.0,2.0,152.0,...,0.0,1.0,172.0,0.0,0.0,2.0,1.0,2.0,,
3,67,0,1,36.6,229,0,formerly smoked,1.0,0.0,106.0,...,0.0,1.0,142.0,0.0,0.3,2.0,2.0,2.0,,
4,67,0,1,36.6,229,0,formerly smoked,1.0,2.0,152.0,...,0.0,1.0,172.0,0.0,0.0,2.0,1.0,2.0,,


In [18]:
# Merge (Stroke, Heart Disease, and Diabetes) with Hypertension datasets
shared_columns = list(set(combined_df_2.columns).intersection(hypertension_df.columns))
print(f"Shared Columns: {shared_columns}")

# Merge through the pandas merge function
combined_df = pd.merge(combined_df_2, hypertension_df, on=shared_columns, how='outer')

print("Combined Dataset Shape: ", combined_df.shape)
combined_df.head()

Shared Columns: ['cp', 'oldpeak', 'ca', 'thal', 'sex', 'thalach', 'exang', 'slope', 'fbs', 'has_hypertension', 'restecg', 'trestbps', 'age', 'chol']
Combined Dataset Shape:  (149677, 21)


Unnamed: 0,age,has_hypertension,has_heart_disease,bmi,blood_glucose,sex,smoking_status,has_stroke,cp,trestbps,...,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,HbA1c_level,has_diabetes
0,67,0,1.0,36.6,229.0,0,formerly smoked,1.0,2.0,152.0,...,0.0,1.0,172.0,0.0,0.0,2.0,1.0,2.0,,
1,67,0,1.0,36.6,229.0,0,formerly smoked,1.0,2.0,152.0,...,0.0,1.0,172.0,0.0,0.0,2.0,1.0,2.0,,
2,67,0,1.0,36.6,229.0,0,formerly smoked,1.0,2.0,152.0,...,0.0,1.0,172.0,0.0,0.0,2.0,1.0,2.0,,
3,67,0,1.0,36.6,229.0,0,formerly smoked,1.0,2.0,152.0,...,0.0,1.0,172.0,0.0,0.0,2.0,1.0,2.0,,
4,67,0,1.0,36.6,229.0,0,formerly smoked,1.0,2.0,152.0,...,0.0,1.0,172.0,0.0,0.0,2.0,1.0,2.0,,


### Data Exploratory After Merging

In [19]:
# Check for missing values
print("Missing Values:\n", combined_df.isnull().sum())
print("\nDuplicated Values:\n", combined_df.duplicated().sum())

Missing Values:
 age                       0
has_hypertension          0
has_heart_disease     24260
bmi                   24260
blood_glucose         24260
sex                       0
smoking_status       112454
has_stroke           112454
cp                    88194
trestbps              88194
chol                  88194
fbs                   88194
restecg               88194
thalach               88194
exang                 88194
oldpeak               88194
slope                 88194
ca                    88194
thal                  88194
HbA1c_level           61473
has_diabetes          61473
dtype: int64

Duplicated Values:
 37031


In [20]:
# Drop duplicated values
combined_df.drop_duplicates(inplace=True)
combined_df.duplicated()

0         False
8         False
16        False
24        False
32        False
          ...  
149672    False
149673    False
149674    False
149675    False
149676    False
Length: 112646, dtype: bool

In [21]:
# Verify combined dataset
print("Combined Dataset Shape: ", combined_df.shape)
print("Combined Dataset Columns: ", combined_df.columns)

combined_df.info()
combined_df.describe().T

Combined Dataset Shape:  (112646, 21)
Combined Dataset Columns:  Index(['age', 'has_hypertension', 'has_heart_disease', 'bmi', 'blood_glucose',
       'sex', 'smoking_status', 'has_stroke', 'cp', 'trestbps', 'chol', 'fbs',
       'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal',
       'HbA1c_level', 'has_diabetes'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
Index: 112646 entries, 0 to 149676
Data columns (total 21 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   age                112646 non-null  int64  
 1   has_hypertension   112646 non-null  int64  
 2   has_heart_disease  88386 non-null   float64
 3   bmi                88386 non-null   float64
 4   blood_glucose      88386 non-null   float64
 5   sex                112646 non-null  int64  
 6   smoking_status     5078 non-null    object 
 7   has_stroke         5078 non-null    float64
 8   cp                 29338 non-null   float64

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,112646.0,44.729711,21.835943,0.0,28.0,47.0,61.0,98.0
has_hypertension,112646.0,0.180619,0.384704,0.0,0.0,0.0,0.0,1.0
has_heart_disease,88386.0,0.039769,0.195417,0.0,0.0,0.0,0.0,1.0
bmi,88386.0,26.54912,5.630241,13.71,22.95,27.32,29.41,66.8
blood_glucose,88386.0,133.280339,36.669739,55.0,100.0,140.0,158.0,272.0
sex,112646.0,0.45793,0.49855,0.0,0.0,0.0,1.0,2.0
has_stroke,5078.0,0.054352,0.226733,0.0,0.0,0.0,0.0,1.0
cp,29338.0,0.908174,1.024641,0.0,0.0,0.0,2.0,3.0
trestbps,29338.0,130.075295,15.403324,94.0,120.0,130.0,140.0,170.0
chol,29338.0,242.700866,45.001045,126.0,209.0,240.0,273.0,360.0


In [22]:
combined_df.head()

Unnamed: 0,age,has_hypertension,has_heart_disease,bmi,blood_glucose,sex,smoking_status,has_stroke,cp,trestbps,...,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,HbA1c_level,has_diabetes
0,67,0,1.0,36.6,229.0,0,formerly smoked,1.0,2.0,152.0,...,0.0,1.0,172.0,0.0,0.0,2.0,1.0,2.0,,
8,67,0,1.0,27.6,144.0,0,never smoked,0.0,2.0,152.0,...,0.0,1.0,172.0,0.0,0.0,2.0,1.0,2.0,,
16,67,0,1.0,31.9,96.0,0,Unknown,0.0,2.0,152.0,...,0.0,1.0,172.0,0.0,0.0,2.0,1.0,2.0,,
24,67,0,1.0,28.893237,97.0,0,Unknown,0.0,2.0,152.0,...,0.0,1.0,172.0,0.0,0.0,2.0,1.0,2.0,,
32,67,0,1.0,36.6,229.0,0,formerly smoked,1.0,0.0,106.0,...,0.0,1.0,142.0,0.0,0.3,2.0,2.0,2.0,,


In [23]:
combined_df['smoking_status'] = combined_df['smoking_status'].fillna('Unknown')
print("Combined Dataset Shape: ", combined_df.shape)
combined_df.info()
combined_df.head()

Combined Dataset Shape:  (112646, 21)
<class 'pandas.core.frame.DataFrame'>
Index: 112646 entries, 0 to 149676
Data columns (total 21 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   age                112646 non-null  int64  
 1   has_hypertension   112646 non-null  int64  
 2   has_heart_disease  88386 non-null   float64
 3   bmi                88386 non-null   float64
 4   blood_glucose      88386 non-null   float64
 5   sex                112646 non-null  int64  
 6   smoking_status     112646 non-null  object 
 7   has_stroke         5078 non-null    float64
 8   cp                 29338 non-null   float64
 9   trestbps           29338 non-null   float64
 10  chol               29338 non-null   float64
 11  fbs                29338 non-null   float64
 12  restecg            29338 non-null   float64
 13  thalach            29338 non-null   float64
 14  exang              29338 non-null   float64
 15  oldpeak           

Unnamed: 0,age,has_hypertension,has_heart_disease,bmi,blood_glucose,sex,smoking_status,has_stroke,cp,trestbps,...,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,HbA1c_level,has_diabetes
0,67,0,1.0,36.6,229.0,0,formerly smoked,1.0,2.0,152.0,...,0.0,1.0,172.0,0.0,0.0,2.0,1.0,2.0,,
8,67,0,1.0,27.6,144.0,0,never smoked,0.0,2.0,152.0,...,0.0,1.0,172.0,0.0,0.0,2.0,1.0,2.0,,
16,67,0,1.0,31.9,96.0,0,Unknown,0.0,2.0,152.0,...,0.0,1.0,172.0,0.0,0.0,2.0,1.0,2.0,,
24,67,0,1.0,28.893237,97.0,0,Unknown,0.0,2.0,152.0,...,0.0,1.0,172.0,0.0,0.0,2.0,1.0,2.0,,
32,67,0,1.0,36.6,229.0,0,formerly smoked,1.0,0.0,106.0,...,0.0,1.0,142.0,0.0,0.3,2.0,2.0,2.0,,


### Model Training
- KNN
- SVM
- Naive Bayes
- Random Forest

In [24]:
combined_df_copy = combined_df.copy()

# KNN does not work with missing value 
# Since our goal is to have multiple target variables, we can assume that NaN is 0
target_columns = ['has_heart_disease', 'has_hypertension', 'has_diabetes', 'has_stroke']
combined_df_copy[target_columns] = combined_df[target_columns].fillna(0)

# Encode the smoking status
if 'smoking_status' in combined_df_copy.columns:
    le = LabelEncoder()
    combined_df_copy['smoking_status'] = le.fit_transform(combined_df_copy['smoking_status'])

# Normalized from float to int
combined_df_copy['has_stroke'] = combined_df_copy['has_stroke'].apply(lambda x: round(x) if isinstance(x, (int, float)) else x)
combined_df_copy['has_heart_disease'] = combined_df_copy['has_heart_disease'].apply(lambda x: round(x) if isinstance(x, (int, float)) else x)
combined_df_copy['has_diabetes'] = combined_df_copy['has_diabetes'].apply(lambda x: round(x) if isinstance(x, (int, float)) else x)

# Fill NaN value with mean
combined_df_copy = combined_df_copy.fillna(combined_df_copy.mean())

#### 1. KNN

In [27]:
def train_knn_by_target(target):
    # Check (X) and (y) features  
    X = combined_df_copy.drop(columns=target_columns)
    y = combined_df_copy[target]

    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Scale the features as unscaled features can distort distances in KNN
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)

    # Train KNN model
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(X_train, y_train)

    y_pred = knn.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    print("Target Variable:", target)
    print('KNN model accuracy:', accuracy)
    print('KNN model recall:', recall)
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

In [None]:
train_knn_by_target('has_stroke')
train_knn_by_target('has_hypertension')
train_knn_by_target('has_heart_disease')
train_knn_by_target('has_diabetes')

In [35]:
# Import necessary libraries
from sklearn.model_selection import KFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier

# Define the K-Fold Cross Validator
kf = KFold(n_splits=5, shuffle=True, random_state=42)

def evaluate_knn_model(X, target_columns):
    for target_variable in target_columns:
        y = combined_df_copy[target_variable]
        # Initialize the KNN model
        knn = KNeighborsClassifier(n_neighbors=5)
        
        # Perform K-Fold Cross Validation for accurancy
        recall_accurancys = cross_val_score(knn, X, y, cv=kf, scoring="accurancy")

        # Perform K-Fold Cross Validation for recall
        recall_scores = cross_val_score(knn, X, y, cv=kf, scoring="recall")
        
        # Output the recall scores and their mean
        print(f"Target Variable: {target_variable}")
        print("K-Fold Cross-validation recall scores: ", recall_scores)
        print("Mean recall: ", np.mean(recall_scores))
        print("-" * 50)

# Prepare features (X)
X = combined_df_copy.drop(columns=target_columns)

# Call the function to evaluate all target variables
evaluate_knn_model(X, target_columns)


Target Variable: has_heart_disease
K-Fold Cross-validation recall scores:  [0.04022989 0.03165468 0.03310345 0.04227405 0.04207574]
Mean recall:  0.03786755967916918
--------------------------------------------------
Target Variable: has_hypertension
K-Fold Cross-validation recall scores:  [0.68078744 0.67480658 0.70259481 0.68253968 0.68612604]
Mean recall:  0.6853709096486265
--------------------------------------------------
Target Variable: has_diabetes
K-Fold Cross-validation recall scores:  [0.3046875  0.32423924 0.31910112 0.3        0.32209302]
Mean recall:  0.3140241782684801
--------------------------------------------------
Target Variable: has_stroke
K-Fold Cross-validation recall scores:  [0.01785714 0.         0.01818182 0.01694915 0.        ]
Mean recall:  0.010597622716266783
--------------------------------------------------


#### 2. SVM

In [24]:
def train_svm_by_target(target):
    # Check (X) and (y) features  
    X = combined_df_copy.drop(columns=target_columns)
    y = combined_df_copy[target]

    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # # Find best parameters by using hyperparameter tuning through GridSearchCV
    # svm_parameters = {
    #     'C': [0.1, 1, 10],
    #     'gamma': ['scale', 'auto', 0.01, 0.1],
    #     'kernel': ['rbf', 'linear']
    # }

    # base_svm = SVC()
    # svm_grid = GridSearchCV(base_svm, svm_parameters, scoring='recall', return_train_score=True, cv=5, verbose=1)

    # # Train KNN model
    # svm_grid.fit(X_train, y_train)

    # best_model = svm_grid.best_estimator_
    # best_parameters = svm_grid.best_params_
    # best_recall = svm_grid.best_score_

    # print('The best model was:', best_model)
    # print('The best parameter values were:', best_parameters)
    # print('The best recall was:', best_recall)

    svm = SVC()
    svm.fit(X_train, y_train)

    y_pred = svm.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    print("Target Variable:", target)
    print('SVM model accuracy:', accuracy)
    print('SVM model recall:', recall)
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

In [None]:
train_svm_by_target('has_stroke')
train_svm_by_target('has_hypertension')
train_svm_by_target('has_heart_disease')
train_svm_by_target('has_diabetes')

#### 3. Naive Bayes

In [26]:
def train_nb_by_target(target):
    # Check (X) and (y) features  
    X = combined_df_copy.drop(columns=target_columns)
    y = combined_df_copy[target]

    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train KNN model
    svm = GaussianNB() 
    svm.fit(X_train, y_train)

    y_pred = svm.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    print("Target Variable:", target)
    print('Naive Bayes model accuracy:', accuracy)
    print('Naive Bayes model recall:', recall)
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

In [None]:
train_nb_by_target('has_stroke')
train_nb_by_target('has_hypertension')
train_nb_by_target('has_heart_disease')
train_nb_by_target('has_diabetes')

#### 4. Random Forest

In [28]:
def train_rf_by_target(target):
    # Check (X) and (y) features  
    X = combined_df_copy.drop(columns=target_columns)
    y = combined_df_copy[target]

    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train KNN model
    rf = RandomForestClassifier(n_estimators=100, random_state=42) 
    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    print("Target Variable:", target)
    print('Random Forest model accuracy:', accuracy)
    print('Random Forest model recall:', recall)
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

In [None]:
train_rf_by_target('has_stroke')
train_rf_by_target('has_hypertension')
train_rf_by_target('has_heart_disease')
train_rf_by_target('has_diabetes')