In [78]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
# %pip install imblearn
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from collections import defaultdict
from sklearn.utils import resample
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC 
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, classification_report
from sklearn.model_selection import train_test_split

### Load the datasets

In [3]:
stroke_df = pd.read_csv('../../cleaned_datasets/normalized_stroke_dataset.csv')
heart_disease_df = pd.read_csv('../../cleaned_datasets/heart-disease-cleaned.csv')
diabete_df = pd.read_csv('../../cleaned_datasets/cleaned_diabetes.csv')
hypertension_df = pd.read_csv('../../cleaned_datasets/hypertension_cleaned.csv')

In [4]:
print("Stroke Dataset:")

# Normalized from float to int
stroke_df['sex'] = stroke_df['sex'].apply(lambda x: round(x) if isinstance(x, (int, float)) else x)
stroke_df['blood_glucose'] = stroke_df['blood_glucose'].apply(lambda x: round(x) if isinstance(x, (int, float)) else x)

stroke_df.rename(columns={'hypertension': 'has_hypertension', 'heart_disease': 'has_heart_disease', 'stroke': 'has_stroke'}, inplace=True)

print("Stroke Columns:", stroke_df.columns)
print("Stroke Shape:", stroke_df.shape)

stroke_df.info()
stroke_df.head()

Stroke Dataset:
Stroke Columns: Index(['age', 'has_hypertension', 'has_heart_disease', 'bmi', 'blood_glucose',
       'sex', 'smoking_status', 'has_stroke'],
      dtype='object')
Stroke Shape: (5109, 8)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5109 entries, 0 to 5108
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   age                5109 non-null   int64  
 1   has_hypertension   5109 non-null   int64  
 2   has_heart_disease  5109 non-null   int64  
 3   bmi                5109 non-null   float64
 4   blood_glucose      5109 non-null   int64  
 5   sex                5109 non-null   int64  
 6   smoking_status     5109 non-null   object 
 7   has_stroke         5109 non-null   int64  
dtypes: float64(1), int64(6), object(1)
memory usage: 319.4+ KB


Unnamed: 0,age,has_hypertension,has_heart_disease,bmi,blood_glucose,sex,smoking_status,has_stroke
0,67,0,1,36.6,229,0,formerly smoked,1
1,61,0,0,28.893237,202,1,never smoked,1
2,80,0,1,32.5,106,0,never smoked,1
3,49,0,0,34.4,171,1,smokes,1
4,79,1,0,24.0,174,1,never smoked,1


In [5]:
print("Heart Disease Dataset:")

# Rename the target to heart_disease
heart_disease_df.rename(columns={'target': 'has_heart_disease'}, inplace=True)

print("Heart Disease Columns:", heart_disease_df.columns)
print("Heart Disease Shape:", heart_disease_df.shape)

heart_disease_df.info()
heart_disease_df.head()

Heart Disease Dataset:
Heart Disease Columns: Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'has_heart_disease'],
      dtype='object')
Heart Disease Shape: (2000, 14)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   age                2000 non-null   int64  
 1   sex                2000 non-null   int64  
 2   cp                 2000 non-null   int64  
 3   trestbps           2000 non-null   int64  
 4   chol               2000 non-null   int64  
 5   fbs                2000 non-null   int64  
 6   restecg            2000 non-null   int64  
 7   thalach            2000 non-null   int64  
 8   exang              2000 non-null   int64  
 9   oldpeak            2000 non-null   float64
 10  slope              2000 non-null   int64  
 11  ca              

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,has_heart_disease
0,51,0,2,130,256,0,0,149,0,0.5,2,0,2,1
1,61,1,0,138,166,0,0,125,1,3.6,1,1,2,0
2,63,0,0,124,197,0,1,136,1,0.0,1,0,2,0
3,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
4,61,1,2,150,243,1,1,137,1,1.0,1,0,2,1


In [6]:
print("Hypertension Dataset:")

hypertension_df.rename(columns={'target': 'has_hypertension'}, inplace=True)

# Normalized from float to int
hypertension_df['age'] = hypertension_df['age'].apply(lambda x: round(x) if isinstance(x, (int, float)) else x)
hypertension_df['sex'] = hypertension_df['sex'].apply(lambda x: round(x) if isinstance(x, (int, float)) else x)

print("Hypertension Columns:", hypertension_df.columns)
print("Hypertension Shape:", hypertension_df.shape)

hypertension_df.info()
hypertension_df.head()

Hypertension Dataset:
Hypertension Columns: Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'has_hypertension'],
      dtype='object')
Hypertension Shape: (24422, 14)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24422 entries, 0 to 24421
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               24422 non-null  int64  
 1   sex               24422 non-null  int64  
 2   cp                24422 non-null  int64  
 3   trestbps          24422 non-null  int64  
 4   chol              24422 non-null  int64  
 5   fbs               24422 non-null  int64  
 6   restecg           24422 non-null  int64  
 7   thalach           24422 non-null  int64  
 8   exang             24422 non-null  int64  
 9   oldpeak           24422 non-null  float64
 10  slope             24422 non-null  int64  
 11  ca                24422 non-nu

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,has_hypertension
0,57,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,64,0,2,130,250,0,1,187,0,3.5,0,0,2,1
2,52,1,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,0,1,120,236,0,1,178,0,0.8,2,0,2,1
4,66,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [7]:
print("Diabete Dataset:")

# Rename columns to normalized with other datasets
diabete_df.rename(columns={'blood_glucose_level': 'blood_glucose', 
                           'gender': 'sex', 'hypertension': 'has_hypertension',
                           'heart_disease': 'has_heart_disease', 'diabetes': 'has_diabetes'}, inplace=True)

# Normalized from float to int
diabete_df.drop(columns='smoking_history', inplace=True)
diabete_df['age'] = diabete_df['age'].apply(lambda x: round(x) if isinstance(x, (int, float)) else x)

print("Diabete Columns:", diabete_df.columns)
print("Diabete Shape:", diabete_df.shape)

diabete_df.info()
diabete_df.head()

Diabete Dataset:
Diabete Columns: Index(['sex', 'age', 'has_hypertension', 'has_heart_disease', 'bmi',
       'HbA1c_level', 'blood_glucose', 'has_diabetes'],
      dtype='object')
Diabete Shape: (88195, 8)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88195 entries, 0 to 88194
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sex                88195 non-null  int64  
 1   age                88195 non-null  int64  
 2   has_hypertension   88195 non-null  int64  
 3   has_heart_disease  88195 non-null  int64  
 4   bmi                88195 non-null  float64
 5   HbA1c_level        88195 non-null  float64
 6   blood_glucose      88195 non-null  int64  
 7   has_diabetes       88195 non-null  int64  
dtypes: float64(2), int64(6)
memory usage: 5.4 MB


Unnamed: 0,sex,age,has_hypertension,has_heart_disease,bmi,HbA1c_level,blood_glucose,has_diabetes
0,0,80,0,1,25.19,6.6,140,0
1,0,54,0,0,27.32,6.6,80,0
2,1,28,0,0,27.32,5.7,158,0
3,0,36,0,0,23.45,5.0,155,0
4,1,76,1,1,20.14,4.8,155,0


#### Check for duplicates

In [8]:
print(heart_disease_df.duplicated().sum())
print(stroke_df.duplicated().sum())
print(hypertension_df.duplicated().sum())
print(diabete_df.duplicated().sum())

1717
1
0
4886


In [9]:
#drop duplicates
heart_disease_df.drop_duplicates(inplace=True)
stroke_df.drop_duplicates(inplace=True)
diabete_df.drop_duplicates(inplace=True)

print(heart_disease_df.duplicated().sum())
print(stroke_df.duplicated().sum())
print(hypertension_df.duplicated().sum())
print(diabete_df.duplicated().sum())

0
0
0
0


#### Checking data imbalance for target variables

In [10]:
print(stroke_df['has_stroke'].value_counts())
print(heart_disease_df['has_heart_disease'].value_counts())
print(hypertension_df['has_hypertension'].value_counts())
print(diabete_df['has_diabetes'].value_counts())


has_stroke
0    4859
1     249
Name: count, dtype: int64
has_heart_disease
1    158
0    125
Name: count, dtype: int64
has_hypertension
1    13762
0    10660
Name: count, dtype: int64
has_diabetes
0    78720
1     4589
Name: count, dtype: int64


#### Upsampling

In [93]:
def apply_oversampling(df, target_size, method='random'):
    """
    Apply oversampling to match target size using specified method
    """
    # Separate features and target
    # Separate features and target
    X = df.drop(columns=['has_heart_disease'])  # Drop the target column
    y = df['has_heart_disease']  # Use the target column directly
    
    # Calculate sampling strategy to reach exact target size
    n_samples = defaultdict(int)
    unique_labels = np.unique(y)
    base_size = target_size // len(unique_labels)
    remainder = target_size % len(unique_labels)
    
    for label in unique_labels:
        n_samples[label] = base_size + (1 if remainder > 0 else 0)
        remainder -= 1
    
    try:
        # Apply specified oversampling method
        if method == 'random':
            oversample = RandomOverSampler(sampling_strategy=n_samples, random_state=42)
            X_resampled, y_resampled = oversample.fit_resample(X, y)
            
        elif method == 'smote':
            oversample = SMOTE(sampling_strategy=n_samples, random_state=42, k_neighbors=min(5, len(df)-1))
            X_resampled, y_resampled = oversample.fit_resample(X, y)
            
        elif method == 'adasyn':
            # ADASYN doesn't support exact sample size, so we'll oversample and then randomly select
            oversample = ADASYN(random_state=42, n_neighbors=min(5, len(df)-1))
            X_resampled, y_resampled = oversample.fit_resample(X, y)
            
            # If we got more samples than needed, randomly select the target size
            if len(X_resampled) > target_size:
                indices = np.random.choice(len(X_resampled), target_size, replace=False)
                X_resampled = X_resampled.iloc[indices]
                y_resampled = y_resampled.iloc[indices]
            
        # Combine features and target back into a dataframe
        resampled_df = pd.concat([X_resampled, y_resampled], axis=1)
        return resampled_df
        
    except ValueError as e:
        print(f"Error with {method} for dataset: {e}")
        return None

heart_disease_resampled_df = apply_oversampling(heart_disease_df, 5108, method='smote')


In [94]:
print(heart_disease_resampled_df.duplicated().sum())
print(heart_disease_resampled_df.shape)
heart_disease_resampled_df['has_heart_disease'].value_counts()

110
(5108, 14)


has_heart_disease
1    2554
0    2554
Name: count, dtype: int64

In [95]:
heart_disease_resampled_df.drop_duplicates(inplace=True)
print(heart_disease_resampled_df.duplicated().sum())

0


In [96]:
heart_disease_resampled_df.shape

(4998, 14)

In [99]:
heart_disease_resampled_2_df = apply_oversampling(heart_disease_resampled_df, 5108, method='adasyn')
print(heart_disease_resampled_2_df.duplicated().sum())
print(heart_disease_resampled_2_df.shape)

0
(5015, 14)


In [108]:
heart_disease_resampled_3_df = apply_oversampling(heart_disease_resampled_2_df, 5108, method='smote')
print(heart_disease_resampled_3_df.duplicated().sum())
print(heart_disease_resampled_3_df.shape)

5
(5108, 14)


In [112]:
heart_disease_resampled_3_df.drop_duplicates(inplace=True)
print(heart_disease_resampled_3_df.duplicated().sum())
print(heart_disease_resampled_3_df.shape)

0
(5103, 14)


In [113]:
heart_disease_resampled_4_df = apply_oversampling(heart_disease_resampled_3_df, 5108, method='smote')
print(heart_disease_resampled_4_df.duplicated().sum())
print(heart_disease_resampled_4_df.shape)

0
(5108, 14)


## Downsampling Big Datasets

##### With the target variables data being imbalanced for all datasets, stratified sampling will be used so that the data can be downsampled proportionally.

In [114]:
# Define function to implement stratified sampling
def stratified_downsample(df, target_column, target_size):
    stratified_sample, _ = train_test_split(
        df,
        train_size=target_size,
        stratify=df[target_column],  # Stratification based on the target column
        random_state=42
    )
    return stratified_sample.reset_index(drop=True)

In [115]:
# Apply function to downsample datasets
hypertension_resampled_df = stratified_downsample(hypertension_df, 'has_hypertension', 5108)
diabete_resampled_df = stratified_downsample(diabete_df, 'has_diabetes', 5108)

stroke_resampled_df = stroke_df.copy()



In [116]:
print("Original Hypertension Dataset:")
print(hypertension_df['has_hypertension'].value_counts())

print("Downsampled Hypertension Dataset:")
print(hypertension_resampled_df['has_hypertension'].value_counts())

hypertension_resampled_df.shape


Original Hypertension Dataset:
has_hypertension
1    13762
0    10660
Name: count, dtype: int64
Downsampled Hypertension Dataset:
has_hypertension
1    2878
0    2230
Name: count, dtype: int64


(5108, 14)

In [117]:
print("Original Diabete Dataset:")
print(diabete_df['has_diabetes'].value_counts())

print("Downsampled Diabete Dataset:")
print(diabete_resampled_df['has_diabetes'].value_counts())

diabete_resampled_df.shape


Original Diabete Dataset:
has_diabetes
0    78720
1     4589
Name: count, dtype: int64
Downsampled Diabete Dataset:
has_diabetes
0    4827
1     281
Name: count, dtype: int64


(5108, 8)

In [118]:
print("Original Stroke Dataset:")
print(stroke_df['has_stroke'].value_counts())

print("Downsampled Stroke Dataset:")
print(stroke_resampled_df['has_stroke'].value_counts())

stroke_resampled_df.shape

Original Stroke Dataset:
has_stroke
0    4859
1     249
Name: count, dtype: int64
Downsampled Stroke Dataset:
has_stroke
0    4859
1     249
Name: count, dtype: int64


(5108, 8)

In [120]:
print("Original Heart Disease Dataset:")
print(heart_disease_df['has_heart_disease'].value_counts())

print("Oversampled Heart Disease Dataset:")
print(heart_disease_resampled_4_df['has_heart_disease'].value_counts())

heart_disease_resampled_4_df.shape

Original Heart Disease Dataset:
has_heart_disease
1    158
0    125
Name: count, dtype: int64
Oversampled Heart Disease Dataset:
has_heart_disease
1    2554
0    2554
Name: count, dtype: int64


(5108, 14)

## Combining Datasets

In [121]:
# Merge stroke and heart disease datasets
shared_resampled_columns_1 = list(set(stroke_resampled_df.columns).intersection(heart_disease_resampled_4_df.columns))
print(f"Shared Columns: {shared_resampled_columns_1}")

# Merge through the pandas merge function
combined_resampled_df_1 = pd.merge(stroke_resampled_df, heart_disease_resampled_4_df, on=shared_resampled_columns_1, how='inner')

print("Combined Dataset Shape: ", combined_resampled_df_1.shape)
combined_resampled_df_1.head(10)

Shared Columns: ['age', 'sex', 'has_heart_disease']
Combined Dataset Shape:  (102238, 19)


Unnamed: 0,age,has_hypertension,has_heart_disease,bmi,blood_glucose,sex,smoking_status,has_stroke,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,67,0,1,36.6,229,0,formerly smoked,1,2,152,277,0,1,172,0,0.0,2,1,2
1,67,0,1,36.6,229,0,formerly smoked,1,0,106,223,0,1,142,0,0.3,2,2,2
2,67,0,1,36.6,229,0,formerly smoked,1,1,155,300,0,0,159,0,0.615262,1,1,2
3,67,0,1,36.6,229,0,formerly smoked,1,1,155,259,0,0,145,0,0.467077,2,0,2
4,67,0,1,36.6,229,0,formerly smoked,1,1,114,264,0,0,132,0,0.085405,1,0,2
5,67,0,1,36.6,229,0,formerly smoked,1,2,138,237,0,1,150,0,1.8,1,2,2
6,67,0,1,36.6,229,0,formerly smoked,1,0,114,152,0,1,127,0,1.419871,1,0,2
7,67,0,1,36.6,229,0,formerly smoked,1,0,125,265,0,0,110,1,0.2,1,1,2
8,67,0,1,36.6,229,0,formerly smoked,1,1,155,300,0,0,159,0,0.605412,1,1,2
9,67,0,1,36.6,229,0,formerly smoked,1,1,157,297,0,0,162,0,0.544239,2,1,2


In [123]:
# Merge (Stroke and Heart Disease) with Diabetes datasets
shared_resampled_columns_2 = list(set(combined_resampled_df_1.columns).intersection(diabete_resampled_df.columns))
print(f"Shared Columns: {shared_resampled_columns_2}")

# Merge through the pandas merge function
combined_resampled_df_2 = pd.merge(combined_resampled_df_1, diabete_resampled_df, on=shared_resampled_columns_2, how='outer')

print("Combined Dataset Shape: ", combined_resampled_df_2.shape)
combined_resampled_df_2.head()

Shared Columns: ['age', 'has_heart_disease', 'has_hypertension', 'sex', 'blood_glucose', 'bmi']
Combined Dataset Shape:  (107346, 21)


Unnamed: 0,age,has_hypertension,has_heart_disease,bmi,blood_glucose,sex,smoking_status,has_stroke,cp,trestbps,...,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,HbA1c_level,has_diabetes
0,0,0,0,27.32,90,0,,,,,...,,,,,,,,,5.0,0.0
1,0,0,0,14.64,130,0,,,,,...,,,,,,,,,4.5,0.0
2,0,0,0,14.92,145,0,,,,,...,,,,,,,,,5.7,0.0
3,0,0,0,14.26,160,0,,,,,...,,,,,,,,,6.5,0.0
4,0,0,0,14.55,200,0,,,,,...,,,,,,,,,6.6,0.0


In [124]:
# Merge (Stroke, Heart Disease, and Diabetes) with Hypertension datasets
shared_resampled_columns = list(set(combined_resampled_df_2.columns).intersection(hypertension_resampled_df.columns))
print(f"Shared Columns: {shared_resampled_columns }")

# Merge through the pandas merge function
combined_resampled_df = pd.merge(combined_resampled_df_2, hypertension_resampled_df, on=shared_resampled_columns , how='outer')

print("Combined Dataset Shape: ", combined_resampled_df.shape)
combined_resampled_df.head()

Shared Columns: ['slope', 'trestbps', 'chol', 'oldpeak', 'exang', 'cp', 'thal', 'age', 'fbs', 'restecg', 'sex', 'thalach', 'has_hypertension', 'ca']
Combined Dataset Shape:  (112411, 21)


Unnamed: 0,age,has_hypertension,has_heart_disease,bmi,blood_glucose,sex,smoking_status,has_stroke,cp,trestbps,...,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,HbA1c_level,has_diabetes
0,51,0,0.0,29.5,67.0,1,never smoked,0.0,0.0,106.0,...,0.0,1.0,163.0,0.0,0.665998,0.0,0.0,3.0,,
1,51,0,0.0,33.1,67.0,1,formerly smoked,0.0,0.0,106.0,...,0.0,1.0,163.0,0.0,0.665998,0.0,0.0,3.0,,
2,51,0,0.0,29.4,68.0,1,smokes,0.0,0.0,106.0,...,0.0,1.0,163.0,0.0,0.665998,0.0,0.0,3.0,,
3,51,0,0.0,33.5,76.0,1,formerly smoked,0.0,0.0,106.0,...,0.0,1.0,163.0,0.0,0.665998,0.0,0.0,3.0,,
4,51,0,0.0,34.1,81.0,1,smokes,0.0,0.0,106.0,...,0.0,1.0,163.0,0.0,0.665998,0.0,0.0,3.0,,


In [125]:
# Check for missing values
print("Missing Values:\n", combined_resampled_df.isnull().sum())
print("\nDuplicated Values:\n", combined_resampled_df.duplicated().sum())

Missing Values:
 age                       0
has_hypertension          0
has_heart_disease      5065
bmi                    5065
blood_glucose          5065
sex                       0
smoking_status        10173
has_stroke            10173
cp                     5108
trestbps               5108
chol                   5108
fbs                    5108
restecg                5108
thalach                5108
exang                  5108
oldpeak                5108
slope                  5108
ca                     5108
thal                   5108
HbA1c_level          107303
has_diabetes         107303
dtype: int64

Duplicated Values:
 0


In [126]:
# Verify combined dataset
print("Combined Dataset Shape: ", combined_resampled_df.shape)
print("Combined Dataset Columns: ", combined_resampled_df.columns)

combined_resampled_df.info()
combined_resampled_df.describe().T
combined_resampled_df.head()

Combined Dataset Shape:  (112411, 21)
Combined Dataset Columns:  Index(['age', 'has_hypertension', 'has_heart_disease', 'bmi', 'blood_glucose',
       'sex', 'smoking_status', 'has_stroke', 'cp', 'trestbps', 'chol', 'fbs',
       'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal',
       'HbA1c_level', 'has_diabetes'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112411 entries, 0 to 112410
Data columns (total 21 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   age                112411 non-null  int64  
 1   has_hypertension   112411 non-null  int64  
 2   has_heart_disease  107346 non-null  float64
 3   bmi                107346 non-null  float64
 4   blood_glucose      107346 non-null  float64
 5   sex                112411 non-null  int64  
 6   smoking_status     102238 non-null  object 
 7   has_stroke         102238 non-null  float64
 8   cp                 107303 non-null  fl

Unnamed: 0,age,has_hypertension,has_heart_disease,bmi,blood_glucose,sex,smoking_status,has_stroke,cp,trestbps,...,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,HbA1c_level,has_diabetes
0,51,0,0.0,29.5,67.0,1,never smoked,0.0,0.0,106.0,...,0.0,1.0,163.0,0.0,0.665998,0.0,0.0,3.0,,
1,51,0,0.0,33.1,67.0,1,formerly smoked,0.0,0.0,106.0,...,0.0,1.0,163.0,0.0,0.665998,0.0,0.0,3.0,,
2,51,0,0.0,29.4,68.0,1,smokes,0.0,0.0,106.0,...,0.0,1.0,163.0,0.0,0.665998,0.0,0.0,3.0,,
3,51,0,0.0,33.5,76.0,1,formerly smoked,0.0,0.0,106.0,...,0.0,1.0,163.0,0.0,0.665998,0.0,0.0,3.0,,
4,51,0,0.0,34.1,81.0,1,smokes,0.0,0.0,106.0,...,0.0,1.0,163.0,0.0,0.665998,0.0,0.0,3.0,,


In [127]:
combined_resampled_df['smoking_status'] = combined_resampled_df['smoking_status'].fillna('Unknown')
print("Combined Dataset Shape: ", combined_resampled_df.shape)
combined_resampled_df.info()
combined_resampled_df.head()

Combined Dataset Shape:  (112411, 21)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112411 entries, 0 to 112410
Data columns (total 21 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   age                112411 non-null  int64  
 1   has_hypertension   112411 non-null  int64  
 2   has_heart_disease  107346 non-null  float64
 3   bmi                107346 non-null  float64
 4   blood_glucose      107346 non-null  float64
 5   sex                112411 non-null  int64  
 6   smoking_status     112411 non-null  object 
 7   has_stroke         102238 non-null  float64
 8   cp                 107303 non-null  float64
 9   trestbps           107303 non-null  float64
 10  chol               107303 non-null  float64
 11  fbs                107303 non-null  float64
 12  restecg            107303 non-null  float64
 13  thalach            107303 non-null  float64
 14  exang              107303 non-null  float64
 15  oldpeak      

Unnamed: 0,age,has_hypertension,has_heart_disease,bmi,blood_glucose,sex,smoking_status,has_stroke,cp,trestbps,...,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,HbA1c_level,has_diabetes
0,51,0,0.0,29.5,67.0,1,never smoked,0.0,0.0,106.0,...,0.0,1.0,163.0,0.0,0.665998,0.0,0.0,3.0,,
1,51,0,0.0,33.1,67.0,1,formerly smoked,0.0,0.0,106.0,...,0.0,1.0,163.0,0.0,0.665998,0.0,0.0,3.0,,
2,51,0,0.0,29.4,68.0,1,smokes,0.0,0.0,106.0,...,0.0,1.0,163.0,0.0,0.665998,0.0,0.0,3.0,,
3,51,0,0.0,33.5,76.0,1,formerly smoked,0.0,0.0,106.0,...,0.0,1.0,163.0,0.0,0.665998,0.0,0.0,3.0,,
4,51,0,0.0,34.1,81.0,1,smokes,0.0,0.0,106.0,...,0.0,1.0,163.0,0.0,0.665998,0.0,0.0,3.0,,


### Training

In [128]:
combined_resampled_df_copy = combined_resampled_df.copy()

# KNN does not work with missing values
# Since our goal is to have multiple target variables, we can assume that NaN is 0
target_columns = ['has_heart_disease', 'has_hypertension', 'has_diabetes', 'has_stroke']
combined_resampled_df_copy[target_columns] = combined_resampled_df[target_columns].fillna(0)

# Encode the smoking status
if 'smoking_status' in combined_resampled_df_copy.columns:
    le = LabelEncoder()
    combined_resampled_df_copy['smoking_status'] = le.fit_transform(combined_resampled_df_copy['smoking_status'])

# Normalized from float to int
combined_resampled_df_copy['has_stroke'] = combined_resampled_df_copy['has_stroke'].apply(lambda x: round(x) if isinstance(x, (int, float)) else x)
combined_resampled_df_copy['has_heart_disease'] = combined_resampled_df_copy['has_heart_disease'].apply(lambda x: round(x) if isinstance(x, (int, float)) else x)
combined_resampled_df_copy['has_diabetes'] = combined_resampled_df_copy['has_diabetes'].apply(lambda x: round(x) if isinstance(x, (int, float)) else x)

# Fill NaN values with the column mean
combined_resampled_df_copy = combined_resampled_df_copy.fillna(combined_resampled_df_copy.mean())


#### 1. KNN

In [129]:
def train_knn_by_target(target):
    # Check (X) and (y) features  
    X = combined_resampled_df_copy.drop(columns=target_columns)
    y = combined_resampled_df_copy[target]

    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Scale the features as unscaled features can distort distances in KNN
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)  # Use transform (not fit_transform) to avoid data leakage

    # Train KNN model
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(X_train, y_train)

    y_pred = knn.predict(X_test)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    print("Target Variable:", target)
    print('KNN model accuracy:', accuracy)
    print('KNN model recall:', recall)
    print("Classification Report:")
    print(classification_report(y_test, y_pred))


In [130]:
train_knn_by_target('has_stroke')
train_knn_by_target('has_hypertension')
train_knn_by_target('has_heart_disease')
train_knn_by_target('has_diabetes')

Target Variable: has_stroke
KNN model accuracy: 0.9545879108659876
KNN model recall: 0.23202301054650049
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.99      0.98     21440
           1       0.52      0.23      0.32      1043

    accuracy                           0.95     22483
   macro avg       0.74      0.61      0.65     22483
weighted avg       0.94      0.95      0.95     22483

Target Variable: has_hypertension
KNN model accuracy: 0.8889383089445358
KNN model recall: 0.4410763996288277
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.96      0.94     19250
           1       0.67      0.44      0.53      3233

    accuracy                           0.89     22483
   macro avg       0.79      0.70      0.74     22483
weighted avg       0.88      0.89      0.88     22483

Target Variable: has_heart_disease
KNN model accuracy: 0.9931503802873282
KNN model r

### SVM

In [131]:
def train_svm_by_target(target):
    # Check (X) and (y) features  
    X = combined_resampled_df_copy.drop(columns=target_columns)
    y = combined_resampled_df_copy[target]

    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # # Find best parameters by using hyperparameter tuning through GridSearchCV
    # svm_parameters = {
    #     'C': [0.1, 1, 10],
    #     'gamma': ['scale', 'auto', 0.01, 0.1],
    #     'kernel': ['rbf', 'linear']
    # }

    # base_svm = SVC()
    # svm_grid = GridSearchCV(base_svm, svm_parameters, scoring='recall', return_train_score=True, cv=5, verbose=1)

    # # Train KNN model
    # svm_grid.fit(X_train, y_train)

    # best_model = svm_grid.best_estimator_
    # best_parameters = svm_grid.best_params_
    # best_recall = svm_grid.best_score_

    # print('The best model was:', best_model)
    # print('The best parameter values were:', best_parameters)
    # print('The best recall was:', best_recall)

    svm = SVC()
    svm.fit(X_train, y_train)

    y_pred = svm.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    print("Target Variable:", target)
    print('SVM model accuracy:', accuracy)
    print('SVM model recall:', recall)
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

In [132]:
train_svm_by_target('has_stroke')
train_svm_by_target('has_hypertension')
train_svm_by_target('has_heart_disease')
train_svm_by_target('has_diabetes')

Target Variable: has_stroke
SVM model accuracy: 0.9536093937641774
SVM model recall: 0.0
Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.98     21440
           1       0.00      0.00      0.00      1043

    accuracy                           0.95     22483
   macro avg       0.48      0.50      0.49     22483
weighted avg       0.91      0.95      0.93     22483



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Target Variable: has_hypertension
SVM model accuracy: 0.8562024640839746
SVM model recall: 0.0
Classification Report:
              precision    recall  f1-score   support

           0       0.86      1.00      0.92     19250
           1       0.00      0.00      0.00      3233

    accuracy                           0.86     22483
   macro avg       0.43      0.50      0.46     22483
weighted avg       0.73      0.86      0.79     22483



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Target Variable: has_heart_disease
SVM model accuracy: 0.95307565716319
SVM model recall: 0.0
Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.98     21428
           1       0.00      0.00      0.00      1055

    accuracy                           0.95     22483
   macro avg       0.48      0.50      0.49     22483
weighted avg       0.91      0.95      0.93     22483



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Target Variable: has_diabetes
SVM model accuracy: 0.9977760974958858
SVM model recall: 0.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     22433
           1       0.00      0.00      0.00        50

    accuracy                           1.00     22483
   macro avg       0.50      0.50      0.50     22483
weighted avg       1.00      1.00      1.00     22483



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Naive Bayes

In [133]:
def train_nb_by_target(target):
    # Check (X) and (y) features  
    X = combined_resampled_df_copy.drop(columns=target_columns)
    y = combined_resampled_df_copy[target]

    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train KNN model
    svm = GaussianNB() 
    svm.fit(X_train, y_train)

    y_pred = svm.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    print("Target Variable:", target)
    print('Naive Bayes model accuracy:', accuracy)
    print('Naive Bayes model recall:', recall)
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

In [134]:
train_nb_by_target('has_stroke')
train_nb_by_target('has_hypertension')
train_nb_by_target('has_heart_disease')
train_nb_by_target('has_diabetes')

Target Variable: has_stroke
Naive Bayes model accuracy: 0.2049103767290842
Naive Bayes model recall: 0.9731543624161074
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.17      0.29     21440
           1       0.05      0.97      0.10      1043

    accuracy                           0.20     22483
   macro avg       0.52      0.57      0.19     22483
weighted avg       0.95      0.20      0.28     22483

Target Variable: has_hypertension
Naive Bayes model accuracy: 0.8061201796913223
Naive Bayes model recall: 0.24497370862975565
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.90      0.89     19250
           1       0.29      0.24      0.27      3233

    accuracy                           0.81     22483
   macro avg       0.58      0.57      0.58     22483
weighted avg       0.79      0.81      0.80     22483

Target Variable: has_heart_disease
Naive Bayes model 

#### 4. Random Forest

In [135]:
def train_rf_by_target(target):
    # Check (X) and (y) features  
    X = combined_resampled_df_copy.drop(columns=target_columns)
    y = combined_resampled_df_copy[target]

    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train KNN model
    rf = RandomForestClassifier(n_estimators=100, random_state=42) 
    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    print("Target Variable:", target)
    print('Random Forest model accuracy:', accuracy)
    print('Random Forest model recall:', recall)
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

In [136]:
train_rf_by_target('has_stroke')
train_rf_by_target('has_hypertension')
train_rf_by_target('has_heart_disease')
train_rf_by_target('has_diabetes')

Target Variable: has_stroke
Random Forest model accuracy: 0.9971089267446516
Random Forest model recall: 0.9376797698945349
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     21440
           1       1.00      0.94      0.97      1043

    accuracy                           1.00     22483
   macro avg       1.00      0.97      0.98     22483
weighted avg       1.00      1.00      1.00     22483

Target Variable: has_hypertension
Random Forest model accuracy: 0.9903482631321443
Random Forest model recall: 0.9365914011753789
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     19250
           1       1.00      0.94      0.97      3233

    accuracy                           0.99     22483
   macro avg       0.99      0.97      0.98     22483
weighted avg       0.99      0.99      0.99     22483

Target Variable: has_heart_disease
Random Fore