In [1]:
# Import modules

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.neural_network import MLPClassifier
import time
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
optimization_time_in_seconds = -time.time()

In [3]:
# Load and prepare data

df = pd.read_csv("heart_2022_with_nans.csv")

df.drop_duplicates(inplace=True)

for col in df.columns:
    if df[col].dtype == 'object':
        min_value = df[col].dropna().mode().iloc[0]
        df[col] = df[col].fillna(min_value)
    else:
        mean_value = df[col].mean()
        df[col] = df[col].fillna(mean_value)

def bmi_category(bmi):
    if bmi > 24.9:
        return 'Above Normal'
    elif bmi >= 18.5 and bmi <= 24.9:
        return 'Normal'
    else:
        return 'Below Normal'

# Apply the function to create a new column 'bmi_category'
df['bmi_category'] = df['BMI'].apply(bmi_category)

for col in df.select_dtypes(include='object').columns:
    encoder = LabelEncoder()
    df[col] = encoder.fit_transform(df[col])

correlation_matrix = df.corr()
strong_correlation_cols = [col for col in correlation_matrix.columns if abs(correlation_matrix.loc['HadHeartAttack', col]) >= 0.06 and col != 'HadHeartAttack']

# Select features (X) and target (y)
X = df[strong_correlation_cols]
y = df['HadHeartAttack']

s = MinMaxScaler()
X = s.fit_transform(X)

# Split data to train and test sets

x_train,x_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=12)


In [4]:
# Defining objective function

def objective(trial: optuna.Trial):
    global x_train, x_test, y_train, y_test

    n_layers = trial.suggest_int('n_layers', 1, 4)

    per_layer = 100 // n_layers

    layers = [trial.suggest_int(f'n_units_{i}', per_layer, per_layer) for i in range(n_layers)]
    
    for i in range(n_layers, 4):
        trial.suggest_int(f'n_units_{i}', 0, 0)

    activation = trial.suggest_categorical('activation', ['identity', 'logistic', 'tanh', 'relu'])
    solver = trial.suggest_categorical('solver', ['lbfgs', 'sgd', 'adam'])
    alpha = trial.suggest_float('alpha', 0.00001, 0.3)
    learning_rate = trial.suggest_categorical('learning_rate', ['constant', 'invscaling', 'adaptive'])
    
    clf =  MLPClassifier(hidden_layer_sizes=tuple(layers), activation=activation, solver=solver, alpha=alpha, learning_rate=learning_rate, max_iter=200)
    clf.fit(x_train, y_train)

    y_pred = clf.predict(x_test)
    
    return f1_score(y_test, y_pred)


In [5]:
study = optuna.create_study(study_name="MLPClassifier", direction='maximize')

#optimization_time_in_seconds = -time.time()

study.optimize(objective, n_trials=50)

[I 2024-08-12 08:56:18,238] A new study created in memory with name: MLPClassifier
[I 2024-08-12 08:59:10,786] Trial 0 finished with value: 0.24295432458697766 and parameters: {'n_layers': 3, 'n_units_0': 33, 'n_units_1': 33, 'n_units_2': 33, 'n_units_3': 0, 'activation': 'logistic', 'solver': 'adam', 'alpha': 0.01243609576889099, 'learning_rate': 'constant'}. Best is trial 0 with value: 0.24295432458697766.
[I 2024-08-12 09:00:26,323] Trial 1 finished with value: 0.28972675927339336 and parameters: {'n_layers': 3, 'n_units_0': 33, 'n_units_1': 33, 'n_units_2': 33, 'n_units_3': 0, 'activation': 'relu', 'solver': 'sgd', 'alpha': 0.001312567669799098, 'learning_rate': 'adaptive'}. Best is trial 1 with value: 0.28972675927339336.
[I 2024-08-12 09:00:48,442] Trial 2 finished with value: 0.0 and parameters: {'n_layers': 4, 'n_units_0': 25, 'n_units_1': 25, 'n_units_2': 25, 'n_units_3': 25, 'activation': 'logistic', 'solver': 'lbfgs', 'alpha': 0.22643358778933215, 'learning_rate': 'constant'

In [6]:
optimization_time_in_seconds += time.time()

In [7]:
print("Best params:", study.best_params)
print("Best value:", study.best_value)
hours = int(optimization_time_in_seconds) // 3600
minutes = (int(optimization_time_in_seconds) // 60) % 60
seconds = int(optimization_time_in_seconds)
seconds = seconds % 60 + (optimization_time_in_seconds - int(optimization_time_in_seconds))    
print("Optimization time: {} h {} m {} s".format(hours, minutes, seconds))

Best params: {'n_layers': 3, 'n_units_0': 33, 'n_units_1': 33, 'n_units_2': 33, 'n_units_3': 0, 'activation': 'tanh', 'solver': 'adam', 'alpha': 0.06792533083487824, 'learning_rate': 'constant'}
Best value: 0.389967845659164
Optimization time: 1 h 11 m 39.588736057281494 s
