## Imports

In [None]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.pipeline import Pipeline 
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from skopt import BayesSearchCV

import warnings

# Suprimir todos os warnings
warnings.filterwarnings("ignore")

## Dataset

In [None]:
df = pd.read_csv('df_processed.csv')
df.head()

In [None]:
df.shape

In [None]:
print(df.columns.tolist())

## Modelagem


In [None]:
R_STATE = 42
TARGET = 'Heart_Disease'

In [None]:
def model_train_test(df: pd.DataFrame, target: str, test_size=0.3, random_state=42) -> list:
    X, y = df.drop(columns=target), df[target]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=test_size,
        random_state=random_state
    )

    return X_train, X_test, y_train, y_test

def model_pre_process(df: pd.DataFrame, target: str, X_train: pd.DataFrame, X_test: pd.DataFrame) -> list:
    df = df.drop(columns=target).copy()

    numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
    categorical_features = df.select_dtypes(exclude=[np.number]).columns.tolist()

    numerical_transformer = Pipeline(
        steps = [
            ('scaler', RobustScaler())
        ]
    )

    categorical_transformer = Pipeline(
        steps = [
            ('onehot', OneHotEncoder(handle_unknown = 'ignore'))
        ]
    )

    preprocessor = ColumnTransformer(
        transformers = [
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ]
    )

    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)

    return X_train_processed, X_test_processed

### Supervisionada

In [None]:
PARAM_GRID_XGB = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 10],
    'subsample': [0.6, 0.8, 1.0]
}

PARAM_GRID_ADABOOST = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1, 10],
}

PARAM_GRID_GRADIENTBOOST = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2, 0.5],
    'max_depth': [3, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'subsample': [0.6, 0.8, 1.0]
}

In [None]:
def supervised_rand_search_cv(model, param_grid, X_train, X_test, y_train, y_test, cv=5, n_jobs=-1, verbose=1):
    rand_search = RandomizedSearchCV(
        model, 
        param_grid, 
        cv=cv, 
        n_jobs=n_jobs,
        verbose=verbose
    )

    rand_search.fit(X_train, y_train)
    
    best_model = rand_search.best_estimator_
    predictions = best_model.predict(X_test)
    accuracy = accuracy_score(predictions, y_test)

    print(f'Melhores parâmetros: {rand_search.best_params_}')
    print(f'Precisão (acurácia): {accuracy}')

    return best_model

In [None]:
X_train, X_test, y_train, y_test = model_train_test(df, TARGET)

In [None]:
X_train_processed, X_test_processed = model_pre_process(df, TARGET, X_train, X_test)

In [None]:
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
adaboost_model = AdaBoostClassifier(random_state=R_STATE)
gradientboost_model = GradientBoostingClassifier(random_state=R_STATE) 

### XGBoost

In [None]:
best_xgb_model = supervised_rand_search_cv(xgb_model, PARAM_GRID_XGB, X_train_processed, X_test_processed, y_train, y_test)

### Adaboost

In [None]:
best_adaboost_model = supervised_rand_search_cv(adaboost_model, PARAM_GRID_ADABOOST, X_train_processed, X_test_processed, y_train, y_test)

### GradientBoost

In [None]:
best_gradientboost_model = supervised_rand_search_cv(gradientboost_model, PARAM_GRID_GRADIENTBOOST, X_train_processed, X_test_processed, y_train, y_test)

### Não supervisionada

In [None]:
PARAM_GRID_KMEANS = {
    'n_clusters': [2, 3, 5, 10],
    'init': ['k-means++', 'random'],
    'n_init': [10, 20, 30],
    'max_iter': [300, 500]
}

PARAM_GRID_DBSCAN = {
    'eps': [0.3, 0.5, 0.7, 1.0],
    'min_samples': [5, 10, 15],
    'metric': ['euclidean', 'manhattan', 'cosine']
}

PARAM_GRID_HIERARCHICAL = {
    'n_clusters': [2, 3, 5, 10],
    'linkage': ['ward', 'complete', 'average', 'single'],
    'affinity': ['euclidean', 'l1', 'l2', 'manhattan', 'cosine']
}

In [None]:
def unsupervised_rand_search_cv(model, param_grid, df, cv=5, n_jobs=-1, verbose=1):
    rand_search = RandomizedSearchCV(
        model, 
        param_grid, 
        cv=cv, 
        n_jobs=n_jobs,
        verbose=verbose
    )

    rand_search.fit(X_train)        
    best_model = rand_search.best_estimator_

    print(f'Melhores parâmetros: {rand_search.best_params_}')

    return best_model

In [None]:
kmeans_model = KMeans(random_state=R_STATE)
dbscan_model = DBSCAN()
hierarchical_model = AgglomerativeClustering()

### Kmeans

In [None]:
best_kmeans_model = unsupervised_rand_search_cv(kmeans_model, PARAM_GRID_KMEANS, df)

### DBSCAN

In [None]:
best_dbscan_model = unsupervised_rand_search_cv(dbscan_model, PARAM_GRID_DBSCAN, df)

### Hierarquical

In [None]:
best_hierarchical_model = unsupervised_rand_search_cv(hierarchical_model, PARAM_GRID_HIERARCHICAL, df)