<a href="https://colab.research.google.com/github/luiseduaardo/NeuroPrev/blob/main/treinamento.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
!git clone https://github.com/luiseduaardo/NeuroPrev.git
%cd NeuroPrev

Cloning into 'NeuroPrev'...
remote: Enumerating objects: 39, done.[K
remote: Counting objects: 100% (39/39), done.[K
remote: Compressing objects: 100% (33/33), done.[K
remote: Total 39 (delta 11), reused 18 (delta 4), pack-reused 0 (from 0)[K
Receiving objects: 100% (39/39), 631.45 KiB | 2.46 MiB/s, done.
Resolving deltas: 100% (11/11), done.
/content/NeuroPrev/NeuroPrev/NeuroPrev/NeuroPrev/NeuroPrev


In [1]:
!pip install optuna



In [2]:
import pandas as pd
import numpy as np
import joblib
import os
import optuna
from imblearn.over_sampling import SMOTENC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import f1_score

In [3]:
df = pd.read_csv('dataset/healthcare-dataset-stroke-data.csv')


In [4]:
df = df.drop('id', axis = 1) #remove coluna id

#separacao do target
X = df.drop("stroke", axis=1)
y = df["stroke"]


#split treino e teste
x_treino, x_teste, y_treino, y_teste = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=6,
    stratify=y
)

#imputacao da mediana do bmi no treino
median_bmi = x_treino["bmi"].median()

x_treino["bmi"] = x_treino["bmi"].fillna(median_bmi)
x_teste["bmi"] = x_teste["bmi"].fillna(median_bmi)


In [5]:
#one hot encoding
x_treino = pd.get_dummies(x_treino)
x_teste = pd.get_dummies(x_teste)

In [6]:
#alinhar colunas
x_treino, x_teste = x_treino.align(x_teste, join="left", axis=1, fill_value=0)

#XGBOOST

#RANDOM FOREST


In [33]:
#identificar categorias
categorias = []
for i, col in enumerate(x_treino.columns):
    if x_treino[col].nunique() <= 2: #se a coluna for binaria ou for int
        categorias.append(i)

smotenc = SMOTENC(categorical_features = categorias, random_state = 6) #não quebra os valores de categorias
x_treino_blc, y_treino_blc = smotenc.fit_resample(x_treino, y_treino)

In [35]:
#teste com optuna

def objetivo(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500), #num arvores
        'max_depth': trial.suggest_int('max_depth', 5, 30), #profundidade max
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 15), #min de exemplos pra dividir um nó
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10), #min de exemplos numa folha
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
        'class_weight': 'balanced', #fixo para ajudar no desbalanceamento
        'random_state': 6,
        'n_jobs': -1
    }

    #cria e treina modelo
    modelo = RandomForestClassifier(**params)
    modelo.fit(x_treino_blc, y_treino_blc)

    preds = modelo.predict(x_teste) #testa
    return f1_score(y_teste, preds, pos_label=1)

#estudo e otimização
estudo_rf = optuna.create_study(direction = 'maximize')
estudo_rf.optimize(objetivo, n_trials = 100)

[I 2026-01-29 22:29:45,289] A new study created in memory with name: no-name-86934374-1c28-4461-8d1f-ca6b81a2cee2
[I 2026-01-29 22:29:47,404] Trial 0 finished with value: 0.26153846153846155 and parameters: {'n_estimators': 269, 'max_depth': 28, 'min_samples_split': 11, 'min_samples_leaf': 2, 'bootstrap': True}. Best is trial 0 with value: 0.26153846153846155.
[I 2026-01-29 22:29:49,618] Trial 1 finished with value: 0.22988505747126436 and parameters: {'n_estimators': 464, 'max_depth': 5, 'min_samples_split': 4, 'min_samples_leaf': 3, 'bootstrap': True}. Best is trial 0 with value: 0.26153846153846155.
[I 2026-01-29 22:29:50,903] Trial 2 finished with value: 0.2549019607843137 and parameters: {'n_estimators': 153, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 2, 'bootstrap': False}. Best is trial 0 with value: 0.26153846153846155.
[I 2026-01-29 22:29:54,891] Trial 3 finished with value: 0.2125 and parameters: {'n_estimators': 406, 'max_depth': 28, 'min_samples_split': 3,

In [None]:
#pegando os melhores parâmetros do Optuna
melhores_params = estudo_rf.best_params
melhores_params['class_weight'] = 'balanced'
melhores_params['random_state'] = 6
melhores_params['n_jobs'] = -1

#treina modelo final
modelo_final = RandomForestClassifier(**melhores_params)
modelo_final.fit(x_treino_blc, y_treino_blc)

#salvando os pesos do treinamento
if not os.path.exists('weights'):
    os.makedirs('weights')

joblib.dump(modelo_final, 'weights/modelo_avc_final.pkl')
print("Sucesso! Modelo salvo em weights/modelo_avc_final.pkl")