In [15]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import recall_score, precision_score, accuracy_score, f1_score, confusion_matrix
from sklearn.model_selection import cross_val_predict, cross_val_score

import pandas as pd
import numpy as np

## Carrega as duas bases para testes

In [16]:
# Com categorização das colunas bmi e glicose
stroke_cat = pd.read_csv('stroke_df_cat')


## Entrada para modelagem

In [17]:
#features

X = (stroke_cat.drop(['stroke'], axis=1)).values

#target

y = (stroke_cat['stroke']).values

## Criação dos modelos

In [18]:
ad_clf = DecisionTreeClassifier()
knn_clf = KNeighborsClassifier(n_neighbors = 5)
rf_clf = RandomForestClassifier(n_estimators=100)
lg_clf = LogisticRegression(random_state=0, max_iter=1000)
mlp_clf = MLPClassifier(random_state=1, max_iter=300)

## Amostragem Holdout teste dos modelos

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33,stratify=y)

## Oversampling

In [28]:
from imblearn.over_sampling import SMOTE

s=SMOTE()
X_res,y_res =s.fit_resample(X_train,y_train)
X_resk,y_resk = s.fit_resample(X,y) 

## Undersampling

In [20]:
from imblearn.under_sampling import ClusterCentroids,RandomUnderSampler

rus= RandomUnderSampler()
cc = ClusterCentroids()

X_res,y_res = rus.fit_resample(X_train,y_train)
X_resk,y_resk = rus.fit_resample(X,y)

## Treinar modelos

In [29]:
modelos = [ad_clf, knn_clf, rf_clf, lg_clf, mlp_clf]

# Dicionário para armazenar métricas
metrics = {"Modelo": [], "Accuracy": [], "Precision": [], "Recall": [], "F1-Score": []}

for modelo in modelos:
    modelo.fit(X_res, y_res)
    y_pred = modelo.predict(X_test)
    
    modelo_name = modelo.__class__.__name__
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    metrics["Modelo"].append(modelo_name)
    metrics["Accuracy"].append(accuracy)
    metrics["Precision"].append(precision)
    metrics["Recall"].append(recall)
    metrics["F1-Score"].append(f1)

# Exiba as métricas

metrics_df = pd.DataFrame(metrics)
print(metrics_df)


                   Modelo  Accuracy  Precision    Recall  F1-Score
0  DecisionTreeClassifier  0.884752   0.135189  0.316279  0.189415
1    KNeighborsClassifier  0.887129   0.173112  0.437209  0.248021
2  RandomForestClassifier  0.895446   0.149888  0.311628  0.202417
3      LogisticRegression  0.805347   0.132887  0.646512  0.220460
4           MLPClassifier  0.799406   0.117816  0.572093  0.195393


## Treinar modelos com  o KFold

In [22]:
modelos = [ad_clf, knn_clf, rf_clf, lg_clf, mlp_clf]


metrics = {"Modelo": [], "Accuracy": [], "Precision": [], "Recall": [], "F1-Score": []}

# Realize a validação cruzada e calcule as métricas para cada modelo
for modelo in modelos:
    
    modelo_name = modelo.__class__.__name__    
   
    y_pred = cross_val_predict(modelo, X, y, cv=5)
    
    # Calcule as métricas
    
    accuracy = cross_val_score(modelo, X, y, cv=5, scoring='accuracy').mean()
    precision = cross_val_score(modelo, X, y, cv=5, scoring='precision').mean()
    recall = cross_val_score(modelo, X, y, cv=5, scoring='recall').mean()
    f1 = cross_val_score(modelo, X, y, cv=5, scoring='f1').mean()
    
    # Armazene as métricas no dicionário
    metrics["Modelo"].append(modelo_name)
    metrics["Accuracy"].append(accuracy)
    metrics["Precision"].append(precision)
    metrics["Recall"].append(recall)
    metrics["F1-Score"].append(f1)

# Exiba as métricas
import pandas as pd
metrics_df = pd.DataFrame(metrics)
print(metrics_df)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                   Modelo  Accuracy  Precision    Recall  F1-Score
0  DecisionTreeClassifier  0.932105   0.171209  0.183427  0.184351
1    KNeighborsClassifier  0.955760   0.306128  0.060067  0.100375
2  RandomForestClassifier  0.948899   0.202442  0.088576  0.119536
3      LogisticRegression  0.958309   0.100000  0.003175  0.006154
4           MLPClassifier  0.958636   0.000000  0.000000  0.000000


In [23]:
modelos = [ad_clf, knn_clf, rf_clf, lg_clf, mlp_clf]


metrics = {"Modelo": [], "Accuracy": [], "Precision": [], "Recall": [], "F1-Score": []}

# Realize a validação cruzada e calcule as métricas para cada modelo
for modelo in modelos:
    modelo_name = modelo.__class__.__name__
    
    # Realize a validação cruzada com previsões
    y_pred = cross_val_predict(modelo, X_resk, y_resk, cv=5)
    
    # Calcule as métricas
    accuracy = cross_val_score(modelo, X_resk, y_resk, cv=5, scoring='accuracy').mean()
    precision = cross_val_score(modelo, X_resk, y_resk, cv=5, scoring='precision').mean()
    recall = cross_val_score(modelo, X_resk, y_resk, cv=5, scoring='recall').mean()
    f1 = cross_val_score(modelo, X_resk, y_resk, cv=5, scoring='f1').mean()
    
    # Armazene as métricas no dicionário
    metrics["Modelo"].append(modelo_name)
    metrics["Accuracy"].append(accuracy)
    metrics["Precision"].append(precision)
    metrics["Recall"].append(recall)
    metrics["F1-Score"].append(f1)

# Exiba as métricas
import pandas as pd
metrics_df = pd.DataFrame(metrics)
print(metrics_df)


                   Modelo  Accuracy  Precision    Recall  F1-Score
0  DecisionTreeClassifier  0.716761   0.729177  0.707337  0.717085
1    KNeighborsClassifier  0.784827   0.787793  0.780177  0.783060
2  RandomForestClassifier  0.787182   0.771844  0.816523  0.789376
3      LogisticRegression  0.806189   0.795353  0.824484  0.809235
4           MLPClassifier  0.780102   0.781226  0.780077  0.780279


In [None]:
pipelines = {
    'Decision Tree': Pipeline([
        ('classifier', DecisionTreeClassifier())
    ]),
    'Random Forest': Pipeline([
        ('classifier', RandomForestClassifier())
    ]),
    'MLP Classifier': Pipeline([
        ('classifier', MLPClassifier())
    ]),
    'KNN': Pipeline([
        ('classifier', KNeighborsClassifier())
    ]),
}

param_grids = {
    'Random Forest':{
        'classifier__n_estimators': [50,100,200],
        'classifier__max_depth': [2,4,6]
    },
    'Decision Tree':{
        'classifier__min_samples_split': [2,3,4],
        'classifier__max_depth': [2,4,6]
    },
    'KNN':{
        'classifier__n_neighbors': [3,5,7],
        'classifier__weights': ['uniform', 'distance']
    },
    'MLP Classifier':{
        'classifier__hidden_layer_sizes': [(50,), (100,50), (100,100,50)],
        'classifier__alpha': [0.0001, 0.001, 0.01]
    }
}

for model_name, pipeline in pipelines.items():
  grid_search = GridSearchCV(pipeline, param_grids[model_name], scoring='accuracy')
  grid_search.fit(X_train, y_train)

  best_params = grid_search.best_params_
  best_model = grid_search.best_estimator_

  y_pred = best_model.predict(X_test)
  accuracy = accuracy_score(y_test, y_pred)

  print("Modelo:", model_name)
  print("Melhores Parâmetros:", best_params)
  print("Accuracy:", accuracy)

In [35]:
from sklearn.model_selection import StratifiedKFold
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

# Supondo que você já carregou seu conjunto de dados em X e y

# Defina o número de folds para a validação cruzada
n_splits = 5

# Crie uma instância do modelo que você deseja avaliar, por exemplo, uma árvore de decisão
model = DecisionTreeClassifier()

# Crie uma instância de StratifiedKFold para estratificação durante a validação cruzada
cv = StratifiedKFold(n_splits=n_splits)

# Crie um objeto RandomUnderSampler para realizar o undersampling
under_sampler = RandomUnderSampler()
#s=SMOTE()


# Inicialize uma lista para armazenar as métricas, como a acurácia, para cada fold
accuracies = []
precision = []
recalls = []

# Realize a validação cruzada com undersampling
for train_idx, test_idx in cv.split(X, y):
    X_train, y_train = X[train_idx], y[train_idx]
    X_test, y_test = X[test_idx], y[test_idx]

    # Realize o undersampling nos dados de treinamento
    X_resampled, y_resampled =under_sampler.fit_resample(X_train, y_train)

    # Treine o modelo no conjunto de dados de treinamento resampleado
    model.fit(X_resampled, y_resampled)

    # Faça previsões no conjunto de dados de teste
    y_pred = model.predict(X_test)

    # Calcule a métrica desejada (por exemplo, acurácia)
    accuracy = accuracy_score(y_test, y_pred)
    precisao = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    accuracies.append(accuracy)
    precision.append(precisao)
    recalls.append(recall)

# Calcule a média das acurácias de todos os folds
mean_accuracy = sum(accuracies) / n_splits
mean_precisao = sum(precision) / n_splits
mean_recall = sum(recalls) / n_splits

print("Acurácia Média:", mean_accuracy)
print("Precisao Média:", mean_precisao)
print("Recall Média:", mean_recall)


Acurácia Média: 0.7427967493215297
Precisao Média: 0.1035899699312536
Recall Média: 0.6834770653668292
