In [12]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import recall_score, precision_score, accuracy_score, f1_score, confusion_matrix
from sklearn.model_selection import cross_val_predict, cross_val_score, StratifiedKFold

import pandas as pd
import numpy as np

## Carrega base

In [2]:
# Com categorização das colunas bmi e glicose
stroke_cat = pd.read_csv('stroke_df_cat')


## Entrada para modelagem

In [3]:
#features

X = (stroke_cat.drop(['stroke'], axis=1)).values

#target

y = (stroke_cat['stroke']).values

## Criação dos modelos

In [4]:
ad_clf = DecisionTreeClassifier()
knn_clf = KNeighborsClassifier(n_neighbors = 5)
rf_clf = RandomForestClassifier(n_estimators=100)
lg_clf = LogisticRegression(random_state=0, max_iter=1000)
mlp_clf = MLPClassifier(random_state=1, max_iter=300)

## Amostragem Holdout teste dos modelos

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33)

In [6]:
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X, y, test_size = 0.33,stratify=y)

#stratify, a função train_test_split garante que a divisão dos dados em conjuntos de treinamento e teste mantém a mesma distribuição das classes em y nos dois conjuntos

## Oversampling

In [7]:
from imblearn.over_sampling import SMOTE

s=SMOTE()
X_reso,y_reso =s.fit_resample(X_train,y_train)


## Undersampling

In [8]:
from imblearn.under_sampling import ClusterCentroids,RandomUnderSampler

rus= RandomUnderSampler()
cc = ClusterCentroids()

X_resu,y_resu = rus.fit_resample(X_train,y_train)


## Treinar modelos - com Holdout

In [9]:
modelos = [ad_clf, knn_clf, rf_clf, lg_clf, mlp_clf]

# Dicionário para armazenar métricas
metrics = {"Modelo": [], "Accuracy": [], "Precision": [], "Recall": [], "F1-Score": []}

for modelo in modelos:
    modelo.fit(X_train, y_train)
    y_pred = modelo.predict(X_test)
    
    modelo_name = modelo.__class__.__name__
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    metrics["Modelo"].append(modelo_name)
    metrics["Accuracy"].append(accuracy)
    metrics["Precision"].append(precision)
    metrics["Recall"].append(recall)
    metrics["F1-Score"].append(f1)

# Exiba as métricas

metrics_df = pd.DataFrame(metrics)
print(metrics_df)


                   Modelo  Accuracy  Precision    Recall  F1-Score
0  DecisionTreeClassifier  0.934455   0.204762  0.207729  0.206235
1    KNeighborsClassifier  0.955050   0.272727  0.057971  0.095618
2  RandomForestClassifier  0.950099   0.241379  0.101449  0.142857
3      LogisticRegression  0.958812   0.444444  0.019324  0.037037
4           MLPClassifier  0.959010   0.000000  0.000000  0.000000


  _warn_prf(average, modifier, msg_start, len(result))


## Treinar modelos - com Holdout stratify

In [10]:
modelos = [ad_clf, knn_clf, rf_clf, lg_clf, mlp_clf]

# Dicionário para armazenar métricas
metrics = {"Modelo": [], "Accuracy": [], "Precision": [], "Recall": [], "F1-Score": []}

for modelo in modelos:
    modelo.fit(X_train_s, y_train_s)
    y_pred = modelo.predict(X_test_s)
    
    modelo_name = modelo.__class__.__name__
    accuracy = accuracy_score(y_test_s, y_pred)
    precision = precision_score(y_test_s, y_pred)
    recall = recall_score(y_test_s, y_pred)
    f1 = f1_score(y_test_s, y_pred)
    
    metrics["Modelo"].append(modelo_name)
    metrics["Accuracy"].append(accuracy)
    metrics["Precision"].append(precision)
    metrics["Recall"].append(recall)
    metrics["F1-Score"].append(f1)

# Exiba as métricas

metrics_df = pd.DataFrame(metrics)
print(metrics_df)

                   Modelo  Accuracy  Precision    Recall  F1-Score
0  DecisionTreeClassifier  0.930495   0.185841  0.200957  0.193103
1    KNeighborsClassifier  0.956634   0.395833  0.090909  0.147860
2  RandomForestClassifier  0.948515   0.206897  0.086124  0.121622
3      LogisticRegression  0.958020   0.333333  0.014354  0.027523
4           MLPClassifier  0.957624   0.272727  0.014354  0.027273


## Treinar modelos com  o KFold

In [15]:
modelos = [ad_clf, knn_clf, rf_clf, lg_clf, mlp_clf]


metrics = {"Modelo": [], "Accuracy": [], "Precision": [], "Recall": [], "F1-Score": []}

# Realize a validação cruzada e calcule as métricas para cada modelo
for modelo in modelos:
    
    modelo_name = modelo.__class__.__name__    
   
    y_pred = cross_val_predict(modelo, X, y, cv=StratifiedKFold(n_splits=5))
    
    # Calcule as métricas
    
    accuracy = cross_val_score(modelo, X, y, cv=StratifiedKFold(n_splits=5), scoring='accuracy').mean()
    precision = cross_val_score(modelo, X, y, cv=StratifiedKFold(n_splits=5), scoring='precision').mean()
    recall = cross_val_score(modelo, X, y, cv=StratifiedKFold(n_splits=5), scoring='recall').mean()
    f1 = cross_val_score(modelo, X, y, cv=StratifiedKFold(n_splits=5), scoring='f1').mean()
    
    # Armazene as métricas no dicionário
    metrics["Modelo"].append(modelo_name)
    metrics["Accuracy"].append(accuracy)
    metrics["Precision"].append(precision)
    metrics["Recall"].append(recall)
    metrics["F1-Score"].append(f1)

# Exiba as métricas
import pandas as pd
metrics_df = pd.DataFrame(metrics)
print(metrics_df)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                   Modelo  Accuracy  Precision    Recall  F1-Score
0  DecisionTreeClassifier  0.931321   0.176610  0.177078  0.164534
1    KNeighborsClassifier  0.955760   0.306128  0.060067  0.100375
2  RandomForestClassifier  0.949356   0.213365  0.091676  0.115012
3      LogisticRegression  0.958309   0.100000  0.003175  0.006154
4           MLPClassifier  0.958636   0.000000  0.000000  0.000000


## Treinar modelos com  Holdout e undersampling

In [13]:
modelos = [ad_clf, knn_clf, rf_clf, lg_clf, mlp_clf]

# Dicionário para armazenar métricas
metrics = {"Modelo": [], "Accuracy": [], "Precision": [], "Recall": [], "F1-Score": []}

for modelo in modelos:
    modelo.fit(X_resu, y_resu)
    y_pred = modelo.predict(X_test)
    
    modelo_name = modelo.__class__.__name__
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    metrics["Modelo"].append(modelo_name)
    metrics["Accuracy"].append(accuracy)
    metrics["Precision"].append(precision)
    metrics["Recall"].append(recall)
    metrics["F1-Score"].append(f1)

# Exiba as métricas

metrics_df = pd.DataFrame(metrics)
print(metrics_df)

                   Modelo  Accuracy  Precision    Recall  F1-Score
0  DecisionTreeClassifier  0.721386   0.098930  0.714976  0.173811
1    KNeighborsClassifier  0.762970   0.126697  0.811594  0.219178
2  RandomForestClassifier  0.753663   0.126710  0.850242  0.220551
3      LogisticRegression  0.757624   0.127473  0.840580  0.221374
4           MLPClassifier  0.744752   0.120617  0.830918  0.210655


## Treinar modelos com  Holdout e oversampling

In [14]:
modelos = [ad_clf, knn_clf, rf_clf, lg_clf, mlp_clf]

# Dicionário para armazenar métricas
metrics = {"Modelo": [], "Accuracy": [], "Precision": [], "Recall": [], "F1-Score": []}

for modelo in modelos:
    modelo.fit(X_reso, y_reso)
    y_pred = modelo.predict(X_test)
    
    modelo_name = modelo.__class__.__name__
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    metrics["Modelo"].append(modelo_name)
    metrics["Accuracy"].append(accuracy)
    metrics["Precision"].append(precision)
    metrics["Recall"].append(recall)
    metrics["F1-Score"].append(f1)

# Exiba as métricas

metrics_df = pd.DataFrame(metrics)
print(metrics_df)

                   Modelo  Accuracy  Precision    Recall  F1-Score
0  DecisionTreeClassifier  0.875248   0.122995  0.333333  0.179688
1    KNeighborsClassifier  0.885545   0.150659  0.386473  0.216802
2  RandomForestClassifier  0.895644   0.149123  0.328502  0.205128
3      LogisticRegression  0.793465   0.130742  0.714976  0.221060
4           MLPClassifier  0.821782   0.130203  0.589372  0.213287


## Treinar modelos com  Kfold e undersampling

In [23]:
modelos = [ad_clf, knn_clf, rf_clf, lg_clf, mlp_clf]
metrics = {"Modelo": [], "Accuracy": [], "Precision": [], "Recall": [], "F1-Score": []}

n_splits = 5
cv = StratifiedKFold(n_splits=n_splits)
under = RandomUnderSampler()


for modelo in modelos:

    accuracies = []
    precisions = []
    recalls = []
    f1s = []    

    # Realize a validação cruzada com undersampling
    for train_idx, test_idx in cv.split(X, y):
        X_train, y_train = X[train_idx], y[train_idx]
        X_test, y_test = X[test_idx], y[test_idx]

    # Realize o undersampling nos dados de treinamento
        X_resampled, y_resampled =under.fit_resample(X_train, y_train)
    
        modelo_name = modelo.__class__.__name__  
        modelo.fit(X_resampled, y_resampled)
        y_pred = modelo.predict(X_test)
    
        modelo_name = modelo.__class__.__name__
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        
        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
        
    mean_accuracy = sum(accuracies) / n_splits
    mean_precision = sum(precisions) / n_splits
    mean_recall = sum(recalls) / n_splits
    mean_f1 = sum(f1s) / n_splits

    # Armazene as métricas no dicionário
    metrics["Modelo"].append(modelo_name)
    metrics["Accuracy"].append(mean_accuracy)    
    metrics["Precision"].append(mean_precision)
    metrics["Recall"].append(mean_recall)
    metrics["F1-Score"].append(mean_f1)

# Exiba as métricas

metrics_df = pd.DataFrame(metrics)
print(metrics_df)



                   Modelo  Accuracy  Precision    Recall  F1-Score
0  DecisionTreeClassifier  0.740443   0.105679  0.707237  0.183873
1    KNeighborsClassifier  0.777757   0.126019  0.737383  0.215205
2  RandomForestClassifier  0.760961   0.127641  0.819610  0.220835
3      LogisticRegression  0.774946   0.133659  0.810086  0.229439
4           MLPClassifier  0.692285   0.108763  0.876653  0.193134


## Treinar modelos com  Kfold e oversampling

In [24]:
modelos = [ad_clf, knn_clf, rf_clf, lg_clf, mlp_clf]
metrics = {"Modelo": [], "Accuracy": [], "Precision": [], "Recall": [], "F1-Score": []}

n_splits = 5
cv = StratifiedKFold(n_splits=n_splits)
over = SMOTE()


for modelo in modelos:

    accuracies = []
    precisions = []
    recalls = []
    f1s = []    

    # Realize a validação cruzada com undersampling
    for train_idx, test_idx in cv.split(X, y):
        X_train, y_train = X[train_idx], y[train_idx]
        X_test, y_test = X[test_idx], y[test_idx]

    # Realize o undersampling nos dados de treinamento
        X_resampled, y_resampled =over.fit_resample(X_train, y_train)
    
        modelo_name = modelo.__class__.__name__  
        modelo.fit(X_resampled, y_resampled)
        y_pred = modelo.predict(X_test)
    
        modelo_name = modelo.__class__.__name__
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        
        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
        
    mean_accuracy = sum(accuracies) / n_splits
    mean_precision = sum(precisions) / n_splits
    mean_recall = sum(recalls) / n_splits
    mean_f1 = sum(f1s) / n_splits

    # Armazene as métricas no dicionário
    metrics["Modelo"].append(modelo_name)
    metrics["Accuracy"].append(mean_accuracy)    
    metrics["Precision"].append(mean_precision)
    metrics["Recall"].append(mean_recall)
    metrics["F1-Score"].append(mean_f1)

# Exiba as métricas

metrics_df = pd.DataFrame(metrics)
print(metrics_df)



                   Modelo  Accuracy  Precision    Recall  F1-Score
0  DecisionTreeClassifier  0.877409   0.122715  0.319648  0.177194
1    KNeighborsClassifier  0.889237   0.171308  0.436733  0.245996
2  RandomForestClassifier  0.888714   0.136014  0.316448  0.190114
3      LogisticRegression  0.799908   0.133637  0.701037  0.224464
4           MLPClassifier  0.822387   0.135607  0.606049  0.221138




In [None]:
pipelines = {
    'Decision Tree': Pipeline([
        ('classifier', DecisionTreeClassifier())
    ]),
    'Random Forest': Pipeline([
        ('classifier', RandomForestClassifier())
    ]),
    'MLP Classifier': Pipeline([
        ('classifier', MLPClassifier())
    ]),
    'KNN': Pipeline([
        ('classifier', KNeighborsClassifier())
    ]),
}

param_grids = {
    'Random Forest':{
        'classifier__n_estimators': [50,100,200],
        'classifier__max_depth': [2,4,6]
    },
    'Decision Tree':{
        'classifier__min_samples_split': [2,3,4],
        'classifier__max_depth': [2,4,6]
    },
    'KNN':{
        'classifier__n_neighbors': [3,5,7],
        'classifier__weights': ['uniform', 'distance']
    },
    'MLP Classifier':{
        'classifier__hidden_layer_sizes': [(50,), (100,50), (100,100,50)],
        'classifier__alpha': [0.0001, 0.001, 0.01]
    }
}

for model_name, pipeline in pipelines.items():
  grid_search = GridSearchCV(pipeline, param_grids[model_name], scoring='accuracy')
  grid_search.fit(X_train, y_train)

  best_params = grid_search.best_params_
  best_model = grid_search.best_estimator_

  y_pred = best_model.predict(X_test)
  accuracy = accuracy_score(y_test, y_pred)

  print("Modelo:", model_name)
  print("Melhores Parâmetros:", best_params)
  print("Accuracy:", accuracy)