In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import recall_score, precision_score, accuracy_score, f1_score, confusion_matrix
from sklearn.model_selection import cross_val_predict, cross_val_score, StratifiedKFold

import pandas as pd
import numpy as np

## Carrega base

In [34]:
stroke_cat = pd.read_csv('stroke_df_cat')


In [35]:
stroke_cat.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,28,0,0,1,2,1,1,1,2,0
1,1,33,0,0,1,2,0,1,4,1,0
2,0,42,0,0,1,2,0,2,3,0,0
3,1,56,0,0,1,2,1,1,5,2,0
4,0,24,0,0,0,2,0,1,5,2,0


In [36]:
def categorizar_idade(value):
  if value <= 18:
    return 0
  elif 18 < value <= 30:
    return 1
  elif 30 < value <= 60:
    return 2
  else:
    return 3

stroke_cat['age'] = stroke_cat['age'].apply(categorizar_idade)

In [37]:
stroke_cat.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,1,0,0,1,2,1,1,1,2,0
1,1,2,0,0,1,2,0,1,4,1,0
2,0,2,0,0,1,2,0,2,3,0,0
3,1,2,0,0,1,2,1,1,5,2,0
4,0,1,0,0,0,2,0,1,5,2,0


## Entrada para modelagem

In [38]:
#features

X = (stroke_cat.drop(['stroke','gender','ever_married','work_type','Residence_type','smoking_status'], axis=1)).values

#target

y = (stroke_cat['stroke']).values

## Criação dos modelos

In [39]:
ad_clf = DecisionTreeClassifier()
knn_clf = KNeighborsClassifier(n_neighbors = 5)
rf_clf = RandomForestClassifier(n_estimators=100)
lg_clf = LogisticRegression(random_state=0, max_iter=1000)
mlp_clf = MLPClassifier(random_state=1, max_iter=300)

## Amostragem Holdout teste dos modelos

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33)

In [41]:
y_train_v =  pd.DataFrame(y_train)
print(y_train_v.value_counts()/y_train_v.shape[0]*100)
print(y_train_v.shape[0])

0    95.845119
1     4.154881
Name: count, dtype: float64
10253


In [42]:
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X, y, test_size = 0.33,stratify=y)

#stratify, a função train_test_split garante que a divisão dos dados em conjuntos de treinamento e teste mantém a mesma distribuição das classes em y nos dois conjuntos

In [43]:
y_train_v =  pd.DataFrame(y_train_s)
print(y_train_v.value_counts()/y_train_v.shape[0]*100)
print(y_train_v.shape[0])

0    95.874378
1     4.125622
Name: count, dtype: float64
10253


## Oversampling

In [44]:
from imblearn.over_sampling import SMOTE

s=SMOTE()
X_reso,y_reso =s.fit_resample(X_train,y_train)


In [45]:
y_train_v =  pd.DataFrame(y_reso)
print(y_train_v.value_counts()/y_train_v.shape[0]*100)
print(y_train_v.shape[0])

0    50.0
1    50.0
Name: count, dtype: float64
19654


## Undersampling

In [46]:
from imblearn.under_sampling import ClusterCentroids,RandomUnderSampler

rus= RandomUnderSampler()
cc = ClusterCentroids()

X_resu,y_resu = rus.fit_resample(X_train,y_train)


In [47]:
y_train_v =  pd.DataFrame(y_resu)
print(y_train_v.value_counts()/y_train_v.shape[0]*100)
print(y_train_v.shape[0])

0    50.0
1    50.0
Name: count, dtype: float64
852


## Treinar modelos - com Holdout

In [48]:
modelos = [ad_clf, knn_clf, rf_clf, lg_clf, mlp_clf]

# Dicionário para armazenar métricas
metrics = {"Modelo": [], "Accuracy": [], "Precision": [], "Recall": [], "F1-Score": []}

for modelo in modelos:
    modelo.fit(X_train, y_train)
    y_pred = modelo.predict(X_test)
    
    modelo_name = modelo.__class__.__name__
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    metrics["Modelo"].append(modelo_name)
    metrics["Accuracy"].append(accuracy)
    metrics["Precision"].append(precision)
    metrics["Recall"].append(recall)
    metrics["F1-Score"].append(f1)

# Exiba as métricas

metrics_df = pd.DataFrame(metrics)
print(metrics_df)


                   Modelo  Accuracy  Precision    Recall  F1-Score
0  DecisionTreeClassifier  0.957822   0.230769  0.014563  0.027397
1    KNeighborsClassifier  0.943762   0.157895  0.087379  0.112500
2  RandomForestClassifier  0.956832   0.166667  0.014563  0.026786
3      LogisticRegression  0.959406   1.000000  0.004854  0.009662
4           MLPClassifier  0.958812   0.375000  0.014563  0.028037


## Treinar modelos - com Holdout stratify

In [49]:
modelos = [ad_clf, knn_clf, rf_clf, lg_clf, mlp_clf]

# Dicionário para armazenar métricas
metrics = {"Modelo": [], "Accuracy": [], "Precision": [], "Recall": [], "F1-Score": []}

for modelo in modelos:
    modelo.fit(X_train_s, y_train_s)
    y_pred = modelo.predict(X_test_s)
    
    modelo_name = modelo.__class__.__name__
    accuracy = accuracy_score(y_test_s, y_pred)
    precision = precision_score(y_test_s, y_pred)
    recall = recall_score(y_test_s, y_pred)
    f1 = f1_score(y_test_s, y_pred)
    
    metrics["Modelo"].append(modelo_name)
    metrics["Accuracy"].append(accuracy)
    metrics["Precision"].append(precision)
    metrics["Recall"].append(recall)
    metrics["F1-Score"].append(f1)

# Exiba as métricas

metrics_df = pd.DataFrame(metrics)
print(metrics_df)

  _warn_prf(average, modifier, msg_start, len(result))


                   Modelo  Accuracy  Precision    Recall  F1-Score
0  DecisionTreeClassifier  0.957228   0.315789  0.028708  0.052632
1    KNeighborsClassifier  0.950099   0.179104  0.057416  0.086957
2  RandomForestClassifier  0.957228   0.315789  0.028708  0.052632
3      LogisticRegression  0.958614   0.000000  0.000000  0.000000
4           MLPClassifier  0.958614   0.000000  0.000000  0.000000


  _warn_prf(average, modifier, msg_start, len(result))


## Treinar modelos com  o KFold

In [50]:
modelos = [ad_clf, knn_clf, rf_clf, lg_clf, mlp_clf]


metrics = {"Modelo": [], "Accuracy": [], "Precision": [], "Recall": [], "F1-Score": []}

# Realize a validação cruzada e calcule as métricas para cada modelo
for modelo in modelos:
    
    modelo_name = modelo.__class__.__name__ 
    skf = StratifiedKFold(n_splits=5, shuffle=True)   
   
    y_pred = cross_val_predict(modelo, X, y, cv=skf)
    
    # Calcule as métricas
    
    accuracy = cross_val_score(modelo, X, y, cv=skf, scoring='accuracy').mean()
    precision = cross_val_score(modelo, X, y, cv=skf, scoring='precision').mean()
    recall = cross_val_score(modelo, X, y, cv=skf, scoring='recall').mean()
    f1 = cross_val_score(modelo, X, y, cv=skf, scoring='f1').mean()
    
    # Armazene as métricas no dicionário
    metrics["Modelo"].append(modelo_name)
    metrics["Accuracy"].append(accuracy)
    metrics["Precision"].append(precision)
    metrics["Recall"].append(recall)
    metrics["F1-Score"].append(f1)

# Exiba as métricas
import pandas as pd
metrics_df = pd.DataFrame(metrics)
print(metrics_df)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                   Modelo  Accuracy  Precision    Recall  F1-Score
0  DecisionTreeClassifier  0.957786   0.121429  0.011061  0.014820
1    KNeighborsClassifier  0.955630   0.333869  0.037958  0.083627
2  RandomForestClassifier  0.957786   0.164286  0.014211  0.015130
3      LogisticRegression  0.958701   0.000000  0.000000  0.000000
4           MLPClassifier  0.958636   0.150000  0.004762  0.006154


## Treinar modelos com  Holdout e undersampling

In [51]:
modelos = [ad_clf, knn_clf, rf_clf, lg_clf, mlp_clf]

# Dicionário para armazenar métricas
metrics = {"Modelo": [], "Accuracy": [], "Precision": [], "Recall": [], "F1-Score": []}

for modelo in modelos:
    modelo.fit(X_resu, y_resu)
    y_pred = modelo.predict(X_test)
    
    modelo_name = modelo.__class__.__name__
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    metrics["Modelo"].append(modelo_name)
    metrics["Accuracy"].append(accuracy)
    metrics["Precision"].append(precision)
    metrics["Recall"].append(recall)
    metrics["F1-Score"].append(f1)

# Exiba as métricas

metrics_df = pd.DataFrame(metrics)
print(metrics_df)

                   Modelo  Accuracy  Precision    Recall  F1-Score
0  DecisionTreeClassifier  0.782970   0.122241  0.699029  0.208092
1    KNeighborsClassifier  0.905347   0.166667  0.330097  0.221498
2  RandomForestClassifier  0.778614   0.125616  0.742718  0.214888
3      LogisticRegression  0.809703   0.142180  0.728155  0.237906
4           MLPClassifier  0.785545   0.133054  0.771845  0.226981


## Treinar modelos com  Holdout e oversampling

In [52]:
modelos = [ad_clf, knn_clf, rf_clf, lg_clf, mlp_clf]

# Dicionário para armazenar métricas
metrics = {"Modelo": [], "Accuracy": [], "Precision": [], "Recall": [], "F1-Score": []}

for modelo in modelos:
    modelo.fit(X_reso, y_reso)
    y_pred = modelo.predict(X_test)
    
    modelo_name = modelo.__class__.__name__
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    metrics["Modelo"].append(modelo_name)
    metrics["Accuracy"].append(accuracy)
    metrics["Precision"].append(precision)
    metrics["Recall"].append(recall)
    metrics["F1-Score"].append(f1)

# Exiba as métricas

metrics_df = pd.DataFrame(metrics)
print(metrics_df)

                   Modelo  Accuracy  Precision    Recall  F1-Score
0  DecisionTreeClassifier  0.772673   0.117073  0.699029  0.200557
1    KNeighborsClassifier  0.940000   0.160839  0.111650  0.131805
2  RandomForestClassifier  0.772079   0.116788  0.699029  0.200139
3      LogisticRegression  0.823564   0.147996  0.699029  0.244275
4           MLPClassifier  0.797822   0.128532  0.684466  0.216424


## Treinar modelos com  Kfold e undersampling

In [54]:
modelos = [ad_clf, knn_clf, rf_clf, lg_clf, mlp_clf]
metrics = {"Modelo": [], "Accuracy": [], "Precision": [], "Recall": [], "F1-Score": []}

n_splits = 5
cv = StratifiedKFold(n_splits=n_splits,shuffle=True)

under = RandomUnderSampler()


for modelo in modelos:

    accuracies = []
    precisions = []
    recalls = []
    f1s = []    

    # Realize a validação cruzada com undersampling
    for train_idx, test_idx in cv.split(X, y):
        X_train, y_train = X[train_idx], y[train_idx]
        X_test, y_test = X[test_idx], y[test_idx]
        
    # Realize o undersampling nos dados de treinamento
        X_resampled, y_resampled =under.fit_resample(X_train, y_train)
    
        modelo_name = modelo.__class__.__name__  
        modelo.fit(X_resampled, y_resampled)
        y_pred = modelo.predict(X_test)
    
        modelo_name = modelo.__class__.__name__
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        
        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
        
    mean_accuracy = sum(accuracies) / n_splits
    mean_precision = sum(precisions) / n_splits
    mean_recall = sum(recalls) / n_splits
    mean_f1 = sum(f1s) / n_splits

    # Armazene as métricas no dicionário
    metrics["Modelo"].append(modelo_name)
    metrics["Accuracy"].append(mean_accuracy)    
    metrics["Precision"].append(mean_precision)
    metrics["Recall"].append(mean_recall)
    metrics["F1-Score"].append(mean_f1)

# Exiba as métricas

metrics_df = pd.DataFrame(metrics)
print(metrics_df)



                   Modelo  Accuracy  Precision    Recall  F1-Score
0  DecisionTreeClassifier  0.790174   0.136155  0.764104  0.230935
1    KNeighborsClassifier  0.911324   0.182873  0.333883  0.236010
2  RandomForestClassifier  0.781938   0.133397  0.778428  0.227702
3      LogisticRegression  0.812520   0.149945  0.757818  0.250345
4           MLPClassifier  0.806117   0.147501  0.772291  0.247610


## Treinar modelos com  Kfold e oversampling

In [55]:
modelos = [ad_clf, knn_clf, rf_clf, lg_clf, mlp_clf]
metrics = {"Modelo": [], "Accuracy": [], "Precision": [], "Recall": [], "F1-Score": []}

n_splits = 5
cv = StratifiedKFold(n_splits=n_splits)
over = SMOTE()


for modelo in modelos:

    accuracies = []
    precisions = []
    recalls = []
    f1s = []    

    # Realize a validação cruzada com undersampling
    for train_idx, test_idx in cv.split(X, y):
        X_train, y_train = X[train_idx], y[train_idx]
        X_test, y_test = X[test_idx], y[test_idx]

    # Realize o undersampling nos dados de treinamento
        X_resampled, y_resampled =over.fit_resample(X_train, y_train)
    
        modelo_name = modelo.__class__.__name__  
        modelo.fit(X_resampled, y_resampled)
        y_pred = modelo.predict(X_test)
    
        modelo_name = modelo.__class__.__name__
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        
        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
        
    mean_accuracy = sum(accuracies) / n_splits
    mean_precision = sum(precisions) / n_splits
    mean_recall = sum(recalls) / n_splits
    mean_f1 = sum(f1s) / n_splits

    # Armazene as métricas no dicionário
    metrics["Modelo"].append(modelo_name)
    metrics["Accuracy"].append(mean_accuracy)    
    metrics["Precision"].append(mean_precision)
    metrics["Recall"].append(mean_recall)
    metrics["F1-Score"].append(mean_f1)

# Exiba as métricas

metrics_df = pd.DataFrame(metrics)
print(metrics_df)

                   Modelo  Accuracy  Precision    Recall  F1-Score
0  DecisionTreeClassifier  0.807685   0.140133  0.707287  0.233808
1    KNeighborsClassifier  0.953343   0.272929  0.071216  0.111885
2  RandomForestClassifier  0.806900   0.139430  0.707324  0.232858
3      LogisticRegression  0.822649   0.154407  0.734171  0.255126
4           MLPClassifier  0.803436   0.142650  0.745232  0.239322


In [56]:
from imblearn.under_sampling import ClusterCentroids,RandomUnderSampler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, stratify=y)
under = RandomUnderSampler()
X_resampled, y_resampled =under.fit_resample(X_train, y_train)

pipelines = {
    'Decision Tree': Pipeline([
        ('classifier', DecisionTreeClassifier())
    ]),
    'Random Forest': Pipeline([
        ('classifier', RandomForestClassifier())
    ]),
    'MLP Classifier': Pipeline([
        ('classifier', MLPClassifier())
    ]),
    'KNN': Pipeline([
        ('classifier', KNeighborsClassifier())
    ]),
    'Logistic Regression': Pipeline([
        ('classifier', LogisticRegression())
    ]),
}

param_grids = {
    'Decision Tree':{
        'classifier__min_samples_split': [2,5,10],
        'classifier__max_depth': [None, 5, 10, 15],
        'classifier__criterion': ['gini', 'entropy']
    },
    'Random Forest':{
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__n_estimators': [50,100,150],
        'classifier__max_depth': [None, 5, 10, 15],
        'classifier__criterion': ['gini', 'entropy']
    },
    'MLP Classifier':{
        'classifier__hidden_layer_sizes': [(50,), (100,50), (100,100,50)],
        'classifier__activation': ['identity', 'logistic', 'tanh', 'relu'],
        'classifier__alpha': [0.0001, 0.001, 0.01],
        'classifier__max_iter': [500],
        'classifier__learning_rate': ['adaptive']
    },
    'KNN':{
        'classifier__n_neighbors': [3,5,7,9],
        'classifier__weights': ['uniform', 'distance'],
        'classifier__metric': ['euclidean', 'manhattan']
    },
    'Logistic Regression':{
        'classifier__max_iter': [1000,2000],
        'classifier__class_weight': ['none', 'balanced']
    }
}

scoring = ['precision', 'recall']
for model_name, pipeline in pipelines.items():
  
  #grid_search = GridSearchCV(pipeline, param_grids[model_name], scoring=scoring, refit=False)  
  grid_search = GridSearchCV(pipeline, param_grids[model_name], scoring='recall')
  grid_search.fit(X_resampled, y_resampled)

  best_params = grid_search.best_params_
  best_model = grid_search.best_estimator_

  y_pred = best_model.predict(X_test)
  recall = recall_score(y_test, y_pred)

  print("Modelo:", model_name)
  print("Melhores Parâmetros:", best_params)
  print("Recall:", recall)

Modelo: Decision Tree
Melhores Parâmetros: {'classifier__criterion': 'gini', 'classifier__max_depth': None, 'classifier__min_samples_split': 10}
Recall: 0.784688995215311
Modelo: Random Forest
Melhores Parâmetros: {'classifier__criterion': 'gini', 'classifier__max_depth': None, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 100}
Recall: 0.7990430622009569




Modelo: MLP Classifier
Melhores Parâmetros: {'classifier__activation': 'tanh', 'classifier__alpha': 0.01, 'classifier__hidden_layer_sizes': (100, 100, 50), 'classifier__learning_rate': 'adaptive', 'classifier__max_iter': 500}
Recall: 0.8038277511961722
Modelo: KNN
Melhores Parâmetros: {'classifier__metric': 'euclidean', 'classifier__n_neighbors': 9, 'classifier__weights': 'distance'}
Recall: 0.430622009569378
Modelo: Logistic Regression
Melhores Parâmetros: {'classifier__class_weight': 'balanced', 'classifier__max_iter': 1000}
Recall: 0.7703349282296651


10 fits failed out of a total of 20.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\User\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\User\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\User\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\pipeline.py", line 427, in fit
    self._final_estimator.fit(Xt, y, **fit_param