In [2]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import recall_score, precision_score, accuracy_score, f1_score, confusion_matrix
from sklearn.model_selection import cross_val_predict, cross_val_score, StratifiedKFold

import pandas as pd
import numpy as np

## Carrega base

In [3]:
# Com categorização das colunas bmi e glicose
stroke_cat = pd.read_csv('stroke_df_cat')


In [4]:
stroke_cat.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,28,0,0,1,2,1,1,1,2,0
1,1,33,0,0,1,2,0,1,4,1,0
2,0,42,0,0,1,2,0,2,3,0,0
3,1,56,0,0,1,2,1,1,5,2,0
4,0,24,0,0,0,2,0,1,5,2,0


## Entrada para modelagem

In [5]:
#features

X = (stroke_cat.drop(['stroke'], axis=1)).values

#target

y = (stroke_cat['stroke']).values

## Criação dos modelos

In [6]:
ad_clf = DecisionTreeClassifier()
knn_clf = KNeighborsClassifier(n_neighbors = 5)
rf_clf = RandomForestClassifier(n_estimators=100)
lg_clf = LogisticRegression(random_state=0, max_iter=1000)
mlp_clf = MLPClassifier(random_state=1, max_iter=300)

## Amostragem Holdout teste dos modelos

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33)

In [25]:
y_train_v =  pd.DataFrame(y_train)
print(y_train_v.value_counts()/y_train_v.shape[0]*100)
print(y_train_v.shape[0])

0    95.757339
1     4.242661
Name: count, dtype: float64
10253


In [26]:
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X, y, test_size = 0.33,stratify=y)

#stratify, a função train_test_split garante que a divisão dos dados em conjuntos de treinamento e teste mantém a mesma distribuição das classes em y nos dois conjuntos

In [27]:
y_train_v =  pd.DataFrame(y_train_s)
print(y_train_v.value_counts()/y_train_v.shape[0]*100)
print(y_train_v.shape[0])

0    95.874378
1     4.125622
Name: count, dtype: float64
10253


## Oversampling

In [28]:
from imblearn.over_sampling import SMOTE

s=SMOTE()
X_reso,y_reso =s.fit_resample(X_train,y_train)


In [29]:
y_train_v =  pd.DataFrame(y_reso)
print(y_train_v.value_counts()/y_train_v.shape[0]*100)
print(y_train_v.shape[0])

0    50.0
1    50.0
Name: count, dtype: float64
19636


## Undersampling

In [32]:
from imblearn.under_sampling import ClusterCentroids,RandomUnderSampler

rus= RandomUnderSampler()
cc = ClusterCentroids()

X_resu,y_resu = rus.fit_resample(X_train,y_train)


In [33]:
y_train_v =  pd.DataFrame(y_resu)
print(y_train_v.value_counts()/y_train_v.shape[0]*100)
print(y_train_v.shape[0])

0    50.0
1    50.0
Name: count, dtype: float64
870


## Treinar modelos - com Holdout

In [34]:
modelos = [ad_clf, knn_clf, rf_clf, lg_clf, mlp_clf]

# Dicionário para armazenar métricas
metrics = {"Modelo": [], "Accuracy": [], "Precision": [], "Recall": [], "F1-Score": []}

for modelo in modelos:
    modelo.fit(X_train, y_train)
    y_pred = modelo.predict(X_test)
    
    modelo_name = modelo.__class__.__name__
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    metrics["Modelo"].append(modelo_name)
    metrics["Accuracy"].append(accuracy)
    metrics["Precision"].append(precision)
    metrics["Recall"].append(recall)
    metrics["F1-Score"].append(f1)

# Exiba as métricas

metrics_df = pd.DataFrame(metrics)
print(metrics_df)


                   Modelo  Accuracy  Precision    Recall  F1-Score
0  DecisionTreeClassifier  0.937228   0.208738  0.218274  0.213400
1    KNeighborsClassifier  0.956436   0.232558  0.050761  0.083333
2  RandomForestClassifier  0.953663   0.265823  0.106599  0.152174
3      LogisticRegression  0.960594   0.250000  0.005076  0.009950
4           MLPClassifier  0.960990   0.500000  0.010152  0.019900


## Treinar modelos - com Holdout stratify

In [35]:
modelos = [ad_clf, knn_clf, rf_clf, lg_clf, mlp_clf]

# Dicionário para armazenar métricas
metrics = {"Modelo": [], "Accuracy": [], "Precision": [], "Recall": [], "F1-Score": []}

for modelo in modelos:
    modelo.fit(X_train_s, y_train_s)
    y_pred = modelo.predict(X_test_s)
    
    modelo_name = modelo.__class__.__name__
    accuracy = accuracy_score(y_test_s, y_pred)
    precision = precision_score(y_test_s, y_pred)
    recall = recall_score(y_test_s, y_pred)
    f1 = f1_score(y_test_s, y_pred)
    
    metrics["Modelo"].append(modelo_name)
    metrics["Accuracy"].append(accuracy)
    metrics["Precision"].append(precision)
    metrics["Recall"].append(recall)
    metrics["F1-Score"].append(f1)

# Exiba as métricas

metrics_df = pd.DataFrame(metrics)
print(metrics_df)

                   Modelo  Accuracy  Precision    Recall  F1-Score
0  DecisionTreeClassifier  0.937624   0.229592  0.215311  0.222222
1    KNeighborsClassifier  0.956040   0.290323  0.043062  0.075000
2  RandomForestClassifier  0.955248   0.350877  0.095694  0.150376
3      LogisticRegression  0.958020   0.285714  0.009569  0.018519
4           MLPClassifier  0.958614   0.000000  0.000000  0.000000


  _warn_prf(average, modifier, msg_start, len(result))


## Treinar modelos com  o KFold

In [36]:
modelos = [ad_clf, knn_clf, rf_clf, lg_clf, mlp_clf]


metrics = {"Modelo": [], "Accuracy": [], "Precision": [], "Recall": [], "F1-Score": []}

# Realize a validação cruzada e calcule as métricas para cada modelo
for modelo in modelos:
    
    modelo_name = modelo.__class__.__name__ 
    skf = StratifiedKFold(n_splits=5, shuffle=True)   
   
    y_pred = cross_val_predict(modelo, X, y, cv=skf)
    
    # Calcule as métricas
    
    accuracy = cross_val_score(modelo, X, y, cv=skf, scoring='accuracy').mean()
    precision = cross_val_score(modelo, X, y, cv=skf, scoring='precision').mean()
    recall = cross_val_score(modelo, X, y, cv=skf, scoring='recall').mean()
    f1 = cross_val_score(modelo, X, y, cv=skf, scoring='f1').mean()
    
    # Armazene as métricas no dicionário
    metrics["Modelo"].append(modelo_name)
    metrics["Accuracy"].append(accuracy)
    metrics["Precision"].append(precision)
    metrics["Recall"].append(recall)
    metrics["F1-Score"].append(f1)

# Exiba as métricas
import pandas as pd
metrics_df = pd.DataFrame(metrics)
print(metrics_df)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                   Modelo  Accuracy  Precision    Recall  F1-Score
0  DecisionTreeClassifier  0.933150   0.195232  0.202500  0.189867
1    KNeighborsClassifier  0.956283   0.383065  0.064879  0.106190
2  RandomForestClassifier  0.951121   0.278423  0.090164  0.132966
3      LogisticRegression  0.958505   0.170000  0.003162  0.015531
4           MLPClassifier  0.958636   0.050000  0.020585  0.030149


## Treinar modelos com  Holdout e undersampling

In [37]:
modelos = [ad_clf, knn_clf, rf_clf, lg_clf, mlp_clf]

# Dicionário para armazenar métricas
metrics = {"Modelo": [], "Accuracy": [], "Precision": [], "Recall": [], "F1-Score": []}

for modelo in modelos:
    modelo.fit(X_resu, y_resu)
    y_pred = modelo.predict(X_test)
    
    modelo_name = modelo.__class__.__name__
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    metrics["Modelo"].append(modelo_name)
    metrics["Accuracy"].append(accuracy)
    metrics["Precision"].append(precision)
    metrics["Recall"].append(recall)
    metrics["F1-Score"].append(f1)

# Exiba as métricas

metrics_df = pd.DataFrame(metrics)
print(metrics_df)

                   Modelo  Accuracy  Precision    Recall  F1-Score
0  DecisionTreeClassifier  0.721584   0.095652  0.725888  0.169031
1    KNeighborsClassifier  0.761584   0.114767  0.761421  0.199468
2  RandomForestClassifier  0.758812   0.114717  0.771574  0.199737
3      LogisticRegression  0.778614   0.131305  0.832487  0.226833
4           MLPClassifier  0.733465   0.108918  0.812183  0.192077


## Treinar modelos com  Holdout e oversampling

In [38]:
modelos = [ad_clf, knn_clf, rf_clf, lg_clf, mlp_clf]

# Dicionário para armazenar métricas
metrics = {"Modelo": [], "Accuracy": [], "Precision": [], "Recall": [], "F1-Score": []}

for modelo in modelos:
    modelo.fit(X_reso, y_reso)
    y_pred = modelo.predict(X_test)
    
    modelo_name = modelo.__class__.__name__
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    metrics["Modelo"].append(modelo_name)
    metrics["Accuracy"].append(accuracy)
    metrics["Precision"].append(precision)
    metrics["Recall"].append(recall)
    metrics["F1-Score"].append(f1)

# Exiba as métricas

metrics_df = pd.DataFrame(metrics)
print(metrics_df)

                   Modelo  Accuracy  Precision    Recall  F1-Score
0  DecisionTreeClassifier  0.882574   0.133333  0.365482  0.195387
1    KNeighborsClassifier  0.886337   0.157895  0.441624  0.232620
2  RandomForestClassifier  0.891683   0.133891  0.324873  0.189630
3      LogisticRegression  0.799604   0.129882  0.725888  0.220339
4           MLPClassifier  0.786733   0.118056  0.690355  0.201631


## Treinar modelos com  Kfold e undersampling

In [39]:
modelos = [ad_clf, knn_clf, rf_clf, lg_clf, mlp_clf]
metrics = {"Modelo": [], "Accuracy": [], "Precision": [], "Recall": [], "F1-Score": []}

n_splits = 5
cv = StratifiedKFold(n_splits=n_splits,shuffle=True)

under = RandomUnderSampler()


for modelo in modelos:

    accuracies = []
    precisions = []
    recalls = []
    f1s = []    

    # Realize a validação cruzada com undersampling
    for train_idx, test_idx in cv.split(X, y):
        X_train, y_train = X[train_idx], y[train_idx]
        X_test, y_test = X[test_idx], y[test_idx]
        
    # Realize o undersampling nos dados de treinamento
        X_resampled, y_resampled =under.fit_resample(X_train, y_train)
    
        modelo_name = modelo.__class__.__name__  
        modelo.fit(X_resampled, y_resampled)
        y_pred = modelo.predict(X_test)
    
        modelo_name = modelo.__class__.__name__
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        
        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
        
    mean_accuracy = sum(accuracies) / n_splits
    mean_precision = sum(precisions) / n_splits
    mean_recall = sum(recalls) / n_splits
    mean_f1 = sum(f1s) / n_splits

    # Armazene as métricas no dicionário
    metrics["Modelo"].append(modelo_name)
    metrics["Accuracy"].append(mean_accuracy)    
    metrics["Precision"].append(mean_precision)
    metrics["Recall"].append(mean_recall)
    metrics["F1-Score"].append(mean_f1)

# Exiba as métricas

metrics_df = pd.DataFrame(metrics)
print(metrics_df)



                   Modelo  Accuracy  Precision    Recall  F1-Score
0  DecisionTreeClassifier  0.741747   0.107666  0.718248  0.187168
1    KNeighborsClassifier  0.783048   0.133626  0.770604  0.227475
2  RandomForestClassifier  0.760701   0.125615  0.800662  0.217063
3      LogisticRegression  0.773312   0.134220  0.822772  0.230772
4           MLPClassifier  0.694175   0.110553  0.872016  0.195144


## Treinar modelos com  Kfold e oversampling

In [40]:
modelos = [ad_clf, knn_clf, rf_clf, lg_clf, mlp_clf]
metrics = {"Modelo": [], "Accuracy": [], "Precision": [], "Recall": [], "F1-Score": []}

n_splits = 5
cv = StratifiedKFold(n_splits=n_splits)
over = SMOTE()


for modelo in modelos:

    accuracies = []
    precisions = []
    recalls = []
    f1s = []    

    # Realize a validação cruzada com undersampling
    for train_idx, test_idx in cv.split(X, y):
        X_train, y_train = X[train_idx], y[train_idx]
        X_test, y_test = X[test_idx], y[test_idx]

    # Realize o undersampling nos dados de treinamento
        X_resampled, y_resampled =over.fit_resample(X_train, y_train)
    
        modelo_name = modelo.__class__.__name__  
        modelo.fit(X_resampled, y_resampled)
        y_pred = modelo.predict(X_test)
    
        modelo_name = modelo.__class__.__name__
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        
        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
        
    mean_accuracy = sum(accuracies) / n_splits
    mean_precision = sum(precisions) / n_splits
    mean_recall = sum(recalls) / n_splits
    mean_f1 = sum(f1s) / n_splits

    # Armazene as métricas no dicionário
    metrics["Modelo"].append(modelo_name)
    metrics["Accuracy"].append(mean_accuracy)    
    metrics["Precision"].append(mean_precision)
    metrics["Recall"].append(mean_recall)
    metrics["F1-Score"].append(mean_f1)

# Exiba as métricas

metrics_df = pd.DataFrame(metrics)
print(metrics_df)

                   Modelo  Accuracy  Precision    Recall  F1-Score
0  DecisionTreeClassifier  0.876494   0.122614  0.322835  0.177573
1    KNeighborsClassifier  0.884728   0.165067  0.439870  0.239989
2  RandomForestClassifier  0.890021   0.139924  0.322835  0.194988
3      LogisticRegression  0.799320   0.133249  0.701062  0.223902
4           MLPClassifier  0.822978   0.136884  0.617085  0.223584


In [7]:
from imblearn.under_sampling import ClusterCentroids,RandomUnderSampler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, stratify=y)
under = RandomUnderSampler()
X_resampled, y_resampled =under.fit_resample(X_train, y_train)

pipelines = {
    'Decision Tree': Pipeline([
        ('classifier', DecisionTreeClassifier())
    ]),
    'Random Forest': Pipeline([
        ('classifier', RandomForestClassifier())
    ]),
    'MLP Classifier': Pipeline([
        ('classifier', MLPClassifier())
    ]),
    'KNN': Pipeline([
        ('classifier', KNeighborsClassifier())
    ]),
    'Logistic Regression': Pipeline([
        ('classifier', LogisticRegression())
    ]),
}

param_grids = {
    'Decision Tree':{
        'classifier__min_samples_split': [2,5,10],
        'classifier__max_depth': [None, 5, 10, 15],
        'classifier__criterion': ['gini', 'entropy']
    },
    'Random Forest':{
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__n_estimators': [50,100,150],
        'classifier__max_depth': [None, 5, 10, 15],
        'classifier__criterion': ['gini', 'entropy']
    },
    'MLP Classifier':{
        'classifier__hidden_layer_sizes': [(50,), (100,50), (100,100,50)],
        'classifier__activation': ['identity', 'logistic', 'tanh', 'relu'],
        'classifier__alpha': [0.0001, 0.001, 0.01],
        'classifier__max_iter': [500],
        'classifier__learning_rate': ['adaptive']
    },
    'KNN':{
        'classifier__n_neighbors': [3,5,7,9],
        'classifier__weights': ['uniform', 'distance'],
        'classifier__metric': ['euclidean', 'manhattan']
    },
    'Logistic Regression':{
        'classifier__max_iter': [1000,2000],
        'classifier__class_weight': ['none', 'balanced']
    }
}

scoring = ['precision', 'recall']
for model_name, pipeline in pipelines.items():
  
  #grid_search = GridSearchCV(pipeline, param_grids[model_name], scoring=scoring, refit=False)  
  grid_search = GridSearchCV(pipeline, param_grids[model_name], scoring='recall')
  grid_search.fit(X_resampled, y_resampled)

  best_params = grid_search.best_params_
  best_model = grid_search.best_estimator_

  y_pred = best_model.predict(X_test)
  recall = recall_score(y_test, y_pred)

  print("Modelo:", model_name)
  print("Melhores Parâmetros:", best_params)
  print("Recall:", recall)

Modelo: Decision Tree
Melhores Parâmetros: {'classifier__criterion': 'entropy', 'classifier__max_depth': 5, 'classifier__min_samples_split': 2}
Recall: 0.8803827751196173
Modelo: Random Forest
Melhores Parâmetros: {'classifier__criterion': 'entropy', 'classifier__max_depth': 5, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 100}
Recall: 0.8995215311004785




Modelo: MLP Classifier
Melhores Parâmetros: {'classifier__activation': 'identity', 'classifier__alpha': 0.01, 'classifier__hidden_layer_sizes': (100, 100, 50), 'classifier__learning_rate': 'adaptive', 'classifier__max_iter': 500}
Recall: 0.9521531100478469
Modelo: KNN
Melhores Parâmetros: {'classifier__metric': 'euclidean', 'classifier__n_neighbors': 9, 'classifier__weights': 'uniform'}
Recall: 0.784688995215311
Modelo: Logistic Regression
Melhores Parâmetros: {'classifier__class_weight': 'balanced', 'classifier__max_iter': 1000}
Recall: 0.8133971291866029


10 fits failed out of a total of 20.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\User\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\User\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\User\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\pipeline.py", line 427, in fit
    self._final_estimator.fit(Xt, y, **fit_param