<a href="https://colab.research.google.com/github/matcgoes/scripts-uteis/blob/main/threshold_tunning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import plotly.express as px
import numpy as np
from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_curve, auc
import pandas as pd

def find_optimal_threshold(model, X, y):
    # Obtém as probabilidades previstas para a classe positiva
    y_proba = model.predict_proba(X)[:, 1]
    
    # Cria um array com vários thresholds
    thresholds = np.arange(0, 1.01, 0.01)
    
    # Inicializa variáveis para armazenar as métricas
    f1_scores = []
    accuracies = []
    precisions = []
    recalls = []
    
    # Calcula as métricas para cada threshold
    for threshold in thresholds:
        y_pred = (y_proba >= threshold).astype(int)
        tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()
        accuracy = (tp + tn) / (tp + tn + fp + fn)
        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0
        f1_score = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
        f1_scores.append(f1_score)
        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
    
    # Encontra o threshold que maximiza o F1-Score
    optimal_threshold_idx = np.argmax(f1_scores)
    optimal_threshold = thresholds[optimal_threshold_idx]
    
    # Cria a matriz de confusão e imprime as métricas relevantes
    tn, fp, fn, tp = confusion_matrix(y, (y_proba >= optimal_threshold).astype(int)).ravel()
    confusion = np.array([[tn, fp], [fn, tp]])
    print(f"Matriz de Confusão:\n{confusion}")
    print(f"Threshold ótimo: {optimal_threshold:.3f}")
    print(f"Acurácia: {accuracies[optimal_threshold_idx]:.3f}")
    print(f"Precisão: {precisions[optimal_threshold_idx]:.3f}")
    print(f"Recall: {recalls[optimal_threshold_idx]:.3f}")
    print(f"F1-Score: {f1_scores[optimal_threshold_idx]:.3f}")
    
    # Cria um DataFrame com as métricas
    df_metrics = pd.DataFrame({'threshold': thresholds,
                               'accuracy': accuracies,
                               'precision': precisions,
                               'recall': recalls,
                               'f1_score': f1_scores})
    
    # Plota as curvas de Precision, Recall e F1-Score
    fig = px.line(df_metrics, x='threshold', y=['precision', 'recall', 'f1_score'],
                  labels={'threshold': 'Threshold', 'value': 'Métrica'},
                  title='Curvas de Precisão, Recall e F1-Score')
    fig.add_vline(x=optimal_threshold, line_dash="dash", line_color="black", 
                  annotation_text="Threshold ótimo", annotation_position="bottom right")
    fig.update_layout(showlegend=True)
    fig.show()

    # Calcula a curva ROC e a área sob a curva (AUC)
    fpr, tpr, thresholds = roc_curve(y, y_proba)
    auc_score = auc(fpr, tpr)
    optimal_idx_roc = np.argmax(tpr - fpr)
    optimal_threshold_roc = thresholds[optimal_idx_roc]
    print("Threshold otimo ROC: ", optimal_threshold_roc)
    print(f"Gini: {auc_score*2 -1:.3f}")

    # Plota a curva ROC
    fig = px.line(x=fpr, y=tpr, labels={'x':'Taxa de Falsos Positivos', 'y':'Taxa de Verdadeiros Positivos'},
                title=f"Curva ROC (AUC = {auc_score:.3f})")
    fig.add_shape(type='line', x0=0, x1=1, y0=0, y1=1, line=dict(dash='dash'), opacity=0.7)
    fig.update_traces(mode='lines', line_width=2, line_color='navy')
    fig.update_layout(xaxis_range=[0, 1], yaxis_range=[0, 1], width=600, height=500)
    fig.show()
    
    # Retorna o threshold ótimo e a matriz de confusão
    return optimal_threshold, df_metrics


In [None]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Gerando dados fictícios
X, y = make_classification(n_samples=1000, n_features=10, n_informative=5, random_state=42)

# Separando em conjuntos de treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Treinando um modelo de regressão logística
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

# Chamando a função para encontrar o threshold ótimo
optimal_threshold, df_metrics = find_optimal_threshold(model, X_test, y_test)


# Imprimindo as métricas para todos os thresholds
print(df_metrics)


Matriz de Confusão:
[[81 31]
 [ 2 86]]
Threshold ótimo: 0.280
Acurácia: 0.835
Precisão: 0.735
Recall: 0.977
F1-Score: 0.839


Threshold otimo ROC:  0.327639680323879
Gini: 0.824


     threshold  accuracy  precision    recall  f1_score
0         0.00     0.440   0.440000  1.000000  0.611111
1         0.01     0.490   0.463158  1.000000  0.633094
2         0.02     0.505   0.470588  1.000000  0.640000
3         0.03     0.555   0.497175  1.000000  0.664151
4         0.04     0.570   0.505747  1.000000  0.671756
..         ...       ...        ...       ...       ...
96        0.96     0.595   0.888889  0.090909  0.164948
97        0.97     0.580   0.833333  0.056818  0.106383
98        0.98     0.570   0.750000  0.034091  0.065217
99        0.99     0.555   0.000000  0.000000  0.000000
100       1.00     0.560   0.000000  0.000000  0.000000

[101 rows x 5 columns]


In [None]:
df_metrics.loc[df_metrics['threshold'] == 0.33]

Unnamed: 0,threshold,accuracy,precision,recall,f1_score
33,0.33,0.835,0.743363,0.954545,0.835821
