In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, LeaveOneOut
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
import seaborn as sns
titanic = sns.load_dataset('titanic')
print("Distribuição das classes:\n", titanic['survived'].value_counts())
titanic.head()

Distribuição das classes:
 survived
0    549
1    342
Name: count, dtype: int64


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [None]:
# Remover as linhas com valores nulos, por simplicidade
titanic = titanic.dropna(subset=['age', 'embarked', 'sex', 'pclass', 'fare', 'survived'])

# Remover colunas com dados redundantes
titanic = titanic.drop(columns=["class", "who", "adult_male", "deck", "alive", "alone"], errors='ignore')

# Selecionar as features e o target
X = titanic[['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare']]
y = titanic['survived']

# Transformar variáveis categóricas em numéricas
X = pd.get_dummies(X, drop_first=True)
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,embark_town
0,0,3,male,22.0,1,0,7.25,S,Southampton
1,1,1,female,38.0,1,0,71.2833,C,Cherbourg
2,1,3,female,26.0,0,0,7.925,S,Southampton
3,1,1,female,35.0,1,0,53.1,S,Southampton
4,0,3,male,35.0,0,0,8.05,S,Southampton


In [None]:
from sklearn.linear_model import LogisticRegression # Import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

# Padronizar os dados
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Avaliação com Hold-Out
model = LogisticRegression()  # Use LogisticRegression for classification
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled) # Predict class labels (0 or 1)

# Calcular métricas importantes
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("\nMétodo Hold-Out:")
print(f"Precisão: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"Matriz de Confusão (Leave-One-Out):\n {confusion_matrix(y_test, y_pred)}")


Método Hold-Out:
Precisão: 0.7294
Recall: 0.7126
F1-Score: 0.7209
Matriz de Confusão (Leave-One-Out):
 [[104  23]
 [ 25  62]]


In [None]:
# Leave-One-Out Cross-Validation (LOO)
print("\nMétodo Leave-One-Out:")
loo = LeaveOneOut()
f1_scores_loo = []
y_true = []
y_pred = []

for train_index, test_index in loo.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model.fit(X_train, y_train)
    y_pred_single = model.predict(X_test)

    y_true.append(y_test.values[0])
    y_pred.append(y_pred_single[0])

    # Calcular o F1-score para esta iteração
    f1 = f1_score(y_test, y_pred_single, average='macro')
    f1_scores_loo.append(f1)

# Resultados finais para LOO
print(f"Média do F1-Score (Leave-One-Out): {np.mean(f1_scores_loo):.4f}")
print(f"Média do Accuracy (Leave-One-Out): {np.mean(recall_scores_loo):.4f}")
print(f"Matriz de Confusão (Leave-One-Out):\n {confusion_matrix(y_true, y_pred)}")


Método Leave-One-Out:


NameError: name 'X_scaled' is not defined

In [None]:
from sklearn.metrics import f1_score, confusion_matrix, precision_score, accuracy_score
import numpy as np
from sklearn.model_selection import KFold

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X) # Fit and transform the entire dataset


kf = KFold(n_splits=5, shuffle=True, random_state=42)  # Defina K=10 ou outro valor
f1_scores_kf = []
precision_scores_kf = []
accuracy_scores_kf = []
y_true_kf = []
y_pred_kf = []

for train_index, test_index in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model.fit(X_train, y_train)
    y_pred_single = model.predict(X_test)

    y_true_kf.extend(y_test)  # Adiciona as verdadeiras para a avaliação posterior
    y_pred_kf.extend(y_pred_single)

    # Calcular o F1-Score para esta iteração
    f1 = f1_score(y_test, y_pred_single, average='macro')
    f1_scores_kf.append(f1)

    # Calcular a precisão para esta iteração
    precision = precision_score(y_test, y_pred_single, average='macro')
    precision_scores_kf.append(precision)

    # Calcular a acurácia para esta iteração
    accuracy = accuracy_score(y_test, y_pred_single)
    accuracy_scores_kf.append(accuracy)

# Resultados finais para K-Fold Cross-Validation
print(f"Média do F1-Score (K-Fold): {np.mean(f1_scores_kf):.4f}")
print(f"Média da Precisão (K-Fold): {np.mean(precision_scores_kf):.4f}")
print(f"Média da Acurácia (K-Fold): {np.mean(accuracy_scores_kf):.4f}")
print(f"Matriz de Confusão (K-Fold):\n {confusion_matrix(y_true_kf, y_pred_kf)}")

Média do F1-Score (K-Fold): 0.7850
Média da Precisão (K-Fold): 0.7959
Média da Acurácia (K-Fold): 0.7964
Matriz de Confusão (K-Fold):
 [[364  60]
 [ 85 203]]


In [None]:
from sklearn.metrics import f1_score, confusion_matrix, precision_score, accuracy_score
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
import numpy as np

f1_scores_bootstrap = []
precision_scores_bootstrap = []
accuracy_scores_bootstrap = []
y_true_bootstrap = []
y_pred_bootstrap = []

# Defina o número de iterações para o bootstrap
n_iterations = 100
for _ in range(n_iterations):
    # Realizar reamostragem com reposição
    X_resampled, y_resampled = resample(X_scaled, y, random_state=42)

    # Divida os dados reamostrados em treino e teste
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

    model.fit(X_train, y_train)
    y_pred_single = model.predict(X_test)

    y_true_bootstrap.extend(y_test)  # Adiciona as verdadeiras para a avaliação posterior
    y_pred_bootstrap.extend(y_pred_single)

    # Calcular o F1-Score para esta iteração
    f1 = f1_score(y_test, y_pred_single, average='macro')
    f1_scores_bootstrap.append(f1)

    # Calcular a precisão para esta iteração
    precision = precision_score(y_test, y_pred_single, average='macro')
    precision_scores_bootstrap.append(precision)

    # Calcular a acurácia para esta iteração
    accuracy = accuracy_score(y_test, y_pred_single)
    accuracy_scores_bootstrap.append(accuracy)

# Resultados finais para o Bootstrap
print(f"Média do F1-Score (Bootstrap): {np.mean(f1_scores_bootstrap):.4f}")
print(f"Média da Precisão (Bootstrap): {np.mean(precision_scores_bootstrap):.4f}")
print(f"Média da Acurácia (Bootstrap): {np.mean(accuracy_scores_bootstrap):.4f}")
print(f"Matriz de Confusão (Bootstrap):\n {confusion_matrix(y_true_bootstrap, y_pred_bootstrap)}")


Média do F1-Score (Bootstrap): 0.7853
Média da Precisão (Bootstrap): 0.7930
Média da Acurácia (Bootstrap): 0.7944
Matriz de Confusão (Bootstrap):
 [[10700  1700]
 [ 2700  6300]]
