In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import json
import numpy as np

In [5]:
print("="*70)
print("LOGISTIC REGRESSION - 5-FOLD CROSS VALIDATION")
print("="*70)

data = np.load("brain_mri_preprocessed.npz")

X_train_flat = data["X_train"]
y_train      = data["y_train"]

X_val_flat   = data["X_val"]
y_val        = data["y_val"]

X_test_flat  = data["X_test"]
y_test       = data["y_test"]

# ===== CONFIGURAR MODELO =====
lr_model = LogisticRegression(
    solver='liblinear',
    C=1.0,
    max_iter=5000,
    random_state=42
)

LOGISTIC REGRESSION - 5-FOLD CROSS VALIDATION


In [6]:
# ===== K-FOLD CROSS VALIDATION =====
print("\n[1/4] Running 5-Fold Cross Validation on training set...")

# Usar StratifiedKFold para manter proporção de classes
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Cross validation (isto demora ~1-2 min)
cv_scores = cross_val_score(
    lr_model, 
    X_train_flat,  # Usa os dados flat
    y_train, 
    cv=skf, 
    scoring='accuracy',
    n_jobs=-1  # Usa todos os cores do CPU
)

print("\n5-Fold CV Results:")
for i, score in enumerate(cv_scores, 1):
    print(f"  Fold {i}: {score:.4f}")
print(f"  " + "-"*40)
print(f"  Mean:    {cv_scores.mean():.4f}")
print(f"  Std Dev: {cv_scores.std():.4f}")
print(f"  Min:     {cv_scores.min():.4f}")
print(f"  Max:     {cv_scores.max():.4f}")


[1/4] Running 5-Fold Cross Validation on training set...

5-Fold CV Results:
  Fold 1: 0.9734
  Fold 2: 0.9627
  Fold 3: 0.9808
  Fold 4: 0.9733
  Fold 5: 0.9744
  ----------------------------------------
  Mean:    0.9729
  Std Dev: 0.0058
  Min:     0.9627
  Max:     0.9808


In [7]:
# ===== TREINAR NO TRAINING SET COMPLETO =====
print("\n[2/4] Training on full training set...")
lr_model.fit(X_train_flat, y_train)
print("✓ Model trained")


[2/4] Training on full training set...
✓ Model trained


In [8]:
# ===== AVALIAR NO VALIDATION SET =====
print("\n[3/4] Evaluating on validation set...")
y_val_pred = lr_model.predict(X_val_flat)

val_acc = accuracy_score(y_val, y_val_pred)
val_prec = precision_score(y_val, y_val_pred)
val_rec = recall_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)

print(f"Validation Set Performance:")
print(f"  Accuracy:  {val_acc:.4f}")
print(f"  Precision: {val_prec:.4f}")
print(f"  Recall:    {val_rec:.4f}")
print(f"  F1-score:  {val_f1:.4f}")


[3/4] Evaluating on validation set...
Validation Set Performance:
  Accuracy:  0.9626
  Precision: 0.9787
  Recall:    0.9707
  F1-score:  0.9747


In [9]:
# ===== AVALIAR NO TEST SET =====
print("\n[4/4] Final evaluation on test set...")
y_test_pred = lr_model.predict(X_test_flat)

test_acc = accuracy_score(y_test, y_test_pred)
test_prec = precision_score(y_test, y_test_pred)
test_rec = recall_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

print(f"\nTest Set Performance:")
print(f"  Accuracy:  {test_acc:.4f}")
print(f"  Precision: {test_prec:.4f}")
print(f"  Recall:    {test_rec:.4f}")
print(f"  F1-score:  {test_f1:.4f}")


[4/4] Final evaluation on test set...

Test Set Performance:
  Accuracy:  0.9826
  Precision: 0.9989
  Recall:    0.9777
  F1-score:  0.9882


In [10]:
# Confusion matrix
cm_test = confusion_matrix(y_test, y_test_pred)
print(f"\nConfusion Matrix (Test):")
print(cm_test)


Confusion Matrix (Test):
[[308   1]
 [ 20 876]]


In [11]:
# ===== OVERFITTING CHECK =====
train_acc = lr_model.score(X_train_flat, y_train)
gap = train_acc - test_acc

print(f"\n" + "="*70)
print("OVERFITTING ANALYSIS")
print("="*70)
print(f"  Training Accuracy: {train_acc:.4f}")
print(f"  Test Accuracy:     {test_acc:.4f}")
print(f"  Gap:               {gap:.4f} ({gap*100:.2f}%)")

if gap < 0.01:
    status = "✓ Excellent generalization (gap < 1%)"
elif gap < 0.03:
    status = "⚠ Good generalization (gap < 3%)"
else:
    status = "✗ Possible overfitting (gap > 3%)"
print(f"  Status: {status}")



OVERFITTING ANALYSIS
  Training Accuracy: 1.0000
  Test Accuracy:     0.9826
  Gap:               0.0174 (1.74%)
  Status: ⚠ Good generalization (gap < 3%)


In [12]:
# ===== GUARDAR RESULTADOS =====
lr_results = {
    'model': 'Logistic Regression',
    'hyperparameters': {
        'solver': 'liblinear',
        'C': 1.0,
        'max_iter': 5000
    },
    'cross_validation': {
        'mean_accuracy': float(cv_scores.mean()),
        'std_dev': float(cv_scores.std()),
        'min': float(cv_scores.min()),
        'max': float(cv_scores.max()),
        'fold_scores': cv_scores.tolist()
    },
    'validation_set': {
        'accuracy': float(val_acc),
        'precision': float(val_prec),
        'recall': float(val_rec),
        'f1_score': float(val_f1)
    },
    'test_set': {
        'accuracy': float(test_acc),
        'precision': float(test_prec),
        'recall': float(test_rec),
        'f1_score': float(test_f1)
    },
    'overfitting': {
        'train_accuracy': float(train_acc),
        'test_accuracy': float(test_acc),
        'gap': float(gap),
        'status': status
    }
}

# Guardar em JSON
with open('lr_cv_results.json', 'w') as f:
    json.dump(lr_results, f, indent=4)

print("\n✓ Results saved to: lr_cv_results.json")
print("="*70)
print("LOGISTIC REGRESSION - COMPLETED ✓")
print("="*70)



✓ Results saved to: lr_cv_results.json
LOGISTIC REGRESSION - COMPLETED ✓
