## Imports

In [None]:
# Standard
import pandas as pd
import numpy as np
import pathlib
import matplotlib.pyplot as plt
import seaborn as sns

# ML
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.metrics import f1_score, roc_curve, auc, confusion_matrix

## Constantes

In [None]:
COLOR_NO_DISASTER = '#3498db'
COLOR_DISASTER = '#e74c3c'
COLOR_GENERAL = '#95a5a6'

SEED = 42

## Datos

In [None]:
data_path = pathlib.Path("../.data/raw")
df = pd.read_csv(data_path / "train.csv")
test_df = pd.read_csv(data_path / "test.csv")

In [None]:
target_mean = df['target'].mean()
print(f'Shape del dataset: {df.shape}')
print(f'Porcentaje de desastres en el target: {target_mean*100:.2f}%')
df.sample(5, random_state=SEED)

## Feature Engineering

In [None]:
# DummyClassifier no necesita features complejas, solo el target
X = df[['id']]  # Usamos id como placeholder
y = df['target'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED, stratify=y)

print(f'Shape de X_train: {X_train.shape}')

## Entrenamiento del modelo

In [None]:
# Modelo Dummy: predice la clase más frecuente
dummy_clf = DummyClassifier(strategy="most_frequent", random_state=SEED)
dummy_clf.fit(X_train, y_train)

print(f'Clase más frecuente: {dummy_clf.classes_[dummy_clf.predict([X_train.iloc[0]])[0]]}')

## Evaluación y Visualización

In [None]:
# Predicciones
y_pred = dummy_clf.predict(X_test)
y_probs = dummy_clf.predict_proba(X_test)[:, 1]

# F1 Score
f1 = f1_score(y_test, y_pred)
print(f'F1-Score: {f1:.4f}')

# 1. Curva ROC
fpr, tpr, thresholds = roc_curve(y_test, y_probs)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color=COLOR_DISASTER, lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color=COLOR_GENERAL, lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) - Dummy Classifier')
plt.legend(loc="lower right")
plt.show()

# 2. Matriz de Confusión
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix - Dummy Classifier')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

## Submission

In [None]:
# Generar submission
X_submission = test_df[['id']]
y_pred_sub = dummy_clf.predict(X_submission)

submission = pd.DataFrame({'id': test_df['id'], 'target': y_pred_sub})

submission_path = pathlib.Path("../.data/submission")
submission_path.mkdir(parents=True, exist_ok=True)
submission.to_csv(submission_path / "random_prediction_submission.csv", index=False)

print(f"Submission guardada en {submission_path / 'random_prediction_submission.csv'}")