## Imports

## Imports

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import sys

# Agregar path para importar ml_utils
sys.path.append(str(Path('.').resolve()))
from ml_utils import evaluate_model, plot_feature_importance, compare_models

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, make_scorer

import matplotlib.pyplot as plt
import seaborn as sns

# Semilla para reproducibilidad
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

## Cargar Datos

In [None]:
DATA_PATH = Path("../.data/processed/")

train_df = pd.read_pickle(DATA_PATH / "train.pkl")

print(f"Train dataset: {train_df.shape}")
print(f"\nDistribuci√≥n del target:")
print(train_df['target'].value_counts().sort_index())
print(f"\nProporci√≥n de disasters: {train_df['target'].mean():.2%}")

## Preparar Features

Usaremos:
- **7 features num√©ricas**: text_length, word_count, hashtag_count, mention_count, url_count, uppercase_percentage, punctuation_percentage
- **TF-IDF**: 150 features del texto lematizado (incluye bigrams)
- **Mean Encoding**: 1 feature codificando keywords por su relaci√≥n con el target

In [None]:
# Features num√©ricas
numeric_features = [
    'text_length',
    'word_count',
    'hashtag_count',
    'mention_count',
    'url_count',
    'uppercase_percentage',
    'punctuation_percentage'
]

print(f"Features num√©ricas: {len(numeric_features)}")

# Preparar datasets
X_numeric = train_df[numeric_features]
X_text = train_df['text_lemmatized'].fillna('')
X_keyword = train_df['keyword_clean'].fillna('unknown')
y = train_df['target']

print(f"\nShapes:")
print(f"  X_numeric: {X_numeric.shape}")
print(f"  X_text: {X_text.shape}")
print(f"  X_keyword: {X_keyword.shape}")
print(f"  y: {y.shape}")

## Split Train/Validation

80/20 split estratificado.

In [None]:
# Split indices - primero separamos los √≠ndices
train_idx, val_idx = train_test_split(
    np.arange(len(train_df)),
    test_size=0.2,
    stratify=y,
    random_state=RANDOM_STATE
)

# Ahora separamos usando los √≠ndices
X_numeric_train = X_numeric.iloc[train_idx]
X_numeric_val = X_numeric.iloc[val_idx]

X_text_train = X_text.iloc[train_idx]
X_text_val = X_text.iloc[val_idx]

X_keyword_train = X_keyword.iloc[train_idx]
X_keyword_val = X_keyword.iloc[val_idx]

y_train = y.iloc[train_idx]
y_val = y.iloc[val_idx]

print(f"Train set: {len(y_train)} samples")
print(f"Val set: {len(y_val)} samples")
print(f"\nDistribuci√≥n en train: {y_train.value_counts().to_dict()}")
print(f"Distribuci√≥n en val: {y_val.value_counts().to_dict()}")

## Mean Encoding para Keywords

El **Mean Encoding** (Target Encoding) reemplaza cada categor√≠a por la media del target para esa categor√≠a.

Ventajas vs OneHot:
- Reduce dimensionalidad (1 columna vs N columnas)
- Captura relaci√≥n directa con el target
- Funciona mejor con XGBoost

‚ö†Ô∏è Importante: Solo calculamos las medias en el train set para evitar data leakage.

In [None]:
# Calcular mean encoding en train
keyword_means = train_df.iloc[train_idx].groupby('keyword_clean')['target'].mean()

print("Mean Encoding por Keyword (Top 10):")
print(keyword_means.sort_values(ascending=False).head(10))
print("\nKeywords con menor mean (Top 10):")
print(keyword_means.sort_values().head(10))

# Aplicar encoding
# Usar media global como default para keywords no vistos
global_mean = y_train.mean()

X_keyword_train_encoded = X_keyword_train.map(keyword_means).fillna(global_mean).values.reshape(-1, 1)
X_keyword_val_encoded = X_keyword_val.map(keyword_means).fillna(global_mean).values.reshape(-1, 1)

print(f"\n‚úÖ Mean Encoding aplicado")
print(f"  Train shape: {X_keyword_train_encoded.shape}")
print(f"  Val shape: {X_keyword_val_encoded.shape}")
print(f"  Global mean (para keywords no vistos): {global_mean:.4f}")

## Feature Engineering Pipeline

1. StandardScaler para features num√©ricas
2. TfidfVectorizer para texto (max 150 features, bigrams)
3. Mean Encoding para keywords (ya aplicado)
4. Combinar todo

In [None]:
# 1. Escalar num√©ricas
scaler = StandardScaler()
X_numeric_train_scaled = scaler.fit_transform(X_numeric_train)
X_numeric_val_scaled = scaler.transform(X_numeric_val)

# 2. TF-IDF para texto (m√°s features que Model 1)
tfidf = TfidfVectorizer(
    max_features=150,
    min_df=2,
    max_df=0.8,
    ngram_range=(1, 2)  # unigrams y bigrams
)
X_text_train_tfidf = tfidf.fit_transform(X_text_train).toarray()
X_text_val_tfidf = tfidf.transform(X_text_val).toarray()

print("Features transformadas:")
print(f"  Num√©ricas escaladas: {X_numeric_train_scaled.shape}")
print(f"  TF-IDF (texto + bigrams): {X_text_train_tfidf.shape}")
print(f"  Mean Encoding (keywords): {X_keyword_train_encoded.shape}")

# 3. Combinar
X_train_combined = np.hstack([
    X_numeric_train_scaled,
    X_text_train_tfidf,
    X_keyword_train_encoded
])

X_val_combined = np.hstack([
    X_numeric_val_scaled,
    X_text_val_tfidf,
    X_keyword_val_encoded
])

print("\n‚úÖ Features combinadas:")
print(f"  Train: {X_train_combined.shape}")
print(f"  Validation: {X_val_combined.shape}")

## GridSearchCV - B√∫squeda de Hiperpar√°metros

Buscaremos los mejores hiperpar√°metros para XGBoost.
Usaremos un grid reducido para hacer pruebas m√°s r√°pidas.

In [None]:
# Grid de hiperpar√°metros (reducido para demo)
param_grid = {
    'n_estimators': [200, 300],
    'max_depth': [5, 7, 9],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8],
    'colsample_bytree': [0.8],
    'gamma': [0, 0.1],
    'reg_alpha': [0, 0.1],
    'reg_lambda': [1]
}

# Modelo base
xgb_base = XGBClassifier(
    random_state=RANDOM_STATE,
    eval_metric='logloss',
    use_label_encoder=False
)

# F1 scorer
f1_scorer = make_scorer(f1_score)

# GridSearchCV con 5-fold cross-validation
print("Iniciando GridSearchCV para XGBoost...")
print(f"Combinaciones a probar: {np.prod([len(v) for v in param_grid.values()])}")
print("‚ö†Ô∏è Esto puede tomar 15-30 minutos...\n")

grid_search = GridSearchCV(
    estimator=xgb_base,
    param_grid=param_grid,
    scoring=f1_scorer,
    cv=5,
    n_jobs=-1,
    verbose=2,
    return_train_score=True
)

# Fit
grid_search.fit(X_train_combined, y_train)

print("\n‚úÖ GridSearchCV completado")

## Resultados de GridSearchCV

In [None]:
print("=" * 70)
print("RESULTADOS DE GRIDSEARCHCV - XGBOOST".center(70))
print("=" * 70)

print(f"\nüèÜ Mejor F1 Score (CV): {grid_search.best_score_:.4f}")
print(f"\nüìã Mejores hiperpar√°metros:")
for param, value in grid_search.best_params_.items():
    print(f"  {param}: {value}")

# Top 5 configuraciones
results_df = pd.DataFrame(grid_search.cv_results_)
top_5 = results_df.nlargest(5, 'mean_test_score')[['params', 'mean_test_score', 'std_test_score']]

print(f"\nüîù Top 5 Configuraciones:")
for idx, row in top_5.iterrows():
    print(f"\n  F1 Score: {row['mean_test_score']:.4f} (¬±{row['std_test_score']:.4f})")
    print(f"  Params: {row['params']}")

print("=" * 70)

## Evaluaci√≥n en Validaci√≥n

In [None]:
# Mejor modelo
best_xgb = grid_search.best_estimator_

# Predecir
y_pred_val = best_xgb.predict(X_val_combined)

# Evaluar
results_xgb = evaluate_model(y_val, y_pred_val, "XGBoost (Best from GridSearchCV)")

## Feature Importance

XGBoost proporciona importancia basada en ganancia (gain).

In [None]:
# Construir nombres de features
feature_names = []

# Num√©ricas
feature_names.extend(numeric_features)

# TF-IDF
tfidf_names = [f"text_{word}" for word in tfidf.get_feature_names_out()]
feature_names.extend(tfidf_names)

# Mean Encoding (1 feature)
feature_names.append('keyword_mean_encoding')

print(f"Total feature names: {len(feature_names)}")
print(f"Total features en modelo: {X_train_combined.shape[1]}")

# Verificar
assert len(feature_names) == X_train_combined.shape[1], \
    "Mismatch entre feature names y n√∫mero de features"

In [None]:
# Obtener importancia por ganancia
importance = best_xgb.get_booster().get_score(importance_type='gain')

# Convertir a array (usar 0 si feature no aparece)
importance_array = np.array([importance.get(f'f{i}', 0) for i in range(len(feature_names))])

# Graficar
importance_df = plot_feature_importance(
    feature_names,
    importance_array,
    "XGBoost",
    top_n=20
)

# Mostrar top 10
print("\nüîù Top 10 Features M√°s Importantes:")
print(importance_df.head(10).to_string())

## An√°lisis de Overfitting

In [None]:
# Predecir en train
y_pred_train = best_xgb.predict(X_train_combined)

# Evaluar (sin imprimir)
results_train = evaluate_model(y_train, y_pred_train, print_results=False)
results_val = evaluate_model(y_val, y_pred_val, print_results=False)

# Comparar
comparison = pd.DataFrame({
    'Train': results_train,
    'Validation': results_val,
    'Diferencia': {k: results_train[k] - results_val[k] for k in results_train.keys()}
})

print("=" * 60)
print("COMPARACI√ìN TRAIN vs VALIDATION".center(60))
print("=" * 60)
print(comparison.T.to_string())
print("=" * 60)

# An√°lisis
diff_f1 = results_train['f1'] - results_val['f1']
if diff_f1 > 0.05:
    print(f"\n‚ö†Ô∏è Posible overfitting detectado (diferencia F1: {diff_f1:.4f})")
elif diff_f1 < 0:
    print(f"\n‚ö†Ô∏è Performance mejor en validaci√≥n que train (diferencia F1: {diff_f1:.4f})")
else:
    print(f"\n‚úÖ Buen balance entre train y validation (diferencia F1: {diff_f1:.4f})")

## Resumen del Modelo

In [None]:
print("=" * 70)
print("RESUMEN: XGBOOST CON GRIDSEARCHCV Y MEAN ENCODING".center(70))
print("=" * 70)

print(f"\nüìä Performance:")
print(f"  F1 Score (CV):         {grid_search.best_score_:.4f}")
print(f"  F1 Score (Train):      {results_train['f1']:.4f}")
print(f"  F1 Score (Validation): {results_val['f1']:.4f}  ‚≠ê")

print(f"\nüéØ Target: F1 > 0.80")
if results_val['f1'] > 0.80:
    print(f"  ‚úÖ OBJETIVO CUMPLIDO (F1 = {results_val['f1']:.4f})")
else:
    print(f"  ‚ùå Por debajo del objetivo (F1 = {results_val['f1']:.4f})")
    print(f"     Falta: {0.80 - results_val['f1']:.4f}")

print(f"\nüîß Configuraci√≥n:")
print(f"  Algoritmo: XGBoost (Gradient Boosting)")
print(f"  Features totales: {X_train_combined.shape[1]}")
print(f"  Encoding keywords: Mean Encoding (Target Encoding)")
print(f"  B√∫squeda: GridSearchCV (5-fold CV)")
print(f"  Combinaciones probadas: {len(results_df)}")

print(f"\nüèÜ Mejores hiperpar√°metros:")
for param, value in grid_search.best_params_.items():
    print(f"  {param}: {value}")

print("\n" + "=" * 70)

## Predicciones para Competencia

In [None]:
# Cargar test
test_df = pd.read_pickle(DATA_PATH / "test.pkl")

print(f"Test dataset: {test_df.shape}")

# Preparar features
X_test_numeric = test_df[numeric_features]
X_test_text = test_df['text_lemmatized'].fillna('')
X_test_keyword = test_df['keyword_clean'].fillna('unknown')

# Transformar
X_test_numeric_scaled = scaler.transform(X_test_numeric)
X_test_text_tfidf = tfidf.transform(X_test_text).toarray()
X_test_keyword_encoded = X_test_keyword.map(keyword_means).fillna(global_mean).values.reshape(-1, 1)

# Combinar
X_test_combined = np.hstack([
    X_test_numeric_scaled,
    X_test_text_tfidf,
    X_test_keyword_encoded
])

print(f"Test features combinadas: {X_test_combined.shape}")

# Predecir
test_predictions = best_xgb.predict(X_test_combined)

print(f"\nPredicciones generadas: {len(test_predictions)}")
print(f"Distribuci√≥n de predicciones:")
print(pd.Series(test_predictions).value_counts().sort_index())

In [None]:
# Crear submission
test_raw = pd.read_csv(Path("../.data/raw/") / "test.csv")
submission = pd.DataFrame({
    'id': test_raw['id'],
    'target': test_predictions
})

# Guardar
OUTPUT_PATH = Path("../.data/submissions/")
OUTPUT_PATH.mkdir(parents=True, exist_ok=True)

submission_file = OUTPUT_PATH / "model2_xgboost.csv"
submission.to_csv(submission_file, index=False)

print(f"‚úÖ Submission guardado en: {submission_file}")
print(f"\nPrimeras 5 filas:")
print(submission.head())
print(f"\n√öltimas 5 filas:")
print(submission.tail())

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import sys

# Agregar path para importar ml_utils
sys.path.append(str(Path('.').resolve()))
from ml_utils import evaluate_model, plot_feature_importance, compare_models, COLOR_NO_DISASTER, COLOR_DISASTER, COLOR_GENERAL

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, make_scorer

import matplotlib.pyplot as plt
import seaborn as sns

# Semilla para reproducibilidad
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

## Cargar Datos Avanzados

In [None]:
DATA_PATH = Path("../.data/processed/")

# Cargar datos con features avanzadas
train_df = pd.read_pickle(DATA_PATH / "train_advanced.pkl")

print(f"Train dataset: {train_df.shape}")
print(f"\nDistribuci√≥n del target:")
print(train_df['target'].value_counts().sort_index())
print(f"\nProporci√≥n de disasters: {train_df['target'].mean():.2%}")

## Preparar Features

Usaremos las mismas features que en Model 1, pero con **Mean Encoding** para keywords.

In [None]:
# Features num√©ricas b√°sicas
numeric_features_basic = [
    'text_length',
    'word_count',
    'hashtag_count',
    'mention_count',
    'url_count',
    'uppercase_percentage',
    'punctuation_percentage'
]

# Features num√©ricas avanzadas
numeric_features_advanced = [
    'sentiment_polarity',
    'sentiment_subjectivity',
    'emoji_count',
    'uppercase_word_count',
    'lexical_diversity',
    'number_count',
    'urgency_word_count',
    'intensity_word_count',
    'has_valid_location'
]

# Location features
location_features = ['location_lat', 'location_lon']

# Combinar todas las num√©ricas
numeric_features = numeric_features_basic + numeric_features_advanced

print(f"Features num√©ricas: {len(numeric_features)}")
print(f"Features de location: {len(location_features)}")

# Preparar datasets
X_numeric = train_df[numeric_features]
X_location = train_df[location_features].fillna(-999)
X_text = train_df['text_lemmatized'].fillna('')
X_keyword = train_df['keyword_clean'].fillna('unknown')
y = train_df['target']

print(f"\nShapes:")
print(f"  X_numeric: {X_numeric.shape}")
print(f"  X_location: {X_location.shape}")
print(f"  X_text: {X_text.shape}")
print(f"  X_keyword: {X_keyword.shape}")
print(f"  y: {y.shape}")

## Split Train/Validation

80/20 split estratificado.

In [None]:
# Split indices - primero separamos los √≠ndices
train_idx, val_idx = train_test_split(
    np.arange(len(train_df)),
    test_size=0.2,
    stratify=y,
    random_state=RANDOM_STATE
)

# Ahora separamos todo usando los √≠ndices
X_numeric_train = X_numeric.iloc[train_idx]
X_numeric_val = X_numeric.iloc[val_idx]

X_location_train = X_location.iloc[train_idx]
X_location_val = X_location.iloc[val_idx]

X_text_train = X_text.iloc[train_idx]
X_text_val = X_text.iloc[val_idx]

X_keyword_train = X_keyword.iloc[train_idx]
X_keyword_val = X_keyword.iloc[val_idx]

y_train = y.iloc[train_idx]
y_val = y.iloc[val_idx]

print(f"Train set: {len(y_train)} samples")
print(f"Val set: {len(y_val)} samples")
print(f"\nDistribuci√≥n en train: {y_train.value_counts().to_dict()}")
print(f"Distribuci√≥n en val: {y_val.value_counts().to_dict()}")

## Mean Encoding para Keywords

El **Mean Encoding** (Target Encoding) reemplaza cada categor√≠a por la media del target para esa categor√≠a.

Ventajas vs OneHot:
- Reduce dimensionalidad (1 columna vs N columnas)
- Captura relaci√≥n directa con el target
- Funciona mejor con XGBoost

‚ö†Ô∏è Importante: Solo calculamos las medias en el train set para evitar data leakage.

In [None]:
# Calcular mean encoding en train
keyword_means = train_df.iloc[train_idx].groupby('keyword_clean')['target'].mean()

print("Mean Encoding por Keyword:")
print(keyword_means.sort_values(ascending=False).head(10))
print("\nKeywords con menor mean:")
print(keyword_means.sort_values().head(10))

# Aplicar encoding
# Usar media global como default para keywords no vistos
global_mean = y_train.mean()

X_keyword_train_encoded = X_keyword_train.map(keyword_means).fillna(global_mean).values.reshape(-1, 1)
X_keyword_val_encoded = X_keyword_val.map(keyword_means).fillna(global_mean).values.reshape(-1, 1)

print(f"\n‚úÖ Mean Encoding aplicado")
print(f"  Train shape: {X_keyword_train_encoded.shape}")
print(f"  Val shape: {X_keyword_val_encoded.shape}")
print(f"  Global mean (para keywords no vistos): {global_mean:.4f}")

## Feature Engineering Pipeline

1. StandardScaler para features num√©ricas
2. TfidfVectorizer para texto (max 150 features, incluyendo bigrams)
3. Mean Encoding para keywords (ya aplicado)
4. Combinar todo

In [None]:
# 1. Escalar num√©ricas
scaler = StandardScaler()
X_numeric_train_scaled = scaler.fit_transform(X_numeric_train)
X_numeric_val_scaled = scaler.transform(X_numeric_val)

# 2. Location
location_scaler = StandardScaler()
X_location_train_scaled = location_scaler.fit_transform(X_location_train)
X_location_val_scaled = location_scaler.transform(X_location_val)

# 3. TF-IDF para texto (m√°s features que en Model 1)
tfidf = TfidfVectorizer(
    max_features=150,
    min_df=2,
    max_df=0.8,
    ngram_range=(1, 2)  # unigrams y bigrams
)
X_text_train_tfidf = tfidf.fit_transform(X_text_train).toarray()
X_text_val_tfidf = tfidf.transform(X_text_val).toarray()

print("Features transformadas:")
print(f"  Num√©ricas escaladas: {X_numeric_train_scaled.shape}")
print(f"  Location escaladas: {X_location_train_scaled.shape}")
print(f"  TF-IDF (texto + bigrams): {X_text_train_tfidf.shape}")
print(f"  Mean Encoding (keywords): {X_keyword_train_encoded.shape}")

# 4. Combinar todas las features
X_train_combined = np.hstack([
    X_numeric_train_scaled,
    X_location_train_scaled,
    X_text_train_tfidf,
    X_keyword_train_encoded
])

X_val_combined = np.hstack([
    X_numeric_val_scaled,
    X_location_val_scaled,
    X_text_val_tfidf,
    X_keyword_val_encoded
])

print("\n‚úÖ Features combinadas:")
print(f"  Train: {X_train_combined.shape}")
print(f"  Validation: {X_val_combined.shape}")

## GridSearchCV - B√∫squeda de Hiperpar√°metros

Buscaremos los mejores hiperpar√°metros para XGBoost:
- **n_estimators**: n√∫mero de boosting rounds
- **max_depth**: profundidad m√°xima de √°rboles
- **learning_rate**: tasa de aprendizaje
- **subsample**: fracci√≥n de muestras por √°rbol
- **colsample_bytree**: fracci√≥n de features por √°rbol
- **gamma**: m√≠nima reducci√≥n de loss para split
- **reg_alpha**: regularizaci√≥n L1
- **reg_lambda**: regularizaci√≥n L2

In [None]:
# Definir grid de hiperpar√°metros
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7, 9],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'gamma': [0, 0.1, 0.2],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [1, 1.5, 2]
}

# Crear modelo base
xgb_base = XGBClassifier(
    random_state=RANDOM_STATE,
    eval_metric='logloss',
    use_label_encoder=False
)

# F1 scorer
f1_scorer = make_scorer(f1_score)

# GridSearchCV con 5-fold cross-validation
print("Iniciando GridSearchCV para XGBoost...")
print(f"Combinaciones totales: {np.prod([len(v) for v in param_grid.values()])}")
print("‚ö†Ô∏è NOTA: Esto puede tomar MUCHO tiempo. Reduciendo grid para demo...")

# Grid reducido para demo (comentar esto y usar param_grid completo si tienes tiempo)
param_grid_reduced = {
    'n_estimators': [200, 300],
    'max_depth': [5, 7],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8],
    'colsample_bytree': [0.8],
    'gamma': [0, 0.1],
    'reg_alpha': [0, 0.1],
    'reg_lambda': [1]
}

print(f"Usando grid reducido: {np.prod([len(v) for v in param_grid_reduced.values()])} combinaciones")
print("Esto puede tomar varios minutos...\n")

grid_search = GridSearchCV(
    estimator=xgb_base,
    param_grid=param_grid_reduced,
    scoring=f1_scorer,
    cv=5,
    n_jobs=-1,
    verbose=2,
    return_train_score=True
)

# Fit
grid_search.fit(X_train_combined, y_train)

print("\n‚úÖ GridSearchCV completado")

## Resultados de GridSearchCV

In [None]:
print("=" * 70)
print("RESULTADOS DE GRIDSEARCHCV - XGBOOST".center(70))
print("=" * 70)

print(f"\nüèÜ Mejor F1 Score (CV): {grid_search.best_score_:.4f}")
print(f"\nüìã Mejores hiperpar√°metros:")
for param, value in grid_search.best_params_.items():
    print(f"  {param}: {value}")

# Top 5 configuraciones
results_df = pd.DataFrame(grid_search.cv_results_)
top_5 = results_df.nlargest(5, 'mean_test_score')[['params', 'mean_test_score', 'std_test_score']]

print(f"\nüîù Top 5 Configuraciones:")
for idx, row in top_5.iterrows():
    print(f"\n  F1 Score: {row['mean_test_score']:.4f} (¬±{row['std_test_score']:.4f})")
    print(f"  Params: {row['params']}")

print("=" * 70)

## Entrenar Modelo Final con Mejores Hiperpar√°metros

In [None]:
# Obtener mejor modelo
best_xgb = grid_search.best_estimator_

print(f"Modelo final: {best_xgb}")
print(f"\nN√∫mero de features utilizadas: {X_train_combined.shape[1]}")

## Evaluaci√≥n en Validaci√≥n

In [None]:
# Predecir
y_pred_val = best_xgb.predict(X_val_combined)

# Evaluar usando ml_utils
results_xgb = evaluate_model(y_val, y_pred_val, "XGBoost (Best from GridSearchCV)")

## Feature Importance

XGBoost proporciona importancia basada en ganancia (gain).

In [None]:
# Construir nombres de features
feature_names = []

# Num√©ricas
feature_names.extend(numeric_features)

# Location
feature_names.extend(['location_lat', 'location_lon'])

# TF-IDF
tfidf_names = [f"text_{word}" for word in tfidf.get_feature_names_out()]
feature_names.extend(tfidf_names)

# Mean Encoding (1 feature)
feature_names.append('keyword_mean_encoding')

print(f"Total feature names: {len(feature_names)}")
print(f"Total features en modelo: {X_train_combined.shape[1]}")

# Verificar
assert len(feature_names) == X_train_combined.shape[1], \
    "Mismatch entre feature names y n√∫mero de features"

In [None]:
# Obtener importancia por ganancia
importance = best_xgb.get_booster().get_score(importance_type='gain')

# Convertir a array (usar 0 si feature no aparece)
importance_array = np.array([importance.get(f'f{i}', 0) for i in range(len(feature_names))])

# Usar funci√≥n de ml_utils para graficar
importance_df = plot_feature_importance(
    feature_names,
    importance_array,
    "XGBoost",
    top_n=20
)

# Mostrar top 10
print("\nüîù Top 10 Features M√°s Importantes:")
print(importance_df.head(10).to_string())

## An√°lisis de Overfitting

In [None]:
# Predecir en train
y_pred_train = best_xgb.predict(X_train_combined)

# Evaluar (sin imprimir)
results_train = evaluate_model(y_train, y_pred_train, print_results=False)
results_val = evaluate_model(y_val, y_pred_val, print_results=False)

# Comparar
comparison = pd.DataFrame({
    'Train': results_train,
    'Validation': results_val,
    'Diferencia': {k: results_train[k] - results_val[k] for k in results_train.keys()}
})

print("=" * 60)
print("COMPARACI√ìN TRAIN vs VALIDATION".center(60))
print("=" * 60)
print(comparison.T.to_string())
print("=" * 60)

# An√°lisis
diff_f1 = results_train['f1'] - results_val['f1']
if diff_f1 > 0.05:
    print(f"\n‚ö†Ô∏è Posible overfitting detectado (diferencia F1: {diff_f1:.4f})")
elif diff_f1 < 0:
    print(f"\n‚ö†Ô∏è Performance mejor en validaci√≥n que train (diferencia F1: {diff_f1:.4f})")
else:
    print(f"\n‚úÖ Buen balance entre train y validation (diferencia F1: {diff_f1:.4f})")

## Resumen del Modelo

In [None]:
print("=" * 70)
print("RESUMEN: XGBOOST CON GRIDSEARCHCV Y MEAN ENCODING".center(70))
print("=" * 70)

print(f"\nüìä Performance:")
print(f"  F1 Score (CV):         {grid_search.best_score_:.4f}")
print(f"  F1 Score (Train):      {results_train['f1']:.4f}")
print(f"  F1 Score (Validation): {results_val['f1']:.4f}  ‚≠ê")

print(f"\nüéØ Target: F1 > 0.80")
if results_val['f1'] > 0.80:
    print(f"  ‚úÖ OBJETIVO CUMPLIDO (F1 = {results_val['f1']:.4f})")
else:
    print(f"  ‚ùå Por debajo del objetivo (F1 = {results_val['f1']:.4f})")
    print(f"     Falta: {0.80 - results_val['f1']:.4f}")

print(f"\nüîß Configuraci√≥n:")
print(f"  Algoritmo: XGBoost (Gradient Boosting)")
print(f"  Features totales: {X_train_combined.shape[1]}")
print(f"  Encoding keywords: Mean Encoding (Target Encoding)")
print(f"  B√∫squeda: GridSearchCV (5-fold CV)")
print(f"  Combinaciones probadas: {len(results_df)}")

print(f"\nüèÜ Mejores hiperpar√°metros:")
for param, value in grid_search.best_params_.items():
    print(f"  {param}: {value}")

print("\n" + "=" * 70)

## Predicciones para Competencia

In [None]:
# Cargar test
test_df = pd.read_pickle(DATA_PATH / "test_advanced.pkl")

print(f"Test dataset: {test_df.shape}")

# Preparar features
X_test_numeric = test_df[numeric_features]
X_test_location = test_df[location_features].fillna(-999)
X_test_text = test_df['text_lemmatized'].fillna('')
X_test_keyword = test_df['keyword_clean'].fillna('unknown')

# Transformar
X_test_numeric_scaled = scaler.transform(X_test_numeric)
X_test_location_scaled = location_scaler.transform(X_test_location)
X_test_text_tfidf = tfidf.transform(X_test_text).toarray()
X_test_keyword_encoded = X_test_keyword.map(keyword_means).fillna(global_mean).values.reshape(-1, 1)

# Combinar
X_test_combined = np.hstack([
    X_test_numeric_scaled,
    X_test_location_scaled,
    X_test_text_tfidf,
    X_test_keyword_encoded
])

print(f"Test features combinadas: {X_test_combined.shape}")

# Predecir
test_predictions = best_xgb.predict(X_test_combined)

print(f"\nPredicciones generadas: {len(test_predictions)}")
print(f"Distribuci√≥n de predicciones:")
print(pd.Series(test_predictions).value_counts().sort_index())

In [None]:
# Crear submission
test_raw = pd.read_csv(Path("../.data/raw/") / "test.csv")
submission = pd.DataFrame({
    'id': test_raw['id'],
    'target': test_predictions
})

# Guardar
OUTPUT_PATH = Path("../.data/submissions/")
OUTPUT_PATH.mkdir(parents=True, exist_ok=True)

submission_file = OUTPUT_PATH / "model2_xgboost.csv"
submission.to_csv(submission_file, index=False)

print(f"‚úÖ Submission guardado en: {submission_file}")
print(f"\nPrimeras 5 filas:")
print(submission.head())
print(f"\n√öltimas 5 filas:")
print(submission.tail())