## Imports

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import sys

# Agregar path para importar ml_utils
sys.path.append(str(Path('.').resolve()))
from ml_utils import evaluate_model, plot_feature_importance, compare_models, COLOR_NO_DISASTER, COLOR_DISASTER, COLOR_GENERAL

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, make_scorer

import matplotlib.pyplot as plt
import seaborn as sns

# Semilla para reproducibilidad
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

## Cargar Datos Avanzados

In [None]:
DATA_PATH = Path("../.data/processed/")

# Cargar datos con features avanzadas
train_df = pd.read_pickle(DATA_PATH / "train_advanced.pkl")

print(f"Train dataset: {train_df.shape}")
print(f"\nColumnas disponibles:")
for i, col in enumerate(train_df.columns, 1):
    print(f"{i:2d}. {col}")

print(f"\nDistribuci√≥n del target:")
print(train_df['target'].value_counts().sort_index())
print(f"\nProporci√≥n de disasters: {train_df['target'].mean():.2%}")

## Preparar Features

Usaremos todas las features disponibles:
- **Num√©ricas b√°sicas**: 7 features de texto (length, word_count, etc.)
- **Num√©ricas avanzadas**: sentiment (2), linguistic (4), intensity (2), location (2) = 10 features
- **Texto**: TF-IDF del texto lematizado
- **Keywords**: OneHotEncoder de keywords
- **Location v√°lida**: feature binaria

In [None]:
# Features num√©ricas b√°sicas
numeric_features_basic = [
    'text_length',
    'word_count',
    'hashtag_count',
    'mention_count',
    'url_count',
    'uppercase_percentage',
    'punctuation_percentage'
]

# Features num√©ricas avanzadas
numeric_features_advanced = [
    'sentiment_polarity',
    'sentiment_subjectivity',
    'emoji_count',
    'uppercase_word_count',
    'lexical_diversity',
    'number_count',
    'urgency_word_count',
    'intensity_word_count',
    'has_valid_location'
]

# Location features (manejar NaN)
location_features = ['location_lat', 'location_lon']

# Combinar todas las num√©ricas
numeric_features = numeric_features_basic + numeric_features_advanced

print(f"Features num√©ricas b√°sicas: {len(numeric_features_basic)}")
print(f"Features num√©ricas avanzadas: {len(numeric_features_advanced)}")
print(f"Features de location: {len(location_features)}")
print(f"Total features num√©ricas: {len(numeric_features)}")

# Preparar datasets
X_numeric = train_df[numeric_features]
X_location = train_df[location_features].fillna(-999)  # Valor especial para missing locations
X_text = train_df['text_lemmatized'].fillna('')
X_keyword = train_df[['keyword_clean']].fillna('unknown')
y = train_df['target']

print(f"\nShapes:")
print(f"  X_numeric: {X_numeric.shape}")
print(f"  X_location: {X_location.shape}")
print(f"  X_text: {X_text.shape}")
print(f"  X_keyword: {X_keyword.shape}")
print(f"  y: {y.shape}")

## Split Train/Validation

80/20 split estratificado para mantener proporci√≥n de clases.

In [None]:
# Split indices
X_numeric_train, X_numeric_val, \
X_location_train, X_location_val, \
X_text_train, X_text_val, \
X_keyword_train, X_keyword_val, \
y_train, y_val = train_test_split(
    X_numeric, X_location, X_text, X_keyword, y,
    test_size=0.2,
    stratify=y,
    random_state=RANDOM_STATE
)

print(f"Train set: {len(y_train)} samples")
print(f"Val set: {len(y_val)} samples")
print(f"\nDistribuci√≥n en train: {y_train.value_counts().to_dict()}")
print(f"Distribuci√≥n en val: {y_val.value_counts().to_dict()}")

## Feature Engineering Pipeline

1. StandardScaler para features num√©ricas
2. TfidfVectorizer para texto (max 100 features)
3. OneHotEncoder para keywords (max 100 categor√≠as)
4. Combinar todo en una matriz

In [None]:
# 1. Escalar num√©ricas
scaler = StandardScaler()
X_numeric_train_scaled = scaler.fit_transform(X_numeric_train)
X_numeric_val_scaled = scaler.transform(X_numeric_val)

# 2. Location ya est√° preparada (fillna con -999)
X_location_train_scaled = scaler.fit_transform(X_location_train)
X_location_val_scaled = scaler.transform(X_location_val)

# 3. TF-IDF para texto
tfidf = TfidfVectorizer(
    max_features=100,
    min_df=2,
    max_df=0.8,
    ngram_range=(1, 2)  # unigrams y bigrams
)
X_text_train_tfidf = tfidf.fit_transform(X_text_train).toarray()
X_text_val_tfidf = tfidf.transform(X_text_val).toarray()

# 4. OneHotEncoder para keywords
onehot_encoder = OneHotEncoder(
    sparse_output=False,
    handle_unknown='infrequent_if_exist',
    max_categories=100
)
X_keyword_train_encoded = onehot_encoder.fit_transform(X_keyword_train)
X_keyword_val_encoded = onehot_encoder.transform(X_keyword_val)

print("Features transformadas:")
print(f"  Num√©ricas escaladas: {X_numeric_train_scaled.shape}")
print(f"  Location escaladas: {X_location_train_scaled.shape}")
print(f"  TF-IDF (texto + bigrams): {X_text_train_tfidf.shape}")
print(f"  OneHot (keywords): {X_keyword_train_encoded.shape}")

# 5. Combinar todas las features
X_train_combined = np.hstack([
    X_numeric_train_scaled,
    X_location_train_scaled,
    X_text_train_tfidf,
    X_keyword_train_encoded
])

X_val_combined = np.hstack([
    X_numeric_val_scaled,
    X_location_val_scaled,
    X_text_val_tfidf,
    X_keyword_val_encoded
])

print("\n‚úÖ Features combinadas:")
print(f"  Train: {X_train_combined.shape}")
print(f"  Validation: {X_val_combined.shape}")

## GridSearchCV - B√∫squeda de Hiperpar√°metros

Buscaremos los mejores hiperpar√°metros para Random Forest:
- **n_estimators**: n√∫mero de √°rboles
- **max_depth**: profundidad m√°xima
- **min_samples_split**: m√≠nimo de samples para split
- **min_samples_leaf**: m√≠nimo de samples en hoja
- **max_features**: n√∫mero de features por split

Usaremos F1 Score como m√©trica de evaluaci√≥n.

In [None]:
# Definir grid de hiperpar√°metros
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    'class_weight': ['balanced', 'balanced_subsample']
}

# Crear modelo base
rf_base = RandomForestClassifier(random_state=RANDOM_STATE)

# F1 scorer
f1_scorer = make_scorer(f1_score)

# GridSearchCV con 5-fold cross-validation
print("Iniciando GridSearchCV...")
print(f"Combinaciones a probar: {np.prod([len(v) for v in param_grid.values()])}")
print("Esto puede tomar varios minutos...\n")

grid_search = GridSearchCV(
    estimator=rf_base,
    param_grid=param_grid,
    scoring=f1_scorer,
    cv=5,
    n_jobs=-1,
    verbose=2,
    return_train_score=True
)

# Fit
grid_search.fit(X_train_combined, y_train)

print("\n‚úÖ GridSearchCV completado")

## Resultados de GridSearchCV

In [None]:
print("=" * 70)
print("RESULTADOS DE GRIDSEARCHCV".center(70))
print("=" * 70)

print(f"\nüèÜ Mejor F1 Score (CV): {grid_search.best_score_:.4f}")
print(f"\nüìã Mejores hiperpar√°metros:")
for param, value in grid_search.best_params_.items():
    print(f"  {param}: {value}")

# Top 5 configuraciones
results_df = pd.DataFrame(grid_search.cv_results_)
top_5 = results_df.nlargest(5, 'mean_test_score')[['params', 'mean_test_score', 'std_test_score']]

print(f"\nüîù Top 5 Configuraciones:")
for idx, row in top_5.iterrows():
    print(f"\n  F1 Score: {row['mean_test_score']:.4f} (¬±{row['std_test_score']:.4f})")
    print(f"  Params: {row['params']}")

print("=" * 70)

## Entrenar Modelo Final con Mejores Hiperpar√°metros

In [None]:
# Obtener mejor modelo
best_rf = grid_search.best_estimator_

print(f"Modelo final: {best_rf}")
print(f"\nN√∫mero de features utilizadas: {X_train_combined.shape[1]}")

## Evaluaci√≥n en Validaci√≥n

In [None]:
# Predecir
y_pred_val = best_rf.predict(X_val_combined)

# Evaluar usando ml_utils
results_rf = evaluate_model(y_val, y_pred_val, "Random Forest (Best from GridSearchCV)")

## Feature Importance

Random Forest proporciona importancia de features basada en Gini impurity.

In [None]:
# Construir nombres de features
feature_names = []

# Num√©ricas
feature_names.extend(numeric_features)

# Location
feature_names.extend(['location_lat', 'location_lon'])

# TF-IDF
tfidf_names = [f"text_{word}" for word in tfidf.get_feature_names_out()]
feature_names.extend(tfidf_names)

# Keywords
keyword_names = [f"keyword_{cat.replace('keyword_clean_', '')}" 
                 for cat in onehot_encoder.get_feature_names_out()]
feature_names.extend(keyword_names)

print(f"Total feature names: {len(feature_names)}")
print(f"Total features en modelo: {len(best_rf.feature_importances_)}")

# Verificar que coinciden
assert len(feature_names) == len(best_rf.feature_importances_), \
    "Mismatch entre feature names y feature importances"

In [None]:
# Usar funci√≥n de ml_utils para graficar
importance_df = plot_feature_importance(
    feature_names,
    best_rf.feature_importances_,
    "Random Forest",
    top_n=20
)

# Mostrar top 10
print("\nüîù Top 10 Features M√°s Importantes:")
print(importance_df.head(10).to_string())

## An√°lisis de Overfitting

Comparar performance en train vs validation:

In [None]:
# Predecir en train
y_pred_train = best_rf.predict(X_train_combined)

# Evaluar (sin imprimir)
results_train = evaluate_model(y_train, y_pred_train, print_results=False)
results_val = evaluate_model(y_val, y_pred_val, print_results=False)

# Comparar
comparison = pd.DataFrame({
    'Train': results_train,
    'Validation': results_val,
    'Diferencia': {k: results_train[k] - results_val[k] for k in results_train.keys()}
})

print("=" * 60)
print("COMPARACI√ìN TRAIN vs VALIDATION".center(60))
print("=" * 60)
print(comparison.T.to_string())
print("=" * 60)

# An√°lisis
diff_f1 = results_train['f1'] - results_val['f1']
if diff_f1 > 0.05:
    print(f"\n‚ö†Ô∏è Posible overfitting detectado (diferencia F1: {diff_f1:.4f})")
elif diff_f1 < 0:
    print(f"\n‚ö†Ô∏è Performance mejor en validaci√≥n que train (diferencia F1: {diff_f1:.4f})")
    print("   Esto puede indicar que el validation set es m√°s f√°cil.")
else:
    print(f"\n‚úÖ Buen balance entre train y validation (diferencia F1: {diff_f1:.4f})")

## Resumen del Modelo

In [None]:
print("=" * 70)
print("RESUMEN: RANDOM FOREST CON GRIDSEARCHCV".center(70))
print("=" * 70)

print(f"\nüìä Performance:")
print(f"  F1 Score (CV):         {grid_search.best_score_:.4f}")
print(f"  F1 Score (Train):      {results_train['f1']:.4f}")
print(f"  F1 Score (Validation): {results_val['f1']:.4f}  ‚≠ê")

print(f"\nüéØ Target: F1 > 0.80")
if results_val['f1'] > 0.80:
    print(f"  ‚úÖ OBJETIVO CUMPLIDO (F1 = {results_val['f1']:.4f})")
else:
    print(f"  ‚ùå Por debajo del objetivo (F1 = {results_val['f1']:.4f})")
    print(f"     Falta: {0.80 - results_val['f1']:.4f}")

print(f"\nüîß Configuraci√≥n:")
print(f"  Algoritmo: Random Forest")
print(f"  Features totales: {X_train_combined.shape[1]}")
print(f"  B√∫squeda: GridSearchCV (5-fold CV)")
print(f"  Combinaciones probadas: {len(results_df)}")

print(f"\nüèÜ Mejores hiperpar√°metros:")
for param, value in grid_search.best_params_.items():
    print(f"  {param}: {value}")

print("\n" + "=" * 70)

## Predicciones para Competencia

Generamos predicciones en el test set para Kaggle.

In [None]:
# Cargar test
test_df = pd.read_pickle(DATA_PATH / "test_advanced.pkl")

print(f"Test dataset: {test_df.shape}")

# Preparar features igual que train
X_test_numeric = test_df[numeric_features]
X_test_location = test_df[location_features].fillna(-999)
X_test_text = test_df['text_lemmatized'].fillna('')
X_test_keyword = test_df[['keyword_clean']].fillna('unknown')

# Transformar
X_test_numeric_scaled = scaler.transform(X_test_numeric)
X_test_location_scaled = scaler.transform(X_test_location)
X_test_text_tfidf = tfidf.transform(X_test_text).toarray()
X_test_keyword_encoded = onehot_encoder.transform(X_test_keyword)

# Combinar
X_test_combined = np.hstack([
    X_test_numeric_scaled,
    X_test_location_scaled,
    X_test_text_tfidf,
    X_test_keyword_encoded
])

print(f"Test features combinadas: {X_test_combined.shape}")

# Predecir
test_predictions = best_rf.predict(X_test_combined)

print(f"\nPredicciones generadas: {len(test_predictions)}")
print(f"Distribuci√≥n de predicciones:")
print(pd.Series(test_predictions).value_counts().sort_index())

In [None]:
# Crear submission
test_raw = pd.read_csv(Path("../.data/raw/") / "test.csv")
submission = pd.DataFrame({
    'id': test_raw['id'],
    'target': test_predictions
})

# Guardar
OUTPUT_PATH = Path("../.data/submissions/")
OUTPUT_PATH.mkdir(parents=True, exist_ok=True)

submission_file = OUTPUT_PATH / "model1_random_forest.csv"
submission.to_csv(submission_file, index=False)

print(f"‚úÖ Submission guardado en: {submission_file}")
print(f"\nPrimeras 5 filas:")
print(submission.head())
print(f"\n√öltimas 5 filas:")
print(submission.tail())