# Random Forest

Este notebook implementa un modelo de Random Forest

## Imports

In [1]:
# Standard
import pandas as pd
import numpy as np
import pathlib
import matplotlib.pyplot as plt
import seaborn as sns

# ML
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, make_scorer, roc_curve, auc, confusion_matrix
from emoji import demojize
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.cli import download

try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

## Constantes

In [None]:
COLOR_NO_DISASTER = '#3498db'
COLOR_DISASTER = '#e74c3c'
COLOR_GENERAL = '#95a5a6'

SEED = 42

## Datos

In [None]:
data_path = pathlib.Path("../.data/raw")
df = pd.read_csv(data_path / "train.csv")
test_df = pd.read_csv(data_path / "test.csv")

In [None]:
target_mean = df['target'].mean()
print(f'Shape del dataset: {df.shape}')
print(f'Porcentaje de desastres en el target: {target_mean*100:.2f}%')
df.sample(5, random_state=SEED)

## Feature Engineering

In [None]:
def extract_features(df):
    df = df.copy()
    
    # Texto limpio y lematizado
    df['clean_text'] = df['text'].str.lower().apply(lambda x: ' '.join([token.lemma_ for token in nlp(x)]))
    
    # Features estadísticas
    df['word_count'] = df['text'].str.split().str.len()
    df['char_count'] = df['text'].str.len()
    df['stopwords_count'] = df['text'].str.lower().str.split().apply(lambda x: sum(w in STOP_WORDS for w in x))
    df['hashtag_count'] = df['text'].str.count(r'#\w+')
    df['mention_count'] = df['text'].str.count(r'@\w+')
    df['url_count'] = df['text'].str.count(r'http\S+|www\S+')
    
    word_lengths = df['text'].str.split().apply(lambda x: np.mean([len(w) for w in x]) if len(x) > 0 else 0)
    df['avg_word_length'] = word_lengths
    
    char_lengths = df['text'].str.len()
    uppercase_counts = df['text'].str.count(r'[A-Z]')
    df['uppercase_ratio'] = np.where(char_lengths > 0, uppercase_counts / char_lengths, 0)
    
    df['punct_count'] = df['text'].str.count(r'[^\w\s]')
    df['number_count'] = df['text'].str.count(r'\d+')
    df['emoji_count'] = df['text'].apply(lambda x: len([c for c in x if c in demojize(x)]))
    
    # Ubicación
    df['has_location'] = df['location'].notna().astype(int)
    df['valid_location'] = np.where(
        df['location'].notna(),
        df['location'].apply(lambda x: any(ent.label_ == 'GPE' for ent in nlp(str(x)).ents)),
        False
    ).astype(int)
    
    # Keywords lematizadas
    df['keyword_lemma'] = df['keyword'].fillna('none').apply(lambda x: ' '.join([token.lemma_ for token in nlp(str(x))]))
    
    return df

print("Extrayendo features de train...")
df = extract_features(df)
print("Extrayendo features de test...")
test_df = extract_features(test_df)

In [None]:
# Pipeline de texto: Word TF-IDF + Char TF-IDF + SVD
word_tfidf = TfidfVectorizer(
    max_features=5000, 
    ngram_range=(1, 2),
    min_df=2
)

char_tfidf = TfidfVectorizer(
    analyzer='char',
    ngram_range=(3, 5),
    max_features=3000,
    min_df=2
)

text_pipeline = Pipeline([
    ('union', FeatureUnion([
        ('word', word_tfidf),
        ('char', char_tfidf),
    ])),
    ('svd', TruncatedSVD(n_components=300, random_state=SEED))
])

print("Generando features densas con SVD...")
X_text_dense = text_pipeline.fit_transform(df['clean_text'])
X_text_dense_test = text_pipeline.transform(test_df['clean_text'])

print(f'Shape de texto denso (SVD): {X_text_dense.shape}')

## Entrenamiento del modelo

In [None]:
# Features numéricas
feature_cols = ['word_count', 'char_count', 'stopwords_count', 'hashtag_count', 'mention_count', 
                'url_count', 'avg_word_length', 'uppercase_ratio', 'punct_count', 'number_count', 
                'emoji_count', 'has_location', 'valid_location']

X_numeric = df[feature_cols].values
X_numeric_test = test_df[feature_cols].values

y = df['target'].values

# Split train/test
X_train_idx, X_test_idx = train_test_split(np.arange(len(df)), test_size=0.2, random_state=SEED, stratify=y)

# Mean Encoding
train_data = df.iloc[X_train_idx]
test_data = df.iloc[X_test_idx]

keyword_means = train_data.groupby('keyword_lemma')['target'].mean()
global_mean = train_data['target'].mean()

train_keyword_encoded = train_data['keyword_lemma'].map(keyword_means).fillna(global_mean).values.reshape(-1, 1)
test_keyword_encoded = test_data['keyword_lemma'].map(keyword_means).fillna(global_mean).values.reshape(-1, 1)
submission_keyword_encoded = test_df['keyword_lemma'].map(keyword_means).fillna(global_mean).values.reshape(-1, 1)

# Construir matrices finales
X_train = np.hstack([
    X_numeric[X_train_idx],
    train_keyword_encoded,
    X_text_dense[X_train_idx]
])

X_test = np.hstack([
    X_numeric[X_test_idx],
    test_keyword_encoded,
    X_text_dense[X_test_idx]
])

X_submission = np.hstack([
    X_numeric_test,
    submission_keyword_encoded,
    X_text_dense_test
])

y_train = y[X_train_idx]
y_test = y[X_test_idx]

print(f'Shape de X_train: {X_train.shape}')

# Modelo Random Forest
param_distributions = {
    'n_estimators': [200, 300, 500],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
rf = RandomForestClassifier(random_state=SEED, n_jobs=-1)

random_search = RandomizedSearchCV(
    rf, 
    param_distributions, 
    n_iter=20,
    cv=skf, 
    scoring=make_scorer(f1_score), 
    n_jobs=-1, 
    verbose=1,
    random_state=SEED
)
random_search.fit(X_train, y_train)

print(f'Mejores parámetros: {random_search.best_params_}')
print(f'Mejor F1-Score (CV): {random_search.best_score_:.4f}')

## Análisis Final

In [None]:
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Reds', cbar=False)
plt.title(f'Confusion Matrix (Threshold = {best_thresh:.2f})')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

## Submission

In [None]:
# Generar submission
y_probs_sub = best_model.predict_proba(X_submission)[:, 1]
y_pred_sub = (y_probs_sub > best_thresh).astype(int)

submission = pd.DataFrame({'id': test_df['id'], 'target': y_pred_sub})

submission_path = pathlib.Path("../.data/submission")
submission_path.mkdir(parents=True, exist_ok=True)
submission.to_csv(submission_path / "random_forest_submission.csv", index=False)

print(f"Submission guardada en {submission_path / 'random_forest_submission.csv'}")