## Imports

In [None]:
# Standard
import pandas as pd
import numpy as np
import pathlib
import matplotlib.pyplot as plt
import seaborn as sns

# ML
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, make_scorer, roc_curve, auc, confusion_matrix
from sklearn.preprocessing import StandardScaler
from emoji import demojize
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.cli import download

try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

## Constantes

In [None]:
COLOR_NO_DISASTER = '#3498db'
COLOR_DISASTER = '#e74c3c'
COLOR_GENERAL = '#95a5a6'

SEED = 42

## Datos

In [None]:
data_path = pathlib.Path("../.data/raw")
df = pd.read_csv(data_path / "train.csv")
test_df = pd.read_csv(data_path / "test.csv")

In [None]:
target_mean = df['target'].mean()
print(f'Shape del dataset: {df.shape}')
print(f'Porcentaje de desastres en el target: {target_mean*100:.2f}%')
df.sample(5, random_state=SEED)

## Feature Engineering

In [None]:
def extract_features(df):
    df = df.copy()
    
    # Texto limpio y lematizado
    df['clean_text'] = df['text'].str.lower().apply(lambda x: ' '.join([token.lemma_ for token in nlp(x)]))
    
    # Features estadísticas
    df['word_count'] = df['text'].str.split().str.len()
    df['char_count'] = df['text'].str.len()
    df['stopwords_count'] = df['text'].str.lower().str.split().apply(lambda x: sum(w in STOP_WORDS for w in x))
    df['hashtag_count'] = df['text'].str.count(r'#\w+')
    df['mention_count'] = df['text'].str.count(r'@\w+')
    df['url_count'] = df['text'].str.count(r'http\S+|www\S+')
    
    word_lengths = df['text'].str.split().apply(lambda x: np.mean([len(w) for w in x]) if len(x) > 0 else 0)
    df['avg_word_length'] = word_lengths
    
    char_lengths = df['text'].str.len()
    uppercase_counts = df['text'].str.count(r'[A-Z]')
    df['uppercase_ratio'] = np.where(char_lengths > 0, uppercase_counts / char_lengths, 0)
    
    df['punct_count'] = df['text'].str.count(r'[^\w\s]')
    df['number_count'] = df['text'].str.count(r'\d+')
    df['emoji_count'] = df['text'].apply(lambda x: len([c for c in x if c in demojize(x)]))
    
    # Ubicación
    df['has_location'] = df['location'].notna().astype(int)
    
    # Keywords lematizadas
    df['keyword_lemma'] = df['keyword'].fillna('none').apply(lambda x: ' '.join([token.lemma_ for token in nlp(str(x))]))
    
    return df

print("Extrayendo features de train...")
df = extract_features(df)
print("Extrayendo features de test...")
test_df = extract_features(test_df)

In [None]:
# Pipeline de texto: Word TF-IDF + Char TF-IDF + SVD
word_tfidf = TfidfVectorizer(
    max_features=3000, 
    ngram_range=(1, 2),
    min_df=2
)

char_tfidf = TfidfVectorizer(
    analyzer='char',
    ngram_range=(3, 5),
    max_features=2000,
    min_df=2
)

text_pipeline = Pipeline([
    ('union', FeatureUnion([
        ('word', word_tfidf),
        ('char', char_tfidf),
    ])),
    ('svd', TruncatedSVD(n_components=200, random_state=SEED))
])

print("Generando features densas con SVD...")
X_text_dense = text_pipeline.fit_transform(df['clean_text'])
X_text_dense_test = text_pipeline.transform(test_df['clean_text'])

print(f'Shape de texto denso (SVD): {X_text_dense.shape}')

## Entrenamiento del modelo

In [None]:
# Features numéricas
numeric_cols = ['word_count', 'char_count', 'stopwords_count', 'hashtag_count', 'mention_count', 
                'url_count', 'avg_word_length', 'uppercase_ratio', 'punct_count', 'number_count', 
                'emoji_count', 'has_location']

X_numeric = df[numeric_cols].values
X_numeric_test = test_df[numeric_cols].values

# Escalar features numéricas
scaler = StandardScaler()
X_numeric = scaler.fit_transform(X_numeric)
X_numeric_test = scaler.transform(X_numeric_test)

# Mean Encoding para keywords
keyword_means = df.groupby('keyword_lemma')['target'].mean()
global_mean = df['target'].mean()

train_keyword_encoded = df['keyword_lemma'].map(keyword_means).fillna(global_mean).values.reshape(-1, 1)
test_keyword_encoded = test_df['keyword_lemma'].map(keyword_means).fillna(global_mean).values.reshape(-1, 1)

# Combinar features (Sparse + Dense)
from scipy.sparse import hstack
X = hstack([X_text_dense, X_numeric, train_keyword_encoded])
X_submission = hstack([X_text_dense_test, X_numeric_test, test_keyword_encoded])
y = df['target'].values

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED, stratify=y)

print(f'Shape de X_train: {X_train.shape}')

# Modelo con búsqueda de hiperparámetros
param_distributions = {
    'C': [0.01, 0.1, 1.0, 10.0],
    'solver': ['liblinear', 'lbfgs'],
    'max_iter': [1000, 2000]
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
lr = LogisticRegression(random_state=SEED)

random_search = RandomizedSearchCV(
    lr, 
    param_distributions, 
    n_iter=10,
    cv=skf, 
    scoring=make_scorer(f1_score), 
    n_jobs=-1, 
    verbose=1,
    random_state=SEED
)
random_search.fit(X_train, y_train)

print(f'Mejores parámetros: {random_search.best_params_}')
print(f'Mejor F1-Score (CV): {random_search.best_score_:.4f}')

## Evaluación y Visualización

In [None]:
best_model = random_search.best_estimator_

# 1. Curva ROC
y_probs = best_model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_probs)
roc_auc = auc(fpr, tpr)

# Mejor threshold
J = tpr - fpr
ix = np.argmax(J)
best_thresh = thresholds[ix]
print(f'Best Threshold: {best_thresh:.4f}')

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color=COLOR_DISASTER, lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color=COLOR_GENERAL, lw=2, linestyle='--')
plt.scatter(fpr[ix], tpr[ix], marker='o', color='black', label=f'Best Threshold = {best_thresh:.2f}')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()

# 2. Matriz de Confusión
y_pred_opt = (y_probs > best_thresh).astype(int)
cm = confusion_matrix(y_test, y_pred_opt)

plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title(f'Confusion Matrix (Threshold = {best_thresh:.2f})')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# 3. Feature Importance (Top 20 coeficientes)
# Recuperar nombres de features
text_features = text_pipeline.named_steps['union'].get_feature_names_out()
svd_cols = [f'svd_{i}' for i in range(X_text_dense.shape[1])]
all_features = text_features + numeric_cols + ['keyword_encoded']

coefs = best_model.coef_[0]
indices = np.argsort(np.abs(coefs))[::-1][:20]

plt.figure(figsize=(10, 6))
sns.barplot(x=coefs[indices], y=np.array(all_features)[indices], palette="viridis")
plt.title("Top 20 Feature Importance (Logistic Regression Coefficients)")
plt.xlabel("Coefficient Value")
plt.show()

## Submission

In [None]:
# Generar submission
y_probs_sub = best_model.predict_proba(X_submission)[:, 1]
y_pred_sub = (y_probs_sub > best_thresh).astype(int)

submission = pd.DataFrame({'id': test_df['id'], 'target': y_pred_sub})

submission_path = pathlib.Path("../.data/submission")
submission_path.mkdir(parents=True, exist_ok=True)
submission.to_csv(submission_path / "linear_regression_final_submission.csv", index=False)

print(f"Submission guardada en {submission_path / 'linear_regression_final_submission.csv'}")