# Parte 3: Machine Learning

In [None]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.preprocessing import TargetEncoder, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

SEED = 42
np.random.seed(SEED)


In [None]:
train_path = 'data/train.csv'
df = pd.read_csv(train_path)

## 1) Feature engineering (común a ambos modelos)

In [None]:
# Limpieza y preprocesamiento básico de texto
def clean_text(s):
    if pd.isna(s):
        return ''
    s = str(s)
    s = s.lower()
    s = re.sub(r'http\S+', ' ', s)
    s = re.sub(r'www\S+', ' ', s)
    s = re.sub(r'[^\w\s#@]', ' ', s)
    s = re.sub(r'[\s_]+', ' ', s).strip()
    return s

df['text_clean'] = df['text'].apply(clean_text)
df['keyword'] = df['keyword'].fillna('no_keyword_contained')
df['location'] = df['location'].apply(clean_text)
df['location'] = df['location'].fillna('no_location_contained')

# Creación de features numéricas adicionales
df['word_count'] = df['text_clean'].apply(lambda s: len(s.split()))
df['text_len'] = df['text_clean'].apply(lambda s: sum(len(w) for w in s.split()))
df['mean_word_len'] = df.apply(lambda row: row['text_len'] / row['word_count'] if row['word_count'] > 0 else 0, axis=1)
df['num_hashtags'] = df['text'].apply(lambda s: 0 if pd.isna(s) else s.count('#'))
df['num_mentions'] = df['text'].apply(lambda s: 0 if pd.isna(s) else s.count('@'))

# Creación de features booleanas adicionales
df['has_url'] = df['text'].apply(lambda s: 0 if pd.isna(s) else (1 if 'http' in s or 'www.' in s else 0))
df['has_hashtag'] = df['num_hashtags'].apply(lambda x: 1 if x > 0 else 0)
df['has_mention'] = df['num_mentions'].apply(lambda x: 1 if x > 0 else 0)
df['location_mentioned'] = df.apply(lambda row: 1 if row['location'].lower() in row['text_clean'] else 0, axis=1)

disaster_terms = df['keyword'].dropna().unique().tolist()
def count_terms(s, terms=disaster_terms):
    s = s.lower()
    cnt = 0
    for t in terms:
        if t in s:
            cnt += 1
    return cnt

# Feature engineering adicional 
df['disaster_terms_count'] = df['text_clean'].apply(count_terms)
df['all_caps_count'] = df['text'].apply(lambda s: sum(1 for w in str(s).split() if w.isupper())).fillna(0)

analyzer = SentimentIntensityAnalyzer()

def vader_scores(text):
    if not isinstance(text, str) or text.strip() == "":
        return {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}
    return analyzer.polarity_scores(text)

scores = [vader_scores(t) for t in df['text'].astype(str).tolist()]
scores_df = pd.DataFrame(scores)

df = pd.concat([df.reset_index(drop=True), scores_df.reset_index(drop=True)], axis=1)

In [49]:
location_appearings = df['location'].value_counts().to_dict()
df['location_clean'] = df['location'].map(lambda loc: loc if location_appearings[loc] > 2 else "other")

keyword_counts = df['keyword'].value_counts()
kw_Q1 = keyword_counts.quantile(0.25)
low_frequency_keywords = keyword_counts[keyword_counts < kw_Q1].to_dict()
df['keyword_clean'] = df['keyword'].map(lambda kw: kw if kw not in low_frequency_keywords else "other")

In [None]:
num_features = ['text_len', 'word_count', 'mean_word_len',
                'num_hashtags', 'num_mentions', 'disaster_terms_count',
                'all_caps_count', 'neg', 'neu', 'pos', 'compound']
cat_features = ['location_clean', 'keyword_clean']
bool_features = ['has_url', 'has_hashtag', 'has_mention', 'location_mentioned']
emb_features = ['text_clean']
X = df[num_features + cat_features + bool_features + emb_features]
y = df['target'].astype(int).values

X_train, X_val, y_train, y_val = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=SEED
)

In [None]:
loc_encoding = TargetEncoder(random_state=SEED)
loc_encoding.fit(X_train[['location_clean']], y_train)
X_train_loc_enc = loc_encoding.transform(X_train[['location_clean']])
X_val_loc_enc = loc_encoding.transform(X_val[['location_clean']])

kyw_encoding = TargetEncoder(random_state=SEED)
kyw_encoding.fit(X_train[['keyword_clean']], y_train)
X_train_kyw_enc = kyw_encoding.transform(X_train[['keyword_clean']])
X_val_kyw_enc = kyw_encoding.transform(X_val[['keyword_clean']])

X_train['location_enc'] = X_train_loc_enc
X_train['keyword_enc'] = X_train_kyw_enc
X_val['location_enc'] = X_val_loc_enc
X_val['keyword_enc'] = X_val_kyw_enc
num_features += ['location_enc', 'keyword_enc']


preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), num_features),
    ('bool', 'passthrough', bool_features)
], remainder='drop')

X_train_proc = preprocessor.fit_transform(X_train[num_features + bool_features])
X_val_proc = preprocessor.transform(X_val[num_features + bool_features])


## Primer modelo: KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()

### KNN sin escalar features numéricas (y sin embeddings)

In [84]:
param_grid = {
    'n_neighbors': [1,5,7,8,9,10,20,30,50,100,150],
    'weights': ['uniform', 'distance'],
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
grid = GridSearchCV(knn, param_grid, scoring='f1', cv=cv, n_jobs=-1, verbose=1)

grid.fit(X_train[num_features + bool_features], y_train)

print("Mejores parámetros:", grid.best_params_)
print("Mejor puntaje F1 en Cross Validation:", grid.best_score_)

best_knn = grid.best_estimator_

y_val_pred = best_knn.predict(X_val[num_features + bool_features])
print("F1 en validación:", f1_score(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred))

Fitting 5 folds for each of 22 candidates, totalling 110 fits
Mejores parámetros: {'n_neighbors': 8, 'weights': 'distance'}
Mejor puntaje F1 en Cross Validation: 0.615239877661397
F1 en validación: 0.6285714285714286
              precision    recall  f1-score   support

           0       0.72      0.76      0.74       869
           1       0.65      0.61      0.63       654

    accuracy                           0.69      1523
   macro avg       0.69      0.68      0.68      1523
weighted avg       0.69      0.69      0.69      1523



### KNN Escalando features numéricas

In [89]:
param_grid = {
    'n_neighbors': [50,100,150,175,178,180,181,182,183,185,200,500,1000],
    'weights': ['uniform', 'distance'],
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
grid = GridSearchCV(knn, param_grid, scoring='f1', cv=cv, n_jobs=-1, verbose=1)

grid.fit(X_train_proc, y_train)

print("Mejores parámetros:", grid.best_params_)
print("Mejor puntaje F1 en Cross Validation:", grid.best_score_)

best_knn = grid.best_estimator_

y_val_pred = best_knn.predict(X_val_proc)
print("F1 en validación:", f1_score(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred))

Fitting 5 folds for each of 26 candidates, totalling 130 fits
Mejores parámetros: {'n_neighbors': 181, 'weights': 'distance'}
Mejor puntaje F1 en Cross Validation: 0.7056738974906606
F1 en validación: 0.7116666666666667
              precision    recall  f1-score   support

           0       0.77      0.86      0.81       869
           1       0.78      0.65      0.71       654

    accuracy                           0.77      1523
   macro avg       0.77      0.76      0.76      1523
weighted avg       0.77      0.77      0.77      1523



### KNN Escalando features numéricas + TFIDF

In [None]:
vectorizer = TfidfVectorizer(max_features=100, stop_words='english')
vectorizer.fit(X_train['text_clean'])
X_train_tfidf = vectorizer.transform(X_train['text_clean']).toarray()
X_val_tfidf = vectorizer.transform(X_val['text_clean']).toarray()

In [None]:
param_grid = {
    'n_neighbors': [1,5,10,13,14,15,16,17,18,19,20,21,22,23,25,30,50,100,500],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'cosine', 'minkowski'],
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
grid = GridSearchCV(knn, param_grid, scoring='f1', cv=cv, n_jobs=-1, verbose=1)

grid.fit(np.hstack((X_train_proc, X_train_tfidf)), y_train)

print("Mejores parámetros:", grid.best_params_)
print("Mejor puntaje F1 en Cross Validation:", grid.best_score_)

best_knn = grid.best_estimator_

y_val_pred = best_knn.predict(np.hstack((X_val_proc, X_val_tfidf)))
print("F1 en validación:", f1_score(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred))

Fitting 5 folds for each of 168 candidates, totalling 840 fits
Mejores parámetros: {'metric': 'cosine', 'n_neighbors': 22, 'weights': 'distance'}
Mejor puntaje F1 en Cross Validation: 0.7154319868702279
F1 en validación: 0.7125506072874493
              precision    recall  f1-score   support

           0       0.77      0.84      0.80       869
           1       0.76      0.67      0.71       654

    accuracy                           0.77      1523
   macro avg       0.77      0.76      0.76      1523
weighted avg       0.77      0.77      0.76      1523



### KNN Escalando features numéricas + Word2Vec preentrenado (300 dimensiones)

In [92]:
from gensim.models import KeyedVectors
w2v_path = "GoogleNews-vectors-negative300.bin"
w2v = KeyedVectors.load_word2vec_format(w2v_path, binary=True)

In [None]:
token_pattern = re.compile(r"\w+")
def tokenize(text):
    return token_pattern.findall(text)

def sentence_vector_avg(tokens, model, dim):
    if not tokens:
        return np.zeros(dim, dtype=float)
    vecs = []
    for w in tokens:
        if w in model:
            vecs.append(model[w])
        elif w.lower() in model:
            vecs.append(model[w.lower()])
    if len(vecs) == 0:
        return np.zeros(dim, dtype=float)
    return np.mean(vecs, axis=0)

X_train_tokenized =  [tokenize(t) for t in X_train['text_clean'].astype(str).tolist()]
X_val_tokenized =  [tokenize(t) for t in X_val['text_clean'].astype(str).tolist()]

vecs = []
labels = []
for i in range(len(X_train_tokenized)):
    tokens = X_train_tokenized[i]
    v = sentence_vector_avg(tokens, w2v, dim=w2v.vector_size)
    vecs.append(v)
    labels.append(int(df.iloc[i]['target']))

w2v_emb = np.vstack(vecs)
w2v_emb.shape

(7613, 300)

In [None]:
param_grid = {
    'n_neighbors': [1,5,10,13,14,15,16,17,18,19,20,21,22,23,25,30,50,100,500],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'cosine', 'minkowski'],
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
grid = GridSearchCV(knn, param_grid, scoring='f1', cv=cv, n_jobs=-1, verbose=1)

grid.fit(np.hstack((X_train_proc, X_train_w2v)), y_train)

print("Mejores parámetros:", grid.best_params_)
print("Mejor puntaje F1 en Cross Validation:", grid.best_score_)

best_knn = grid.best_estimator_

y_val_pred = best_knn.predict(np.hstack((X_val_proc, X_val_w2v)))
print("F1 en validación:", f1_score(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred))