# Tema 4: Redes neuronales recurrentes para detección de noticias falsas

## Ejercicio 1 - RNN
Entrenar RNNs (SimpleRNN) utilizando Keras de TensorFlow para detectar noticias falsas.

### Apartado a
Importar librerías y cargar modelos.

In [None]:
import numpy as np
import pandas as pd
import spacy
from pathlib import Path

from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, Dense, Bidirectional, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tqdm import tqdm

PATH_DATA = Path.cwd().parent / 'data'
PATH_MODELS = Path.cwd().parent / 'models'

In [None]:
nlp = spacy.load('es_core_news_sm')
w2v = KeyedVectors.load_word2vec_format(str(PATH_MODELS / 'SBW-vectors-300-min5.txt'), binary=False)

### Apartado b
Cargar y explorar el dataset.

In [None]:
df = pd.read_excel(str(PATH_DATA / 'train.xlsx'), engine="openpyxl")

df.head()

In [None]:
df['Category'].value_counts()

### Apartado c
Análisis rápido de longitud de textos.

In [None]:
(df['Headline']
 .apply(lambda x: len(x.split()))
 .describe())

In [None]:
(df['Text']
 .apply(lambda x: len(x.split()))
 .describe())

In [None]:
texts, headlines = df['Text'].tolist(), df['Headline'].tolist()
labels = np.array([1 if cat == 'Fake' else 0 for cat in df['Category']])

len(headlines), len(texts), len(labels)

### Apartado d
Funciones de indexación y transformación.

In [None]:
def build_indexer(corpus, vocab_size):
    token_to_index = {}
    current_index = 1

    for sentence in tqdm(corpus):
        doc = nlp(sentence)
        for token in doc:
            clean_token = token.text.lower()
            if clean_token not in token_to_index and current_index < vocab_size:
                token_to_index[clean_token] = current_index
                current_index += 1

    print("\nVocabulario (", len(token_to_index), "):")
    print(token_to_index)

    return token_to_index


def transform_text(corpus, token_to_index, max_length=10):
    encoded_sentences = []
    for sentence in tqdm(corpus):
        doc = nlp(sentence)
        encoded_sentence = []
        for token in doc:
            clean_token = token.text.lower()
            if clean_token in token_to_index:
                encoded_sentence.append(token_to_index[clean_token])
            else:
                encoded_sentence.append(0)
        encoded_sentences.append(encoded_sentence)

    prepared_sentences = pad_sequences(encoded_sentences, maxlen=max_length, padding='post', truncating='post')
    print("Oraciones procesadas(", len(prepared_sentences), "):")
    print(prepared_sentences)

    return prepared_sentences

In [None]:
def build_embedding_matrix(w2v_model, vocab, emb_dim):
    embedding_matrix = np.zeros((len(vocab) + 1, emb_dim))
    for token, idx in vocab.items():
        if token in w2v_model:
            embedding_matrix[idx] = w2v_model[token]
        else:
            embedding_matrix[idx] = np.random.normal(scale=0.6, size=(emb_dim,))
    return embedding_matrix

### Apartado e
Preparar datos: separación train/validation/test.

In [None]:
vocab_size_headlines = 5000

X_train_headlines, X_val_headlines, y_train_headlines, y_val_headlines = train_test_split(headlines, labels, test_size=0.3, random_state=42)
X_val_headlines, X_test_headlines, y_val_headlines, y_test_headlines = train_test_split(X_val_headlines, y_val_headlines, test_size=0.5, random_state=42)

token_to_index_headlines = build_indexer(X_train_headlines, vocab_size=vocab_size_headlines)

print(len(X_train_headlines), len(X_val_headlines), len(X_test_headlines))
print(len(y_train_headlines), len(y_val_headlines), len(y_test_headlines))

In [None]:
vocab_size_texts = 20000

X_train_texts, X_val_texts, y_train_texts, y_val_texts = train_test_split(texts, labels, test_size=0.3, random_state=42)
X_val_texts, X_test_texts, y_val_texts, y_test_texts = train_test_split(X_val_texts, y_val_texts, test_size=0.5, random_state=42)

token_to_index_texts = build_indexer(X_train_texts, vocab_size=vocab_size_texts)

print(len(X_train_texts), len(X_val_texts), len(X_test_texts))
print(len(y_train_texts), len(y_val_texts), len(y_test_texts))

### Apartado f
Obtener matrices de índices.

In [None]:
max_length_headlines = 15

X_train_headlines = transform_text(X_train_headlines, token_to_index_headlines, max_length_headlines)
X_val_headlines = transform_text(X_val_headlines, token_to_index_headlines, max_length_headlines)
X_test_headlines = transform_text(X_test_headlines, token_to_index_headlines, max_length_headlines)

In [None]:
max_length_texts = 500

X_train_texts = transform_text(X_train_texts, token_to_index_texts, max_length_texts)
X_val_texts = transform_text(X_val_texts, token_to_index_texts, max_length_texts)
X_test_texts = transform_text(X_test_texts, token_to_index_texts, max_length_texts)

### Apartado g
Entrenar RNN con headlines (embeddings desde cero).

In [None]:
model = Sequential()

vector_size = 128

model.add(Embedding(input_dim=vocab_size_headlines, output_dim=vector_size))
model.add(SimpleRNN(8, recurrent_dropout=0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.build(input_shape=(None, max_length_headlines))
model.summary()

In [None]:
batch_size = 32
epochs = 5

earlyStopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')

history = model.fit(
    X_train_headlines,
    y_train_headlines,
    validation_data=(X_val_headlines, y_val_headlines),
    batch_size=batch_size,
    epochs=epochs,
    callbacks=[earlyStopping]
)

y_pred_headlines = model.predict(X_test_headlines)
print(f"Accuracy test: {accuracy_score(y_test_headlines, y_pred_headlines > 0.5)}")

### Apartado h
Entrenar RNN con headlines (inicialización con Word2Vec).

In [None]:
embedding_matrix = build_embedding_matrix(w2v, token_to_index_headlines, 300)
embedding_matrix.shape

In [None]:
model = Sequential()

model.add(Embedding(
    input_dim=embedding_matrix.shape[0],
    output_dim=embedding_matrix.shape[1],
    weights=[embedding_matrix],
    trainable=True
))
model.add(SimpleRNN(8, recurrent_dropout=0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.build(input_shape=(None, max_length_headlines))
model.summary()

In [None]:
batch_size = 32
epochs = 5

earlyStopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')

history = model.fit(
    X_train_headlines,
    y_train_headlines,
    validation_data=(X_val_headlines, y_val_headlines),
    batch_size=batch_size,
    epochs=epochs,
    callbacks=[earlyStopping]
)

y_pred_headlines = model.predict(X_test_headlines)
print(f"Accuracy test: {accuracy_score(y_test_headlines, y_pred_headlines > 0.5)}")

### Apartado i
Entrenar RNN con textos (embeddings desde cero).

In [None]:
model = Sequential()

vector_size = 128

model.add(Embedding(input_dim=vocab_size_texts, output_dim=vector_size))
model.add(SimpleRNN(8, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.build(input_shape=(None, max_length_texts))
model.summary()

In [None]:
batch_size = 32
epochs = 5

earlyStopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')

history = model.fit(
    X_train_texts,
    y_train_texts,
    validation_data=(X_val_texts, y_val_texts),
    batch_size=batch_size,
    epochs=epochs,
    callbacks=[earlyStopping]
)

y_pred_texts = model.predict(X_test_texts)
print(f"Accuracy test: {accuracy_score(y_test_texts, y_pred_texts > 0.5)}")

### Apartado j
Entrenar RNN con textos (inicialización con Word2Vec).

In [None]:
embedding_matrix = build_embedding_matrix(w2v, token_to_index_texts, 300)
embedding_matrix.shape

In [None]:
model = Sequential()

model.add(Embedding(
    input_dim=embedding_matrix.shape[0],
    output_dim=embedding_matrix.shape[1],
    weights=[embedding_matrix],
    trainable=True
))
model.add(SimpleRNN(8, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.build(input_shape=(None, max_length_texts))
model.summary()

In [None]:
batch_size = 32
epochs = 5

earlyStopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')

history = model.fit(
    X_train_texts,
    y_train_texts,
    validation_data=(X_val_texts, y_val_texts),
    batch_size=batch_size,
    epochs=epochs,
    callbacks=[earlyStopping]
)

y_pred_texts = model.predict(X_test_texts)
print(f"Accuracy test: {accuracy_score(y_test_texts, y_pred_texts > 0.5)}")

## Ejercicio 2 - LSTM
Entrenar LSTMs utilizando Keras de TensorFlow para detectar noticias falsas.

### Apartado a
Entrenar LSTM con headlines (embeddings desde cero).

In [None]:
model = Sequential()

vector_size = 128

model.add(Embedding(input_dim=vocab_size_headlines, output_dim=vector_size))
model.add(LSTM(8, recurrent_dropout=0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.build(input_shape=(None, max_length_headlines))
model.summary()

In [None]:
batch_size = 32
epochs = 5

earlyStopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')

history = model.fit(
    X_train_headlines,
    y_train_headlines,
    validation_data=(X_val_headlines, y_val_headlines),
    batch_size=batch_size,
    epochs=epochs,
    callbacks=[earlyStopping]
)

y_pred_headlines = model.predict(X_test_headlines)
print(f"Accuracy test: {accuracy_score(y_test_headlines, y_pred_headlines > 0.5)}")

### Apartado b
Entrenar LSTM con headlines (inicialización con Word2Vec).

In [None]:
embedding_matrix = build_embedding_matrix(w2v, token_to_index_headlines, 300)
embedding_matrix.shape

In [None]:
model = Sequential()

model.add(Embedding(
    input_dim=embedding_matrix.shape[0],
    output_dim=embedding_matrix.shape[1],
    weights=[embedding_matrix],
    trainable=True
))
model.add(LSTM(8, recurrent_dropout=0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.build(input_shape=(None, max_length_headlines))
model.summary()

In [None]:
batch_size = 32
epochs = 5

earlyStopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')

history = model.fit(
    X_train_headlines,
    y_train_headlines,
    validation_data=(X_val_headlines, y_val_headlines),
    batch_size=batch_size,
    epochs=epochs,
    callbacks=[earlyStopping]
)

y_pred_headlines = model.predict(X_test_headlines)
print(f"Accuracy test: {accuracy_score(y_test_headlines, y_pred_headlines > 0.5)}")

### Apartado c
Entrenar LSTM con textos (embeddings desde cero).

In [None]:
model = Sequential()

vector_size = 128

model.add(Embedding(input_dim=vocab_size_texts, output_dim=vector_size))
model.add(LSTM(8, recurrent_dropout=0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.build(input_shape=(None, max_length_texts))
model.summary()

In [None]:
batch_size = 32
epochs = 5

earlyStopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')

history = model.fit(
    X_train_texts,
    y_train_texts,
    validation_data=(X_val_texts, y_val_texts),
    batch_size=batch_size,
    epochs=epochs,
    callbacks=[earlyStopping]
)

y_pred_texts = model.predict(X_test_texts)
print(f"Accuracy test: {accuracy_score(y_test_texts, y_pred_texts > 0.5)}")

### Apartado d
Entrenar LSTM con textos (inicialización con Word2Vec).

In [None]:
embedding_matrix = build_embedding_matrix(w2v, token_to_index_texts, 300)
embedding_matrix.shape

In [None]:
model = Sequential()

model.add(Embedding(
    input_dim=embedding_matrix.shape[0],
    output_dim=embedding_matrix.shape[1],
    weights=[embedding_matrix],
    trainable=True
))
model.add(LSTM(8, recurrent_dropout=0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.build(input_shape=(None, max_length_texts))
model.summary()

In [None]:
batch_size = 32
epochs = 5

earlyStopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')

history = model.fit(
    X_train_texts,
    y_train_texts,
    validation_data=(X_val_texts, y_val_texts),
    batch_size=batch_size,
    epochs=epochs,
    callbacks=[earlyStopping]
)

y_pred_texts = model.predict(X_test_texts)
print(f"Accuracy test: {accuracy_score(y_test_texts, y_pred_texts > 0.5)}")

## Ejercicio 3 - Bi-LSTM
Entrenar redes neuronales recurrentes LSTM Bidireccionales.

### Apartado a
Entrenar Bi-LSTM con headlines (embeddings desde cero).

In [None]:
DROPOUT = 0.4

model = Sequential()

vector_size = 128

model.add(Embedding(input_dim=vocab_size_headlines, output_dim=vector_size))
model.add(Bidirectional(LSTM(60, return_sequences=True, recurrent_dropout=0.2)))
model.add(Dropout(DROPOUT))
model.add(Bidirectional(LSTM(32, recurrent_dropout=0.2)))
model.add(Dropout(DROPOUT))
model.add(Dense(60, activation='relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.build(input_shape=(None, max_length_headlines))
model.summary()

In [None]:
batch_size = 32
epochs = 5

earlyStopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')

history = model.fit(
    X_train_headlines,
    y_train_headlines,
    validation_data=(X_val_headlines, y_val_headlines),
    batch_size=batch_size,
    epochs=epochs,
    callbacks=[earlyStopping]
)

y_pred_headlines = model.predict(X_test_headlines)
print(f"Accuracy test: {accuracy_score(y_test_headlines, y_pred_headlines > 0.5)}")

### Apartado b
Entrenar Bi-LSTM con headlines (inicialización con Word2Vec).

In [None]:
embedding_matrix = build_embedding_matrix(w2v, token_to_index_headlines, 300)

In [None]:
DROPOUT = 0.4

model = Sequential()

model.add(Embedding(
    input_dim=embedding_matrix.shape[0],
    output_dim=embedding_matrix.shape[1],
    weights=[embedding_matrix],
    trainable=True
))
model.add(Bidirectional(LSTM(60, return_sequences=True, recurrent_dropout=0.2)))
model.add(Dropout(DROPOUT))
model.add(Bidirectional(LSTM(32, recurrent_dropout=0.2)))
model.add(Dropout(DROPOUT))
model.add(Dense(60, activation='relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.build(input_shape=(None, max_length_headlines))
model.summary()

In [None]:
batch_size = 32
epochs = 5

earlyStopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')

history = model.fit(
    X_train_headlines,
    y_train_headlines,
    validation_data=(X_val_headlines, y_val_headlines),
    batch_size=batch_size,
    epochs=epochs,
    callbacks=[earlyStopping]
)

y_pred_headlines = model.predict(X_test_headlines)
print(f"Accuracy test: {accuracy_score(y_test_headlines, y_pred_headlines > 0.5)}")

### Apartado c
Entrenar Bi-LSTM con textos (embeddings desde cero).

In [None]:
DROPOUT = 0.4

model = Sequential()

vector_size = 128

model.add(Embedding(input_dim=vocab_size_texts, output_dim=vector_size))
model.add(Bidirectional(LSTM(60, return_sequences=True, recurrent_dropout=0.2)))
model.add(Dropout(DROPOUT))
model.add(Bidirectional(LSTM(32, recurrent_dropout=0.2)))
model.add(Dropout(DROPOUT))
model.add(Dense(60, activation='relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.build(input_shape=(None, max_length_texts))
model.summary()

In [None]:
batch_size = 32
epochs = 5

earlyStopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')

history = model.fit(
    X_train_texts,
    y_train_texts,
    validation_data=(X_val_texts, y_val_texts),
    batch_size=batch_size,
    epochs=epochs,
    callbacks=[earlyStopping]
)

y_pred_texts = model.predict(X_test_texts)
print(f"Accuracy test: {accuracy_score(y_test_texts, y_pred_texts > 0.5)}")

### Apartado d
Entrenar Bi-LSTM con textos (inicialización con Word2Vec).

In [None]:
embedding_matrix = build_embedding_matrix(w2v, token_to_index_texts, 300)

In [None]:
DROPOUT = 0.4

model = Sequential()

model.add(Embedding(
    input_dim=embedding_matrix.shape[0],
    output_dim=embedding_matrix.shape[1],
    weights=[embedding_matrix],
    trainable=True
))
model.add(Bidirectional(LSTM(60, return_sequences=True, recurrent_dropout=0.2)))
model.add(Dropout(DROPOUT))
model.add(Bidirectional(LSTM(32, recurrent_dropout=0.2)))
model.add(Dropout(DROPOUT))
model.add(Dense(60, activation='relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.build(input_shape=(None, max_length_texts))
model.summary()

In [None]:
batch_size = 32
epochs = 5

earlyStopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')

history = model.fit(
    X_train_texts,
    y_train_texts,
    validation_data=(X_val_texts, y_val_texts),
    batch_size=batch_size,
    epochs=epochs,
    callbacks=[earlyStopping]
)

y_pred_texts = model.predict(X_test_texts)
print(f"Accuracy test: {accuracy_score(y_test_texts, y_pred_texts > 0.5)}")