# Tema 4: Redes neuronales para clasificación de texto

In [None]:
from pathlib import Path

PATH_MODELS = Path.cwd().parent / 'models'

## Ejercicio 1
Entrenar un modelo simple de red neuronal (MLP) utilizando Keras de TensorFlow.

### Apartado a
Carga de datos.

In [None]:
sentences = ['Estoy un poco harto del día a día , nada mejora',
             'Hoy es un buen día',
             'No se te ve satisfecho con el trabajo',
             'Este paisaje es hermoso y bonito']

# 1: positivo, 0: negativo
labels = [0, 1, 0, 1]

### Apartado b
Funciones de normalización.

In [None]:
def token_filtered(token):
    return not (token.is_punct | token.is_space | token.is_stop | len(token.text) < 4)

def spacy_processing(doc, filtering, lematization):
    tokens = []
    if filtering and lematization:
        tokens = [token.lemma_ for token in doc if token_filtered(token)]
    elif lematization:
        tokens = [token.lemma_ for token in doc]
    elif filtering:
        tokens = [token.text for token in doc if token_filtered(token)]
    else:
        tokens = [token.text for token in doc]
    return " ".join(tokens)

### Apartado c
Preparar datos de entrenamiento.

In [None]:
import spacy
from tensorflow.keras.preprocessing.sequence import pad_sequences

nlp = spacy.load('es_core_news_sm')

def prepare_vocabulary(corpus, vocab_size, normalize):
    token_to_index = {}
    current_index = 1

    for sentence in corpus:
        doc = nlp(sentence)
        if normalize:
            doc = nlp(spacy_processing(doc, True, True))
        for token in doc:
            if token.text not in token_to_index and current_index < vocab_size:
                token_to_index[token.text] = current_index
                current_index += 1

    return token_to_index


def prepare_sentences(corpus, vocabulary, max_length, normalize):
    encoded_sentences = []

    for sentence in corpus:
        doc = nlp(sentence)
        if normalize:
            doc = nlp(spacy_processing(doc, True, True))
        encoded_sentence = []
        for token in doc:
            if token.text in vocabulary:
                encoded_sentence.append(vocabulary[token.text])
            else:
                encoded_sentence.append(0)
        encoded_sentences.append(encoded_sentence)

    prepared_sentences = pad_sequences(encoded_sentences, maxlen=max_length, padding='post', truncating='post')
    print("Oraciones originales(", len(corpus), "):")
    print(corpus)
    print("Oraciones procesadas(", len(prepared_sentences), "):")
    print(prepared_sentences)
    return prepared_sentences


vocab_size = 50
max_length = 10

vocabulary_train = prepare_vocabulary(sentences, vocab_size, True)
print("\nVocabulario (", len(vocabulary_train), "):")
print(vocabulary_train)
prepared_sentences = prepare_sentences(sentences, vocabulary_train, max_length, True)

### Apartado d
Configurar el modelo MLP.

In [None]:
import tensorflow as tf
tf.__version__

In [None]:
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding

model = Sequential()

vector_size = 8
model.add(Embedding(vocab_size, vector_size))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

print("Red diseñada correctamente")

### Apartado e
Compilar y entrenar el modelo.

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.build(input_shape=(None, max_length))
model.summary()

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

prepared_sentences = np.array(prepared_sentences)
labels = np.array(labels)

X_train, X_test, y_train, y_test = train_test_split(prepared_sentences, labels, test_size=0.2, random_state=42)

batch_size = 32
epochs = 5
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=batch_size, epochs=epochs)

### Apartado f
Evaluar el modelo con nuevas frases.

In [None]:
test_sentences = [
    "No fui al estreno de la película porque nadie me quería acompañar",
    "Envidio de buena manera a los que tienen la oportunidad de ir mañana al estadio",
    "Se nos está volviendo costumbre del domingo por la noche, ver el episodio anterior de SNL y eso me hace recibir el lunes con mejor humor",
    "Al final decidí no ir al cine porque estaba cansada",
    "Todo es maravilloso y formidable, muy bonito"
]

print("\nVocabulario (", len(vocabulary_train), "):")
print(vocabulary_train)

prepared_test = prepare_sentences(test_sentences, vocabulary_train, max_length, True)

predictions = model.predict(prepared_test)

print("Predicciones detalladas:")
for i, sentence in enumerate(test_sentences):
    pred = predictions[i][0]
    sentiment = "Positivo" if pred > 0.5 else "Negativo"
    print(f"\nTexto: {sentence}")
    print(f"Predicción numérica: {pred:.4f}")
    print(f"Sentimiento predicho: {sentiment}")

## Ejercicio 2
Entrenar una CNN con capas convolucionales para clasificación de sentimiento en tweets.

### Apartado a
Cargar dataset de tweets multilingüe y preparar datos.

In [None]:
from datasets import load_dataset

dataset = load_dataset("cardiffnlp/tweet_sentiment_multilingual", "spanish", split='train')

print(dataset.column_names)

filtered_data = dataset.filter(lambda x: x['label'] in [0, 2])

labels = [1 if label == 2 else 0 for label in filtered_data['label']]
texts = filtered_data['text']
vocab_size = 5000
max_length = 100

print(f"Tamaño total del dataset: {len(texts)}")
print(f"Distribución de etiquetas: Positivos={labels.count(1)}, Negativos={labels.count(0)}")

vocabulary_train = prepare_vocabulary(texts, vocab_size, False)
print("\nVocabulario (", len(vocabulary_train), "):")
print(vocabulary_train)
prepared_sentences = prepare_sentences(texts, vocabulary_train, max_length, False)

### Apartado b
Configurar modelo CNN con capas convolucionales.

In [None]:
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding, Conv1D, MaxPooling1D

model = Sequential()

embedding_dim = 128
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))

model.add(Conv1D(filters=128, kernel_size=4, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))

model.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))

model.add(Conv1D(filters=32, kernel_size=2, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))

model.add(Flatten())
model.add(Dense(50, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.build(input_shape=(None, max_length))
model.summary()

print("Red diseñada correctamente")

### Apartado c
Entrenar el modelo.

In [None]:
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping
import numpy as np

prepared_sentences = np.array(prepared_sentences)
labels = np.array(labels)

X_train, X_test, y_train, y_test = train_test_split(prepared_sentences, labels, test_size=0.2, random_state=42)

earlyStopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')

epochs = 10
batch_size = 16

history = model.fit(X_train, y_train,
                    epochs=epochs,
                    batch_size=batch_size,
                    validation_split=0.2,
                    callbacks=[earlyStopping])

### Apartado d
Curva de aprendizaje.

In [None]:
import matplotlib.pyplot as plt

history_dict = history.history
loss_values = history_dict['loss']
val_loss_values = history_dict['val_loss']
accuracy = history_dict['accuracy']
val_accuracy = history_dict['val_accuracy']

epochs = range(1, len(loss_values) + 1)
fig, ax = plt.subplots(1, 2, figsize=(14, 6))

ax[0].plot(epochs, accuracy, 'b', label='Training accuracy')
ax[0].plot(epochs, val_accuracy, 'red', label='Validation accuracy')
ax[0].set_title('Training & Validation Accuracy', fontsize=16)
ax[0].set_xlabel('Epochs', fontsize=16)
ax[0].set_ylabel('Accuracy', fontsize=16)
ax[0].legend()

ax[1].plot(epochs, loss_values, 'b', label='Training loss')
ax[1].plot(epochs, val_loss_values, 'red', label='Validation loss')
ax[1].set_title('Training & Validation Loss', fontsize=16)
ax[1].set_xlabel('Epochs', fontsize=16)
ax[1].set_ylabel('Loss', fontsize=16)
ax[1].legend()

### Apartado e
Evaluar el modelo en test.

In [None]:
loss, accuracy = model.evaluate(X_test, y_test, batch_size=batch_size)

## Ejercicio 3
Introducir word embeddings preentrenados (Word2Vec).

from gensim.models import KeyedVectors

w2v = KeyedVectors.load_word2vec_format(str(PATH_MODELS / 'SBW-vectors-300-min5.txt'), binary=False)

In [None]:
from pathlib import Path
from gensim.models import KeyedVectors

PATH_MODELS = Path.cwd().parent / 'models'

w2v = KeyedVectors.load_word2vec_format(str(PATH_MODELS / 'SBW-vectors-300-min5.txt'), binary=False)

### Apartado b
Calcular matriz de embeddings para todo el vocabulario.

In [None]:
import numpy as np

def build_embedding_matrix(w2v_model, vocab, emb_dim):
    embedding_matrix = np.zeros((len(vocab) + 1, emb_dim))
    for token, idx in vocab.items():
        if token in w2v_model:
            embedding_matrix[idx] = w2v_model[token]
        else:
            embedding_matrix[idx] = np.random.normal(scale=0.6, size=(emb_dim,))
    return embedding_matrix

embedding_matrix = build_embedding_matrix(w2v, vocabulary_train, 300)
embedding_matrix.shape

### Apartado c
Entrenar con la matriz de embedding preentrenada.

In [None]:
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding, Conv1D, MaxPooling1D

model = Sequential()

model.add(Embedding(
    input_dim=embedding_matrix.shape[0],
    output_dim=embedding_matrix.shape[1],
    weights=[embedding_matrix],
    trainable=True
))

model.add(Conv1D(filters=128, kernel_size=4, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))

model.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))

model.add(Conv1D(filters=32, kernel_size=2, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))

model.add(Flatten())
model.add(Dense(50, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.build(input_shape=(None, max_length))
model.summary()

print("Red diseñada correctamente")

In [None]:
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping
import numpy as np

prepared_sentences = np.array(prepared_sentences)
labels = np.array(labels)

X_train, X_test, y_train, y_test = train_test_split(prepared_sentences, labels, test_size=0.2, random_state=42)

earlyStopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')

epochs = 10
batch_size = 16

history = model.fit(X_train, y_train,
                    epochs=epochs,
                    batch_size=batch_size,
                    validation_split=0.2,
                    callbacks=[earlyStopping])