# Laboratorio 4: NLP
### Data Science - Sección 20
Pablo Andrés Zamora Vásquez - 21780 <br>
Diego Andrés Morales Aquino - 21762

In [1]:
import numpy as np
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Concatenate
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
from sklearn.preprocessing import StandardScaler

## Importación de datos

In [2]:

max_features = 50000  # Usar las 50,000 palabras más frecuentes
maxlen = 200  # Longitud máxima de secuencias (rellenaremos a esta longitud)

# Cargar los datos del dataset IMDB
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=max_features)

# Secuenciar y rellenar las reseñas para tener una longitud uniforme
X_train = pad_sequences(X_train, maxlen=maxlen)
X_test = pad_sequences(X_test, maxlen=maxlen)

print(f"Train data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Train data shape: (25000, 200)
Test data shape: (25000, 200)


## Preprocesamiento

In [3]:
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()

id_to_word = {index: word for word, index in imdb.get_word_index().items()}

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[1m1641221/1641221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [4]:
# Función para calcular la longitud de cada reseña
def get_review_lengths(reviews):
    return np.array([len(review) for review in reviews])

# Función para calcular la proporción de palabras positivas y negativas en cada
# reseña usando el analizador VADER.
def get_sentiment_ratios(reviews):
    positive_ratios = []
    negative_ratios = []

    for review in reviews:
        pos_count = 0
        neg_count = 0
        total_words = len(review)

        # Evaluar cada palabra con VADER
        for word_index in review:
            word = id_to_word.get(word_index, None)  # Convertir índice a palabra
            if word == None:
              continue

            sentiment = sid.polarity_scores(word)
            if sentiment['compound'] > 0:
                pos_count += 1
            elif sentiment['compound'] < 0:
                neg_count += 1

        if total_words == 0:
            positive_ratios.append(0)
            negative_ratios.append(0)
        else:
            # Proporción de palabras positivas y negativas
            positive_ratios.append(pos_count / total_words)
            negative_ratios.append(neg_count / total_words)

    return np.array(positive_ratios), np.array(negative_ratios)

def get_lexical_diversity(reviews):
    return np.array([len(set(review)) / len(review) if len(review) > 0 else 0 for review in reviews])

### Características adicionales

In [5]:
train_positive_ratios, train_negative_ratios = get_sentiment_ratios(X_train)
test_positive_ratios, test_negative_ratios = get_sentiment_ratios(X_test)

In [6]:
train_review_lengths = get_review_lengths(X_train)
test_review_lengths = get_review_lengths(X_test)

In [7]:
train_lexical_diversity = get_lexical_diversity(X_train)
test_lexical_diversity = get_lexical_diversity(X_test)

## Modelo

In [8]:
# Parámetros del modelo
embedding_size = 128
lstm_units = 128
dropout_rate = 0.5

# Entrada para las secuencias de palabras (reseñas)
input_seq = Input(shape=(maxlen,), name="input_sequence")
embedding_layer = Embedding(input_dim=max_features, output_dim=embedding_size, input_length=maxlen)(input_seq)

# Capas LSTM
lstm_layer = LSTM(lstm_units, return_sequences=True)(embedding_layer)
lstm_layer = LSTM(lstm_units)(lstm_layer)
lstm_layer = Dropout(dropout_rate)(lstm_layer)

# Entrada para las características adicionales
input_features = Input(shape=(4,), name="input_features")

# Concatenar la salida LSTM con las características adicionales
concatenated = Concatenate()([lstm_layer, input_features])

# Añadir capas densamente conectadas
dense_layer = Dense(64, activation="relu")(concatenated)
dense_layer = Dropout(dropout_rate)(dense_layer)
output_layer = Dense(1, activation="sigmoid")(dense_layer)  # Clasificación binaria (positivo/negativo)

# Definir el modelo
model = Model(inputs=[input_seq, input_features], outputs=output_layer)

# Compilar el modelo
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

model.summary()




## Entrenamiento

In [9]:
NUM_EPOCHS = 15

# Concatenar las características adicionales en un solo array
train_additional_features = np.column_stack((train_positive_ratios, train_negative_ratios, train_review_lengths, train_lexical_diversity))
test_additional_features = np.column_stack((test_positive_ratios, test_negative_ratios, test_review_lengths, test_lexical_diversity))

# Normalizar las características adicionales
scaler = StandardScaler()
train_additional_features = scaler.fit_transform(train_additional_features)
test_additional_features = scaler.transform(test_additional_features)

# Entrenamiento del modelo
history = model.fit(
    [X_train, train_additional_features],
    y_train,
    epochs=NUM_EPOCHS,
    batch_size=64,
    validation_data=([X_test, test_additional_features], y_test)
)


Epoch 1/15
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 34ms/step - accuracy: 0.6420 - loss: 0.6101 - val_accuracy: 0.8398 - val_loss: 0.3682
Epoch 2/15
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 32ms/step - accuracy: 0.9017 - loss: 0.2584 - val_accuracy: 0.8283 - val_loss: 0.4866
Epoch 3/15
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 32ms/step - accuracy: 0.9539 - loss: 0.1370 - val_accuracy: 0.8432 - val_loss: 0.3985
Epoch 4/15
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 32ms/step - accuracy: 0.9730 - loss: 0.0844 - val_accuracy: 0.8497 - val_loss: 0.4374
Epoch 5/15
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 38ms/step - accuracy: 0.9869 - loss: 0.0443 - val_accuracy: 0.8510 - val_loss: 0.6013
Epoch 6/15
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 32ms/step - accuracy: 0.9899 - loss: 0.0332 - val_accuracy: 0.8484 - val_loss: 0.7830
Epoch 7/15
[1m3

## Evaluación

In [10]:
# Evaluación del modelo en el conjunto de prueba
test_loss, test_accuracy = model.evaluate(
    [X_test, test_additional_features],
    y_test,
    verbose=2
)

print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")



# Comparación con un modelo simple

# Modelo simple LSTM sin características adicionales
simple_input_seq = Input(shape=(maxlen,))
simple_embedding_layer = Embedding(input_dim=max_features, output_dim=embedding_size, input_length=maxlen)(simple_input_seq)
simple_lstm_layer = LSTM(lstm_units)(simple_embedding_layer)
simple_lstm_layer = Dropout(dropout_rate)(simple_lstm_layer)
simple_output_layer = Dense(1, activation="sigmoid")(simple_lstm_layer)

simple_model = Model(inputs=simple_input_seq, outputs=simple_output_layer)
simple_model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Entrenamiento del modelo simple
simple_history = simple_model.fit(
    X_train, y_train,  # Solo secuencias de palabras
    epochs=NUM_EPOCHS,
    batch_size=64,
    validation_data=(X_test, y_test)
)

# Evaluación del modelo simple
simple_test_loss, simple_test_accuracy = simple_model.evaluate(X_test, y_test, verbose=2)

print(f"Simple Model Test Loss: {simple_test_loss}")
print(f"Simple Model Test Accuracy: {simple_test_accuracy}")


# Comparación de la precisión y la pérdida
print(f"Modelo con características adicionales - Precisión: {test_accuracy}, Pérdida: {test_loss}")
print(f"Modelo simple - Precisión: {simple_test_accuracy}, Pérdida: {simple_test_loss}")


782/782 - 5s - 7ms/step - accuracy: 0.8476 - loss: 0.8467
Test Loss: 0.8466650247573853
Test Accuracy: 0.8476399779319763
Epoch 1/15




[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 20ms/step - accuracy: 0.7219 - loss: 0.5204 - val_accuracy: 0.8488 - val_loss: 0.3637
Epoch 2/15
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 21ms/step - accuracy: 0.9221 - loss: 0.2179 - val_accuracy: 0.8694 - val_loss: 0.3348
Epoch 3/15
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 21ms/step - accuracy: 0.9590 - loss: 0.1218 - val_accuracy: 0.8476 - val_loss: 0.4649
Epoch 4/15
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 20ms/step - accuracy: 0.9699 - loss: 0.0901 - val_accuracy: 0.8544 - val_loss: 0.4050
Epoch 5/15
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 20ms/step - accuracy: 0.9783 - loss: 0.0672 - val_accuracy: 0.8424 - val_loss: 0.5103
Epoch 6/15
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 21ms/step - accuracy: 0.9888 - loss: 0.0367 - val_accuracy: 0.8539 - val_loss: 0.6094
Epoch 7/15
[1m391/391[0m [32m