# Final Assignment

In [6]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gensim
from sklearn.model_selection import train_test_split

In [7]:
df = pd.read_csv('b2w-10k.csv')
df3 = df[pd.to_numeric(df['overall_rating'], errors='coerce').between(0, 5, inclusive='both')]
df3 = df3[['review_text', 'overall_rating']]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df3['review_text'],
                                                    df3['overall_rating'],
                                                    test_size=0.25, random_state=42)

# Tokenizar os textos
X_train_tokenized = [text.split() for text in X_train]
X_test_tokenized = [text.split() for text in X_test]

# Passo 2: Treinar o modelo Word2Vec no conjunto de treino
w2v_model = gensim.models.Word2Vec(sentences=X_train_tokenized, vector_size=100, window=5, min_count=1, workers=4)

# Passo 3: Criar função para transformar o texto em vetores utilizando o modelo Word2Vec
def text_to_vector(text, model, max_len):
    vectorized_text = []
    for word in text:
        if word in model.wv:
            vectorized_text.append(model.wv[word])
        else:
            vectorized_text.append(np.zeros(model.vector_size))  # Se a palavra não estiver no vocabulário
    # Padding/truncamento para garantir que o comprimento seja fixo (max_len)
    if len(vectorized_text) < max_len:
        vectorized_text.extend([np.zeros(model.vector_size)] * (max_len - len(vectorized_text)))
    else:
        vectorized_text = vectorized_text[:max_len]
    return np.array(vectorized_text)

# Passo 4: Aplicar a transformação para todo o conjunto de treino e teste
max_len = 50  # Definido anteriormente

X_train_vectorized = np.array([text_to_vector(text, w2v_model, max_len) for text in X_train_tokenized])
X_test_vectorized = np.array([text_to_vector(text, w2v_model, max_len) for text in X_test_tokenized])

# Verificar os shapes resultantes
print(f"Tamanho do conjunto de treino vectorizado: {X_train_vectorized.shape}")
print(f"Tamanho do conjunto de teste vectorizado: {X_test_vectorized.shape}")

Tamanho do conjunto de treino vectorizado: (7499, 50, 100)
Tamanho do conjunto de teste vectorizado: (2500, 50, 100)


Uni-direcional LSTM

In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

max_features = 15000  # Número máximo de palavras no vocabulário
max_len = 50  # Comprimento máximo da sequência (ajustado com base nos dados)

model_lstm_uni = Sequential([
    Embedding(input_dim=max_features, output_dim=100, input_length=max_len),
    LSTM(64),  # Unidirecional
    Dense(1, activation='sigmoid')
])

model_lstm_uni.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])



Bi-direcional LSTM

In [10]:
from tensorflow.keras.layers import Bidirectional

model_lstm_bi = Sequential([
    Embedding(input_dim=max_features, output_dim=100, input_length=max_len),
    Bidirectional(LSTM(64)),
    Dense(1, activation='sigmoid')
])

model_lstm_bi.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

Uni-direcional GRU

In [11]:
from tensorflow.keras.layers import GRU

model_gru_uni = Sequential([
    Embedding(input_dim=max_features, output_dim=100, input_length=max_len),
    GRU(64),  # Unidirecional
    Dense(1, activation='sigmoid')
])

model_gru_uni.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


Bi-direcional GRU

In [12]:
model_gru_bi = Sequential([
    Embedding(input_dim=max_features, output_dim=100, input_length=max_len),
    Bidirectional(GRU(64)),
    Dense(1, activation='sigmoid')
])

model_gru_bi.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

Treino

In [13]:
# Função para treinar e avaliar os modelos
def train_and_evaluate_model(model, X_train, y_train, X_test, y_test, epochs=5, batch_size=32):
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test), verbose=2)
    test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)
    return test_acc

# Definir os modelos sem a camada de Embedding
def create_lstm_uni_model():
    model = tf.keras.Sequential([
        tf.keras.layers.LSTM(64, input_shape=(max_len, 100)),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    return model

def create_lstm_bi_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64), input_shape=(max_len, 100)),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    return model

def create_gru_uni_model():
    model = tf.keras.Sequential([
        tf.keras.layers.GRU(64, input_shape=(max_len, 100)),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    return model

def create_gru_bi_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64), input_shape=(max_len, 100)),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    return model

# Treinando os quatro modelos
results = {}

# LSTM Unidirecional
model_lstm_uni = create_lstm_uni_model()
accuracy_lstm_uni = train_and_evaluate_model(model_lstm_uni, X_train_vectorized, y_train, X_test_vectorized, y_test, epochs=10)
results['LSTM uni-direcional'] = accuracy_lstm_uni

# LSTM Bidirecional
model_lstm_bi = create_lstm_bi_model()
accuracy_lstm_bi = train_and_evaluate_model(model_lstm_bi, X_train_vectorized, y_train, X_test_vectorized, y_test, epochs=10)
results['LSTM bi-direcional'] = accuracy_lstm_bi

# GRU Unidirecional
model_gru_uni = create_gru_uni_model()
accuracy_gru_uni = train_and_evaluate_model(model_gru_uni, X_train_vectorized, y_train, X_test_vectorized, y_test, epochs=10)
results['GRU uni-direcional'] = accuracy_gru_uni

# GRU Bidirecional
model_gru_bi = create_gru_bi_model()
accuracy_gru_bi = train_and_evaluate_model(model_gru_bi, X_train_vectorized, y_train, X_test_vectorized, y_test, epochs=10)
results['GRU bi-direcional'] = accuracy_gru_bi

  super().__init__(**kwargs)


Epoch 1/10
235/235 - 6s - 26ms/step - accuracy: 0.1964 - loss: -3.6156e+01 - val_accuracy: 0.1780 - val_loss: -6.2386e+01
Epoch 2/10
235/235 - 3s - 12ms/step - accuracy: 0.1964 - loss: -8.0972e+01 - val_accuracy: 0.1780 - val_loss: -1.0453e+02
Epoch 3/10
235/235 - 3s - 12ms/step - accuracy: 0.1964 - loss: -1.2084e+02 - val_accuracy: 0.1780 - val_loss: -1.4507e+02
Epoch 4/10
235/235 - 3s - 12ms/step - accuracy: 0.1964 - loss: -1.5979e+02 - val_accuracy: 0.1780 - val_loss: -1.8498e+02
Epoch 5/10
235/235 - 3s - 13ms/step - accuracy: 0.1964 - loss: -1.9838e+02 - val_accuracy: 0.1780 - val_loss: -2.2466e+02
Epoch 6/10
235/235 - 3s - 12ms/step - accuracy: 0.1964 - loss: -2.3672e+02 - val_accuracy: 0.1780 - val_loss: -2.6418e+02
Epoch 7/10
235/235 - 3s - 12ms/step - accuracy: 0.1964 - loss: -2.7501e+02 - val_accuracy: 0.1780 - val_loss: -3.0449e+02
Epoch 8/10
235/235 - 3s - 12ms/step - accuracy: 0.1964 - loss: -3.1476e+02 - val_accuracy: 0.1780 - val_loss: -3.4538e+02
Epoch 9/10
235/235 - 3s 

  super().__init__(**kwargs)


Epoch 1/10
235/235 - 6s - 24ms/step - accuracy: 0.1958 - loss: -6.9422e+01 - val_accuracy: 0.1780 - val_loss: -1.1882e+02
Epoch 2/10
235/235 - 4s - 15ms/step - accuracy: 0.1964 - loss: -1.5458e+02 - val_accuracy: 0.1780 - val_loss: -2.0047e+02
Epoch 3/10
235/235 - 3s - 15ms/step - accuracy: 0.1964 - loss: -2.3261e+02 - val_accuracy: 0.1780 - val_loss: -2.8003e+02
Epoch 4/10
235/235 - 3s - 14ms/step - accuracy: 0.1964 - loss: -3.0922e+02 - val_accuracy: 0.1780 - val_loss: -3.5875e+02
Epoch 5/10
235/235 - 3s - 14ms/step - accuracy: 0.1964 - loss: -3.8534e+02 - val_accuracy: 0.1780 - val_loss: -4.3712e+02
Epoch 6/10
235/235 - 3s - 15ms/step - accuracy: 0.1964 - loss: -4.6113e+02 - val_accuracy: 0.1780 - val_loss: -5.1524e+02
Epoch 7/10
235/235 - 3s - 14ms/step - accuracy: 0.1964 - loss: -5.3678e+02 - val_accuracy: 0.1780 - val_loss: -5.9326e+02
Epoch 8/10
235/235 - 3s - 14ms/step - accuracy: 0.1964 - loss: -6.1233e+02 - val_accuracy: 0.1780 - val_loss: -6.7116e+02
Epoch 9/10
235/235 - 3s 

In [14]:
# Identificar o melhor modelo
best_model = max(results, key=results.get)

# Criar uma lista de dicionários com os dados dos modelos
data = []
for model_name, accuracy in results.items():
    is_best = "S" if model_name == best_model else "N"
    data.append({"Método": model_name, "Acurácia": accuracy, "Melhor (S/N)": is_best})

# Criar um DataFrame com os resultados
df_results = pd.DataFrame(data)

# Exibir o DataFrame com os resultados
print(df_results)

                Método  Acurácia Melhor (S/N)
0  LSTM uni-direcional     0.178            S
1   LSTM bi-direcional     0.178            N
2   GRU uni-direcional     0.178            N
3    GRU bi-direcional     0.178            N
