In [47]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models import Word2Vec

In [48]:
imdb_df = pd.read_csv('./IMDB_reviews_cleaned.csv')

In [49]:
# Se separan la variable a predecir (sentiment) de las reseñas
X = imdb_df['review']
y = np.array(imdb_df['sentiment'])

In [50]:
# Se crean los conjuntos de entrenamiento y de evaluación
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [51]:
#parameter for OOV tokens  is set to out of vocab and padding
oov_tok = "<OOV>"
trunc_type='post'
padding_type='post'
max_seq_length = 1000

In [52]:
# Initialize the Tokenizer class


In [53]:
#Tokenización
#Técnica para partir los textos en partes más pequeñas y asignar un código único a cada una de estas

tokenizer = Tokenizer(num_words=5000,oov_token=oov_tok)

# Generate the word index dictionary
tokenizer.fit_on_texts(x_train)


# Generate and pad the training sequences
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)

In [54]:
x_train = pad_sequences(x_train, maxlen=max_seq_length, padding=padding_type, truncating=trunc_type)

In [55]:
sentences = [review.split() for review in X]
model = Word2Vec(sentences, vector_size=100, window=5, min_count=5, workers=8,sg=0)

In [56]:
# Generate word embeddings from the loaded model
vectors = []
for sentence in sentences:
    vector = []
    for word in sentence:
        if word in model.wv.key_to_index:
            vector.append(model.wv.get_vector(word))
    if len(vector) > 0:
        vectors.append(np.mean(vector, axis=0))
    else:
        vectors.append(np.zeros(model.vector_size))
embedding_matrix = np.array(vectors)

[ 0.04827379 -0.06985364  0.16005489  0.29827058 -0.09532056 -0.09384312
  0.44769517  0.03625048 -0.35462138 -0.11501411  0.10521481  0.06062239
  0.20488    -0.04550808 -0.25560886 -0.14145426  0.05348225 -0.08228648
 -0.4239182  -0.57626235  0.24634247 -0.3239937  -0.03933471 -0.27822173
 -0.22978428 -0.10450473 -0.30146003  0.1437652   0.4607057  -0.00098847
  0.2425458   0.23770556 -0.06206258 -0.22123651 -0.5344737   0.48707914
  0.14440753 -0.05623389  0.08839573 -0.04574089  0.3391397  -0.52163273
 -0.07640072  0.31453443 -0.02103771 -0.43906474 -0.08852153 -0.20163462
  0.09329267  0.41993594  0.06478628 -0.2502373   0.09069575  0.16391395
 -0.5702982   0.34577227 -0.23715968 -0.17989661 -0.22709465  0.06151568
 -0.00814888 -0.06365754  0.22962062  0.13933055 -0.09162042  0.08809141
  0.18052064  0.16083223 -0.50842947 -0.21516995 -0.2472593   0.28001684
 -0.15539628  0.4842375   0.705848    0.19922784 -0.20424418  0.44850245
  0.08992673  0.00338358 -0.27608138 -0.06852435 -0

In [57]:
max_len = max(len(seq) for seq in x_train)
max_len

1000