# Sentiment analysis with Word2Vec


- Convert words to vectors with Word2Vec
- Use the word representation given by Word2vec to feed a RNN


In [15]:
import numpy as np
from gensim.models import Word2Vec
from tensorflow.keras import layers, Sequential
import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers
from tensorflow.keras import Sequential
from tensorflow.keras.callbacks import EarlyStopping
import gensim.downloader as api

In [2]:
#Load the data
def load_data(percentage_of_sentences=None):
    train_data, test_data = tfds.load(name="imdb_reviews", split=["train", "test"], batch_size=-1, as_supervised=True) #batch_size=-1 will return the full dataset as tf.Tensors and as_supervised = True: the returned tf.data.Dataset will have a 2-tuple structure (input, label)

    train_sentences, y_train = tfds.as_numpy(train_data)
    test_sentences, y_test = tfds.as_numpy(test_data)
    
    # Take only a given percentage of the entire data
    if percentage_of_sentences is not None:
        assert(percentage_of_sentences> 0 and percentage_of_sentences<=100)
        
        len_train = int(percentage_of_sentences/100*len(train_sentences))
        train_sentences, y_train = train_sentences[:len_train], y_train[:len_train]
  
        len_test = int(percentage_of_sentences/100*len(test_sentences))
        test_sentences, y_test = test_sentences[:len_test], y_test[:len_test]
    
    X_train = [text_to_word_sequence(_.decode("utf-8")) for _ in train_sentences]
    X_test = [text_to_word_sequence(_.decode("utf-8")) for _ in test_sentences]
    
    return X_train, y_train, X_test, y_test

X_train, y_train, X_test, y_test = load_data(percentage_of_sentences=5)

2022-12-15 12:00:23.003394: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-12-15 12:00:23.003444: W tensorflow/stream_executor/cuda/cuda_driver.cc:263] failed call to cuInit: UNKNOWN ERROR (303)
2022-12-15 12:00:23.003463: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (DESKTOP-ETLHI67): /proc/driver/nvidia/version does not exist
2022-12-15 12:00:23.003893: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
#Train a word2vec model on our sentences
word2vec = Word2Vec(sentences=X_train, vector_size=30, window =2, min_count=5)
print(word2vec)

Word2Vec<vocab=4871, vector_size=30, alpha=0.025>


In [4]:
# Function to convert a sentence (list of words) into a matrix representing the words in the embedding space
def embed_sentence(word2vec, sentence):
    embedded_sentence = []
    for word in sentence:
        if word in word2vec.wv:
            embedded_sentence.append(word2vec.wv[word])
        
    return np.array(embedded_sentence)

# Function that converts a list of sentences into a list of matrices
def embedding(word2vec, sentences):
    embed = [] 
    for sentence in sentences:
        embedded_sentence = embed_sentence(word2vec, sentence)
        embed.append(embedded_sentence)    
    return embed

# Embed the training and test sentences
X_train_embed = embedding(word2vec, X_train)
X_test_embed = embedding(word2vec, X_test)


# Pad the training and test embedded sentences
X_train_pad = pad_sequences(X_train_embed, dtype='float32', padding='post', maxlen=200)
X_test_pad = pad_sequences(X_test_embed, dtype='float32', padding='post', maxlen=200)

In [5]:
#checking our X_train_pad and X_test_pad, they should be np arrays, 3-dim,  
#last dimension must be of the size of the word2vec embedding space, and 1st dim must be of size of X_train and X_test

for X in [X_train_pad, X_test_pad]:
    assert type(X) == np.ndarray
    assert X.shape[-1] == word2vec.wv.vector_size


assert X_train_pad.shape[0] == len(X_train)
assert X_test_pad.shape[0] == len(X_test)

# Baseline accuracy
Here, our 2 labels are balanced, so our baseline accuracy is 0.5

In [6]:
baseline_accuracy = y_train.sum() / y_train.shape[0]
baseline_accuracy

0.5048

# RNN Model (LSTM) - without transfer learning


In [7]:
vocab_size = word2vec.wv.vectors.shape[0]  #on 5% of the imdb_reviews dataset, that is equal to 4871

In [8]:
def init_model():
    model = Sequential()
    model.add(layers.Masking())
    model.add(layers.LSTM(20, activation="tanh"))
    model.add(layers.Dense(10, activation = 'relu'))
    model.add(layers.Dense(1, activation="sigmoid"))
    
    model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])
    return model

model = init_model()

In [11]:
callback = EarlyStopping(monitor='accuracy', patience=3)
model.fit(X_train_pad, y_train, epochs=10, batch_size=128, callbacks=[callback], verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f91098338e0>

In [12]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 masking (Masking)           (None, 200, 30)           0         
                                                                 
 lstm (LSTM)                 (None, 20)                4080      
                                                                 
 dense (Dense)               (None, 10)                210       
                                                                 
 dense_1 (Dense)             (None, 1)                 11        
                                                                 
Total params: 4,301
Trainable params: 4,301
Non-trainable params: 0
_________________________________________________________________


In [13]:
model.predict(X_test_pad)



array([[0.6017388 ],
       [0.6507066 ],
       [0.45992807],
       ...,
       [0.40447983],
       [0.46417248],
       [0.49000248]], dtype=float32)

In [14]:
model.evaluate(X_test_pad, y_test)
#accuracy around 0.54



[0.6814813613891602, 0.5544000267982483]

# Same model - with transfer learning (glove wiki gigaword 50)

In [17]:
#Load pre-trained embeddings from glove-wiki-gigaword-50
#chose this model from: print(list(api.info()['models'].keys()))
word2vec_transfer = api.load('glove-wiki-gigaword-50')

In [18]:
embedding_size_transfer = word2vec_transfer.vector_size
vocab_size_transfer = word2vec_transfer.vectors.shape[0]

In [19]:
# Function to convert a sentence (list of words) into a matrix representing the words in the embedding space
def embed_sentence_with_TF(word2vec, sentence):
    embedded_sentence = []
    for word in sentence:
        if word in word2vec:
            embedded_sentence.append(word2vec[word])        
    return np.array(embedded_sentence)

# Function that converts a list of sentences into a list of matrices
def embedding(word2vec, sentences):
    embed = []  
    for sentence in sentences:
        embedded_sentence = embed_sentence_with_TF(word2vec, sentence)
        embed.append(embedded_sentence)   
    return embed

# Embed the training and test sentences
X_train_embed_transfer = embedding(word2vec_transfer, X_train)
X_test_embed_transfer = embedding(word2vec_transfer, X_test)

In [21]:
#padding
X_train_pad_transfer = pad_sequences(X_train_embed_transfer, dtype='float32', padding='post', maxlen=200)
X_test_pad_transfer = pad_sequences(X_test_embed_transfer, dtype='float32', padding='post', maxlen=200)

In [22]:
model_transfer = init_model()

In [24]:
model_transfer.fit(X_train_pad_transfer, y_train, epochs=100, batch_size=128, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f90dc9ad210>

In [25]:
model_transfer.predict(X_test_pad_transfer)



array([[0.5925275 ],
       [0.50295913],
       [0.48237646],
       ...,
       [0.5291006 ],
       [0.6257042 ],
       [0.502998  ]], dtype=float32)

In [27]:
res = model_transfer.evaluate(X_test_pad_transfer, y_test, verbose=2)
#accuray of more than 0.6
#conclusion: because of bigger embedded vocabulary size, better accuracy but also more time for each iteration to run

40/40 - 2s - loss: 0.6708 - accuracy: 0.6000 - 2s/epoch - 55ms/step
