# Part 0 : Dataset Preparation

### Import essential libraries needed

##### Gensim is selected as it offers the ability to use pre-trained embeddings and compatibility with other libraries such as Scikit-learn and Pandas for downstream tasks in machine learning workflows.

In [24]:
#Installing packages
!pip install datasets
!pip install gensim
!pip install nltk



### Load in dataset

In [2]:
import gensim
from gensim.utils import simple_preprocess
import numpy as np
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
dataset = load_dataset("rotten_tomatoes")
train_dataset = dataset['train'] 
validation_dataset = dataset['validation']
test_dataset = dataset['test']

In [4]:
print(train_dataset[:5])
print(validation_dataset[:5])
print(test_dataset[:5])

{'text': ['the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .', 'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth .', 'effective but too-tepid biopic', 'if you sometimes like to go to the movies to have fun , wasabi is a good place to start .', "emerges as something rare , an issue movie that's so honest and keenly observed that it doesn't feel like one ."], 'label': [1, 1, 1, 1, 1]}
{'text': ['compassionately explores the seemingly irreconcilable situation between conservative christian parents and their estranged gay and lesbian children .', 'the soundtrack alone is worth the price of admission .', 'rodriguez does a splendid job of racial profiling hollywood style--casting excellent

### Text Preprocessing for Word2Vec Training

##### We initialize a list to store the processed words from our training dataset. Each sentence is preprocessed by performing the following steps:

##### Convert a document into a list of lowercase tokens, ignoring tokens that are too short or too long, uses ~gensim.utils.tokenise internally.

In [5]:
# Initialise a list to hold processed sentences
processed_sentences = []

# Iterate over the text data in the training dataset
for text in train_dataset['text']:
        # Preprocess the sentence (tokenisation, lowercasing, removing punctuation)
        processed_sentence = simple_preprocess(text)
        
        # Append the processed sentence to the list
        processed_sentences.append(processed_sentence)

In [6]:
# Print the first 5 processed sentences
print(processed_sentences[:5])

[['the', 'rock', 'is', 'destined', 'to', 'be', 'the', 'st', 'century', 'new', 'conan', 'and', 'that', 'he', 'going', 'to', 'make', 'splash', 'even', 'greater', 'than', 'arnold', 'schwarzenegger', 'jean', 'claud', 'van', 'damme', 'or', 'steven', 'segal'], ['the', 'gorgeously', 'elaborate', 'continuation', 'of', 'the', 'lord', 'of', 'the', 'rings', 'trilogy', 'is', 'so', 'huge', 'that', 'column', 'of', 'words', 'cannot', 'adequately', 'describe', 'co', 'writer', 'director', 'peter', 'jackson', 'expanded', 'vision', 'of', 'tolkien', 'middle', 'earth'], ['effective', 'but', 'too', 'tepid', 'biopic'], ['if', 'you', 'sometimes', 'like', 'to', 'go', 'to', 'the', 'movies', 'to', 'have', 'fun', 'wasabi', 'is', 'good', 'place', 'to', 'start'], ['emerges', 'as', 'something', 'rare', 'an', 'issue', 'movie', 'that', 'so', 'honest', 'and', 'keenly', 'observed', 'that', 'it', 'doesn', 'feel', 'like', 'one']]


# Part 1. Preparing Word Embeddings

### Gensim comes with several already pre-trained models, in the Gensim-data repository:

- `conceptnet-numberbatch-17-06-300`
- `word2vec-ruscorpora-300`
- `word2vec-google-news-300`
- `glove-wiki-gigaword-50`
- `glove-wiki-gigaword-100`
- `glove-wiki-gigaword-200`
- `glove-wiki-gigaword-300`
- `glove-twitter-25`
- `glove-twitter-50`
- `glove-twitter-100`
- `glove-twitter-200`
- `__testing_word2vec-matrix-synopsis`

##### In the model names, the numbers refer to the dimensionality of the word vectors (embeddings) for each pre-trained model, this means that higher dimensions capture more complex relationships between words but require more memory and computational resources.


##### Model name examples:

- **`word2vec-google-news-300`**: A Word2Vec model trained on Google News, providing embeddings with 300 dimensions.
- **`glove-twitter-100`**: A GloVe model trained on Twitter data, with 100-dimensional embeddings specialized for social media language.

##### We will explore a few of these pre trained models to see which models will do better in migitating number of OOV words in our dataset. To keep it fair, we will select the models with the highest dimentions from each category


In [None]:
from gensim.models import Word2Vec
import gensim.downloader as api

# Specify path to save models
save_paths = {
    'word2vec_ruscorpora_300': './word2vec_ruscorpora_300.model',
    'word2vec_google_300': './word2vec_google_news_300.model',
    'glove_wiki_300': './glove_wiki_gigaword_300.model',
    'glove_twitter_200': './glove_twitter_200.model'
}

# Download and save each model, will take up to few minutes
word2vec_ruscorpora_300 = api.load('word2vec-ruscorpora-300')
word2vec_ruscorpora_300.save(save_paths['word2vec_ruscorpora_300'])

word2vec_google_300 = api.load('word2vec-google-news-300')
word2vec_google_300.save(save_paths['word2vec_google_300'])

glove_wiki_300 = api.load('glove-wiki-gigaword-300')
glove_wiki_300.save(save_paths['glove_wiki_300'])

glove_twitter_200 = api.load('glove-twitter-200')
glove_twitter_200.save(save_paths['glove_twitter_200'])


In [10]:
from gensim.models import KeyedVectors

# Load models from saved files
word2vec_ruscorpora_300 = KeyedVectors.load('./word2vec_ruscorpora_300.model')
word2vec_google_300 = KeyedVectors.load('./word2vec_google_news_300.model')
glove_wiki_300 = KeyedVectors.load('./glove_wiki_gigaword_300.model')
glove_twitter_200 = KeyedVectors.load('./glove_twitter_200.model')

print("Models loaded from local files.")

Models loaded from local files.


##### Question 1(a): Calculate vocabulary size from processed sentences previously in part 0

In [11]:
# Iterate over the list of processed sentences and get unique words
train_vocab = set(word for sentence in processed_sentences for word in sentence)

# (1a) Vocabulary size
vocab_size = len(train_vocab)
print("Size of vocabulary in training data:", vocab_size)

Size of vocabulary in training data: 16288


##### Question 1(b) Identify out-of-vocabulary (OOV) words while taking into account various models

In [12]:
# Vocabulary size of ruscorpora model
word2vec_ruscorpora_300_vocab_size = len(word2vec_ruscorpora_300.key_to_index)
print("Size of vocabulary in word2vec_ruscorpora_300 model:", word2vec_ruscorpora_300_vocab_size)

# (b) Count OOV words
oov_words = [word for word in train_vocab if word not in word2vec_ruscorpora_300]
oov_count = len(oov_words)
print("Number of OOV words:", oov_count)

Size of vocabulary in word2vec_ruscorpora_300 model: 184973
Number of OOV words: 16288


In [13]:
# Vocabulary size of word2vec_google_300 model
word2vec_google_300_vocab_size = len(word2vec_google_300.key_to_index)
print("Size of vocabulary in word2vec_google_300 model:", word2vec_google_300_vocab_size)

# (b) Count OOV words
oov_words = [word for word in train_vocab if word not in word2vec_google_300]
oov_count = len(oov_words)
print("Number of OOV words:", oov_count)

Size of vocabulary in word2vec_google_300 model: 3000000
Number of OOV words: 1473


In [14]:
# Vocabulary size of glove_wiki_300 model
glove_wiki_300_vocab_size = len(glove_wiki_300.key_to_index)
print("Size of vocabulary in glove_wiki_300 model:", glove_wiki_300_vocab_size)

# (b) Count OOV words
oov_words = [word for word in train_vocab if word not in glove_wiki_300]
oov_count = len(oov_words)
print("Number of OOV words:", oov_count)

Size of vocabulary in glove_wiki_300 model: 400000
Number of OOV words: 580


In [15]:
# Vocabulary size of glove_twitter_200 model
glove_twitter_200_vocab_size = len(glove_twitter_200.key_to_index)
print("Size of vocabulary in glove_twitter_200 model:", glove_twitter_200_vocab_size)

# (b) Count OOV words
oov_words = [word for word in train_vocab if word not in glove_twitter_200]
oov_count = len(oov_words)
print("Number of OOV words:", oov_count)

Size of vocabulary in glove_twitter_200 model: 1193514
Number of OOV words: 1473


##### From this analysis, we can tell that glove_wiki_300 models is the best performing model on the training dataset, as it has a smaller vocabulary as compared to the other models, but still produced the least number of OOV words. We will use the glove_wiki_300 model from now onwards.

### Step 3: Prepare the Embedding Matrix

##### Create an embedding matrix for later parts, each row corresponds to the embedding of a specific word in the training vocabulary. For OOV words, we will use a zero vector.

In [16]:
# Define the embedding dimension
embedding_dim = glove_wiki_300.vector_size

# Get the size of the vocabulary
train_vocab_size = len(train_vocab)  # Make sure train_vocab is a set or list

# Initialize the embedding matrix
embedding_matrix = np.zeros((train_vocab_size, embedding_dim))

# Create a mapping of each word in train_vocab to an index
word_to_index = {word: idx for idx, word in enumerate(train_vocab)}

# Populate the embedding matrix
for word, idx in word_to_index.items():
    if word in glove_wiki_300:
        embedding_matrix[idx] = glove_wiki_300[word]
    else:
        # Use zero vector for OOV words
        embedding_matrix[idx] = np.zeros(embedding_dim)


In [17]:
# Print the shape of the embedding matrix
print("Shape of the embedding matrix:", embedding_matrix.shape)

# Print the first 5 embeddings
print("First 5 embeddings:")
print(embedding_matrix[:5])  # Adjust the slice as needed

Shape of the embedding matrix: (16288, 300)
First 5 embeddings:
[[ 0.24978     0.15938     0.19845    ...  1.00559998  0.19495
   0.31422001]
 [ 0.26120999  0.31194001 -0.14826    ... -0.67054999 -0.28485
   0.13755   ]
 [ 0.038951   -0.12122    -0.29510999 ...  0.50962001 -0.68910003
  -0.25566   ]
 [-0.61173999  0.34871    -0.92996001 ...  0.31099999  0.91194999
  -0.31082001]
 [ 0.63892001  0.027604    0.13714001 ...  0.086945   -0.35929
   0.33945   ]]


In [18]:
# Check whether words from training set is in th embedding
words_to_check = ['the', 'rock', 'is', 'destined', 'to']

for word in words_to_check:
    if word in word_to_index:
        print(f"'{word}' is in the vocabulary.")
    else:
        print(f"'{word}' is not in the vocabulary.")


'the' is in the vocabulary.
'rock' is in the vocabulary.
'is' is in the vocabulary.
'destined' is in the vocabulary.
'to' is in the vocabulary.


In [21]:
# Get the first sentence from processed_sentences
first_sentence = processed_sentences[0]  # Assume this is a list of words like ['this', 'movie', 'was', 'great']

# Initialize an empty list to hold the words
reconstructed_sentence = []

# Loop through each word in the first sentence
for word in first_sentence:
    if word in word_to_index:  # Check if the word is in vocabulary
        index = word_to_index[word]  # Get the index of the word
        embedding = embedding_matrix[index]  # Retrieve the embedding
        reconstructed_sentence.append(word)  # Add the word back to the sentence

# Join the words to form the reconstructed sentence
reconstructed_sentence_str = ' '.join(reconstructed_sentence)

# Output the reconstructed sentence
print("Reconstructed first sentence from embeddings:")
print(reconstructed_sentence_str)



Reconstructed first sentence from embeddings:
the rock is destined to be the st century new conan and that he going to make splash even greater than arnold schwarzenegger jean claud van damme or steven segal


##### Question 1(c) Implementing a strategy to handle OOV words

##### Strategy 1: Calculates a sentence vector by averaging in-vocabulary word embeddings, effectively ignoring OOV words.

##### This approach: 

- This approach creates a vector representation for a sentence by taking the average of the word embeddings for all in-vocabulary words in the sentence.
- By ignoring OOV words (words without embeddings), the strategy reduces the potential impact of missing vectors.


In [17]:
def get_word_vector(word):
    """Return the GloVe vector if the word exists, otherwise return a zero vector."""
    return glove_wiki_300[word] if word in glove_wiki_300 else np.zeros(embedding_dim)

def sentence_vector(sentence):
    """Calculate the average vector for a sentence by averaging in-vocabulary word vectors."""
    vectors = [get_word_vector(word) for word in sentence if get_word_vector(word) is not None]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(embedding_dim)

# Example usage
sample_sentence = processed_sentences[0]  # Use the first processed sentence as an example
sentence_vec = sentence_vector(sample_sentence)
print("Sentence vector:", sentence_vec)


Sentence vector: [-1.53414346e-02 -9.83752590e-03 -5.52846305e-02 -9.99514684e-02
  3.01311947e-02  1.24417685e-01  3.51512656e-02 -8.86158645e-02
  5.73788676e-03 -1.08949292e+00  1.85584605e-01 -5.60377762e-02
 -9.68933664e-03  1.03667580e-01  6.70564100e-02  5.24014123e-02
 -1.68635145e-01 -3.12670171e-02  4.08079997e-02  7.22704735e-03
  3.13387737e-02  1.88141063e-01  1.58004612e-01  7.59189427e-02
 -1.81370601e-01  7.10277483e-02  1.25491977e-01 -1.95886679e-02
  5.73181361e-02  2.00815815e-02  7.70360678e-02  2.06905693e-01
 -2.18780972e-02  8.56238753e-02 -7.37673342e-01 -2.27190293e-02
 -5.49495742e-02 -4.15924937e-02  4.21326384e-02 -2.35538818e-02
 -1.13561312e-02 -8.40994269e-02 -1.04256593e-01 -5.45769371e-02
  1.49288565e-01  9.94461104e-02  1.03147775e-01  7.92050287e-02
 -6.59514889e-02 -2.47636642e-02  3.56394015e-02 -1.05243325e-01
 -2.51182504e-02  8.16771686e-02  3.56279649e-02  1.22403152e-01
 -1.23865858e-01  3.06245804e-01  1.17730126e-01 -1.11946091e-01
  2.5853

##### Strategy 2: Using subword embeddings, such as those generated by FastText.

##### This approach:

-  Address the OOV problem by creating word embeddings based on subword components (like character n-grams).
- This allows the model to create a vector representation for a word even if it hasn’t encountered that exact word in the training data.
- For instance, if the model has seen “exciting” and “amazing” but not “excited,” it can still generate a meaningful vector for “excited” based on its subword parts.

##### For this part, download the fasttext model from this link: https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.vec and place it in the project directory

In [19]:
!pip install fasttext

Collecting fasttext
  Using cached fasttext-0.9.3.tar.gz (73 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.6-py3-none-any.whl (243 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml): started
  Building wheel for fasttext (pyproject.toml): finished with status 'error'
Failed to build fasttext


  error: subprocess-exited-with-error
  
  × Building wheel for fasttext (pyproject.toml) did not run successfully.
  │ exit code: 1
  ╰─> [31 lines of output]
      !!
      
              ********************************************************************************
              Usage of dash-separated 'description-file' will not be supported in future
              versions. Please use the underscore name 'description_file' instead.
      
              By 2025-Mar-03, you need to update your project and remove deprecated calls
              or your builds will no longer be supported.
      
              See https://setuptools.pypa.io/en/latest/userguide/declarative_config.html for details.
              ********************************************************************************
      
      !!
        opt = self.warn_dash_deprecation(opt, section)
      running bdist_wheel
      running build
      running build_py
      creating build\lib.win-amd64-cpython-311\fasttext
  

##### Load model for usage, will take a couple of minutes

In [20]:
fasttext_model = KeyedVectors.load_word2vec_format('./wiki.en.vec', binary=False)

In [21]:
# Check each word in the processed sentences against the FastText model
oov_count = 0

for word in train_vocab:
    if word not in fasttext_model:
        oov_count += 1

# Output the total number of OOV words
print("Number of OOV words:", oov_count)

Number of OOV words: 239


##### Using fastext, number of OOV words is much lesser