# Part 0 Dataset Preparation

Exploratory ipynb

In [1]:
%pip install datasets

Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np

In [3]:
from datasets import load_dataset
dataset = load_dataset("rotten_tomatoes")
train_dataset = dataset['train'] 
validation_dataset = dataset['validation']
test_dataset = dataset['test']

In [4]:
from gensim.utils import simple_preprocess

# preprocess
sentences = []
for line in train_dataset['text']:
    sentence = simple_preprocess(line)
    sentences.append(sentence)

# Part 1 Preparing Word Embeddings

In [5]:
%pip install gensim
import gensim
from gensim.models import Word2Vec

Note: you may need to restart the kernel to use updated packages.


In [6]:
import gensim.downloader as api
# wv = api.load('word2vec-google-news-300')
# this glove model gives fewer OOV (1938 vs ~2680) but it has a smaller vocab size so idk what's up
wv = api.load('glove-wiki-gigaword-300')

In [7]:
vocabulary = []
oov = []
for sentence in sentences:
    for word in sentence:
        if word in wv:
            if word not in vocabulary:
                vocabulary.append(word)
        else:
            if word not in oov:
                oov.append(word)

In [8]:
print(len(oov))
print(oov)

print(len(vocabulary))
print(vocabulary)

580
['wisegirls', 'enrapturing', 'compleja', 'retadora', 'orquídeas', 'originalidad', 'suspenser', 'obviation', 'gorefests', 'waydowntown', 'makmalbaf', 'exhilarate', 'nuttgens', 'petin', 'provocatuers', 'jirí', 'hubac', 'shapelessly', 'addessi', 'seldahl', 'wollter', 'mullinski', 'avventura', 'needn', 'narcotizing', 'precollegiate', 'sparklingly', 'superlarge', 'destinees', 'margolo', 'dominatrixes', 'scuzbag', 'idoosyncratic', 'flatula', 'denlopp', 'updatings', 'watstein', 'sappier', 'condensada', 'divertida', 'visualmente', 'entretenida', 'sorprenderá', 'exporing', 'capturou', 'sarcástica', 'demencial', 'predecesora', 'complejos', 'cadness', 'shagster', 'powaqqatsi', 'policiales', 'últimos', 'kaputschnik', 'kickass', 'travil', 'splittingly', 'aborbing', 'monkeyfun', 'bierbichler', 'crummles', 'bustingly', 'stultifyingly', 'deutchland', 'datedness', 'inhospitability', 'næs', 'hastier', 'estava', 'existência', 'papai', 'fato', 'inquestionável', 'talancón', 'drippiness', 'oesn', 'monti

In [9]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((len(vocabulary), 300))
embedding_dict = {}

for i in range(len(vocabulary)):
    embedding_vector = wv[vocabulary[i]]
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
    embedding_dict[vocabulary[i]] = embedding_vector

In [10]:
print(embedding_matrix)

[[ 0.04656     0.21318001 -0.0074364  ...  0.0090611  -0.20988999
   0.053913  ]
 [-0.14924     0.021244   -0.34240001 ...  0.64679998 -0.37239
  -0.085055  ]
 [-0.1749      0.22956     0.24924    ... -0.24131    -0.40402001
   0.054744  ]
 ...
 [ 0.33013001  0.47231999  0.11104    ...  0.43649    -0.31005001
   0.12175   ]
 [ 0.28295001  0.059428    0.12142    ... -0.24597999 -0.24743
  -0.46906   ]
 [ 0.30191001  0.095044    0.68882    ...  0.23462     0.044753
  -0.83178002]]


In [11]:
# save dictionary as file. get embedding of a word by accessing embedding_dict[word]
np.save('embeddings.npy', embedding_dict)


In [12]:
# sample code for how to load and read file
read_dictionary = np.load('embeddings.npy',allow_pickle='TRUE').item()
print(read_dictionary["word"])

[-4.3036e-01 -4.0622e-01 -1.8279e-01 -1.2548e-01  4.3490e-02 -2.1216e-01
 -3.6509e-01  3.2598e-02 -1.0828e-01 -1.3537e+00 -1.9152e-01  1.8976e-01
 -4.3755e-01  2.4337e-01  2.9676e-01 -4.2886e-02 -4.9444e-02  3.7994e-01
  5.5679e-02  1.0740e-01 -1.5195e-01  2.7901e-03 -5.0005e-02  1.6948e-01
 -1.7819e-01 -1.7449e-01  1.8066e-01 -3.5179e-01  4.3839e-01  2.3936e-01
 -1.2200e-01 -1.3631e-02 -2.7989e-01  4.4950e-01 -6.5760e-01 -7.7317e-02
  6.9246e-02 -7.7499e-01 -4.3509e-01  1.6177e-01 -9.4878e-02  2.5968e-01
 -3.6336e-02  2.0131e-01  1.6860e-01 -4.3616e-01 -2.1700e-02  6.6751e-02
 -3.5336e-01  3.5581e-01  4.3044e-01 -1.2607e-01  9.4664e-01  2.5349e-01
 -6.7874e-01  1.0727e-01  1.1090e-01 -2.2619e-02  5.4648e-01  6.4194e-01
  7.1169e-01  4.4902e-02  1.0753e-01  4.5971e-01 -4.1282e-02  2.1160e-01
  3.2395e-01  1.9663e-01  4.1871e-01  3.7204e-01  4.2732e-02 -1.6376e-01
 -1.3316e-01  6.7047e-02 -1.2618e-01 -6.9014e-02  1.0433e+00  2.7489e-01
 -8.0330e-03 -1.7116e-01  4.8991e-02  3.0116e-02 -1