# Making the word embedding

Running the cells below reads in a text file called "clean_data.txt" of the form

    agent_name_speaker1 cleaned_quote1 
    agent_name_speaker2 cleaned_quote2 
    ...
    
and trains a word embedding from the text, using the gensim package. 

The word embedding is stored as "word_embedding.emb" using gensim's native model storing function, and can be loaded back into a gensim embedding via `gensim.models.KeyedVectors.load(word_embedding.emb)`.

In [1]:
import numpy as np
from gensim.models import Word2Vec
from random import seed, shuffle
from gensim.models.callbacks import CallbackAny2Vec
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
# By changing these seeds, randomness in the loading and training process
# will produce a different embedding each time

data_seed = 42 
model_seed = 42

## 1. Define some components used for training

In [6]:
# This class is needed to print out the loss during training, which can be useful to monitor
# the convergence behaviour: if the loss decreases significantly between two epochs, the 
# model may not be fully trained yet.

class PrintLoss(CallbackAny2Vec):
    """Callback to print loss after each epoch.
    Credits https://stackoverflow.com/questions/54888490/gensim-word2vec-print-log-loss
    """

    def __init__(self):
        self.epoch = 1
        self.loss_to_be_subed = 0
        self.log = ""

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        loss_now = loss - self.loss_to_be_subed
        self.loss_to_be_subed = loss
        print('Loss after epoch {}: {}'.format(self.epoch, loss_now))
        self.log += 'Loss after epoch {}: {} \n'.format(self.epoch, loss_now)
        self.epoch += 1

In [9]:
# This class is a generator object. Each time it is called it returns a new line (i.e. data sample)
# from the file stored under "path_to_data". Most of the logic is a trick to perform pseudo-randomisation for large 
# input files.

class DataGenerator(object):
    def __init__(self, path_to_data,
                 share_data=1.,
                 chunk_size=10000,
                 random_buffer_size=100000,
                 data_seed=data_seed):
        
        """Iterator that loads lines from a (possibly large) file in a mildly randomised fashion. 
        
        We want to feed the data randomly to avoid training to depend on the order of the quotes in the 
        file. However, true randomisation is very costly when text files become larger than a few GB.
        
        Here, a buffer stores a set of lines from the text file. The buffer is shuffled, and the first chunk 
        of lines is returned (that is, one such line is yielded each time the generator is called). The buffer 
        is filled up again with fresh lines and shuffled. This continues until no lines are left to fill the 
        buffer with, at which point the remaining lines are returned.
        
        Args:
            path_to_data (str): Full path to a data file with one preprocessed sentence/document per line.
            share_of_original_data (float):  and picks each line with probability share_of_original_data, which
                effectively results in a dataset with approx n_data*share_of_original_data samples
            chunk_size (int): Return so many lines from the random buffer at once before filling it up again. Larger
                chunk sizes speed up training, but decrease randomness.
            random_buffer_size (int): Keep so many lines from the data file in a buffer which is shuffled before
                returning the samples in a chunk. Higher values take more RAM but lead to more randomness
                when sampling the data. A value equal to the number of all samples would lead to perfectly
                random samples.
        """
        if chunk_size > random_buffer_size:
            raise ValueError("Chunk size cannot be larger than the buffer size.")

        self.path_to_data = path_to_data
        self.share_of_original_data = share_data
        self.chunk_size = chunk_size
        self.random_buffer_size = random_buffer_size
        seed(data_seed)

    def __iter__(self):

        # load initial buffer
        buffer = []
        with open(self.path_to_data, "r") as f:

            reached_end = False

            # fill buffer for the first time
            for i in range(self.random_buffer_size):
                line = f.readline().strip().split(" ")
                if not line:
                    reached_end = True
                    break
                buffer.append(line)

            while not reached_end:

                # randomise the buffer
                shuffle(buffer)

                # remove and return chunk from buffer
                for i in range(self.chunk_size):
                    # separate non-bootstrap case here for speed
                    if self.share_of_original_data == 1.0:
                        yield buffer.pop(0)
                    else:
                        # randomly decide whether this line is in
                        # the bootstrapped data
                        if np.random.rand() > self.share_of_original_data:
                            # remove anyways
                            buffer.pop(0)
                            continue
                        else:
                            yield buffer.pop(0)

                # fill up the buffer with a fresh chunk
                for i in range(self.chunk_size):
                    line = f.readline()
                    if not line:
                        reached_end = True
                        break
                    else:
                        buffer.append(line.strip().split(" "))

            # if end of file has been reached
            # yield all elements left in the buffer
            # in random order
            shuffle(buffer)
            for el in buffer:
                yield el

## 2. Training the embedding

Training can now be done in a few lines. The hyperparameters we use were found by empirically optimising various embeddings.

In [10]:
training_generator = DataGenerator("clean_data.txt")

model = Word2Vec(
    sentences = training_generator, 
    vector_size = 250, # number of dimensions that the word vectors will have
    window = 10,  # maximum distance between the current and predicted word
    sg = 1,  # use skip-gram (semantic learning) as training algorithm
    workers = 8,  # number of threads for training the model
    min_count = 1,  # ignores all words with total frequency lower than this  # CHANGE BACK TO 5 
    sorted_vocab = 1, # sort the words in the resulting embedding
    seed = model_seed, # use a random seed for reproducability
    epochs = 30,  # number of times training goes through the data
    compute_loss = True, # print the loss in each epoch
    callbacks = [PrintLoss()])
    
# normalise the word vectors
model.wv.init_sims()

# extract and save embedding
emb = model.wv
emb.save("word_embedding.emb")

2022-05-06 15:52:04,053 : INFO : collecting all words and their counts
2022-05-06 15:52:04,368 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-05-06 15:52:04,649 : INFO : PROGRESS: at sentence #10000, processed 11024 words, keeping 684 word types
2022-05-06 15:52:04,656 : INFO : PROGRESS: at sentence #20000, processed 21965 words, keeping 1171 word types
2022-05-06 15:52:04,664 : INFO : PROGRESS: at sentence #30000, processed 32880 words, keeping 1552 word types
2022-05-06 15:52:04,672 : INFO : PROGRESS: at sentence #40000, processed 43804 words, keeping 1957 word types
2022-05-06 15:52:04,679 : INFO : PROGRESS: at sentence #50000, processed 54805 words, keeping 2363 word types
2022-05-06 15:52:04,688 : INFO : PROGRESS: at sentence #60000, processed 65802 words, keeping 2789 word types
2022-05-06 15:52:04,696 : INFO : PROGRESS: at sentence #70000, processed 76744 words, keeping 3150 word types
2022-05-06 15:52:04,706 : INFO : PROGRESS: at sentence #80000

Loss after epoch 1: 104621.3984375


2022-05-06 15:52:06,288 : INFO : EPOCH 1: training on 109548 raw words (13378 effective words) took 0.7s, 18099 effective words/s


Loss after epoch 2: 93834.2109375


2022-05-06 15:52:06,970 : INFO : EPOCH 2: training on 109548 raw words (13341 effective words) took 0.7s, 19674 effective words/s


Loss after epoch 3: 112565.390625


2022-05-06 15:52:07,643 : INFO : EPOCH 3: training on 109548 raw words (13420 effective words) took 0.7s, 20156 effective words/s


Loss after epoch 4: 57122.375


2022-05-06 15:52:08,337 : INFO : EPOCH 4: training on 109548 raw words (13423 effective words) took 0.7s, 19409 effective words/s


Loss after epoch 5: 67451.78125


2022-05-06 15:52:09,031 : INFO : EPOCH 5: training on 109548 raw words (13333 effective words) took 0.7s, 19306 effective words/s


Loss after epoch 6: 50952.71875


2022-05-06 15:52:09,717 : INFO : EPOCH 6: training on 109548 raw words (13408 effective words) took 0.7s, 19631 effective words/s


Loss after epoch 7: 61800.8125


2022-05-06 15:52:10,452 : INFO : EPOCH 7: training on 109548 raw words (13389 effective words) took 0.7s, 18297 effective words/s


Loss after epoch 8: 50515.25


2022-05-06 15:52:11,143 : INFO : EPOCH 8: training on 109548 raw words (13393 effective words) took 0.7s, 19500 effective words/s


Loss after epoch 9: 53052.25


2022-05-06 15:52:11,868 : INFO : EPOCH 9: training on 109548 raw words (13352 effective words) took 0.7s, 18504 effective words/s


Loss after epoch 10: 54611.0625


2022-05-06 15:52:12,632 : INFO : EPOCH 10: training on 109548 raw words (13356 effective words) took 0.8s, 17562 effective words/s


Loss after epoch 11: 96952.25


2022-05-06 15:52:13,349 : INFO : EPOCH 11: training on 109548 raw words (13289 effective words) took 0.7s, 18622 effective words/s


Loss after epoch 12: 74864.6875


2022-05-06 15:52:14,204 : INFO : EPOCH 12: training on 109548 raw words (13395 effective words) took 0.9s, 15732 effective words/s


Loss after epoch 13: 71958.0


2022-05-06 15:52:14,907 : INFO : EPOCH 13: training on 109548 raw words (13325 effective words) took 0.7s, 19047 effective words/s


Loss after epoch 14: 96457.875


2022-05-06 15:52:15,588 : INFO : EPOCH 14: training on 109548 raw words (13437 effective words) took 0.7s, 19832 effective words/s


Loss after epoch 15: 75718.0625


2022-05-06 15:52:16,285 : INFO : EPOCH 15: training on 109548 raw words (13412 effective words) took 0.7s, 19363 effective words/s


Loss after epoch 16: 43618.75


2022-05-06 15:52:16,958 : INFO : EPOCH 16: training on 109548 raw words (13345 effective words) took 0.7s, 19928 effective words/s


Loss after epoch 17: 82219.5


2022-05-06 15:52:17,614 : INFO : EPOCH 17: training on 109548 raw words (13436 effective words) took 0.7s, 20632 effective words/s


Loss after epoch 18: 61734.0


2022-05-06 15:52:18,260 : INFO : EPOCH 18: training on 109548 raw words (13314 effective words) took 0.6s, 20703 effective words/s


Loss after epoch 19: 53094.0


2022-05-06 15:52:18,908 : INFO : EPOCH 19: training on 109548 raw words (13325 effective words) took 0.6s, 20662 effective words/s


Loss after epoch 20: 96475.125


2022-05-06 15:52:19,537 : INFO : EPOCH 20: training on 109548 raw words (13322 effective words) took 0.6s, 21295 effective words/s


Loss after epoch 21: 58901.625


2022-05-06 15:52:20,177 : INFO : EPOCH 21: training on 109548 raw words (13387 effective words) took 0.6s, 21025 effective words/s


Loss after epoch 22: 60677.375


2022-05-06 15:52:20,798 : INFO : EPOCH 22: training on 109548 raw words (13330 effective words) took 0.6s, 21576 effective words/s


Loss after epoch 23: 91356.875


2022-05-06 15:52:21,450 : INFO : EPOCH 23: training on 109548 raw words (13402 effective words) took 0.6s, 20688 effective words/s


Loss after epoch 24: 64995.25


2022-05-06 15:52:22,094 : INFO : EPOCH 24: training on 109548 raw words (13444 effective words) took 0.6s, 20963 effective words/s


Loss after epoch 25: 69830.125


2022-05-06 15:52:22,784 : INFO : EPOCH 25: training on 109548 raw words (13326 effective words) took 0.7s, 19410 effective words/s


Loss after epoch 26: 58318.125


2022-05-06 15:52:23,421 : INFO : EPOCH 26: training on 109548 raw words (13345 effective words) took 0.6s, 21057 effective words/s


Loss after epoch 27: 45836.75


2022-05-06 15:52:24,132 : INFO : EPOCH 27: training on 109548 raw words (13398 effective words) took 0.7s, 18943 effective words/s


Loss after epoch 28: 34906.0


2022-05-06 15:52:24,782 : INFO : EPOCH 28: training on 109548 raw words (13366 effective words) took 0.6s, 20633 effective words/s


Loss after epoch 29: 49213.875


2022-05-06 15:52:25,426 : INFO : EPOCH 29: training on 109548 raw words (13367 effective words) took 0.6s, 20855 effective words/s
2022-05-06 15:52:25,427 : INFO : Word2Vec lifecycle event {'msg': 'training on 3286440 raw words (401100 effective words) took 20.6s, 19483 effective words/s', 'datetime': '2022-05-06T15:52:25.427292', 'gensim': '4.2.0', 'python': '3.8.10 (default, Mar 15 2022, 12:22:08) \n[GCC 9.4.0]', 'platform': 'Linux-5.4.0-109-generic-x86_64-with-glibc2.29', 'event': 'train'}
2022-05-06 15:52:25,428 : INFO : Word2Vec lifecycle event {'params': 'Word2Vec<vocab=4143, vector_size=250, alpha=0.025>', 'datetime': '2022-05-06T15:52:25.428039', 'gensim': '4.2.0', 'python': '3.8.10 (default, Mar 15 2022, 12:22:08) \n[GCC 9.4.0]', 'platform': 'Linux-5.4.0-109-generic-x86_64-with-glibc2.29', 'event': 'created'}
  model.wv.init_sims()
2022-05-06 15:52:25,432 : INFO : KeyedVectors lifecycle event {'fname_or_handle': 'word_embedding.emb', 'separately': 'None', 'sep_limit': 10485760

Loss after epoch 30: 75453.875
