# Training word2vec

In [None]:
path = "data/"
filename = "roman_processed.txt"

## Loading the corpus

In [None]:
import time
import logging
from gensim.models.word2vec import LineSentence

logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

lines = LineSentence(path + filename)

## Training a word2vec model

### Usage
Word2Vec(sentences=None, corpus_file=None, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None,
         sample=0.001, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1,
         hashfxn=<built-in function hash>, iter=5, null_word=0, trim_rule=None, sorted_vocab=1, batch_words=10000,
         compute_loss=False, callbacks=(), max_final_vocab=None)

### Important parameters
1. **sentences** (iterable of iterables, optional) – The sentences iterable can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network.
2. **corpus_file** (str, optional) – Path to a corpus file in LineSentence format. You may use this argument instead of sentences to get performance boost. Only one of sentences or corpus_file arguments need to be passed (or none of them, in that case, the model is left uninitialized).
3. **size** (int, optional) – Dimensionality of the word vectors.
4. **window** (int, optional) – Maximum distance between the current and predicted word within a sentence.
5. **min_count** (int, optional) – Ignores all words with total frequency lower than this.
6. **workers** (int, optional) – Use these many worker threads to train the model (=faster training with multicore machines).
7. **sg** ({0, 1}, optional) – Training algorithm: 1 for skip-gram; otherwise CBOW.
8. **hs** ({0, 1}, optional) – If 1, hierarchical softmax will be used for model training. If 0, and negative is non-zero, negative sampling will be used.
9. **negative** (int, optional) – If > 0, negative sampling will be used, the int for negative specifies how many “noise words” should be drawn (usually between 5-20). If set to 0, no negative sampling is used.
10. **ns_exponent** (float, optional) – The exponent used to shape the negative sampling distribution. A value of 1.0 samples exactly in proportion to the frequencies, 0.0 samples all words equally, while a negative value samples low-frequency words more than high-frequency words.
11. **cbow_mean** ({0, 1}, optional) – If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used.
12. **alpha** (float, optional) – The initial learning rate.
13. **min_alpha** (float, optional) – Learning rate will linearly drop to min_alpha as training progresses.
14. **seed** (int, optional) – Seed for the random number generator.
15. **max_vocab_size** (int, optional) – Limits the RAM during vocabulary building; if there are more unique words than this, then prune the infrequent ones. Every 10 million word types need about 1GB of RAM. Set to None for no limit.
16. **max_final_vocab** (int, optional) – Limits the vocab to a target vocab size by automatically picking a matching min_count. If the specified min_count is more than the calculated min_count, the specified min_count will be used. Set to None if not required.
17. **sample** (float, optional) – The threshold for configuring which higher-frequency words are randomly downsampled, useful range is (0, 1e-5).
18. **iter** (int, optional) – Number of iterations (epochs) over the corpus

In [None]:
import multiprocessing
from gensim.models import Word2Vec

size = 500                               # Vector dimension
window = 5                               # Context window size
min_count = 1                            # Cut off frequency
sg = 1                                   # 0 for CBOW, 1 for Skip-gram
sample = 0.001                           # Default
workers = multiprocessing.cpu_count()    # Number of cores to use

model = Word2Vec(size=size, window=window, min_count=min_count, sg=sg, sample=sample, workers=workers)

In [None]:
def get_model_name():
    model_name = 'word2vec'
    
    if 'roman' in filename:
        model_name += '_roman'
    else:
        model_name += '_urdu'
        
    if sg == 0:
        model_name += '_cbow'
    else:
        model_name += '_sg'
        
    model_name += '_' + str(size)
    return model_name

In [None]:
t = time.time()

model.build_vocab(lines, progress_per=1000000)

print('Time to build vocab: {} mins'.format(round((time.time() - t) / 60, 2)))

In [None]:
model_name = get_model_name()
print("About to train {}".format(model_name))

t = time.time()

model.train(lines, total_examples=model.corpus_count, epochs=5, report_delay=10)

print('Time to train the model: {} mins'.format(round((time.time() - t) / 60, 2)))

## Saving the model

In [None]:
model.save(model_name)