## Imports

In [None]:
import glob
import pickle
import re
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser
from itertools import product
from stop_words import get_stop_words
from nltk import tokenize

## Configuration

*input_dir:* The path to the directory that contains your text files. Please make sure to use a '/' (slash) in the end. For example: path/to/texts/.

*language*: The language of your texts. This is used to get the right list of stops words.

*num_processes*: The number of processes to use. This depends on your hardware. The more cores you can use, the faster the training of the models.

*models_filename:* The filename for the resulting trained models. You may use the **.p** extension indicating a pickled file, but you are free to use whatever you like. Just make sure this is consistent in the evaluation step.

In [None]:
input_dir = "../data/texts/"
language = "french"
num_processes = 2
models_filename = "models.p"

### Grid search parameters
You should provide possible values for hyperparameters of the word2vec model. Please refer to the [gensim documentation](https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec) to see a list of all hyperparameter. The following values serve as an example and may be adjusted to your needs.

In [None]:
vector_sizes = [100, 200, 300]
skip_grams = [0, 1]
hs = [0, 1]
windows = [5, 10]
negatives = [5, 10]
iters = [5, 10]

hyperparameters = list(product(vector_sizes, skip_grams, hs, windows, negatives, iters))
num_hyperparameters = len(hyperparameters)
print("number of hyperparameter combinations:", num_hyperparameters)

## Gird Search

### Loading texts

In [None]:
text_file_names = glob.glob("{}*.txt".format(input_dir))
print("found {} texts".format(len(text_file_names)))
texts = []
for text_file_name in text_file_names:
    with open(text_file_name, "r", encoding="utf-8") as input_file:
        texts.append(input_file.read())
print("loaded {} texts".format(len(texts)))
combined_text = " ".join(texts) 

### Conduct grid search

In [None]:
models = {}
stop_words = get_stop_words(language.lower())
sentences = tokenize.sent_tokenize(combined_text)
reg_exp_tok = tokenize.RegexpTokenizer(r"\w{3,}")
split_sentences = [reg_exp_tok.tokenize(s.lower()) for s in sentences]
split_sentences_wo_sw = []
for s in split_sentences:
    cleaned_tokens = [t for t in s if t not in stop_words]
    if len(cleaned_tokens) > 0:
        split_sentences_wo_sw.append(cleaned_tokens)
for hp in hyperparameters:
    model = Word2Vec(sentences=split_sentences_wo_sw, workers=num_processes, size=hp[0], sg=hp[1], hs=hp[2], window=hp[3], negative=hp[4], iter=hp[5])
    models[hp] = model

### Save models

In [None]:
with open(models_filename, "wb") as handle:
    pickle.dump(models, handle)