# Training custom CBoW model using Keras

In [None]:
from os import listdir
from os.path import isfile, join

import numpy as np

from tensorflow import keras
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda
from keras.utils import get_file, to_categorical
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

import gensim
import gensim.downloader
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
from gensim.utils import tokenize

In [None]:
path = get_file('alice.txt', origin='http://www.gutenberg.org/files/11/11-0.txt')
corpus = open(path).readlines()[:300]
corpus = [sentence for sentence in corpus if sentence.count(' ') >= 2]

tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
corpus = tokenizer.texts_to_sequences(corpus)
nb_samples = sum(len(s) for s in corpus)
V = len(tokenizer.word_index) + 1
dim = 100
window_size = 2

In [None]:
def generate_data(corpus, window_size, V):
    maxlen = window_size * 2
    for words in corpus:
        L = len(words)
        for index, word in enumerate(words):
            contexts = []
            labels   = []            
            s = index - window_size
            e = index + window_size + 1
            
            contexts.append([words[i] for i in range(s, e) if 0 <= i < L and i != index])
            labels.append(word)

            x = sequence.pad_sequences(contexts, maxlen=maxlen)
            y = to_categorical(labels, V)
            yield (x, y)

In [None]:
cbow = Sequential()
cbow.add(Embedding(input_dim=V, output_dim=dim, input_length=window_size*2))
cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(dim,)))
cbow.add(Dense(V, activation='softmax'))
cbow.summary()

In [None]:
cbow.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
for ite in range(3):
    loss = 0.
    for x, y in generate_data(corpus, window_size, V):
        loss += cbow.train_on_batch(x, y)

    print(ite, loss)

In [None]:
f = open('vectors.txt' ,'w')
f.write('{} {}\n'.format(V-1, dim))

In [None]:
vectors = cbow.get_weights()[0]
for word, i in tokenizer.word_index.items():
    str_vec = ' '.join(map(str, list(vectors[i, :])))
    f.write('{} {}\n'.format(word, str_vec))
f.close()

In [None]:
w2v = gensim.models.KeyedVectors.load_word2vec_format('./vectors.txt', binary=False)

In [None]:
w2v.most_similar(positive=['the'])

In [None]:
w2v.most_similar(positive=['alice'])

# Gensim pre-trained Word2Vec models

[Documentation](https://radimrehurek.com/gensim/models/word2vec.html) for Word2Vec models that **gensim** offers

In [None]:
# Show all available models in gensim-data
print(list(gensim.downloader.info()['models'].keys()))

In [None]:
# Download the "glove-wiki-gigaword-50" embeddings
glove_vectors = gensim.downloader.load('glove-wiki-gigaword-50')

In [None]:
glove_vectors.most_similar('love')

# Gensim custom-trained Word2Vec model for Serbian songs

In [None]:
CORPUS_MODES=["folk", "pop", "rock"]
texts = []

for MODEL_MODE in CORPUS_MODES:
    file_names = [f for f in listdir('./data/'+MODEL_MODE+'/') if isfile(join('./data/'+MODEL_MODE+'/', f))]
    for file_name in file_names:
        with open('./data/'+MODEL_MODE+'/'+file_name, 'r') as song_text_file:
            tokenized_text = list(tokenize(song_text_file.read().lower()))
            texts.append(tokenized_text)

In [None]:
for i in range(len(texts)):
    if "doberman" in texts[i]:
        print(texts[i])

In [None]:
serbian_model = Word2Vec(sentences=texts, vector_size=100, window=5, min_count=1, workers=4, epochs=100)
serbian_model.save("word2vec.model")

In [None]:
serbian_model.wv.most_similar('ljubav')

In [None]:
serbian_model.wv['ljubav']