In [1]:
from gensim.models import Word2Vec

from mwe_discov_eval.utils import ConllIterator, w2v_files
from mwe_discov_eval.databases.EmbeddingsDb import W2VReader
from mwe_discov_eval.databases import EmbeddingsDb, NgramDb
from mwe_discov_eval.databases.EmbeddingsDb import W2VReader

In [None]:
corpus = "mwe_discov_eval/samples/corpora/example.conllu"
idx_dict = {"id": 0, "surface": 1, "lemma": 2, "pos": 3}
sentences = ConllIterator(corpus, idx_dict, codec='utf8')
sentences.set_itermode('sent', keys=["lemma", "pos"], join_values='/', ignore_compound=True)

In [None]:
# Saving iterator as txt to build ALaCarte matrix
txt_file = "mwe_discov_eval/samples/corpora/lemma_pos.txt"
sentences.save_as_text(txt_file, keys=["lemma", "pos"], join_values='/', ignore_compound=True)

In [None]:
# Building word2vec embeddings of single words
parameters = {'size': 200, 'window': 2, 'min_count': 100, 'sg': 1, 'iter': 10}
model = Word2Vec(sentences, workers=5, **parameters)
model_name = "mwe_discov_eval/samples/embeddings/w2v"
model.save(model_name + '.model')
word_vectors = model.wv
word_vectors.save_word2vec_format(model_name + '.txt')

In [None]:
# Heading is removed from word2vec file to convert it in GloVe format for Alacarte
w2v_files.remove_heading("mwe_discov_eval/samples/embeddings/w2v.txt")

In [None]:
# Building ALaCarte transform matrix
%run ALaCarte/alacarte.py -v -s mwe_discov_eval/samples/embeddings/w2v.txt -c mwe_discov_eval/samples/corpora/lemma_pos.txt -w 2  mwe_discov_eval/samples/embeddings/alacarte/matrix

In [None]:
# Extracting targets from ngram_db for ALaCarte
ngram_db = NgramDb.load('mwe_discov_eval/samples/databases/sample_db')
txt_file = "mwe_discov_eval/samples/embeddings/alacarte/targets.txt"
ngram_db.connect()
ngram_db.to_text(txt_file, 'lemma_pos_counts', 'lemma_pos', 1, sub=('\t', '/'), mode='w')
for n in range(2, ngram_db.n_max+1):
    ngram_db.to_text(txt_file, 'lemma_pos_counts', 'lemma_pos', n, sub=('\t', '/'), mode='a')
ngram_db.disconnect()

In [None]:
#Building ALaCarte Embeddings
%run ALaCarte/alacarte.py -v -m mwe_discov_eval/samples/embeddings/alacarte/matrix_transform.bin -s mwe_discov_eval/samples/embeddings/w2v.txt -w 2 -c mwe_discov_eval/samples/corpora/lemma_pos.txt -t mwe_discov_eval/samples/embeddings/alacarte/targets.txt mwe_discov_eval/samples/embeddings/vec --create-new

In [2]:
#Heading is added and word separator is changed back to '\t'
wv_file = "mwe_discov_eval/samples/embeddings/vec_alacarte.txt"
w2v_files.add_heading(wv_file)
w2v_files.sub_separator(wv_file, '/', '\t')

100%|█████████████████████████████████████████████████████████████████████| 2005630/2005630 [01:33<00:00, 21448.53it/s]


In [3]:
#Generating embeddings database
wv_file = "mwe_discov_eval/samples/embeddings/vec_alacarte.txt"
fileroot = "mwe_discov_eval/samples/databases/alacarte_emb"
emb_db = EmbeddingsDb.from_w2v(wv_file, fileroot)

Saving data into db...
Done!
Generating info file...
Done!
