In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [1]:
import os
import sys
import pandas as pd
import numpy as np
import logging
import gensim
from gensim.models import word2vec
from gensim.models.callbacks import CallbackAny2Vec

In [2]:
os.environ["CUDA_VISIBLE_DEVICES"]="2"

In [3]:
DATA_PATH = '../../data/agent-benchmark'
VOCABULARY_FILE = '../../data/agent-benchmark/vocabulary.csv'
TRAIN_DATASET = os.path.join(DATA_PATH, 'train.csv')
VAL_DATASET = os.path.join(DATA_PATH, 'val.csv')
TEST_DATASET  = os.path.join(DATA_PATH, 'test.csv')

In [4]:
EMBEDDING_DIM = 300

In [5]:
def read_data(filename):
    data = pd.read_csv(filename, sep=";", names =['utterance','label'], header=None, dtype={'utterance':str, 'label': str})
    return data

In [6]:
train_dataset = read_data(TRAIN_DATASET).dropna()
val_dataset = read_data(VAL_DATASET).dropna()
test_dataset = read_data(TEST_DATASET).dropna()

In [7]:
train_posts = train_dataset['utterance']
val_posts = val_dataset['utterance']

In [8]:
vocab_data = np.concatenate((train_posts, val_posts), axis=0)
with open(VOCABULARY_FILE,'wb') as f:
       for l in vocab_data:
        f.write(b'%s\n'%l)

In [9]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
sentences = word2vec.LineSentence(VOCABULARY_FILE)
model = word2vec.Word2Vec(
    sentences,
    size=EMBEDDING_DIM,
    window=10,
    min_count=3,
    workers=10,
    iter=10)

2021-04-09 14:57:15,362 : INFO : collecting all words and their counts
2021-04-09 14:57:15,369 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-04-09 14:57:15,458 : INFO : PROGRESS: at sentence #10000, processed 63479 words, keeping 5106 word types
2021-04-09 14:57:15,555 : INFO : PROGRESS: at sentence #20000, processed 127400 words, keeping 7272 word types
2021-04-09 14:57:15,561 : INFO : collected 7370 word types from a corpus of 130471 raw words and 20462 sentences
2021-04-09 14:57:15,562 : INFO : Loading a fresh vocabulary
2021-04-09 14:57:15,579 : INFO : effective_min_count=3 retains 2705 unique words (36% of original 7370, drops 4665)
2021-04-09 14:57:15,580 : INFO : effective_min_count=3 leaves 124785 word corpus (95% of original 130471, drops 5686)
2021-04-09 14:57:15,595 : INFO : deleting the raw counts dictionary of 7370 items
2021-04-09 14:57:15,598 : INFO : sample=0.001 downsamples 73 most-common words
2021-04-09 14:57:15,599 : INFO : downsamp

2021-04-09 14:57:19,995 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-04-09 14:57:20,004 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-04-09 14:57:20,012 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-04-09 14:57:20,013 : INFO : EPOCH - 7 : training on 130471 raw words (85450 effective words) took 0.5s, 160212 effective words/s
2021-04-09 14:57:20,580 : INFO : worker thread finished; awaiting finish of 9 more threads
2021-04-09 14:57:20,592 : INFO : worker thread finished; awaiting finish of 8 more threads
2021-04-09 14:57:20,593 : INFO : worker thread finished; awaiting finish of 7 more threads
2021-04-09 14:57:20,595 : INFO : worker thread finished; awaiting finish of 6 more threads
2021-04-09 14:57:20,596 : INFO : worker thread finished; awaiting finish of 5 more threads
2021-04-09 14:57:20,598 : INFO : worker thread finished; awaiting finish of 4 more threads
2021-04-09 14:57:20,606 : INFO : worker thread 

In [10]:
model.wv.most_similar_cosmul(positive=['volume'])

2021-04-09 14:57:53,560 : INFO : precomputing L2-norms of word weight vectors


[(u'increase', 0.9808063507080078),
 (u'speakers', 0.9806278944015503),
 (u'brightness', 0.9732304215431213),
 (u'garage', 0.9722849130630493),
 (u'lower', 0.9704989194869995),
 (u'down', 0.9691586494445801),
 (u'speaker', 0.9677348732948303),
 (u'raise', 0.9657253623008728),
 (u'switch', 0.9609561562538147),
 (u'socket', 0.9592718482017517)]

In [14]:
model.wv.save_word2vec_format('../../data/agent-benchmark/vocab/word2vec-model-agent-benchmark.bin')

2021-04-09 14:58:51,693 : INFO : storing 2705x300 projection weights into ../../data/agent-benchmark/vocab/word2vec-model-agent-benchmark.bin


In [15]:
!tail -n 10 ../../data/virtual-operator/vocab/word2vec-model-virtural-operator.bin

patas -0.0014272118 0.0019288043 -0.040097624 -0.061160143 -0.0045883656 0.043964684 0.017257297 0.039947383 0.020025706 0.004470046 -0.031035079 -0.039248925 0.018947026 0.011011288 0.044187214 0.03965991 -0.023142641 0.03243842 0.04040606 0.016655501 -0.06299851 0.031413954 0.029765792 -0.0064665484 0.010933491 0.06293159 -0.016881099 -0.0012014469 0.019711385 0.008653525 -0.0027381051 0.01804984 -0.03875595 -0.0119994925 0.03325943 0.05967363 -0.03882809 -0.028015615 -0.0035998593 -0.008467932 0.032131754 0.0034180963 -0.012455963 -0.04348119 -0.024119103 0.0059879497 0.012339806 -0.011204941 0.020261697 0.03127929 0.06176789 -0.030937301 -0.007627971 0.03737832 0.005531746 -0.017434793 0.01099796 -0.032762587 0.00881455 0.028472126 0.012862526 0.005032312 -0.06392629 -0.07016575 -0.035004105 3.4107376e-05 0.015196012 -0.054792736 0.0073507098 -0.019180952 0.032480672 -0.061447892 0.035760812 0.028222706 0.028436132 0.026326528 0.019838635 -0.028305968 0.0369279 0.013009287 0.008283