## Learning a group of topic related sentences using Gensim (Word2Vec)

**NOTE**: The ipython notebook from which this html page was created can be downloaded [here](word_embedding_task.ipynb).

In [1]:
# Loading a text file containing a few sentences (utterances)
from gensim.models.word2vec import LineSentence

sentences = LineSentence('small_dataset.txt')

for sentence in sentences:
    print(sentence)

['What', 'are', 'the', 'main', 'symptoms', 'of', 'diabetes', '?']
['Tell', 'me', 'how', 'I', 'may', 'treat', 'bruxism', '?']
['What', 'is', 'the', 'cause', 'of', 'a', 'stroke', '?']


In [2]:
# Create a Word2Vec model using the list of sentences from text file
from gensim.models import Word2Vec

# min_count is set to 1, meaning that all words ocurring in the
# list of sentences shall be used to learn the topic specific model
model = Word2Vec(sentences, min_count=1)

Once the model has been trained it can be used for word similarity search, but in this case we are only interested in finding out the individual word vector representation (word embeddings) of the words that were part of the dataset.

In [3]:
# Print the default size of the word vector for the English language
print("English word vector dimension: {} [default]".format(model.vector_size))
# Print the word vector for a given word from the vocabulary
print("What WV:\n", model.wv['What'])

English word vector dimension: 100 [default]
What WV:
 [ 3.8179313e-03  2.4054307e-04  1.4165390e-03  4.6917102e-03
  9.0145570e-04 -3.3321411e-03  4.4127158e-03  1.0163705e-03
 -3.0541401e-03  3.3260447e-03 -3.3608312e-03 -2.4186056e-03
  3.4223178e-03 -5.4589680e-05 -1.2973493e-03  2.5586255e-03
  3.6581676e-03  2.7903772e-03 -4.8657446e-03 -1.4178132e-03
  4.0518804e-03  1.2574471e-03 -1.3775246e-04  5.1502854e-04
 -4.2253044e-03  2.3082788e-03 -2.3886915e-03 -3.8120357e-04
  2.6817231e-03 -1.5355331e-03  4.4533387e-03  4.9725585e-03
 -4.8908647e-03 -7.5012585e-04  2.1398852e-03 -4.4348165e-03
  4.9365517e-03  1.3714336e-03 -2.1389878e-04  2.3822451e-04
  3.3364524e-03  7.2594936e-04 -2.0612867e-03  5.9130415e-04
 -4.8737540e-03 -8.4016996e-04 -2.1013622e-03  4.9492731e-03
 -4.0631974e-03 -1.4872494e-03 -4.1368478e-03  3.5485523e-03
 -4.7037241e-04 -2.2351947e-04 -4.0245340e-03  3.3300796e-03
  2.3225050e-03 -2.8624914e-03  3.7211452e-03  9.9605444e-05
  3.7177138e-03 -3.0952804e-03

In [4]:
# Print the vocabulary of words in the list of sentences
words = list(model.wv.vocab)
print(words)

['What', 'are', 'the', 'main', 'symptoms', 'of', 'diabetes', '?', 'Tell', 'me', 'how', 'I', 'may', 'treat', 'bruxism', 'is', 'cause', 'a', 'stroke']


In [5]:
# Print word vector for each word in learned vocabulary
vocab_words = []
for word in words:
    word_vector = model.wv[word]
    print("{}:\n{}".format(word, word_vector))
    vocab_words.append(word_vector)

What:
[ 3.8179313e-03  2.4054307e-04  1.4165390e-03  4.6917102e-03
  9.0145570e-04 -3.3321411e-03  4.4127158e-03  1.0163705e-03
 -3.0541401e-03  3.3260447e-03 -3.3608312e-03 -2.4186056e-03
  3.4223178e-03 -5.4589680e-05 -1.2973493e-03  2.5586255e-03
  3.6581676e-03  2.7903772e-03 -4.8657446e-03 -1.4178132e-03
  4.0518804e-03  1.2574471e-03 -1.3775246e-04  5.1502854e-04
 -4.2253044e-03  2.3082788e-03 -2.3886915e-03 -3.8120357e-04
  2.6817231e-03 -1.5355331e-03  4.4533387e-03  4.9725585e-03
 -4.8908647e-03 -7.5012585e-04  2.1398852e-03 -4.4348165e-03
  4.9365517e-03  1.3714336e-03 -2.1389878e-04  2.3822451e-04
  3.3364524e-03  7.2594936e-04 -2.0612867e-03  5.9130415e-04
 -4.8737540e-03 -8.4016996e-04 -2.1013622e-03  4.9492731e-03
 -4.0631974e-03 -1.4872494e-03 -4.1368478e-03  3.5485523e-03
 -4.7037241e-04 -2.2351947e-04 -4.0245340e-03  3.3300796e-03
  2.3225050e-03 -2.8624914e-03  3.7211452e-03  9.9605444e-05
  3.7177138e-03 -3.0952804e-03 -2.6811736e-03 -4.7585592e-04
 -2.7382355e-03 -4

In [6]:
# Convert python list into numpy array and save np.array as csv
import numpy as np

vocab_words_arr = np.array(vocab_words)
print("Shape: {}".format(vocab_words_arr.shape))

Shape: (19, 100)


In [7]:
# Saving word vectors in csv file
# REF: https://thispointer.com/how-to-save-numpy-array-to-a-csv-file-using-numpy-savetxt-in-python/
np.savetxt('word_vectors.csv', vocab_words_arr, delimiter=',',fmt='%10.9f')

In [8]:
# Verify the word vectors have been saved as expected (CSV format)
csv_file = open('word_vectors.csv')
for line in csv_file:
    print(line)
csv_file.close()

0.003817931,0.000240543,0.001416539,0.004691710,0.000901456,-0.003332141,0.004412716,0.001016370,-0.003054140,0.003326045,-0.003360831,-0.002418606,0.003422318,-0.000054590,-0.001297349,0.002558626,0.003658168,0.002790377,-0.004865745,-0.001417813,0.004051880,0.001257447,-0.000137752,0.000515029,-0.004225304,0.002308279,-0.002388692,-0.000381204,0.002681723,-0.001535533,0.004453339,0.004972558,-0.004890865,-0.000750126,0.002139885,-0.004434817,0.004936552,0.001371434,-0.000213899,0.000238225,0.003336452,0.000725949,-0.002061287,0.000591304,-0.004873754,-0.000840170,-0.002101362,0.004949273,-0.004063197,-0.001487249,-0.004136848,0.003548552,-0.000470372,-0.000223519,-0.004024534,0.003330080,0.002322505,-0.002862491,0.003721145,0.000099605,0.003717714,-0.003095280,-0.002681174,-0.000475856,-0.002738236,-0.000456624,0.004250429,-0.000523769,0.001175903,-0.003073386,0.003857980,0.003220433,0.001414227,-0.002455805,-0.003524332,-0.003382101,-0.004817462,-0.004495983,0.000933415,0.002115293,