piskvorky · menshikh-iv · Aug 14, 2018 · Aug 6, 2018 · Aug 6, 2018 · Aug 6, 2018
diff --git a/gensim/scripts/word2vec2tensor.py b/gensim/scripts/word2vec2tensor.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 #
+# Copyright (C) 2018 Vimig Socrates <vimig.socrates@gmail.com>
 # Copyright (C) 2016 Loreto Parisi <loretoparisi@gmail.com>
 # Copyright (C) 2016 Silvio Olivastri <silvio.olivastri@gmail.com>
 # Copyright (C) 2016 Radim Rehurek <radim@rare-technologies.com>
@@ -43,8 +44,11 @@
 import logging
 import argparse
 
+from smart_open import smart_open
 import gensim
 
+import numpy as np
+
 logger = logging.getLogger(__name__)
 
 
@@ -63,16 +67,15 @@ def word2vec2tensor(word2vec_model_path, tensor_filename, binary=False):
         True if input file in binary format.
 
     """
-    model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_model_path, binary=binary)
+    model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_model_path, binary=binary, datatype=np.float64)
     outfiletsv = tensor_filename + '_tensor.tsv'
     outfiletsvmeta = tensor_filename + '_metadata.tsv'
 
-    with open(outfiletsv, 'w+') as file_vector:
-        with open(outfiletsvmeta, 'w+') as file_metadata:
-            for word in model.index2word:
-                file_metadata.write(gensim.utils.to_utf8(word) + gensim.utils.to_utf8('\n'))
-                vector_row = '\t'.join(str(x) for x in model[word])
-                file_vector.write(vector_row + '\n')
+    with smart_open(outfiletsv, 'wb') as file_vector, smart_open(outfiletsvmeta, 'wb') as file_metadata:
+        for word in model.index2word:
+            file_metadata.write(gensim.utils.to_utf8(word) + gensim.utils.to_utf8('\n'))
+            vector_row = '\t'.join(str(x) for x in model[word])
+            file_vector.write(gensim.utils.to_utf8(vector_row) + gensim.utils.to_utf8('\n'))
 
     logger.info("2D tensor file saved to %s", outfiletsv)
     logger.info("Tensor metadata file saved to %s", outfiletsvmeta)

diff --git a/gensim/test/test_scripts.py b/gensim/test/test_scripts.py
@@ -1,108 +1,186 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-#
-# Copyright (C) 2018 Manos Stergiadis <em.stergiadis@gmail.com>
-# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
-
-"""
-Automated tests for checking the output of gensim.scripts.
-"""
-
-from __future__ import unicode_literals
-
-import json
-import logging
-import os.path
-import unittest
-
-from gensim.scripts.segment_wiki import segment_all_articles, segment_and_write_all_articles
-from smart_open import smart_open
-from gensim.test.utils import datapath, get_tmpfile
-
-
-class TestSegmentWiki(unittest.TestCase):
-
-    def setUp(self):
-        self.fname = datapath('enwiki-latest-pages-articles1.xml-p000000010p000030302-shortened.bz2')
-        self.expected_title = 'Anarchism'
-        self.expected_section_titles = [
-            'Introduction',
-            'Etymology and terminology',
-            'History',
-            'Anarchist schools of thought',
-            'Internal issues and debates',
-            'Topics of interest',
-            'Criticisms',
-            'References',
-            'Further reading',
-            'External links'
-        ]
-
-    def tearDown(self):
-        # remove all temporary test files
-        fname = get_tmpfile('script.tst')
-        extensions = ['', '.json']
-        for ext in extensions:
-            try:
-                os.remove(fname + ext)
-            except OSError:
-                pass
-
-    def test_segment_all_articles(self):
-        title, sections, interlinks = next(segment_all_articles(self.fname, include_interlinks=True))
-
-        # Check title
-        self.assertEqual(title, self.expected_title)
-
-        # Check section titles
-        section_titles = [s[0] for s in sections]
-        self.assertEqual(section_titles, self.expected_section_titles)
-
-        # Check text
-        first_section_text = sections[0][1]
-        first_sentence = "'''Anarchism''' is a political philosophy that advocates self-governed societies"
-        self.assertTrue(first_sentence in first_section_text)
-
-        # Check interlinks
-        self.assertTrue(interlinks['self-governance'] == 'self-governed')
-        self.assertTrue(interlinks['Hierarchy'] == 'hierarchical')
-        self.assertTrue(interlinks['Pierre-Joseph Proudhon'] == 'Proudhon')
-
-    def test_generator_len(self):
-        expected_num_articles = 106
-        num_articles = sum(1 for x in segment_all_articles(self.fname))
-
-        self.assertEqual(num_articles, expected_num_articles)
-
-    def test_json_len(self):
-        tmpf = get_tmpfile('script.tst.json')
-        segment_and_write_all_articles(self.fname, tmpf, workers=1)
-
-        expected_num_articles = 106
-        num_articles = sum(1 for line in smart_open(tmpf))
-        self.assertEqual(num_articles, expected_num_articles)
-
-    def test_segment_and_write_all_articles(self):
-        tmpf = get_tmpfile('script.tst.json')
-        segment_and_write_all_articles(self.fname, tmpf, workers=1, include_interlinks=True)
-
-        # Get the first line from the text file we created.
-        with open(tmpf) as f:
-            first = next(f)
-
-        # decode JSON line into a Python dictionary object
-        article = json.loads(first)
-        title, section_titles, interlinks = article['title'], article['section_titles'], article['interlinks']
-
-        self.assertEqual(title, self.expected_title)
-        self.assertEqual(section_titles, self.expected_section_titles)
-
-        # Check interlinks
-        self.assertTrue(interlinks['self-governance'] == 'self-governed')
-        self.assertTrue(interlinks['Hierarchy'] == 'hierarchical')
-        self.assertTrue(interlinks['Pierre-Joseph Proudhon'] == 'Proudhon')
-
-
-if __name__ == '__main__':
-    logging.basicConfig(level=logging.DEBUG)
-    unittest.main()
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2018 Vimig Socrates <vimig.socrates@gmail.com> heavily influenced from @AakaashRao
+# Copyright (C) 2018 Manos Stergiadis <em.stergiadis@gmail.com>
+# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
+
+"""
+Automated tests for checking the output of gensim.scripts.
+"""
+
+from __future__ import unicode_literals
+
+import json
+import logging
+import os.path
+import unittest
+
+from smart_open import smart_open
+import numpy as np
+
+from gensim.scripts.segment_wiki import segment_all_articles, segment_and_write_all_articles
+from gensim.test.utils import datapath, get_tmpfile
+
+from gensim.scripts.word2vec2tensor import word2vec2tensor
+from gensim.utils import to_utf8
+from gensim.models import KeyedVectors
+
+
+class TestSegmentWiki(unittest.TestCase):
+
+    def setUp(self):
+        self.fname = datapath('enwiki-latest-pages-articles1.xml-p000000010p000030302-shortened.bz2')
+        self.expected_title = 'Anarchism'
+        self.expected_section_titles = [
+            'Introduction',
+            'Etymology and terminology',
+            'History',
+            'Anarchist schools of thought',
+            'Internal issues and debates',
+            'Topics of interest',
+            'Criticisms',
+            'References',
+            'Further reading',
+            'External links'
+        ]
+
+    def tearDown(self):
+        # remove all temporary test files
+        fname = get_tmpfile('script.tst')
+        extensions = ['', '.json']
+        for ext in extensions:
+            try:
+                os.remove(fname + ext)
+            except OSError:
+                pass
+
+    def test_segment_all_articles(self):
+        title, sections, interlinks = next(segment_all_articles(self.fname, include_interlinks=True))
+
+        # Check title
+        self.assertEqual(title, self.expected_title)
+
+        # Check section titles
+        section_titles = [s[0] for s in sections]
+        self.assertEqual(section_titles, self.expected_section_titles)
+
+        # Check text
+        first_section_text = sections[0][1]
+        first_sentence = "'''Anarchism''' is a political philosophy that advocates self-governed societies"
+        self.assertTrue(first_sentence in first_section_text)
+
+        # Check interlinks
+        self.assertTrue(interlinks['self-governance'] == 'self-governed')
+        self.assertTrue(interlinks['Hierarchy'] == 'hierarchical')
+        self.assertTrue(interlinks['Pierre-Joseph Proudhon'] == 'Proudhon')
+
+    def test_generator_len(self):
+        expected_num_articles = 106
+        num_articles = sum(1 for x in segment_all_articles(self.fname))
+
+        self.assertEqual(num_articles, expected_num_articles)
+
+    def test_json_len(self):
+        tmpf = get_tmpfile('script.tst.json')
+        segment_and_write_all_articles(self.fname, tmpf, workers=1)
+
+        expected_num_articles = 106
+        num_articles = sum(1 for line in smart_open(tmpf))
+        self.assertEqual(num_articles, expected_num_articles)
+
+    def test_segment_and_write_all_articles(self):
+        tmpf = get_tmpfile('script.tst.json')
+        segment_and_write_all_articles(self.fname, tmpf, workers=1, include_interlinks=True)
+
+        # Get the first line from the text file we created.
+        with open(tmpf) as f:
+            first = next(f)
+
+        # decode JSON line into a Python dictionary object
+        article = json.loads(first)
+        title, section_titles, interlinks = article['title'], article['section_titles'], article['interlinks']
+
+        self.assertEqual(title, self.expected_title)
+        self.assertEqual(section_titles, self.expected_section_titles)
+
+        # Check interlinks
+        self.assertTrue(interlinks['self-governance'] == 'self-governed')
+        self.assertTrue(interlinks['Hierarchy'] == 'hierarchical')
+        self.assertTrue(interlinks['Pierre-Joseph Proudhon'] == 'Proudhon')
+
+
+class TestWord2Vec2Tensor(unittest.TestCase):
+    def setUp(self):
+        self.datapath = datapath('word2vec_pre_kv_c')
+        self.output_folder = get_tmpfile('')
+        self.metadata_file = self.output_folder + '_metadata.tsv'
+        self.tensor_file = self.output_folder + '_tensor.tsv'
+        self.vector_file = self.output_folder + '_vector.tsv'
+
+    def testConversion(self):
+        word2vec2tensor(word2vec_model_path=self.datapath, tensor_filename=self.output_folder)
+
+        try:
+            with smart_open(self.metadata_file, 'rb') as f:
+                metadata = f.readlines()
+        except Exception:
+            if not os.path.isfile(os.path.join(self.metadata_file)):
+                self.fail(
+                    'Metadata file %s creation failed. \
+                    Check the parameters and input file format.' % self.metadata_file
+                    )
+        try:
+            with smart_open(self.tensor_file, 'rb') as f:
+                vectors = f.readlines()
+        except Exception:
+            if not os.path.isfile(os.path.join(self.tensor_file)):
+                self.fail(
+                    'Tensor file %s creation failed. \
+                    Check the parameters and input file format.' % self.tensor_file
+                    )
+
+        # check if number of words and vector size in tensor file line up with word2vec
+        with smart_open(self.datapath, 'rb') as f:
+            first_line = f.readline().strip()
+
+        number_words, vector_size = map(int, first_line.split(b' '))
+        if not len(metadata) == len(vectors) == number_words:
+            self.fail(
+                'Metadata file %s and tensor file %s \
+                imply different number of rows.' % (self.metadata_file, self.tensor_file)
+                )
+
+        # write word2vec to file
+        metadata = [word.strip() for word in metadata]
+        vectors = [vector.replace(b'\t', b' ') for vector in vectors]
+        word2veclines = [metadata[i] + b' ' + vectors[i] for i in range(len(metadata))]
+        with smart_open(self.vector_file, 'wb') as f:
+            # write header
+            f.write(to_utf8(str(number_words) + ' ' + str(vector_size) + '\n'))
+            f.writelines(word2veclines)
+
+        # test that the converted model loads successfully
+        test_model = KeyedVectors.load_word2vec_format(self.vector_file, binary=False)
+
+        # test vocabularies are the same
+        orig_model = KeyedVectors.load_word2vec_format(self.datapath, binary=False)
+
+        if not orig_model.vocab.keys() == test_model.vocab.keys():
+            self.fail(
+                'Original word2vec model %s and tensor model %s have \
+                different vocabularies.' % (self.datapath, self.vector_file)
+                )
+
+        # test vectors for each word are the same
+        for word in orig_model.vocab.keys():
+            if not np.array_equal(orig_model[word], test_model[word]):
+                self.fail(
+                'Original word2vec model %s and tensor model %s store different \
+                vectors for word %s.' % (self.datapath, self.vector_file, word)
+                )
+
+
+if __name__ == '__main__':
+    logging.basicConfig(level=logging.DEBUG)
+    unittest.main()