-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Make word2vec2tensor
script compatible with python3
#2147
Changes from 5 commits
06ad270
940ee83
ad6b52f
4c849dd
2fb3ec2
186d6e8
9fc2ad6
d94e41d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,108 +1,186 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
# | ||
# Copyright (C) 2018 Manos Stergiadis <em.stergiadis@gmail.com> | ||
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html | ||
|
||
""" | ||
Automated tests for checking the output of gensim.scripts. | ||
""" | ||
|
||
from __future__ import unicode_literals | ||
|
||
import json | ||
import logging | ||
import os.path | ||
import unittest | ||
|
||
from gensim.scripts.segment_wiki import segment_all_articles, segment_and_write_all_articles | ||
from smart_open import smart_open | ||
from gensim.test.utils import datapath, get_tmpfile | ||
|
||
|
||
class TestSegmentWiki(unittest.TestCase): | ||
|
||
def setUp(self): | ||
self.fname = datapath('enwiki-latest-pages-articles1.xml-p000000010p000030302-shortened.bz2') | ||
self.expected_title = 'Anarchism' | ||
self.expected_section_titles = [ | ||
'Introduction', | ||
'Etymology and terminology', | ||
'History', | ||
'Anarchist schools of thought', | ||
'Internal issues and debates', | ||
'Topics of interest', | ||
'Criticisms', | ||
'References', | ||
'Further reading', | ||
'External links' | ||
] | ||
|
||
def tearDown(self): | ||
# remove all temporary test files | ||
fname = get_tmpfile('script.tst') | ||
extensions = ['', '.json'] | ||
for ext in extensions: | ||
try: | ||
os.remove(fname + ext) | ||
except OSError: | ||
pass | ||
|
||
def test_segment_all_articles(self): | ||
title, sections, interlinks = next(segment_all_articles(self.fname, include_interlinks=True)) | ||
|
||
# Check title | ||
self.assertEqual(title, self.expected_title) | ||
|
||
# Check section titles | ||
section_titles = [s[0] for s in sections] | ||
self.assertEqual(section_titles, self.expected_section_titles) | ||
|
||
# Check text | ||
first_section_text = sections[0][1] | ||
first_sentence = "'''Anarchism''' is a political philosophy that advocates self-governed societies" | ||
self.assertTrue(first_sentence in first_section_text) | ||
|
||
# Check interlinks | ||
self.assertTrue(interlinks['self-governance'] == 'self-governed') | ||
self.assertTrue(interlinks['Hierarchy'] == 'hierarchical') | ||
self.assertTrue(interlinks['Pierre-Joseph Proudhon'] == 'Proudhon') | ||
|
||
def test_generator_len(self): | ||
expected_num_articles = 106 | ||
num_articles = sum(1 for x in segment_all_articles(self.fname)) | ||
|
||
self.assertEqual(num_articles, expected_num_articles) | ||
|
||
def test_json_len(self): | ||
tmpf = get_tmpfile('script.tst.json') | ||
segment_and_write_all_articles(self.fname, tmpf, workers=1) | ||
|
||
expected_num_articles = 106 | ||
num_articles = sum(1 for line in smart_open(tmpf)) | ||
self.assertEqual(num_articles, expected_num_articles) | ||
|
||
def test_segment_and_write_all_articles(self): | ||
tmpf = get_tmpfile('script.tst.json') | ||
segment_and_write_all_articles(self.fname, tmpf, workers=1, include_interlinks=True) | ||
|
||
# Get the first line from the text file we created. | ||
with open(tmpf) as f: | ||
first = next(f) | ||
|
||
# decode JSON line into a Python dictionary object | ||
article = json.loads(first) | ||
title, section_titles, interlinks = article['title'], article['section_titles'], article['interlinks'] | ||
|
||
self.assertEqual(title, self.expected_title) | ||
self.assertEqual(section_titles, self.expected_section_titles) | ||
|
||
# Check interlinks | ||
self.assertTrue(interlinks['self-governance'] == 'self-governed') | ||
self.assertTrue(interlinks['Hierarchy'] == 'hierarchical') | ||
self.assertTrue(interlinks['Pierre-Joseph Proudhon'] == 'Proudhon') | ||
|
||
|
||
if __name__ == '__main__': | ||
logging.basicConfig(level=logging.DEBUG) | ||
unittest.main() | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
# | ||
# Copyright (C) 2018 Vimig Socrates <vimig.socrates@gmail.com> heavily influenced from @AakaashRao | ||
# Copyright (C) 2018 Manos Stergiadis <em.stergiadis@gmail.com> | ||
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html | ||
|
||
""" | ||
Automated tests for checking the output of gensim.scripts. | ||
""" | ||
|
||
from __future__ import unicode_literals | ||
|
||
import json | ||
import logging | ||
import os.path | ||
import unittest | ||
|
||
from smart_open import smart_open | ||
import numpy as np | ||
|
||
from gensim.scripts.segment_wiki import segment_all_articles, segment_and_write_all_articles | ||
from gensim.test.utils import datapath, get_tmpfile | ||
|
||
from gensim.scripts.word2vec2tensor import word2vec2tensor | ||
from gensim.utils import to_utf8 | ||
from gensim.models import KeyedVectors | ||
|
||
|
||
class TestSegmentWiki(unittest.TestCase): | ||
|
||
def setUp(self): | ||
self.fname = datapath('enwiki-latest-pages-articles1.xml-p000000010p000030302-shortened.bz2') | ||
self.expected_title = 'Anarchism' | ||
self.expected_section_titles = [ | ||
'Introduction', | ||
'Etymology and terminology', | ||
'History', | ||
'Anarchist schools of thought', | ||
'Internal issues and debates', | ||
'Topics of interest', | ||
'Criticisms', | ||
'References', | ||
'Further reading', | ||
'External links' | ||
] | ||
|
||
def tearDown(self): | ||
# remove all temporary test files | ||
fname = get_tmpfile('script.tst') | ||
extensions = ['', '.json'] | ||
for ext in extensions: | ||
try: | ||
os.remove(fname + ext) | ||
except OSError: | ||
pass | ||
|
||
def test_segment_all_articles(self): | ||
title, sections, interlinks = next(segment_all_articles(self.fname, include_interlinks=True)) | ||
|
||
# Check title | ||
self.assertEqual(title, self.expected_title) | ||
|
||
# Check section titles | ||
section_titles = [s[0] for s in sections] | ||
self.assertEqual(section_titles, self.expected_section_titles) | ||
|
||
# Check text | ||
first_section_text = sections[0][1] | ||
first_sentence = "'''Anarchism''' is a political philosophy that advocates self-governed societies" | ||
self.assertTrue(first_sentence in first_section_text) | ||
|
||
# Check interlinks | ||
self.assertTrue(interlinks['self-governance'] == 'self-governed') | ||
self.assertTrue(interlinks['Hierarchy'] == 'hierarchical') | ||
self.assertTrue(interlinks['Pierre-Joseph Proudhon'] == 'Proudhon') | ||
|
||
def test_generator_len(self): | ||
expected_num_articles = 106 | ||
num_articles = sum(1 for x in segment_all_articles(self.fname)) | ||
|
||
self.assertEqual(num_articles, expected_num_articles) | ||
|
||
def test_json_len(self): | ||
tmpf = get_tmpfile('script.tst.json') | ||
segment_and_write_all_articles(self.fname, tmpf, workers=1) | ||
|
||
expected_num_articles = 106 | ||
num_articles = sum(1 for line in smart_open(tmpf)) | ||
self.assertEqual(num_articles, expected_num_articles) | ||
|
||
def test_segment_and_write_all_articles(self): | ||
tmpf = get_tmpfile('script.tst.json') | ||
segment_and_write_all_articles(self.fname, tmpf, workers=1, include_interlinks=True) | ||
|
||
# Get the first line from the text file we created. | ||
with open(tmpf) as f: | ||
first = next(f) | ||
|
||
# decode JSON line into a Python dictionary object | ||
article = json.loads(first) | ||
title, section_titles, interlinks = article['title'], article['section_titles'], article['interlinks'] | ||
|
||
self.assertEqual(title, self.expected_title) | ||
self.assertEqual(section_titles, self.expected_section_titles) | ||
|
||
# Check interlinks | ||
self.assertTrue(interlinks['self-governance'] == 'self-governed') | ||
self.assertTrue(interlinks['Hierarchy'] == 'hierarchical') | ||
self.assertTrue(interlinks['Pierre-Joseph Proudhon'] == 'Proudhon') | ||
|
||
|
||
class TestWord2Vec2Tensor(unittest.TestCase): | ||
def setUp(self): | ||
self.datapath = datapath('word2vec_pre_kv_c') | ||
self.output_folder = get_tmpfile('') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. better to add an explicit name (for example |
||
self.metadata_file = self.output_folder + '_metadata.tsv' | ||
self.tensor_file = self.output_folder + '_tensor.tsv' | ||
self.vector_file = self.output_folder + '_vector.tsv' | ||
|
||
def testConversion(self): | ||
word2vec2tensor(word2vec_model_path=self.datapath, tensor_filename=self.output_folder) | ||
|
||
try: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No need try/except in current test. if exception raised - test failed, this is expected behavior. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Without try/except, should we give the developers more feedback and which part of test went wrong? If so, how would I do that without the except block? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not in the current case, stack trace (if something goes wrong) have enough information in current tests. |
||
with smart_open(self.metadata_file, 'rb') as f: | ||
metadata = f.readlines() | ||
except Exception: | ||
if not os.path.isfile(os.path.join(self.metadata_file)): | ||
self.fail( | ||
'Metadata file %s creation failed. \ | ||
Check the parameters and input file format.' % self.metadata_file | ||
) | ||
try: | ||
with smart_open(self.tensor_file, 'rb') as f: | ||
vectors = f.readlines() | ||
except Exception: | ||
if not os.path.isfile(os.path.join(self.tensor_file)): | ||
self.fail( | ||
'Tensor file %s creation failed. \ | ||
Check the parameters and input file format.' % self.tensor_file | ||
) | ||
|
||
# check if number of words and vector size in tensor file line up with word2vec | ||
with smart_open(self.datapath, 'rb') as f: | ||
first_line = f.readline().strip() | ||
|
||
number_words, vector_size = map(int, first_line.split(b' ')) | ||
if not len(metadata) == len(vectors) == number_words: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
self.fail( | ||
'Metadata file %s and tensor file %s \ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we using 120 character limit, no reasons to make short lines |
||
imply different number of rows.' % (self.metadata_file, self.tensor_file) | ||
) | ||
|
||
# write word2vec to file | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The strange part of the test, you copy part of the code from script to test this script, why? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm, sorry, could you be a bit more specific? Do you mean the lines from 154-157? If so, the idea was to take the metadata and tensor components that were separated and put the back together in w2v style without any gensim functions and make sure they are the same as what is created by the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it's fine to remove it, yes |
||
metadata = [word.strip() for word in metadata] | ||
vectors = [vector.replace(b'\t', b' ') for vector in vectors] | ||
word2veclines = [metadata[i] + b' ' + vectors[i] for i in range(len(metadata))] | ||
with smart_open(self.vector_file, 'wb') as f: | ||
# write header | ||
f.write(to_utf8(str(number_words) + ' ' + str(vector_size) + '\n')) | ||
f.writelines(word2veclines) | ||
|
||
# test that the converted model loads successfully | ||
test_model = KeyedVectors.load_word2vec_format(self.vector_file, binary=False) | ||
|
||
# test vocabularies are the same | ||
orig_model = KeyedVectors.load_word2vec_format(self.datapath, binary=False) | ||
|
||
if not orig_model.vocab.keys() == test_model.vocab.keys(): | ||
self.fail( | ||
'Original word2vec model %s and tensor model %s have \ | ||
different vocabularies.' % (self.datapath, self.vector_file) | ||
) | ||
|
||
# test vectors for each word are the same | ||
for word in orig_model.vocab.keys(): | ||
if not np.array_equal(orig_model[word], test_model[word]): | ||
self.fail( | ||
'Original word2vec model %s and tensor model %s store different \ | ||
vectors for word %s.' % (self.datapath, self.vector_file, word) | ||
) | ||
|
||
|
||
if __name__ == '__main__': | ||
logging.basicConfig(level=logging.DEBUG) | ||
unittest.main() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why
np.float64
instead ofnp.float32
(that's default parameter)?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For some reason, when the word2vec model is loaded, if we leave the default, using the test data file
word2vec_pre_kv_c
I run into parsing issues. For instance, for the wordthe
, I getarray([-0.56110603, -1.97569799, 1.66395497, -1.23224604, 0.75475103, 0.98576403, 2.26144099, -0.59829003, -0.47433099, -1.41610503], dtype=float32)
instead of-0.561106 -1.975698 1.663955 -1.232246 0.754751 0.985764 2.261441 -0.598290 -0.474331 -1.416105
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
And what's a problem here? I still don't catch, sorry
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry, should have been more clear. In the first dimension, as an example, the original word2vec model has
-0.561106
but for some reason, without thenp.float64
when it is read in, it displays-0.56110603
, so the test fails equality. I'm not sure on the reason, though I assume something to do with theload_word2vec_format
internals.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
what's assertion you mean? can you link concrete line of code?
Also, you can fix assertion (slightly relax "almost equal" condition), but anyway, you shouldn't change dtype here.