From 809f2ba186eb5fecb812340259d0751f26ff9d99 Mon Sep 17 00:00:00 2001 From: Pranay Mathur Date: Sun, 25 Sep 2016 18:18:53 +0530 Subject: [PATCH 01/17] Updated n_similarity method Fixes Issue #743, n_similarity method now raises ZeroDivisionError if atleast one empty list is passed to it. --- gensim/models/word2vec.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index bf7793524e..469180d2ad 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -1539,9 +1539,15 @@ def n_similarity(self, ws1, ws2): True """ - v1 = [self[word] for word in ws1] - v2 = [self[word] for word in ws2] - return dot(matutils.unitvec(array(v1).mean(axis=0)), matutils.unitvec(array(v2).mean(axis=0))) + if len(ws1) > 0 and len(ws2) > 0: + v1 = [self[word] for word in ws1] + v2 = [self[word] for word in ws2] + return dot(matutils.unitvec(array(v1).mean(axis=0)), + matutils.unitvec(array(v2).mean(axis=0))) + else: + raise ZeroDivisionError('Atleast one of the passed list is empty.') + return + def init_sims(self, replace=False): """ From 3084f6baa94130b40a890707139ee624a30c3ea1 Mon Sep 17 00:00:00 2001 From: Pranay Mathur Date: Sun, 25 Sep 2016 18:19:55 +0530 Subject: [PATCH 02/17] Updated testSimilarities method Added new test cases in testSimilarities method which makes sure whether ZeroDivisionError is raised if atleast one empty list is passed to word2vec.n_similarities method Related to fix for issue #743 --- gensim/test/test_word2vec.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index 7e0368f13f..a417a31e5d 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -370,9 +370,12 @@ def testSimilarities(self): model = word2vec.Word2Vec(size=2, min_count=1, sg=0, hs=0, negative=2) model.build_vocab(sentences) model.train(sentences) - + self.assertTrue(model.n_similarity(['graph', 'trees'], ['trees', 'graph'])) self.assertTrue(model.n_similarity(['graph'], ['trees']) == model.similarity('graph', 'trees')) + self.assertRaises(ZeroDivisionError,model.n_similarity(['graph', 'trees'], [])) + self.assertRaises(ZeroDivisionError,model.n_similarity([], [])) + self.assertRaises(ZeroDivisionError,model.n_similarity([], ['graph', 'trees'])) def testSimilarBy(self): """Test word2vec similar_by_word and similar_by_vector.""" From 14ce73f4c14a9c14003c34e55fdcb27d3d99450b Mon Sep 17 00:00:00 2001 From: Pranay Mathur Date: Sun, 25 Sep 2016 18:20:33 +0530 Subject: [PATCH 03/17] Update CHANGELOG.md --- CHANGELOG.md | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5fbee1495e..04dcddf6e9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,11 +1,10 @@ Changes ======= -0.13.2, 2016-08-19 -* export_phrases in Phrases model changed. Fixed issue #794 and added test cases in test/test_phrases.py(@AadityaJ, -[#879](https://github.com/RaRe-Technologies/gensim/pull/879)) - - bigram construction can now support multiple bigrams within one sentence -* wordtopics has changed to word_topics in ldamallet, and fixed issue #764. (@bhargavvader, [#771](https://github.com/RaRe-Technologies/gensim/pull/771)) +0.13.2, 2016-09-25 + +* Fixed issue #743, In word2vec's n_similarity method if atleast one empty list is passed ZeroDivisionError is raised. +* wordtopics has changed to word_topics in ldamallet, and fixed issue #764. (@bhargavvader, [#771](https://github.com/RaRe-Technologies/gensim/pull/771)) - assigning wordtopics value of word_topics to keep backward compatibility, for now * topics, topn parameters changed to num_topics and num_words in show_topics() and print_topics()(@droudy, [#755](https://github.com/RaRe-Technologies/gensim/pull/755)) - In hdpmodel and dtmmodel @@ -47,7 +46,7 @@ Changes * Control whether to use lowercase for computing word2vec accuracy. (@alantian, #607) * Easy import of GloVe vectors using Gensim (Manas Ranjan Kar, #625) - Allow easy port of GloVe vectors into Gensim - - Standalone script with command line arguments, compatible with Python>=2.6 + - Standalone script with command line arguments, compatible with Python>=2.6 - Usage: python -m gensim.scripts.glove2word2vec -i glove_vectors.txt -o output_word2vec_compatible.txt * Add `similar_by_word()` and `similar_by_vector()` to word2vec (@isohyt, #381) * Convenience method for similarity of two out of training sentences to doc2vec (@ellolo, #707) From 552cd5912201d21867c8476689392f92d648c508 Mon Sep 17 00:00:00 2001 From: Pranay Mathur Date: Sun, 25 Sep 2016 18:20:49 +0530 Subject: [PATCH 04/17] Update CHANGELOG.md --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 04dcddf6e9..d59511f79d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ Changes 0.13.2, 2016-09-25 -* Fixed issue #743, In word2vec's n_similarity method if atleast one empty list is passed ZeroDivisionError is raised. +* Fixed issue #743 , In word2vec's n_similarity method if atleast one empty list is passed ZeroDivisionError is raised. * wordtopics has changed to word_topics in ldamallet, and fixed issue #764. (@bhargavvader, [#771](https://github.com/RaRe-Technologies/gensim/pull/771)) - assigning wordtopics value of word_topics to keep backward compatibility, for now * topics, topn parameters changed to num_topics and num_words in show_topics() and print_topics()(@droudy, [#755](https://github.com/RaRe-Technologies/gensim/pull/755)) From a826db25b62de1a25774d746a77d39e309c7d371 Mon Sep 17 00:00:00 2001 From: Pranay Mathur Date: Sun, 25 Sep 2016 18:27:09 +0530 Subject: [PATCH 05/17] Rename .travis.yml to .temp.yml --- .travis.yml => .temp.yml | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename .travis.yml => .temp.yml (100%) diff --git a/.travis.yml b/.temp.yml similarity index 100% rename from .travis.yml rename to .temp.yml From b77ac477d72dbe0950ebaac1a80a36b2aa01510b Mon Sep 17 00:00:00 2001 From: Pranay Mathur Date: Sun, 25 Sep 2016 18:27:46 +0530 Subject: [PATCH 06/17] Create .travis.yml --- .travis.yml | 1 + 1 file changed, 1 insertion(+) create mode 100644 .travis.yml diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000000..8b13789179 --- /dev/null +++ b/.travis.yml @@ -0,0 +1 @@ + From ec51a48bed7ad9f5f364e846878eba0d718da3d6 Mon Sep 17 00:00:00 2001 From: Pranay Mathur Date: Sun, 25 Sep 2016 18:28:22 +0530 Subject: [PATCH 07/17] Delete .temp.yml --- .temp.yml | 22 ---------------------- 1 file changed, 22 deletions(-) delete mode 100644 .temp.yml diff --git a/.temp.yml b/.temp.yml deleted file mode 100644 index 18f7502e71..0000000000 --- a/.temp.yml +++ /dev/null @@ -1,22 +0,0 @@ -sudo: false -language: python -python: - - "2.6" - - "2.7" - - "3.3" - - "3.4" - - "3.5" -before_install: - - wget 'http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh' -O miniconda.sh - - chmod +x miniconda.sh - - ./miniconda.sh -b - - export PATH=/home/travis/miniconda2/bin:$PATH - - conda update --yes conda -install: - - conda create --yes -n gensim-test python=$TRAVIS_PYTHON_VERSION pip atlas numpy scipy - - source activate gensim-test - - pip install pyemd - - pip install annoy - - pip install testfixtures - - python setup.py install -script: python setup.py test From 1288dcfea8191a8f5aa3e4020af06f878f8164dc Mon Sep 17 00:00:00 2001 From: Pranay Mathur Date: Sun, 25 Sep 2016 18:28:31 +0530 Subject: [PATCH 08/17] Update .travis.yml --- .travis.yml | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 8b13789179..18f7502e71 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1 +1,22 @@ - +sudo: false +language: python +python: + - "2.6" + - "2.7" + - "3.3" + - "3.4" + - "3.5" +before_install: + - wget 'http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh' -O miniconda.sh + - chmod +x miniconda.sh + - ./miniconda.sh -b + - export PATH=/home/travis/miniconda2/bin:$PATH + - conda update --yes conda +install: + - conda create --yes -n gensim-test python=$TRAVIS_PYTHON_VERSION pip atlas numpy scipy + - source activate gensim-test + - pip install pyemd + - pip install annoy + - pip install testfixtures + - python setup.py install +script: python setup.py test From 8bc778c1b0952135500747bfddac973a719658a4 Mon Sep 17 00:00:00 2001 From: Pranay Mathur Date: Sun, 25 Sep 2016 19:41:37 +0530 Subject: [PATCH 09/17] fixed assertRaises statements, to prevent error --- gensim/test/test_word2vec.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index a417a31e5d..f0f8ddf057 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -123,7 +123,8 @@ def testTooShortBinaryWord2VecFormat(self): f = open(tfile, 'r+b') f.write(b'13') # write wrong (too-long) vector count f.close() - self.assertRaises(EOFError, word2vec.Word2Vec.load_word2vec_format, tfile, binary=True) + self. + (EOFError, word2vec.Word2Vec.load_word2vec_format, tfile, binary=True) def testTooShortTextWord2VecFormat(self): tfile = testfile() @@ -373,9 +374,10 @@ def testSimilarities(self): self.assertTrue(model.n_similarity(['graph', 'trees'], ['trees', 'graph'])) self.assertTrue(model.n_similarity(['graph'], ['trees']) == model.similarity('graph', 'trees')) - self.assertRaises(ZeroDivisionError,model.n_similarity(['graph', 'trees'], [])) - self.assertRaises(ZeroDivisionError,model.n_similarity([], [])) - self.assertRaises(ZeroDivisionError,model.n_similarity([], ['graph', 'trees'])) + with self.assertRaises(ZeroDivisionError): + model.n_similarity(['graph', 'trees'], []) + model.n_similarity([], []) + model.n_similarity([], ['graph', 'trees']) def testSimilarBy(self): """Test word2vec similar_by_word and similar_by_vector.""" From 3398e5acec3e4e82cee1780227276a19061ad6bf Mon Sep 17 00:00:00 2001 From: Pranay Mathur Date: Sun, 25 Sep 2016 19:52:48 +0530 Subject: [PATCH 10/17] Update test_word2vec.py --- gensim/test/test_word2vec.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index f0f8ddf057..2fcd5834d3 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -123,8 +123,7 @@ def testTooShortBinaryWord2VecFormat(self): f = open(tfile, 'r+b') f.write(b'13') # write wrong (too-long) vector count f.close() - self. - (EOFError, word2vec.Word2Vec.load_word2vec_format, tfile, binary=True) + self.assertRaises(EOFError, word2vec.Word2Vec.load_word2vec_format, tfile, binary=True) def testTooShortTextWord2VecFormat(self): tfile = testfile() From eea3590ec1d195c04097c62539e88746e3a7f197 Mon Sep 17 00:00:00 2001 From: Pranay Mathur Date: Sun, 25 Sep 2016 20:34:31 +0530 Subject: [PATCH 11/17] fixing python 2.6 compatibility issues with assertRaises --- gensim/test/test_word2vec.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index 2fcd5834d3..723ed3ad59 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -373,10 +373,9 @@ def testSimilarities(self): self.assertTrue(model.n_similarity(['graph', 'trees'], ['trees', 'graph'])) self.assertTrue(model.n_similarity(['graph'], ['trees']) == model.similarity('graph', 'trees')) - with self.assertRaises(ZeroDivisionError): - model.n_similarity(['graph', 'trees'], []) - model.n_similarity([], []) - model.n_similarity([], ['graph', 'trees']) + self.assertRaises(ZeroDivisionError, model.n_similarity, ['graph', 'trees'], []) + self.assertRaises(ZeroDivisionError, model.n_similarity, [], ['graph', 'trees']) + self.assertRaises(ZeroDivisionError, model.n_similarity, [], []) def testSimilarBy(self): """Test word2vec similar_by_word and similar_by_vector.""" From 5235bb6c573f7469ecee8cd19343f19bd9e307aa Mon Sep 17 00:00:00 2001 From: Pranay Mathur Date: Sun, 25 Sep 2016 23:42:32 +0530 Subject: [PATCH 12/17] Added Requested Changes, removing return --- gensim/models/word2vec.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 469180d2ad..9ff1110d01 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -1539,14 +1539,12 @@ def n_similarity(self, ws1, ws2): True """ - if len(ws1) > 0 and len(ws2) > 0: - v1 = [self[word] for word in ws1] - v2 = [self[word] for word in ws2] - return dot(matutils.unitvec(array(v1).mean(axis=0)), - matutils.unitvec(array(v2).mean(axis=0))) - else: + if not(len(ws1) > 0 and len(ws2) > 0): raise ZeroDivisionError('Atleast one of the passed list is empty.') - return + v1 = [self[word] for word in ws1] + v2 = [self[word] for word in ws2] + return dot(matutils.unitvec(array(v1).mean(axis=0)), + matutils.unitvec(array(v2).mean(axis=0))) def init_sims(self, replace=False): From 8f3e51cd03aebbaf84e3806698b4769d3979c4b5 Mon Sep 17 00:00:00 2001 From: Pranay Mathur Date: Sun, 25 Sep 2016 23:46:35 +0530 Subject: [PATCH 13/17] Update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d59511f79d..8d25f33e22 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ Changes 0.13.2, 2016-09-25 * Fixed issue #743 , In word2vec's n_similarity method if atleast one empty list is passed ZeroDivisionError is raised. +* export_phrases in Phrases model changed. Fixed issue #794 and added test cases in test/test_phrases.py(@AadityaJ, #879) bigram construction can now support multiple bigrams within one sentence * wordtopics has changed to word_topics in ldamallet, and fixed issue #764. (@bhargavvader, [#771](https://github.com/RaRe-Technologies/gensim/pull/771)) - assigning wordtopics value of word_topics to keep backward compatibility, for now * topics, topn parameters changed to num_topics and num_words in show_topics() and print_topics()(@droudy, [#755](https://github.com/RaRe-Technologies/gensim/pull/755)) From 306b2ec6872b55b0042fb157ad3c2b54fc7bfdf3 Mon Sep 17 00:00:00 2001 From: Pranay Mathur Date: Mon, 26 Sep 2016 09:20:26 +0530 Subject: [PATCH 14/17] Update CHANGELOG.md --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8d25f33e22..15f3c4fefd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,7 @@ Changes ======= -0.13.2, 2016-09-25 +0.13.2, 2016-08-19 * Fixed issue #743 , In word2vec's n_similarity method if atleast one empty list is passed ZeroDivisionError is raised. * export_phrases in Phrases model changed. Fixed issue #794 and added test cases in test/test_phrases.py(@AadityaJ, #879) bigram construction can now support multiple bigrams within one sentence From ec49724498d43928cbb8a7d5cbe8e1bd93d8f244 Mon Sep 17 00:00:00 2001 From: Pranay Mathur Date: Mon, 26 Sep 2016 09:22:22 +0530 Subject: [PATCH 15/17] Update word2vec.py --- gensim/models/word2vec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 9ff1110d01..27a9829f42 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -1539,7 +1539,7 @@ def n_similarity(self, ws1, ws2): True """ - if not(len(ws1) > 0 and len(ws2) > 0): + if not(len(ws1) and len(ws2)): raise ZeroDivisionError('Atleast one of the passed list is empty.') v1 = [self[word] for word in ws1] v2 = [self[word] for word in ws2] From dc4df1c296af34e381c6e515b022d55e56e2d98f Mon Sep 17 00:00:00 2001 From: Pranay Mathur Date: Mon, 26 Sep 2016 16:16:04 +0530 Subject: [PATCH 16/17] Update CHANGELOG.md --- CHANGELOG.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 15f3c4fefd..e3cbaa9727 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,12 @@ Changes ======= +0.13.3, 2016-09-26 + +* Fixed issue #743 , In word2vec's n_similarity method if atleast one empty list is passed ZeroDivisionError is raised, added test cases in test/test_word2vec.py(@pranay360, #883) + 0.13.2, 2016-08-19 -* Fixed issue #743 , In word2vec's n_similarity method if atleast one empty list is passed ZeroDivisionError is raised. * export_phrases in Phrases model changed. Fixed issue #794 and added test cases in test/test_phrases.py(@AadityaJ, #879) bigram construction can now support multiple bigrams within one sentence * wordtopics has changed to word_topics in ldamallet, and fixed issue #764. (@bhargavvader, [#771](https://github.com/RaRe-Technologies/gensim/pull/771)) - assigning wordtopics value of word_topics to keep backward compatibility, for now From 9285ef96c604ed435009b283d8748bd6f4f7f2f6 Mon Sep 17 00:00:00 2001 From: Pranay Mathur Date: Mon, 26 Sep 2016 18:08:59 +0530 Subject: [PATCH 17/17] Update CHANGELOG.md --- CHANGELOG.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e3cbaa9727..fdbc1e89f6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,13 +2,13 @@ Changes ======= 0.13.3, 2016-09-26 - * Fixed issue #743 , In word2vec's n_similarity method if atleast one empty list is passed ZeroDivisionError is raised, added test cases in test/test_word2vec.py(@pranay360, #883) 0.13.2, 2016-08-19 - -* export_phrases in Phrases model changed. Fixed issue #794 and added test cases in test/test_phrases.py(@AadityaJ, #879) bigram construction can now support multiple bigrams within one sentence -* wordtopics has changed to word_topics in ldamallet, and fixed issue #764. (@bhargavvader, [#771](https://github.com/RaRe-Technologies/gensim/pull/771)) +* export_phrases in Phrases model changed. Fixed issue #794 and added test cases in test/test_phrases.py(@AadityaJ, +[#879](https://github.com/RaRe-Technologies/gensim/pull/879)) + - bigram construction can now support multiple bigrams within one sentence +* wordtopics has changed to word_topics in ldamallet, and fixed issue #764. (@bhargavvader, [#771](https://github.com/RaRe-Technologies/gensim/pull/771)) - assigning wordtopics value of word_topics to keep backward compatibility, for now * topics, topn parameters changed to num_topics and num_words in show_topics() and print_topics()(@droudy, [#755](https://github.com/RaRe-Technologies/gensim/pull/755)) - In hdpmodel and dtmmodel @@ -50,7 +50,7 @@ Changes * Control whether to use lowercase for computing word2vec accuracy. (@alantian, #607) * Easy import of GloVe vectors using Gensim (Manas Ranjan Kar, #625) - Allow easy port of GloVe vectors into Gensim - - Standalone script with command line arguments, compatible with Python>=2.6 + - Standalone script with command line arguments, compatible with Python>=2.6 - Usage: python -m gensim.scripts.glove2word2vec -i glove_vectors.txt -o output_word2vec_compatible.txt * Add `similar_by_word()` and `similar_by_vector()` to word2vec (@isohyt, #381) * Convenience method for similarity of two out of training sentences to doc2vec (@ellolo, #707)