Merge pull request #1 from piskvorky/manas

code style fixes to CLI scripts
piskvorky · Mar 15, 2016 · 4f4f6b6 · 4f4f6b6
2 parents 4bdf6b0 + 014eaf2
commit 4f4f6b6
Show file tree

Hide file tree

Showing 3 changed files with 88 additions and 108 deletions.
diff --git a/gensim/scripts/glove2word2vec.py b/gensim/scripts/glove2word2vec.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2016 Manas Ranjan Kar <manasrkar91@gmail.com>
+# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
+
+"""
+USAGE: %(program)s --input <GloVe vector file> --output <Word2vec vector file>
+
+Convert GloVe vectors in text format into the word2vec text format.
+
+The only difference between the two formats is an extra header line in word2vec,
+which contains the number of vectors and their dimensionality (two integers).
+"""
+
+import os
+import sys
+import logging
+import argparse
+
+import gensim
+from smart_open import smart_open
+
+logger = logging.getLogger(__name__)
+
+
+def get_glove_info(glove_file_name):
+    """
+    Return the number of vectors and dimensions in a file in GloVe format.
+    """
+    num_lines = sum(1 for line in smart_open(glove_file_name))
+    num_dims = len(smart_open(glove_file_name).next().split())
+    return num_lines, num_dims
+
+
+def glove2word2vec(glove_input_file, word2vec_output_file):
+    """
+    Convert `glove_input_file` in GloVe format into `word2vec_output_file` in word2vec format.
+    """
+    num_lines, num_dims = get_glove_info(glove_input_file)
+    header = "{} {}".format(num_lines, num_dims)
+    logger.info("converting %i vectors from %s to %s", num_lines, glove_input_file, word2vec_output_file)
+
+    with smart_open(word2vec_output_file, 'wb') as fout:
+        fout.write("%s\n" % header)
+        with smart_open(glove_input_file, 'rb') as fin:
+            for line in fin:
+                fout.write(line)
+    return num_lines, num_dims
+
+
+if __name__ == "__main__":
+    logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO)
+    logging.root.setLevel(level=logging.INFO)
+    logger.info("running %s", ' '.join(sys.argv))
+
+    # check and process cmdline input
+    program = os.path.basename(sys.argv[0])
+    if len(sys.argv) < 2:
+        print(globals()['__doc__'] % locals())
+        sys.exit(1)
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-i", "--input", required=True,
+        help="Input file, in gloVe format (read-only).")
+    parser.add_argument(
+        "-o", "--output", required=True,
+        help="Output file, in word2vec text format (will be overwritten!).")
+    args = parser.parse_args()
+
+    # do the actual conversion
+    num_lines, num_dims = glove2word2vec(args.input, args.output)
+    logger.info('converted model with %i vectors and %i dimensions', num_lines, num_dims)
+
+    # test that the converted model loads successfully
+    model = gensim.models.Word2Vec.load_word2vec_format(args.output, binary=False)
+    logger.info('model %s successfully loaded', model)
+    logger.info('testing the model....')
+    logger.info('top-10 most similar words to "king": %s', model.most_similar(positive=['king'], topn=10))
+    logger.info('similarity score between "woman" and "man": %s', model.similarity('woman', 'man'))
+
+    logger.info("finished running %s", program)
diff --git a/gensim/scripts/word2vec_standalone.py b/gensim/scripts/word2vec_standalone.py
@@ -10,8 +10,7 @@
 -min_count MIN-COUNT -alpha ALPHA -binary BINARY -accuracy FILE
 
 Trains a neural embedding model on text file CORPUS.
-Parameters essentially reproduce those used by the original C tool 
-(see https://code.google.com/archive/p/word2vec/).
+Parameters essentially reproduce those used by the original C tool (see https://code.google.com/archive/p/word2vec/).
 
 Parameters for training:
         -train <file>
@@ -61,21 +60,17 @@
 
 
 if __name__ == "__main__":
-
     logging.basicConfig(
         format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s',
         level=logging.INFO)
     logger.info("running %s", " ".join(sys.argv))
 
-
     # check and process cmdline input
     program = os.path.basename(sys.argv[0])
     if len(sys.argv) < 2:
         print(globals()['__doc__'] % locals())
         sys.exit(1)
 
-
-
     seterr(all='raise')  # don't ignore numpy errors
 
     parser = argparse.ArgumentParser()
@@ -108,8 +103,10 @@
 
     corpus = LineSentence(args.train)
 
-    model = Word2Vec(corpus, size=args.size, min_count=args.min_count, workers=args.threads, window=args.window,
-    sample=args.sample, alpha=args.alpha, sg=skipgram, hs=args.hs, negative=args.negative, cbow_mean=1, iter=args.iter)
+    model = Word2Vec(
+        corpus, size=args.size, min_count=args.min_count, workers=args.threads, window=args.window,
+        sample=args.sample, alpha=args.alpha, sg=skipgram, hs=args.hs, negative=args.negative,
+        cbow_mean=1, iter=args.iter)
 
     if args.output:
         outfile = args.output

diff --git a/glove2word2vec.py b/glove2word2vec.py