piskvorky · akutuzov · Nov 5, 2015 · Nov 6, 2015 · Nov 6, 2015 · Nov 6, 2015
diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py
@@ -24,6 +24,7 @@
 from gensim.corpora.dictionary import Dictionary
 from six import string_types
 from six.moves import xrange
+from scipy import stats
 
 
 logger = logging.getLogger(__name__)
@@ -478,6 +479,86 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, c
         sections.append(total)
         return sections
 
+    @staticmethod
+    def log_evaluation(pearson, spearman, oov, pairs):
+        logger.info('Pearson correlation coefficient against {0:s}: {1:.4f}'.format(pairs, pearson[0]))
+        logger.info('Spearman rank-order correlation coefficient against {0:s}: {1:.4f}'.format(pairs, spearman[0]))
+        logger.info('Pairs with unknown words ratio: {0:.1f}%'.format(oov))
+
+    def evaluation(self, pairs, delimiter='\t', restrict_vocab=300000, case_insensitive=True, dummy4unknown=False):
+        """
+        Compute correlation of the model with human similarity judgments. `pairs` is a filename of a dataset where
+        lines are 3-tuples, each consisting of a word pair and a similarity value, separated by `delimiter'.
+        Example datasets can be found at http://technion.ac.il/~ira.leviant/wordsim353.zip or at
+        https://www.cl.cam.ac.uk/~fh295/SimLex-999.zip.
+
+        The model is evaluated using Pearson correlation coefficient and Spearman rank-order correlation coefficient
+        between the similarities from the dataset and the similarities produced by the model itself.        .
+        The results are printed to log and returned as a triple (pearson, spearman, ratio of pairs with unknown words).
+
+        Use `restrict_vocab` to ignore all word pairs containing a word not in the first `restrict_vocab`
+        words (default 300,000). This may be meaningful if you've sorted the vocabulary by descending frequency.
+        If `case_insensitive` is True, the first `restrict_vocab` words are taken, and then case normalization
+        is performed.
+
+        Use `case_insensitive` to convert all words in the pairs and vocab to their uppercase form before
+        evaluating the model (default True). Useful when you expect case-mismatch between training tokens
+        and words pairs in the dataset. If there are multiple case variants of a single word, the vector for the first
+        occurrence (also the most frequent if vocabulary is sorted) is taken.
+
+        Use `dummy4unknown=True' to produce zero-valued similarities for pairs with out-of-vocabulary words.
+        Otherwise (default False), these pairs are skipped entirely.
+        """
+        ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]]
+        ok_vocab = dict((w.upper(), v) for w, v in reversed(ok_vocab)) if case_insensitive else dict(ok_vocab)
+
+        similarity_gold = []
+        similarity_model = []
+        oov = 0
+
+        original_vocab = self.vocab
+        self.vocab = ok_vocab
+
+        for line_no, line in enumerate(utils.smart_open(pairs)):
+            line = utils.to_unicode(line)
+            if line.startswith('#'):
+                # May be a comment
+                continue
+            else:
+                try:
+                    if case_insensitive:
+                        a, b, sim = [word.upper() for word in line.split(delimiter)]
+                    else:
+                        a, b, sim = [word for word in line.split(delimiter)]
+                    sim = float(sim)
+                except:
+                    logger.info('skipping invalid line #{0:d} in {1:s}'.format(line_no, pairs))
+                    continue
+                if a not in ok_vocab or b not in ok_vocab:
+                    oov += 1
+                    if dummy4unknown:
+                        similarity_model.append(0.0)
+                        similarity_gold.append(sim)
+                        continue
+                    else:
+                        logger.debug('skipping line #{0:d} with OOV words: {1:s}'.format(line_no, line.strip()))
+                        continue
+                similarity_gold.append(sim)  # Similarity from the dataset
+                similarity_model.append(self.similarity(a, b))  # Similarity from the model
+        self.vocab = original_vocab
+        spearman = stats.spearmanr(similarity_gold, similarity_model)
+        pearson = stats.pearsonr(similarity_gold, similarity_model)
+        oov_ratio = float(oov) / (len(similarity_gold) + oov) * 100
+
+        logger.debug('Pearson correlation coefficient against {0:s}: {1:f} with p-value {2:f}'
+                     .format(pairs, pearson[0], pearson[1]))
+        logger.debug('Spearman rank-order correlation coefficient against {0:s}: {1:f} with p-value {2:f}'
+                     .format(pairs, spearman[0], spearman[1]))
+        logger.debug('Pairs with unknown words: {0:d}'.format(oov))
+        self.log_evaluation(pearson, spearman, oov_ratio, pairs)
+        return pearson, spearman, oov_ratio
+
+
     def init_sims(self, replace=False):
         """
         Precompute L2-normalized vectors.

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
@@ -99,6 +99,7 @@
 from six import iteritems, itervalues, string_types
 from six.moves import xrange
 from types import GeneratorType
+from scipy import stats
 
 logger = logging.getLogger(__name__)
 
@@ -1392,6 +1393,13 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=None, case_inse
         most_similar = most_similar or KeyedVectors.most_similar
         return self.wv.accuracy(questions, restrict_vocab, most_similar, case_insensitive)
 
+    @staticmethod
+    def log_evaluation(pearson, spearman, oov, pairs):
+        return KeyedVectors.log_evaluation(pearson, spearman, oov, pairs)
+
+    def evaluation(self, pairs, delimiter='\t', restrict_vocab=300000, case_insensitive=True, dummy4unknown=False):
+        return self.wv.evaluation(self, pairs, delimiter, restrict_vocab, case_insensitive, dummy4unknown)
+
     def __str__(self):
         return "%s(vocab=%s, size=%s, alpha=%s)" % (self.__class__.__name__, len(self.wv.index2word), self.vector_size, self.alpha)
 

diff --git a/gensim/scripts/word2vec_standalone.py b/gensim/scripts/word2vec_standalone.py
@@ -10,7 +10,11 @@
 -min_count MIN-COUNT -alpha ALPHA -binary BINARY -accuracy FILE
 
 Trains a neural embedding model on text file CORPUS.
+<<<<<<< HEAD
+Parameters essentially reproduce those used by the original C tool 
+=======
 Parameters essentially reproduce those used by the original C tool
+>>>>>>> upstream/master
 (see https://code.google.com/archive/p/word2vec/).
 
 Parameters for training:
@@ -53,6 +57,18 @@
 import os.path
 import sys
 import argparse
+<<<<<<< HEAD
+
+
+logger = logging.getLogger(__name__)
+
+if __name__ == "__main__":
+    from gensim.models.word2vec import Word2Vec  # avoid referencing __main__ in pickle
+    from gensim.models.word2vec_inner import FAST_VERSION
+    from gensim.models.word2vec import LineSentence
+    from numpy import seterr
+
+=======
 from numpy import seterr
 
 logger = logging.getLogger(__name__)
@@ -61,10 +77,16 @@
 
 
 if __name__ == "__main__":
+>>>>>>> upstream/master
     logging.basicConfig(
         format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s',
         level=logging.INFO)
     logger.info("running %s", " ".join(sys.argv))
+<<<<<<< HEAD
+    logger.info("using optimization %s", FAST_VERSION)
+
+=======
+>>>>>>> upstream/master
 
     # check and process cmdline input
     program = os.path.basename(sys.argv[0])
@@ -79,8 +101,13 @@
     parser.add_argument("-output", help="Use file OUTPUT to save the resulting word vectors")
     parser.add_argument("-window", help="Set max skip length WINDOW between words; default is 5", type=int, default=5)
     parser.add_argument("-size", help="Set size of word vectors; default is 100", type=int, default=100)
+<<<<<<< HEAD
+    parser.add_argument("-sample", help="""Set threshold for occurrence of words. Those that appear with higher frequency in the training data will be randomly down-sampled;
+    default is 1e-3, useful range is (0, 1e-5)""", type=float, default=1e-3)
+=======
     parser.add_argument("-sample", help="Set threshold for occurrence of words. Those that appear with higher frequency in the training data will be randomly down-sampled; "
                                         "default is 1e-3, useful range is (0, 1e-5)", type=float, default=1e-3)
+>>>>>>> upstream/master
     parser.add_argument("-hs", help="Use Hierarchical Softmax; default is 0 (not used)", type=int, default=0, choices=[0, 1])
     parser.add_argument("-negative", help="Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)", type=int, default=5)
     parser.add_argument("-threads", help="Use THREADS threads (default 3)", type=int, default=3)
@@ -104,10 +131,15 @@
 
     corpus = LineSentence(args.train)
 
+<<<<<<< HEAD
+    model = Word2Vec(corpus, size=args.size, min_count=args.min_count, workers=args.threads, window=args.window,
+    sample=args.sample, alpha=args.alpha, sg=skipgram, hs=args.hs, negative=args.negative, cbow_mean=1, iter=args.iter)
+=======
     model = Word2Vec(
         corpus, size=args.size, min_count=args.min_count, workers=args.threads,
         window=args.window, sample=args.sample, alpha=args.alpha, sg=skipgram,
         hs=args.hs, negative=args.negative, cbow_mean=1, iter=args.iter)
+>>>>>>> upstream/master
 
     if args.output:
         outfile = args.output

diff --git a/syntax_model_files/README.md b/syntax_model_files/README.md
@@ -0,0 +1 @@
+# word2vec_syntax
diff --git a/syntax_model_files/gen_iter.py b/syntax_model_files/gen_iter.py
@@ -0,0 +1,181 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Tue May 24 22:43:11 2016
+
+@author: robert
+"""
+
+#TWEAK TIME
+
+import pickle
+import os
+import re
+# use nltk.download() to download stopwords corpus if not yet
+
+
+
+
+# Iterable to be passed to word2vec class as sentences.
+# reads sentences one by one from picke dump
+
+
+
+#stops = set(stopwords.words('russian'))
+
+stops = ['чтоб', 'между', 'какой', 'без', 'но', 'чуть', 'для', 'не', 'куда',
+            'себя', 'всего', 'даже', 'был', 'кто', 'уж', 'только', 'с', 'быть',
+            'теперь', 'много', 'по', 'надо', 'когда', 'этого',
+            'три', 'и', 'опять', 'или', 'под', 'более', 'эти', 'бы', 'чем',
+            'совсем', 'сам', 'раз', 'хоть', 'нибудь', 'него', 'уже', 'сейчас', 
+            'никогда', 'о', 'ни', 'можно', 'ли', 'потому', 'тем', 'будто', 
+            'в', 'перед', 'так', 'два', 'ничего', 'а', 'почти', 'может',
+            'было', 'эту', 'их', 'нет', 'впрочем', 'им', 'во', 'лучше',
+             'до', 'про', 'вот', 'после', 'что', 'зачем', 'иногда', 
+            'ее', 'другой', 'больше', 'тоже', 'еще', 'от', 'у', 'потом', 'всю',
+            'над', 'этой', 'за', 'если',  'ж', 'там', 'есть',
+            'через', 'из', 'как', 'на', 'чтобы', 'такой', 'том',
+            'да', 'этом', 'хорошо', 'к', 'при', 'были', 'себе', 
+            'чего',  'ней', 'то',  'вам', 'один', 'вдруг', 'со', 
+            'тогда', 'будет', 'разве', 'нельзя', 'наконец', 'ведь', 'здесь',
+            'тот', 'какая', 'этот', 'же', 'где', 'ну', 'конечно',  
+            'того', 'тут', 'была',  'всегда', 'свою', 'об', 'всех']
+
+# futureStops = ['кто', 'что']
+
+
+# Looping over the corpus and generating pickle dump file that would give off
+# context pairs one by one
+
+def createContext(root_directory):
+
+    pickleDump = open('/run/media/robert/1TB-1/linuxfolder/pythonworks/contDumpFinal', 'ab')
+    dumpCounter = 0
+
+# walking the corpus dir
+# files walked linewise
+
+
+    for root, dirs, files in os.walk(root_directory):
+            for fname in filter(lambda fname: fname.endswith('.conll'), files):
+
+
+                document = open(os.path.join(root, fname), 'r')
+                print('Opened document ' + fname)
+
+
+                wordCounter = -1
+                sentDict = {}
+                sentCash = []
+                for line in document:
+
+                    if len(line)<5:
+                        continue
+                    line = line.lower()
+                    line = line.split()
+                                        # Creating cash dictionary for sentence
+
+                    wordCounter += 1
+                    if wordCounter < int(line[0]):
+
+                        if re.match('[A-Za-zА-Яа-я]+$', line[2]) != None:
+                            sentDict.update({line[0]:{'word':line[2],'ref':line[6]}})
+
+
+                        else:
+                            sentDict.update({line[0]:{'word':None,'ref':line[6]}})
+
+
+                    else:
+                        wordCounter = 0
+                                            # Creating a sentence (context pair) to be passed to word2vec later
+                        for slot in sentDict:
+                            if sentDict[slot]['word'] == None:
+                                continue
+                            if sentDict[slot]['word'] in stops:
+
+                                continue
+                            sentCash.append(sentDict[slot]['word']) # append target word if it is okay
+       # looking into word that's higher in hyerarchy
+                            if (sentDict[slot]['ref'] != 0 and sentDict[slot]['ref'] != '0'):
+                                wordRef = sentDict[slot]['ref']
+                                refCounter = 0
+                                while refCounter < 10:
+                                    refCounter += 1
+
+                    #cycling through dependent word chain until good word fould or 10 tries
+
+
+                                    try:
+                                        if sentDict[wordRef]['word'] in stops:
+
+                                            wordRef = sentDict[wordRef]['ref']
+
+                                        else:
+                                            refCounter = 10
+
+                                            try:
+
+                                                sentCash.append(sentDict[sentDict[slot]['ref']]['word'])
+    #                                            print(sentCash)
+
+                                            except:
+                                                continue
+                                    except:
+                                        pass
+        # looking into dependent words
+        # cycling through all words in a sentence again
+                            for slot2 in sentDict:
+                                if sentDict[slot2]['ref'] == slot:
+                                    if sentDict[slot2]['word'] != None:
+                                        if re.match('[A-Za-zА-Яа-я]+$', sentDict[slot2]['word']) != None:
+                                            if sentDict[slot2]['word'] not in stops:
+                                                sentCash.append(sentDict[slot2]['word'])
+                                        # if okay, stop here
+#
+                                    if (sentDict[slot2]['word'] == None) or (sentDict[slot2]['word'] in stops):
+                                        checkedSlot = slot2
+                                        slotCounter = 0
+                                        while slotCounter < 10:
+#                                            print('SPASITE2')
+                                            slotCounter += 1
+                                            for slot3 in sentDict:
+                                                if sentDict[slot3]['ref'] == checkedSlot:
+
+                                                    if (sentDict[slot3]['word'] == None) or (sentDict[slot3]['word'] in stops):
+#                                                        print(str(sentDict[slot3]['word']) + ' is  BAD WORD FROM SECOND CYCLE!')
+                                                        checkedSlot = slot3
+                                                        slotCounter += 1
+                                                    else:
+ #                                                       print(sentDict[slot3]['word'] +  ' is a GOOD WORD FROM SECOND CYCLE!')
+                                                        sentCash.append(sentDict[slot3]['word'])
+                                                        slotCounter = 10
+                        # veryfying no stopwords slipped
+                            for k in filter(lambda k: k in stops, sentCash):
+                                sentCash.remove(k)
+                            if len(sentCash) > 1:
+#                                print('Dumping.....')
+                                pickle.dump(sentCash,pickleDump)
+                        #pickling to a file
+                                dumpCounter += 1
+                            sentCash = []
+                        sentDict = {}
+                        if re.match('[A-Za-zА-Яа-я]+$', line[2]) != None:
+                            sentDict.update({line[0]:{'word':line[2],'ref':line[6]}})
+                        else:
+                            sentDict.update({line[0]:{'word':None,'ref':line[6]}})
+
+    pickleDump.close()
+    return(dumpCounter)
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/syntax_model_files/gensim/models/compile_cython.py b/syntax_model_files/gensim/models/compile_cython.py
@@ -0,0 +1,13 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Sun Jun 12 21:01:49 2016
+
+@author: robert
+"""
+
+from distutils.core import setup
+from Cython.Build import cythonize
+import numpy
+
+
+setup(ext_modules=cythonize('/run/media/robert/1TB-1/linuxfolder/gitlair/word2vec_syntax/gensim/models/word2vec_inner_in_works.pyx'), include_dirs=[numpy.get_include()])