From 2ec6001b6127ef78fa582c688b7330cae2c2de74 Mon Sep 17 00:00:00 2001 From: Pierpaolo Pantone <24alsecondo@gmail.com> Date: Fri, 19 Feb 2016 11:42:43 +0100 Subject: [PATCH 01/14] Fix imports in sentiment module --- nltk/sentiment/util.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nltk/sentiment/util.py b/nltk/sentiment/util.py index 009348a83d..37e1bc5b82 100644 --- a/nltk/sentiment/util.py +++ b/nltk/sentiment/util.py @@ -407,7 +407,7 @@ def demo_tweets(trainer, n_instances=None, output=None): :param output: the output file where results have to be reported. """ from nltk.tokenize import TweetTokenizer - from sentiment_analyzer import SentimentAnalyzer + from nltk.sentiment import SentimentAnalyzer from nltk.corpus import twitter_samples, stopwords # Different customizations for the TweetTokenizer @@ -484,7 +484,7 @@ def demo_movie_reviews(trainer, n_instances=None, output=None): :param output: the output file where results have to be reported. """ from nltk.corpus import movie_reviews - from sentiment_analyzer import SentimentAnalyzer + from nltk.sentiment import SentimentAnalyzer if n_instances is not None: n_instances = int(n_instances/2) @@ -536,7 +536,7 @@ def demo_subjectivity(trainer, save_analyzer=False, n_instances=None, output=Non and negative. :param output: the output file where results have to be reported. """ - from sentiment_analyzer import SentimentAnalyzer + from nltk.sentiment import SentimentAnalyzer from nltk.corpus import subjectivity if n_instances is not None: From 46e924f213739b7ac9429e022798419709efd8c7 Mon Sep 17 00:00:00 2001 From: Pierpaolo Pantone <24alsecondo@gmail.com> Date: Fri, 19 Feb 2016 11:50:40 +0100 Subject: [PATCH 02/14] Fix vader imports in sentiment module --- nltk/sentiment/util.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nltk/sentiment/util.py b/nltk/sentiment/util.py index 37e1bc5b82..119e28245f 100644 --- a/nltk/sentiment/util.py +++ b/nltk/sentiment/util.py @@ -650,7 +650,7 @@ def demo_vader_instance(text): :param text: a text whose polarity has to be evaluated. """ - from vader import SentimentIntensityAnalyzer + from nltk.sentiment import SentimentIntensityAnalyzer vader_analyzer = SentimentIntensityAnalyzer() print(vader_analyzer.polarity_scores(text)) @@ -663,7 +663,7 @@ def demo_vader_tweets(n_instances=None, output=None): """ from collections import defaultdict from nltk.corpus import twitter_samples - from vader import SentimentIntensityAnalyzer + from nltk.sentiment import SentimentIntensityAnalyzer from nltk.metrics import (accuracy as eval_accuracy, precision as eval_precision, recall as eval_recall, f_measure as eval_f_measure) From 2085bf8b4c834ad9456cf1debc70ce2e48306ea7 Mon Sep 17 00:00:00 2001 From: alvations Date: Fri, 19 Feb 2016 16:41:34 +0100 Subject: [PATCH 03/14] Use os.path.split to handle file paths in all OS Currently, `jar.rpartition('/')[2]` only works for linux. --- nltk/parse/malt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nltk/parse/malt.py b/nltk/parse/malt.py index 41141fecf4..5cce984ae3 100644 --- a/nltk/parse/malt.py +++ b/nltk/parse/malt.py @@ -66,7 +66,7 @@ def find_maltparser(parser_dirname): # Checks that that the found directory contains all the necessary .jar malt_dependencies = ['','',''] _malt_jars = set(find_jars_within_path(_malt_dir)) - _jars = set(jar.rpartition('/')[2] for jar in _malt_jars) + _jars = set(os.path.split(jar)[1] for jar in _malt_jars) malt_dependencies = set(['log4j.jar', 'libsvm.jar', 'liblinear-1.8.jar']) assert malt_dependencies.issubset(_jars) From 22585dfdc748dbcde74713dd3da9c93909acd792 Mon Sep 17 00:00:00 2001 From: alvations Date: Mon, 22 Feb 2016 19:26:28 +0100 Subject: [PATCH 04/14] Use OS specific classpath separator See http://stackoverflow.com/questions/4528438/classpath-does-not-work-under-linux --- nltk/parse/malt.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/nltk/parse/malt.py b/nltk/parse/malt.py index 5cce984ae3..5d440a203f 100644 --- a/nltk/parse/malt.py +++ b/nltk/parse/malt.py @@ -12,6 +12,7 @@ from nltk.six import text_type import os +import sys import tempfile import subprocess import inspect @@ -215,8 +216,10 @@ def generate_malt_command(self, inputfilename, outputfilename=None, mode=None): """ cmd = ['java'] - cmd+= self.additional_java_args # Adds additional java arguments. - cmd+= ['-cp', ':'.join(self.malt_jars)] # Adds classpaths for jars + cmd+= self.additional_java_args # Adds additional java arguments + # Joins classpaths with ";" if on Windows and on Linux/Mac use ":" + classpaths_separator = ';' if sys.platform.startswith('win') else ':' + cmd+= ['-cp', classpaths_separator.join(self.malt_jars)] # Adds classpaths for jars cmd+= ['org.maltparser.Malt'] # Adds the main function. # Adds the model file. From 046bfb000a68d8348524095cf3d3689692c65f7b Mon Sep 17 00:00:00 2001 From: alvations Date: Mon, 22 Feb 2016 19:36:49 +0100 Subject: [PATCH 05/14] Updated contributor info. --- nltk/parse/malt.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nltk/parse/malt.py b/nltk/parse/malt.py index 5d440a203f..8f3d1c971f 100644 --- a/nltk/parse/malt.py +++ b/nltk/parse/malt.py @@ -2,6 +2,7 @@ # Natural Language Toolkit: Interface to MaltParser # # Author: Dan Garrette +# Contributor: Liling Tan, Mustufain, osamamukhtar11 # # Copyright (C) 2001-2015 NLTK Project # URL: From 1bdab4acf682e4824d86cf136938bd4338c9ed6b Mon Sep 17 00:00:00 2001 From: alvations Date: Wed, 24 Feb 2016 15:34:00 +0100 Subject: [PATCH 06/14] Fix implementation error on Method3 Smoothing The smooth should only affect the precision count not the ngram precision itself, so the `p_i.denominator` needs to be considered, i.e.`(1/2**k) / p_i.denominator`. Simplified mathematically to `1/ (2**k * p_i.denominator)` --- nltk/translate/bleu_score.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nltk/translate/bleu_score.py b/nltk/translate/bleu_score.py index 356ee82cdd..294774745b 100644 --- a/nltk/translate/bleu_score.py +++ b/nltk/translate/bleu_score.py @@ -491,7 +491,7 @@ def method3(self, p_n, *args, **kwargs): incvnt = 1 # From the mteval-v13a.pl, it's referred to as k. for i, p_i in enumerate(p_n): if p_i == 0: - p_n[i] = 1 / 2**incvnt + p_n[i] = 1 / (2**incvnt * p_i.denominator) incvnt+=1 return p_n From ca52a97f171ce482658b7602bf6881f2e6a73342 Mon Sep 17 00:00:00 2001 From: alvations Date: Thu, 25 Feb 2016 08:44:07 +0100 Subject: [PATCH 07/14] Fixing the ZipFile issue in Python3.5 --- nltk/data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nltk/data.py b/nltk/data.py index bf2497eb3e..d208a63293 100644 --- a/nltk/data.py +++ b/nltk/data.py @@ -978,8 +978,8 @@ def __init__(self, filename): def read(self, name): assert self.fp is None self.fp = open(self.filename, 'rb') - value = zipfile.ZipFile.read(self, name) - self.close() + with self.open(name) as zfin: + value = zfin.read() return value def write(self, *args, **kwargs): From b73a8e3d889fe4a42b022161edb77c055dbe4655 Mon Sep 17 00:00:00 2001 From: alvations Date: Thu, 25 Feb 2016 10:54:15 +0100 Subject: [PATCH 08/14] Enforce read-mode for ZipFile --- nltk/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nltk/data.py b/nltk/data.py index d208a63293..aab65d2511 100644 --- a/nltk/data.py +++ b/nltk/data.py @@ -978,7 +978,7 @@ def __init__(self, filename): def read(self, name): assert self.fp is None self.fp = open(self.filename, 'rb') - with self.open(name) as zfin: + with self.open(name, 'r') as zfin: value = zfin.read() return value From 21532d5992366a8f0ce1e4eede71195b2529b004 Mon Sep 17 00:00:00 2001 From: alvations Date: Thu, 25 Feb 2016 11:41:49 +0100 Subject: [PATCH 09/14] Reverting to old data.py --- nltk/data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nltk/data.py b/nltk/data.py index aab65d2511..bf2497eb3e 100644 --- a/nltk/data.py +++ b/nltk/data.py @@ -978,8 +978,8 @@ def __init__(self, filename): def read(self, name): assert self.fp is None self.fp = open(self.filename, 'rb') - with self.open(name, 'r') as zfin: - value = zfin.read() + value = zipfile.ZipFile.read(self, name) + self.close() return value def write(self, *args, **kwargs): From b111ed721b6158a62f9f68b2a2d59b4ab00e02f9 Mon Sep 17 00:00:00 2001 From: alvations Date: Thu, 25 Feb 2016 12:06:29 +0100 Subject: [PATCH 10/14] Added _fileRefCnt support to be python3.5+ compatible --- nltk/data.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/nltk/data.py b/nltk/data.py index bf2497eb3e..2e4d7e6d36 100644 --- a/nltk/data.py +++ b/nltk/data.py @@ -974,11 +974,17 @@ def __init__(self, filename): zipfile.ZipFile.__init__(self, filename) assert self.filename == filename self.close() + # After closing a ZipFile object, the _fileRefCnt needs to be cleared + # for Python2and3 compatible code. + self._fileRefCnt = 0 def read(self, name): assert self.fp is None self.fp = open(self.filename, 'rb') value = zipfile.ZipFile.read(self, name) + # Ensure that _fileRefCnt needs to be set for Python2and3 compatible code. + # Since we only opened one file here, we add 1. + self._fileRefCnt += 1 self.close() return value From 6d7eeec50d9db67cb72ce9122cbfb1503cd135f1 Mon Sep 17 00:00:00 2001 From: alvations Date: Thu, 25 Feb 2016 13:47:12 +0100 Subject: [PATCH 11/14] Resolves unclosed warning for .raw() Resolves the warning issues for unopened text when PlaintextCorpusReader.raw() --- nltk/corpus/reader/plaintext.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/nltk/corpus/reader/plaintext.py b/nltk/corpus/reader/plaintext.py index b12669f517..ad2b5328cc 100644 --- a/nltk/corpus/reader/plaintext.py +++ b/nltk/corpus/reader/plaintext.py @@ -71,7 +71,12 @@ def raw(self, fileids=None): """ if fileids is None: fileids = self._fileids elif isinstance(fileids, string_types): fileids = [fileids] - return concat([self.open(f).read() for f in fileids]) + raw_texts = [] + for f in fileids: + _fin = self.open(f) + raw_texts.append(_fin.read()) + _fin.close() + return concat(raw_texts) def words(self, fileids=None): """ From 8ff1c359a1114535442ce5367bbd04c5608efc71 Mon Sep 17 00:00:00 2001 From: alvations Date: Thu, 25 Feb 2016 14:37:45 +0100 Subject: [PATCH 12/14] Enforce stream.close() after reaching EOF Note that the stream will reopen once it's accessed again. --- nltk/corpus/reader/util.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/nltk/corpus/reader/util.py b/nltk/corpus/reader/util.py index bb31cb9bdd..d51ad04e7f 100644 --- a/nltk/corpus/reader/util.py +++ b/nltk/corpus/reader/util.py @@ -334,6 +334,9 @@ def iterate_from(self, start_tok): # If we reach this point, then we should know our length. assert self._len is not None + # Enforce closing of stream once we reached end of file + # We should have reached EOF once we're out of the while loop. + self.close() # Use concat for these, so we can use a ConcatenatedCorpusView # when possible. From a488bb6c900c89ea4d2ca154627fa92093626332 Mon Sep 17 00:00:00 2001 From: alvations Date: Fri, 26 Feb 2016 14:22:49 +0100 Subject: [PATCH 13/14] Fixes doctest for python3 Dictionary hash are dynamic in Python3 so the __repr__ output will be different all the time. Doctest should check for value equality instead. Fixes issue on https://nltk.ci.cloudbees.com/job/nltk/TOXENV=py34-jenkins,jdk=jdk8latestOnlineInstall/lastCompletedBuild/testReport/nltk.tokenize.mwe/MWETokenizer/add_mwe/ --- nltk/tokenize/mwe.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/nltk/tokenize/mwe.py b/nltk/tokenize/mwe.py index 12b4797786..a09c5a6f9f 100644 --- a/nltk/tokenize/mwe.py +++ b/nltk/tokenize/mwe.py @@ -70,8 +70,9 @@ def add_mwe(self, mwe): >>> tokenizer.add_mwe(('a', 'b')) >>> tokenizer.add_mwe(('a', 'b', 'c')) >>> tokenizer.add_mwe(('a', 'x')) - >>> tokenizer._mwes.as_dict() - {'a': {'x': {True: None}, 'b': {True: None, 'c': {True: None}}}} + >>> expected = {'a': {'x': {True: None}, 'b': {True: None, 'c': {True: None}}}} + >>> tokenizer._mwes.as_dict() == expected + True """ self._mwes.insert(mwe) From 80b3b1dc6c146d3ee81a8bfc045005c9e308168c Mon Sep 17 00:00:00 2001 From: alvations Date: Sat, 27 Feb 2016 00:27:50 +0100 Subject: [PATCH 14/14] Fixes doctest for python3 Dictionary hash are dynamic in Python3 so the __repr__ output will be different all the time. Doctest should check for value equality instead. Monotonic nested defaultdict/dictionary outputs should not be affected though. Fixes issue on https://nltk.ci.cloudbees.com/job/nltk/TOXENV=py34-jenkins,jdk=jdk8latestOnlineInstall/lastCompletedBuild/testReport/nltk.util/Trie/as_dict/ (Same issue as https://github.com/nltk/nltk/commit/a488bb6c900c89ea4d2ca154627fa92093626332) @stevenbird Sorry I presented the wrong radio button in the previous commit when fixing https://github.com/nltk/nltk/commit/a488bb6c900c89ea4d2ca154627fa92093626332 --- nltk/util.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/nltk/util.py b/nltk/util.py index 2af0fb05d7..d04544b8e4 100644 --- a/nltk/util.py +++ b/nltk/util.py @@ -1332,8 +1332,9 @@ def as_dict(self, d=None): >>> from nltk.util import Trie >>> trie = Trie(["abc", "def"]) - >>> trie.as_dict() - {'a': {'b': {'c': {True: None}}}, 'd': {'e': {'f': {True: None}}}} + >>> expected = {'a': {'b': {'c': {True: None}}}, 'd': {'e': {'f': {True: None}}}} + >>> trie.as_dict() == expected + True """ def _default_to_regular(d):