Merge pull request #67 from nltk/develop

Syncing with bleeding edge
nltk · Apr 12, 2017 · dd28a0d · dd28a0d
2 parents cb26e5f + 472382b
commit dd28a0d
Show file tree

Hide file tree

Showing 14 changed files with 53 additions and 61 deletions.
diff --git a/.gitignore b/.gitignore
@@ -42,3 +42,6 @@ web/api/*.rst
 # iPython notebooks
 
 .ipynb_checkpoints
+
+# pyenv files
+.python-version
diff --git a/jenkins.sh b/jenkins.sh
@@ -24,21 +24,21 @@ if [[ ! -d ${stanford_corenlp_package_name} ]]; then
 	ln -s ${stanford_corenlp_package_name} 'stanford-corenlp'
 fi
 
-stanford_parser_package_zip_name=$(curl -s 'http://nlp.stanford.edu/software/lex-parser.shtml' | grep -o 'stanford-parser-full-.*\.zip' | head -n1)
+stanford_parser_package_zip_name=$(curl -s 'https://nlp.stanford.edu/software/lex-parser.shtml' | grep -o 'stanford-parser-full-.*\.zip' | head -n1)
 [[ ${stanford_parser_package_zip_name} =~ (.+)\.zip ]]
 stanford_parser_package_name=${BASH_REMATCH[1]}
 if [[ ! -d ${stanford_parser_package_name} ]]; then
-	wget -nv "http://nlp.stanford.edu/software/$stanford_parser_package_zip_name"
+	wget -nv "https://nlp.stanford.edu/software/$stanford_parser_package_zip_name"
 	unzip ${stanford_parser_package_zip_name}
 	rm ${stanford_parser_package_zip_name}
 	ln -s ${stanford_parser_package_name} 'stanford-parser'
 fi
 
-stanford_tagger_package_zip_name=$(curl -s 'http://nlp.stanford.edu/software/tagger.shtml' | grep -o 'stanford-postagger-full-.*\.zip' | head -n1)
+stanford_tagger_package_zip_name=$(curl -s 'https://nlp.stanford.edu/software/tagger.shtml' | grep -o 'stanford-postagger-full-.*\.zip' | head -n1)
 [[ ${stanford_tagger_package_zip_name} =~ (.+)\.zip ]]
 stanford_tagger_package_name=${BASH_REMATCH[1]}
 if [[ ! -d ${stanford_tagger_package_name} ]]; then
-	wget -nv "http://nlp.stanford.edu/software/$stanford_tagger_package_zip_name"
+	wget -nv "https://nlp.stanford.edu/software/$stanford_tagger_package_zip_name"
 	unzip ${stanford_tagger_package_zip_name}
 	rm ${stanford_tagger_package_zip_name}
 	ln -s ${stanford_tagger_package_name} 'stanford-postagger'

diff --git a/nltk/grammar.py b/nltk/grammar.py
@@ -179,6 +179,21 @@ def __div__(self, rhs):
         """
         return Nonterminal('%s/%s' % (self._symbol, rhs._symbol))
 
+
+    def __truediv__(self, rhs):
+        """
+        Return a new nonterminal whose symbol is ``A/B``, where ``A`` is
+        the symbol for this nonterminal, and ``B`` is the symbol for rhs.
+        This function allows use of the slash ``/`` operator with
+        the future import of division.
+
+        :param rhs: The nonterminal used to form the right hand side
+            of the new nonterminal.
+        :type rhs: Nonterminal
+        :rtype: Nonterminal
+        """
+        return self.__div__(rhs)
+
 def nonterminals(symbols):
     """
     Given a string containing a list of symbol names, return a list of

diff --git a/nltk/parse/stanford.py b/nltk/parse/stanford.py
@@ -23,7 +23,7 @@
 from nltk.parse.dependencygraph import DependencyGraph
 from nltk.tree import Tree
 
-_stanford_url = 'http://nlp.stanford.edu/software/lex-parser.shtml'
+_stanford_url = 'https://nlp.stanford.edu/software/lex-parser.shtml'
 
 class GenericStanfordParser(ParserI):
     """Interface to the Stanford Parser"""

diff --git a/nltk/stem/lancaster.py b/nltk/stem/lancaster.py
@@ -6,7 +6,7 @@
 # For license information, see LICENSE.TXT
 
 """
-A word stemmer based on the Lancaster stemming algorithm.
+A word stemmer based on the Lancaster (Paice/Husk) stemming algorithm.
 Paice, Chris D. "Another Stemmer." ACM SIGIR Forum 24.3 (1990): 56-61.
 """
 from __future__ import unicode_literals

diff --git a/nltk/tag/__init__.py b/nltk/tag/__init__.py
@@ -86,8 +86,6 @@ def _get_tagger(lang=None):
         tagger = PerceptronTagger(False)
         ap_russian_model_loc = 'file:' + str(find(RUS_PICKLE))
         tagger.load(ap_russian_model_loc)
-    elif lang == 'eng':
-        tagger = PerceptronTagger()
     else:
         tagger = PerceptronTagger()
     return tagger

diff --git a/nltk/tag/perceptron.py b/nltk/tag/perceptron.py
@@ -228,7 +228,7 @@ def normalize(self, word):
         '''
         Normalization used in pre-processing.
         - All words are lower cased
-        - Digits in the range 1800-2100 are represented as !YEAR;
+        - Groups of digits of length 4 are represented as !YEAR;
         - Other digits are represented as !DIGITS
 
         :rtype: str
@@ -244,7 +244,7 @@ def normalize(self, word):
 
     def _get_features(self, i, word, context, prev, prev2):
         '''Map tokens into a feature representation, implemented as a
-        {hashable: float} dict. If the features change, a new model must be
+        {hashable: int} dict. If the features change, a new model must be
         trained.
         '''
         def add(name, *args):

diff --git a/nltk/tag/stanford.py b/nltk/tag/stanford.py
@@ -10,7 +10,7 @@
 """
 A module for interfacing with the Stanford taggers.
 
-Tagger models need to be downloaded from http://nlp.stanford.edu/software
+Tagger models need to be downloaded from https://nlp.stanford.edu/software
 and the STANFORD_MODELS environment variable set (a colon-separated
 list of paths).
 
@@ -26,7 +26,7 @@
 from nltk.tag.api import TaggerI
 from nltk import compat
 
-_stanford_url = 'http://nlp.stanford.edu/software'
+_stanford_url = 'https://nlp.stanford.edu/software'
 
 class StanfordTagger(TaggerI):
     """
@@ -188,3 +188,12 @@ def parse_output(self, text, sentences):
             return result
 
         raise NotImplementedError
+
+def setup_module(module):
+    from nose import SkipTest
+
+    try:
+        StanfordPOSTagger('english-bidirectional-distsim.tagger')
+    except LookupError:
+        raise SkipTest('Doctests from nltk.tag.stanford are skipped because one \
+                       of the stanford jars cannot be found.')
diff --git a/nltk/test/metrics.doctest b/nltk/test/metrics.doctest
@@ -239,7 +239,7 @@ For other associations, we ensure the ordering of the measures:
     True
     >>> bam.dice(20, (42, 20), N) > bam.dice(20, (41, 27), N)
     True
-    >>> bam.fisher(20, (42, 20), N) > bam.fisher(20, (41, 27), N)
+    >>> bam.fisher(20, (42, 20), N) > bam.fisher(20, (41, 27), N) # doctest: +SKIP
     False
 
 For trigrams, we have to provide more count information:

diff --git a/nltk/tokenize/moses.py b/nltk/tokenize/moses.py
@@ -248,10 +248,10 @@ def restore_multidots(self, text):
         return re.sub(r'DOTMULTI', r'.', text)
 
     def islower(self, text):
-        return not set(text).difference(set(IsLower))
+        return not set(text).difference(set(self.IsLower))
 
     def isalpha(self, text):
-        return not set(text).difference(set(IsAlpha))
+        return not set(text).difference(set(self.IsAlpha))
 
     def has_numeric_only(self, text):
         return bool(re.search(r'(.*)[\s]+(\#NUMERIC_ONLY\#)', text))

diff --git a/nltk/tokenize/stanford.py b/nltk/tokenize/stanford.py
@@ -19,7 +19,7 @@
 
 from nltk.tokenize.api import TokenizerI
 
-_stanford_url = 'http://nlp.stanford.edu/software/tokenizer.shtml'
+_stanford_url = 'https://nlp.stanford.edu/software/tokenizer.shtml'
 
 class StanfordTokenizer(TokenizerI):
     r"""

diff --git a/nltk/tokenize/stanford_segmenter.py b/nltk/tokenize/stanford_segmenter.py
@@ -21,7 +21,7 @@
 
 from nltk.tokenize.api import TokenizerI
 
-_stanford_url = 'http://nlp.stanford.edu/software'
+_stanford_url = 'https://nlp.stanford.edu/software'
 
 class StanfordSegmenter(TokenizerI):
     r"""

diff --git a/nltk/translate/gleu_score.py b/nltk/translate/gleu_score.py
@@ -46,7 +46,8 @@ def sentence_gleu(reference, hypothesis, min_len=1, max_len=4):
          sentence reward objective."
 
     Note: The GLEU score is designed for sentence based evaluation thus there is
-          no corpus based scores implemented in NLTK.
+          no corpus based scores implemented in NLTK. Also, unlike
+          multi-reference BLEU, GLEU only supports a single reference.
 
     The infamous "the the the ... " example
 
@@ -68,15 +69,15 @@ def sentence_gleu(reference, hypothesis, min_len=1, max_len=4):
         >>> sentence_gleu(ref1, hyp2) # doctest: +ELLIPSIS
         0.1206...
 
-    :param references: reference sentence
-    :type references: list(str)
+    :param reference: a reference sentence
+    :type reference: list(str)
     :param hypothesis: a hypothesis sentence
     :type hypothesis: list(str)
     :param min_len: The minimum order of n-gram this function should extract.
     :type min_len: int
     :param max_len: The maximum order of n-gram this function should extract.
     :type max_len: int
-    :return: the sentence level CHRF score.
+    :return: the sentence level GLEU score.
     :rtype: float
     """
     # For each order of ngram, calculate the no. of ngram matches and

diff --git a/tox.ini b/tox.ini
@@ -1,5 +1,9 @@
 [tox]
-envlist = py27,py34,py35,pypy,py27-nodeps,py34-nodeps,py35-nodeps,py27-jenkins,py34-jenkins,py35-jenkins
+envlist =
+    py{27,34,35}
+    pypy
+    py{27,34,35}-nodeps
+    py{27,34,35}-jenkins
 
 [testenv]
 ; simplify numpy installation
@@ -25,8 +29,8 @@ changedir = nltk/test
 commands =
     ; scipy and scikit-learn requires numpy even to run setup.py so
     ; they can't be installed in one command
-
     pip install scipy scikit-learn
+
     ; python runtests.py --with-coverage --cover-inclusive --cover-package=nltk --cover-html --cover-html-dir={envdir}/docs []
     python runtests.py []
 
@@ -40,44 +44,6 @@ deps =
 commands =
     python runtests.py []
 
-[testenv:py34]
-deps =
-    numpy
-    nose >= 1.2.1
-    coverage
-    text-unidecode
-    twython
-    pyparsing
-    python-crfsuite
-    rednose
-
-commands =
-    ; scipy and scikit-learn requires numpy even to run setup.py so
-    ; they can't be installed in one command
-    pip install scipy scikit-learn
-
-    ; python runtests.py --with-coverage --cover-inclusive --cover-package=nltk --cover-html --cover-html-dir={envdir}/docs []
-    python runtests.py []
-
-[testenv:py35]
-deps =
-    numpy
-    nose >= 1.2.1
-    coverage
-    text-unidecode
-    twython
-    pyparsing
-    python-crfsuite
-    rednose
-
-commands =
-    ; scipy and scikit-learn requires numpy even to run setup.py so
-    ; they can't be installed in one command
-    pip install scipy scikit-learn
-
-    ; python runtests.py --with-coverage --cover-inclusive --cover-package=nltk --cover-html --cover-html-dir={envdir}/docs []
-    python runtests.py []
-
 [testenv:py27-nodeps]
 basepython = python2.7
 deps =