Skip to content

Commit

Permalink
Merge pull request #67 from nltk/develop
Browse files Browse the repository at this point in the history
Syncing with bleeding edge
  • Loading branch information
alvations committed Apr 12, 2017
2 parents cb26e5f + 472382b commit dd28a0d
Show file tree
Hide file tree
Showing 14 changed files with 53 additions and 61 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,6 @@ web/api/*.rst
# iPython notebooks

.ipynb_checkpoints

# pyenv files
.python-version
8 changes: 4 additions & 4 deletions jenkins.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,21 +24,21 @@ if [[ ! -d ${stanford_corenlp_package_name} ]]; then
ln -s ${stanford_corenlp_package_name} 'stanford-corenlp'
fi

stanford_parser_package_zip_name=$(curl -s 'http://nlp.stanford.edu/software/lex-parser.shtml' | grep -o 'stanford-parser-full-.*\.zip' | head -n1)
stanford_parser_package_zip_name=$(curl -s 'https://nlp.stanford.edu/software/lex-parser.shtml' | grep -o 'stanford-parser-full-.*\.zip' | head -n1)
[[ ${stanford_parser_package_zip_name} =~ (.+)\.zip ]]
stanford_parser_package_name=${BASH_REMATCH[1]}
if [[ ! -d ${stanford_parser_package_name} ]]; then
wget -nv "http://nlp.stanford.edu/software/$stanford_parser_package_zip_name"
wget -nv "https://nlp.stanford.edu/software/$stanford_parser_package_zip_name"
unzip ${stanford_parser_package_zip_name}
rm ${stanford_parser_package_zip_name}
ln -s ${stanford_parser_package_name} 'stanford-parser'
fi

stanford_tagger_package_zip_name=$(curl -s 'http://nlp.stanford.edu/software/tagger.shtml' | grep -o 'stanford-postagger-full-.*\.zip' | head -n1)
stanford_tagger_package_zip_name=$(curl -s 'https://nlp.stanford.edu/software/tagger.shtml' | grep -o 'stanford-postagger-full-.*\.zip' | head -n1)
[[ ${stanford_tagger_package_zip_name} =~ (.+)\.zip ]]
stanford_tagger_package_name=${BASH_REMATCH[1]}
if [[ ! -d ${stanford_tagger_package_name} ]]; then
wget -nv "http://nlp.stanford.edu/software/$stanford_tagger_package_zip_name"
wget -nv "https://nlp.stanford.edu/software/$stanford_tagger_package_zip_name"
unzip ${stanford_tagger_package_zip_name}
rm ${stanford_tagger_package_zip_name}
ln -s ${stanford_tagger_package_name} 'stanford-postagger'
Expand Down
15 changes: 15 additions & 0 deletions nltk/grammar.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,21 @@ def __div__(self, rhs):
"""
return Nonterminal('%s/%s' % (self._symbol, rhs._symbol))


def __truediv__(self, rhs):
"""
Return a new nonterminal whose symbol is ``A/B``, where ``A`` is
the symbol for this nonterminal, and ``B`` is the symbol for rhs.
This function allows use of the slash ``/`` operator with
the future import of division.
:param rhs: The nonterminal used to form the right hand side
of the new nonterminal.
:type rhs: Nonterminal
:rtype: Nonterminal
"""
return self.__div__(rhs)

def nonterminals(symbols):
"""
Given a string containing a list of symbol names, return a list of
Expand Down
2 changes: 1 addition & 1 deletion nltk/parse/stanford.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from nltk.parse.dependencygraph import DependencyGraph
from nltk.tree import Tree

_stanford_url = 'http://nlp.stanford.edu/software/lex-parser.shtml'
_stanford_url = 'https://nlp.stanford.edu/software/lex-parser.shtml'

class GenericStanfordParser(ParserI):
"""Interface to the Stanford Parser"""
Expand Down
2 changes: 1 addition & 1 deletion nltk/stem/lancaster.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# For license information, see LICENSE.TXT

"""
A word stemmer based on the Lancaster stemming algorithm.
A word stemmer based on the Lancaster (Paice/Husk) stemming algorithm.
Paice, Chris D. "Another Stemmer." ACM SIGIR Forum 24.3 (1990): 56-61.
"""
from __future__ import unicode_literals
Expand Down
2 changes: 0 additions & 2 deletions nltk/tag/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,6 @@ def _get_tagger(lang=None):
tagger = PerceptronTagger(False)
ap_russian_model_loc = 'file:' + str(find(RUS_PICKLE))
tagger.load(ap_russian_model_loc)
elif lang == 'eng':
tagger = PerceptronTagger()
else:
tagger = PerceptronTagger()
return tagger
Expand Down
4 changes: 2 additions & 2 deletions nltk/tag/perceptron.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ def normalize(self, word):
'''
Normalization used in pre-processing.
- All words are lower cased
- Digits in the range 1800-2100 are represented as !YEAR;
- Groups of digits of length 4 are represented as !YEAR;
- Other digits are represented as !DIGITS
:rtype: str
Expand All @@ -244,7 +244,7 @@ def normalize(self, word):

def _get_features(self, i, word, context, prev, prev2):
'''Map tokens into a feature representation, implemented as a
{hashable: float} dict. If the features change, a new model must be
{hashable: int} dict. If the features change, a new model must be
trained.
'''
def add(name, *args):
Expand Down
13 changes: 11 additions & 2 deletions nltk/tag/stanford.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
"""
A module for interfacing with the Stanford taggers.
Tagger models need to be downloaded from http://nlp.stanford.edu/software
Tagger models need to be downloaded from https://nlp.stanford.edu/software
and the STANFORD_MODELS environment variable set (a colon-separated
list of paths).
Expand All @@ -26,7 +26,7 @@
from nltk.tag.api import TaggerI
from nltk import compat

_stanford_url = 'http://nlp.stanford.edu/software'
_stanford_url = 'https://nlp.stanford.edu/software'

class StanfordTagger(TaggerI):
"""
Expand Down Expand Up @@ -188,3 +188,12 @@ def parse_output(self, text, sentences):
return result

raise NotImplementedError

def setup_module(module):
from nose import SkipTest

try:
StanfordPOSTagger('english-bidirectional-distsim.tagger')
except LookupError:
raise SkipTest('Doctests from nltk.tag.stanford are skipped because one \
of the stanford jars cannot be found.')
2 changes: 1 addition & 1 deletion nltk/test/metrics.doctest
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ For other associations, we ensure the ordering of the measures:
True
>>> bam.dice(20, (42, 20), N) > bam.dice(20, (41, 27), N)
True
>>> bam.fisher(20, (42, 20), N) > bam.fisher(20, (41, 27), N)
>>> bam.fisher(20, (42, 20), N) > bam.fisher(20, (41, 27), N) # doctest: +SKIP
False

For trigrams, we have to provide more count information:
Expand Down
4 changes: 2 additions & 2 deletions nltk/tokenize/moses.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,10 +248,10 @@ def restore_multidots(self, text):
return re.sub(r'DOTMULTI', r'.', text)

def islower(self, text):
return not set(text).difference(set(IsLower))
return not set(text).difference(set(self.IsLower))

def isalpha(self, text):
return not set(text).difference(set(IsAlpha))
return not set(text).difference(set(self.IsAlpha))

def has_numeric_only(self, text):
return bool(re.search(r'(.*)[\s]+(\#NUMERIC_ONLY\#)', text))
Expand Down
2 changes: 1 addition & 1 deletion nltk/tokenize/stanford.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

from nltk.tokenize.api import TokenizerI

_stanford_url = 'http://nlp.stanford.edu/software/tokenizer.shtml'
_stanford_url = 'https://nlp.stanford.edu/software/tokenizer.shtml'

class StanfordTokenizer(TokenizerI):
r"""
Expand Down
2 changes: 1 addition & 1 deletion nltk/tokenize/stanford_segmenter.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

from nltk.tokenize.api import TokenizerI

_stanford_url = 'http://nlp.stanford.edu/software'
_stanford_url = 'https://nlp.stanford.edu/software'

class StanfordSegmenter(TokenizerI):
r"""
Expand Down
9 changes: 5 additions & 4 deletions nltk/translate/gleu_score.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@ def sentence_gleu(reference, hypothesis, min_len=1, max_len=4):
sentence reward objective."
Note: The GLEU score is designed for sentence based evaluation thus there is
no corpus based scores implemented in NLTK.
no corpus based scores implemented in NLTK. Also, unlike
multi-reference BLEU, GLEU only supports a single reference.
The infamous "the the the ... " example
Expand All @@ -68,15 +69,15 @@ def sentence_gleu(reference, hypothesis, min_len=1, max_len=4):
>>> sentence_gleu(ref1, hyp2) # doctest: +ELLIPSIS
0.1206...
:param references: reference sentence
:type references: list(str)
:param reference: a reference sentence
:type reference: list(str)
:param hypothesis: a hypothesis sentence
:type hypothesis: list(str)
:param min_len: The minimum order of n-gram this function should extract.
:type min_len: int
:param max_len: The maximum order of n-gram this function should extract.
:type max_len: int
:return: the sentence level CHRF score.
:return: the sentence level GLEU score.
:rtype: float
"""
# For each order of ngram, calculate the no. of ngram matches and
Expand Down
46 changes: 6 additions & 40 deletions tox.ini
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
[tox]
envlist = py27,py34,py35,pypy,py27-nodeps,py34-nodeps,py35-nodeps,py27-jenkins,py34-jenkins,py35-jenkins
envlist =
py{27,34,35}
pypy
py{27,34,35}-nodeps
py{27,34,35}-jenkins

[testenv]
; simplify numpy installation
Expand All @@ -25,8 +29,8 @@ changedir = nltk/test
commands =
; scipy and scikit-learn requires numpy even to run setup.py so
; they can't be installed in one command

pip install scipy scikit-learn

; python runtests.py --with-coverage --cover-inclusive --cover-package=nltk --cover-html --cover-html-dir={envdir}/docs []
python runtests.py []

Expand All @@ -40,44 +44,6 @@ deps =
commands =
python runtests.py []

[testenv:py34]
deps =
numpy
nose >= 1.2.1
coverage
text-unidecode
twython
pyparsing
python-crfsuite
rednose

commands =
; scipy and scikit-learn requires numpy even to run setup.py so
; they can't be installed in one command
pip install scipy scikit-learn

; python runtests.py --with-coverage --cover-inclusive --cover-package=nltk --cover-html --cover-html-dir={envdir}/docs []
python runtests.py []

[testenv:py35]
deps =
numpy
nose >= 1.2.1
coverage
text-unidecode
twython
pyparsing
python-crfsuite
rednose

commands =
; scipy and scikit-learn requires numpy even to run setup.py so
; they can't be installed in one command
pip install scipy scikit-learn

; python runtests.py --with-coverage --cover-inclusive --cover-package=nltk --cover-html --cover-html-dir={envdir}/docs []
python runtests.py []

[testenv:py27-nodeps]
basepython = python2.7
deps =
Expand Down

0 comments on commit dd28a0d

Please sign in to comment.