Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Resolve ReDoS opportunity by fixing incorrectly specified regex (#2906)
  • Loading branch information
tomaarsen committed Dec 8, 2021
1 parent 8ed8b70 commit 2a50a3e
Show file tree
Hide file tree
Showing 6 changed files with 20 additions and 20 deletions.
2 changes: 1 addition & 1 deletion nltk/parse/malt.py
Expand Up @@ -32,7 +32,7 @@ def malt_regex_tagger():
(r"\)$", ")"), # round brackets
(r"\[$", "["),
(r"\]$", "]"), # square brackets
(r"^-?[0-9]+(.[0-9]+)?$", "CD"), # cardinal numbers
(r"^-?[0-9]+(\.[0-9]+)?$", "CD"), # cardinal numbers
(r"(The|the|A|a|An|an)$", "DT"), # articles
(r"(He|he|She|she|It|it|I|me|Me|You|you)$", "PRP"), # pronouns
(r"(His|his|Her|her|Its|its)$", "PRP$"), # possessive
Expand Down
2 changes: 1 addition & 1 deletion nltk/sem/glue.py
Expand Up @@ -703,7 +703,7 @@ def get_pos_tagger(self):

regexp_tagger = RegexpTagger(
[
(r"^-?[0-9]+(.[0-9]+)?$", "CD"), # cardinal numbers
(r"^-?[0-9]+(\.[0-9]+)?$", "CD"), # cardinal numbers
(r"(The|the|A|a|An|an)$", "AT"), # articles
(r".*able$", "JJ"), # adjectives
(r".*ness$", "NN"), # nouns formed from adjectives
Expand Down
2 changes: 1 addition & 1 deletion nltk/tag/brill.py
Expand Up @@ -329,7 +329,7 @@ def print_train_stats():
)
print(
"TRAIN ({tokencount:7d} tokens) initial {initialerrors:5d} {initialacc:.4f} "
"final: {finalerrors:5d} {finalacc:.4f} ".format(**train_stats)
"final: {finalerrors:5d} {finalacc:.4f}".format(**train_stats)
)
head = "#ID | Score (train) | #Rules | Template"
print(head, "\n", "-" * len(head), sep="")
Expand Down
22 changes: 11 additions & 11 deletions nltk/tag/brill_trainer.py
Expand Up @@ -91,7 +91,7 @@ def __init__(
# Training

def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
"""
r"""
Trains the Brill tagger on the corpus *train_sents*,
producing at most *max_rules* transformations, each of which
reduces the net number of errors in the corpus by at least
Expand All @@ -111,7 +111,7 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
>>> testing_data = [untag(s) for s in gold_data]
>>> backoff = RegexpTagger([
... (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
... (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'), # cardinal numbers
... (r'(The|the|A|a|An|an)$', 'AT'), # articles
... (r'.*able$', 'JJ'), # adjectives
... (r'.*ness$', 'NN'), # nouns formed from adjectives
Expand All @@ -125,7 +125,7 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
>>> baseline = backoff #see NOTE1
>>> baseline.evaluate(gold_data) #doctest: +ELLIPSIS
0.2450142...
0.2433862...
>>> # Set up templates
>>> Template._cleartemplates() #clear any templates created in earlier tests
Expand All @@ -137,7 +137,7 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
>>> tagger1 = tt.train(training_data, max_rules=10)
TBL train (fast) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: None)
Finding initial useful rules...
Found 845 useful rules.
Found 847 useful rules.
<BLANKLINE>
B |
S F r O | Score = Fixed - Broken
Expand All @@ -150,7 +150,7 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
85 85 0 0 | NN->, if Pos:NN@[-1] & Word:,@[0]
69 69 0 0 | NN->. if Pos:NN@[-1] & Word:.@[0]
51 51 0 0 | NN->IN if Pos:NN@[-1] & Word:of@[0]
47 63 16 161 | NN->IN if Pos:NNS@[-1]
47 63 16 162 | NN->IN if Pos:NNS@[-1]
33 33 0 0 | NN->TO if Pos:NN@[-1] & Word:to@[0]
26 26 0 0 | IN->. if Pos:NNS@[-1] & Word:.@[0]
24 24 0 0 | IN->, if Pos:NNS@[-1] & Word:,@[0]
Expand All @@ -162,11 +162,11 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
>>> train_stats = tagger1.train_stats()
>>> [train_stats[stat] for stat in ['initialerrors', 'finalerrors', 'rulescores']]
[1775, 1269, [132, 85, 69, 51, 47, 33, 26, 24, 22, 17]]
[1776, 1270, [132, 85, 69, 51, 47, 33, 26, 24, 22, 17]]
>>> tagger1.print_template_statistics(printunused=False)
TEMPLATE STATISTICS (TRAIN) 2 templates, 10 rules)
TRAIN ( 2417 tokens) initial 1775 0.2656 final: 1269 0.4750
TRAIN ( 2417 tokens) initial 1776 0.2652 final: 1270 0.4746
#ID | Score (train) | #Rules | Template
--------------------------------------------
001 | 305 0.603 | 7 0.700 | Template(Pos([-1]),Word([0]))
Expand All @@ -175,7 +175,7 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
<BLANKLINE>
>>> tagger1.evaluate(gold_data) # doctest: +ELLIPSIS
0.43996...
0.43833...
>>> tagged, test_stats = tagger1.batch_tag_incremental(testing_data, gold_data)
Expand All @@ -185,13 +185,13 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
True
>>> [test_stats[stat] for stat in ['initialerrors', 'finalerrors', 'rulescores']]
[1855, 1376, [100, 85, 67, 58, 27, 36, 27, 16, 31, 32]]
[1859, 1380, [100, 85, 67, 58, 27, 36, 27, 16, 31, 32]]
>>> # A high-accuracy tagger
>>> tagger2 = tt.train(training_data, max_rules=10, min_acc=0.99)
TBL train (fast) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: 0.99)
Finding initial useful rules...
Found 845 useful rules.
Found 847 useful rules.
<BLANKLINE>
B |
S F r O | Score = Fixed - Broken
Expand All @@ -212,7 +212,7 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
18 18 0 0 | NN->CC if Pos:NN@[-1] & Word:and@[0]
>>> tagger2.evaluate(gold_data) # doctest: +ELLIPSIS
0.44159544...
0.43996743...
>>> tagger2.rules()[2:4]
(Rule('001', 'NN', '.', [(Pos([-1]),'NN'), (Word([0]),'.')]), Rule('001', 'NN', 'IN', [(Pos([-1]),'NN'), (Word([0]),'of')]))
Expand Down
8 changes: 4 additions & 4 deletions nltk/tag/sequential.py
Expand Up @@ -337,7 +337,7 @@ class UnigramTagger(NgramTagger):
>>> test_sent = brown.sents(categories='news')[0]
>>> unigram_tagger = UnigramTagger(brown.tagged_sents(categories='news')[:500])
>>> for tok, tag in unigram_tagger.tag(test_sent):
... print("({}, {}), ".format(tok, tag))
... print("({}, {}), ".format(tok, tag)) # doctest: +NORMALIZE_WHITESPACE
(The, AT), (Fulton, NP-TL), (County, NN-TL), (Grand, JJ-TL),
(Jury, NN-TL), (said, VBD), (Friday, NR), (an, AT),
(investigation, NN), (of, IN), (Atlanta's, NP$), (recent, JJ),
Expand Down Expand Up @@ -491,7 +491,7 @@ def context(self, tokens, index, history):

@jsontags.register_tag
class RegexpTagger(SequentialBackoffTagger):
"""
r"""
Regular Expression Tagger
The RegexpTagger assigns tags to tokens by comparing their
Expand All @@ -503,7 +503,7 @@ class RegexpTagger(SequentialBackoffTagger):
>>> from nltk.tag import RegexpTagger
>>> test_sent = brown.sents(categories='news')[0]
>>> regexp_tagger = RegexpTagger(
... [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
... [(r'^-?[0-9]+(\.[0-9]+)?$', 'CD'), # cardinal numbers
... (r'(The|the|A|a|An|an)$', 'AT'), # articles
... (r'.*able$', 'JJ'), # adjectives
... (r'.*ness$', 'NN'), # nouns formed from adjectives
Expand All @@ -515,7 +515,7 @@ class RegexpTagger(SequentialBackoffTagger):
... ])
>>> regexp_tagger
<Regexp Tagger: size=9>
>>> regexp_tagger.tag(test_sent)
>>> regexp_tagger.tag(test_sent) # doctest: +NORMALIZE_WHITESPACE
[('The', 'AT'), ('Fulton', 'NN'), ('County', 'NN'), ('Grand', 'NN'), ('Jury', 'NN'),
('said', 'NN'), ('Friday', 'NN'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'NN'),
("Atlanta's", 'NNS'), ('recent', 'NN'), ('primary', 'NN'), ('election', 'NN'),
Expand Down
4 changes: 2 additions & 2 deletions nltk/tbl/demo.py
Expand Up @@ -393,11 +393,11 @@ def _demo_plot(learning_curve_output, teststats, trainstats=None, take=None):
plt.savefig(learning_curve_output)


NN_CD_TAGGER = RegexpTagger([(r"^-?[0-9]+(.[0-9]+)?$", "CD"), (r".*", "NN")])
NN_CD_TAGGER = RegexpTagger([(r"^-?[0-9]+(\.[0-9]+)?$", "CD"), (r".*", "NN")])

REGEXP_TAGGER = RegexpTagger(
[
(r"^-?[0-9]+(.[0-9]+)?$", "CD"), # cardinal numbers
(r"^-?[0-9]+(\.[0-9]+)?$", "CD"), # cardinal numbers
(r"(The|the|A|a|An|an)$", "AT"), # articles
(r".*able$", "JJ"), # adjectives
(r".*ness$", "NN"), # nouns formed from adjectives
Expand Down

0 comments on commit 2a50a3e

Please sign in to comment.