Skip to content

Commit

Permalink
align parser methods with updated API, incorporate read_cfg into Cont…
Browse files Browse the repository at this point in the history
…extFreeGrammar.read, and same for read_pcfg and read_fcfg
  • Loading branch information
stevenbird committed Apr 22, 2014
1 parent a580803 commit 52f017f
Show file tree
Hide file tree
Showing 17 changed files with 146 additions and 137 deletions.
12 changes: 6 additions & 6 deletions nltk/data.py
Expand Up @@ -661,9 +661,9 @@ def retrieve(resource_url, filename=None, verbose=True):
'pickle': "A serialized python object, stored using the pickle module.",
'json': "A serialized python object, stored using the json module.",
'yaml': "A serialized python object, stored using the yaml module.",
'cfg': "A context free grammar, parsed by nltk.read_cfg().",
'pcfg': "A probabilistic CFG, parsed by nltk.read_pcfg().",
'fcfg': "A feature CFG, parsed by nltk.read_fcfg().",
'cfg': "A context free grammar.",
'pcfg': "A probabilistic CFG.",
'fcfg': "A feature CFG.",
'fol': "A list of first order logic expressions, parsed by "
"nltk.sem.parse_fol() using nltk.sem.logic.LogicParser.",
'logic': "A list of first order logic expressions, parsed by "
Expand Down Expand Up @@ -803,13 +803,13 @@ def load(resource_url, format='auto', cache=True, verbose=False,
if format == 'text':
resource_val = string_data
elif format == 'cfg':
resource_val = nltk.grammar.read_cfg(
resource_val = nltk.grammar.ContextFreeGrammar.read(
string_data, encoding=encoding)
elif format == 'pcfg':
resource_val = nltk.grammar.read_pcfg(
resource_val = nltk.grammar.WeightedGrammar.read(
string_data, encoding=encoding)
elif format == 'fcfg':
resource_val = nltk.grammar.read_fcfg(
resource_val = nltk.grammar.FeatureGrammar.read(
string_data, logic_parser=logic_parser,
fstruct_reader=fstruct_reader, encoding=encoding)
elif format == 'fol':
Expand Down
178 changes: 87 additions & 91 deletions nltk/grammar.py
Expand Up @@ -507,6 +507,16 @@ def _calculate_leftcorners(self):
for left in lefts:
lc.update(self._immediate_leftcorner_words.get(left, set()))

@classmethod
def read(cls, input, encoding=None):
"""
Return the ``ContextFreeGrammar`` corresponding to the input string(s).
:param input: a grammar, either in the form of a string or as a list of strings.
"""
start, productions = read_grammar(input, standard_nonterm_parser,
encoding=encoding)
return ContextFreeGrammar(start, productions)

def start(self):
"""
Expand Down Expand Up @@ -757,6 +767,35 @@ def _calculate_indexes(self):
if is_terminal(token):
self._lexical_index.setdefault(token, set()).add(prod)

@classmethod
def read(cls, input, features=None, logic_parser=None, fstruct_reader=None,
encoding=None):
"""
Return a feature structure based ``FeatureGrammar``.
:param input: a grammar, either in the form of a string or else
as a list of strings.
:param features: a tuple of features (default: SLASH, TYPE)
:param logic_parser: a parser for lambda-expressions,
by default, ``LogicParser()``
:param fstruct_reader: a feature structure parser
(only if features and logic_parser is None)
"""
if features is None:
features = (SLASH, TYPE)

if fstruct_reader is None:
fstruct_reader = FeatStructReader(features, FeatStructNonterminal,
logic_parser=logic_parser)
elif logic_parser is not None:
raise Exception('\'logic_parser\' and \'fstruct_reader\' must '
'not both be set')

start, productions = read_grammar(input, fstruct_reader.read_partial,
encoding=encoding)
return FeatureGrammar(start, productions)


def productions(self, lhs=None, rhs=None, empty=False):
"""
Return the grammar productions, filtered by the left-hand side
Expand Down Expand Up @@ -865,6 +904,19 @@ def __init__(self, productions):
"""
self._productions = productions

@classmethod
def read(cls, input):
productions = []
for linenum, line in enumerate(input.split('\n')):
line = line.strip()
if line.startswith('#') or line=='': continue
try: productions += _read_dependency_production(line)
except ValueError:
raise ValueError('Unable to parse line %s: %s' % (linenum, line))
if len(productions) == 0:
raise ValueError('No productions found!')
return DependencyGrammar(productions)

def contains(self, head, mod):
"""
:param head: A head word.
Expand Down Expand Up @@ -1031,6 +1083,20 @@ def __init__(self, start, productions, calculate_leftcorners=True):
raise ValueError("Productions for %r do not sum to 1" % lhs)


@classmethod
def read(cls, input, encoding=None):
"""
Return a probabilistic ``WeightedGrammar`` corresponding to the
input string(s).
:param input: a grammar, either in the form of a string or else
as a list of strings.
"""
start, productions = read_grammar(input, standard_nonterm_parser,
probabilistic=True, encoding=encoding)
return WeightedGrammar(start, productions)


#################################################################
# Inducing Grammars
#################################################################
Expand Down Expand Up @@ -1069,82 +1135,27 @@ def induce_pcfg(start, productions):


#################################################################
# Parsing Grammars
# Helper functions for reading productions
#################################################################

# Parsing CFGs

def read_cfg_production(input):
def _read_cfg_production(input):
"""
Return a list of context-free ``Productions``.
"""
return read_production(input, standard_nonterm_parser)

def read_cfg(input, encoding=None):
"""
Return the ``ContextFreeGrammar`` corresponding to the input string(s).
return _read_production(input, standard_nonterm_parser)

:param input: a grammar, either in the form of a string or
as a list of strings.
"""
start, productions = read_grammar(input, standard_nonterm_parser,
encoding=encoding)
return ContextFreeGrammar(start, productions)

# Parsing Probabilistic CFGs

def read_pcfg_production(input):
def _read_pcfg_production(input):
"""
Return a list of PCFG ``WeightedProductions``.
"""
return read_production(input, standard_nonterm_parser, probabilistic=True)
return _read_production(input, standard_nonterm_parser, probabilistic=True)

def read_pcfg(input, encoding=None):
"""
Return a probabilistic ``WeightedGrammar`` corresponding to the
input string(s).
:param input: a grammar, either in the form of a string or else
as a list of strings.
"""
start, productions = read_grammar(input, standard_nonterm_parser,
probabilistic=True, encoding=encoding)
return WeightedGrammar(start, productions)

# Parsing Feature-based CFGs

def read_fcfg_production(input, fstruct_reader):
def _read_fcfg_production(input, fstruct_reader):
"""
Return a list of feature-based ``Productions``.
"""
return read_production(input, fstruct_reader)
return _read_production(input, fstruct_reader)

def read_fcfg(input, features=None, logic_parser=None, fstruct_reader=None,
encoding=None):
"""
Return a feature structure based ``FeatureGrammar``.
:param input: a grammar, either in the form of a string or else
as a list of strings.
:param features: a tuple of features (default: SLASH, TYPE)
:param logic_parser: a parser for lambda-expressions,
by default, ``LogicParser()``
:param fstruct_reader: a feature structure parser
(only if features and logic_parser is None)
"""
if features is None:
features = (SLASH, TYPE)

if fstruct_reader is None:
fstruct_reader = FeatStructReader(features, FeatStructNonterminal,
logic_parser=logic_parser)
elif logic_parser is not None:
raise Exception('\'logic_parser\' and \'fstruct_reader\' must '
'not both be set')

start, productions = read_grammar(input, fstruct_reader.read_partial,
encoding=encoding)
return FeatureGrammar(start, productions)

# Parsing generic grammars

Expand All @@ -1153,7 +1164,7 @@ def read_fcfg(input, features=None, logic_parser=None, fstruct_reader=None,
_TERMINAL_RE = re.compile(r'( "[^"]+" | \'[^\']+\' ) \s*', re.VERBOSE)
_DISJUNCTION_RE = re.compile(r'\| \s*', re.VERBOSE)

def read_production(line, nonterm_parser, probabilistic=False):
def _read_production(line, nonterm_parser, probabilistic=False):
"""
Parse a grammar rule, given as a string, and return
a list of productions.
Expand Down Expand Up @@ -1208,6 +1219,10 @@ def read_production(line, nonterm_parser, probabilistic=False):
return [Production(lhs, rhs) for rhs in rhsides]


#################################################################
# Reading Phrase Structure Grammars
#################################################################

def read_grammar(input, nonterm_parser, probabilistic=False, encoding=None):
"""
Return a pair consisting of a starting category and a list of
Expand Down Expand Up @@ -1251,7 +1266,7 @@ def read_grammar(input, nonterm_parser, probabilistic=False, encoding=None):
raise ValueError('Bad directive')
else:
# expand out the disjunctions on the RHS
productions += read_production(line, nonterm_parser, probabilistic)
productions += _read_production(line, nonterm_parser, probabilistic)
except ValueError as e:
raise ValueError('Unable to parse line %s: %s\n%s' %
(linenum+1, line, e))
Expand All @@ -1272,7 +1287,7 @@ def standard_nonterm_parser(string, pos):


#################################################################
# Parsing Dependency Grammars
# Reading Dependency Grammars
#################################################################

_READ_DG_RE = re.compile(r'''^\s* # leading whitespace
Expand All @@ -1288,19 +1303,7 @@ def standard_nonterm_parser(string, pos):
re.VERBOSE)
_SPLIT_DG_RE = re.compile(r'''('[^']'|[-=]+>|"[^"]+"|'[^']+'|\|)''')

def read_dependency_grammar(s):
productions = []
for linenum, line in enumerate(s.split('\n')):
line = line.strip()
if line.startswith('#') or line=='': continue
try: productions += read_dependency_production(line)
except ValueError:
raise ValueError('Unable to parse line %s: %s' % (linenum, line))
if len(productions) == 0:
raise ValueError('No productions found!')
return DependencyGrammar(productions)

def read_dependency_production(s):
def _read_dependency_production(s):
if not _READ_DG_RE.match(s):
raise ValueError('Bad production string')
pieces = _SPLIT_DG_RE.split(s)
Expand All @@ -1324,7 +1327,7 @@ def cfg_demo():
A demonstration showing how ``ContextFreeGrammars`` can be created and used.
"""

from nltk import nonterminals, Production, read_cfg
from nltk import nonterminals, Production, ContextFreeGrammar

# Create some nonterminals
S, NP, VP, PP = nonterminals('S, NP, VP, PP')
Expand All @@ -1338,7 +1341,7 @@ def cfg_demo():
print(Production(S, [NP]))

# Create some Grammar Productions
grammar = read_cfg("""
grammar = ContextFreeGrammar.read("""
S -> NP VP
PP -> P NP
NP -> Det N | NP PP
Expand All @@ -1356,7 +1359,7 @@ def cfg_demo():
print(repr(grammar.productions()).replace(',', ',\n'+' '*25))
print()

toy_pcfg1 = read_pcfg("""
toy_pcfg1 = WeightedGrammar.read("""
S -> NP VP [1.0]
NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
Det -> 'the' [0.8] | 'my' [0.2]
Expand All @@ -1367,7 +1370,7 @@ def cfg_demo():
P -> 'with' [0.61] | 'under' [0.39]
""")

toy_pcfg2 = read_pcfg("""
toy_pcfg2 = WeightedGrammar.read("""
S -> NP VP [1.0]
VP -> V NP [.59]
VP -> V [.40]
Expand Down Expand Up @@ -1461,7 +1464,7 @@ def dg_demo():
A demonstration showing the creation and inspection of a
``DependencyGrammar``.
"""
grammar = read_dependency_grammar("""
grammar = DependencyGrammar.read("""
'scratch' -> 'cats' | 'walls'
'walls' -> 'the'
'cats' -> 'the'
Expand Down Expand Up @@ -1506,12 +1509,5 @@ def demo():
__all__ = ['Nonterminal', 'nonterminals',
'Production', 'DependencyProduction', 'WeightedProduction',
'ContextFreeGrammar', 'WeightedGrammar', 'DependencyGrammar',
'StatisticalDependencyGrammar',
'induce_pcfg', 'read_cfg', 'read_cfg_production',
'read_pcfg', 'read_pcfg_production',
'read_fcfg', 'read_fcfg_production',
'read_grammar', 'read_production',
'read_dependency_grammar', 'read_dependency_production',
'demo', 'cfg_demo', 'pcfg_demo', 'dg_demo', 'sdg_demo',
'toy_pcfg1', 'toy_pcfg2']
'StatisticalDependencyGrammar', 'induce_pcfg', 'read_grammar']

4 changes: 2 additions & 2 deletions nltk/parse/chart.py
Expand Up @@ -1547,8 +1547,8 @@ def parse(self, tokens, tree_class=Tree):
########################################################################

def demo_grammar():
from nltk.grammar import read_cfg
return read_cfg("""
from nltk.grammar import ContextFreeGrammar
return ContextFreeGrammar.read("""
S -> NP VP
PP -> "with" NP
NP -> NP PP
Expand Down
4 changes: 2 additions & 2 deletions nltk/parse/generate.py
Expand Up @@ -64,11 +64,11 @@ def _generate_one(grammar, item, depth):
"""

def demo(N=23):
from nltk.grammar import read_cfg
from nltk.grammar import ContextFreeGrammar

print('Generating the first %d sentences for demo grammar:' % (N,))
print(demo_grammar)
grammar = read_cfg(demo_grammar)
grammar = ContextFreeGrammar.read(demo_grammar)
for n, sent in enumerate(generate(grammar, n=N), 1):
print('%3d. %s' % (n, ' '.join(sent)))

Expand Down
5 changes: 3 additions & 2 deletions nltk/parse/nonprojectivedependencyparser.py
Expand Up @@ -11,7 +11,6 @@
import math

from nltk.compat import xrange
from nltk.grammar import read_dependency_grammar

from nltk.parse.dependencygraph import DependencyGraph, conll_data2

Expand Down Expand Up @@ -620,7 +619,9 @@ def nonprojective_conll_parse_demo():
print(parse_graph)

def rule_based_demo():
grammar = read_dependency_grammar("""
from nltk.grammar import DependencyGrammar

grammar = DependencyGrammar.read("""
'taught' -> 'play' | 'man'
'man' -> 'the' | 'in'
'in' -> 'corner'
Expand Down

0 comments on commit 52f017f

Please sign in to comment.