# <center>Book: Steven Bird, Ewan Klein, Edward Loper, 2009. **Natural Language Processing (NLP) with Python**, O'Reilly.</center> 

### Chapter 7 -  Extracting Information

In [1]:
locs = [('Onmicom', 'IN', 'New York'),
        ('DDB Needham', 'IN', 'New York'), 
        ('Kaplan Thaler Group', 'IN', 'New York'),
        ('BBDO South', 'IN', 'Atlanta'),
        ('Georgia-Pacific', 'IN', 'Atlanta')]
query = [e1 for (e1, re1, e2) in locs if e2 == 'Atlanta']
print(query)

['BBDO South', 'Georgia-Pacific']


In [2]:
import nltk

def ie_preprocess(document):
    sentences = nltk.sent_tokenize(document)
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    sentences = [nltk.pos_tag(sent) for sent in sentences]

In [3]:
sentence = [("the", "DT"), ("little", "JJ"), ("yellow", "JJ"),
            ("dog", "NN"), ("barked", "VBD"), ("at", "IN"), 
            ("the", "DT"), ("cat", "NN")]

# look for an optional determiner, followed by any number of adjectives, 
# and a required noun
grammar = "NP: {<DT>?<JJ>*<NN>}"
cp = nltk.RegexpParser(grammar)
result = cp.parse(sentence)
print(result)

(S
  (NP the/DT little/JJ yellow/JJ dog/NN)
  barked/VBD
  at/IN
  (NP the/DT cat/NN))


In [4]:
import re

grammar = r"""
    NP: {<DT|PP\$>?<JJ>*<NN>} # chunk determiner/possessive, adjectives and noun
        {<NNP>+}              # chunk sequences of proper nouns
"""
cp = nltk.RegexpParser(grammar)
sentence = [("Rapunzel", "NNP"), ("let", "VBD"), ("down", "RP"),
            ("her", "PP$"), ("long", "JJ"), ("golden", "JJ"), ("hair", "NN")]
print(cp.parse(sentence))

(S
  (NP Rapunzel/NNP)
  let/VBD
  down/RP
  (NP her/PP$ long/JJ golden/JJ hair/NN))


In [5]:
nouns = [("money", "NN"), ("market", "NN"), ("fund", "NN")]
grammar = "NP: {<NN><NN>}  # Chunk two consecutive nouns"
cp = nltk.RegexpParser(grammar)
print(cp.parse(nouns))

(S (NP money/NN market/NN) fund/NN)


In [6]:
cp = nltk.RegexpParser('CHUNK: {<V.*> <TO> <V.*>}')
brown = nltk.corpus.brown
chunked = []

for sent in brown.tagged_sents():
    tree = cp.parse(sent)
    for subtree in tree.subtrees():
        if subtree.label() == 'CHUNK': 
            chunked.append(subtree)

In [7]:
for t in chunked[:5]:
    print(t)

(CHUNK combined/VBN to/TO achieve/VB)
(CHUNK continue/VB to/TO place/VB)
(CHUNK serve/VB to/TO protect/VB)
(CHUNK wanted/VBD to/TO wait/VB)
(CHUNK allowed/VBN to/TO place/VB)


In [8]:
def find_chunks(corpus, chunk):
    """
    Returns chunks with a given set of tags from a corpus of 
    tagged sentences.
    
    Arguments:
    
    corpus: corpus of chunked sentences.
    chunk:  regular expression with POS tags. Should be in the format:
            'CHUNK_LABEL: {TAG1}{TAG2}...'. N.B. a colon (:) must separate
            the label from the tags.
    """
    chunked = []
    # retrieve label for chunks
    chunk_label = re.search(r'[^"][^:]*', chunk).group()
    cp = nltk.RegexpParser(chunk)
    for sent in corpus:
        tree = cp.parse(sent)
        for subtree in tree.subtrees():
            if subtree.label() == chunk_label:
                chunked.append(subtree)
    
    return chunked

In [9]:
nouns_4Xs = find_chunks(brown.tagged_sents(), "NOUNS: {<N.*>{4,}}")
for n in nouns_4Xs[:5]:
    print(n)

(NOUNS Court/NN-TL Judge/NN-TL Durwood/NP Pye/NP)
(NOUNS Mayor-nominate/NN-TL Ivan/NP Allen/NP Jr./NP)
(NOUNS Georgia's/NP$ automobile/NN title/NN law/NN)
(NOUNS State/NN-TL Welfare/NN-TL Department's/NN$-TL handling/NN)
(NOUNS Fulton/NP-TL Tax/NN-TL Commissioner's/NN$-TL Office/NN-TL)


In [10]:
grammar = r"""
 NP:
  {<.*>+}      # Chunk everything
  }<VBD|IN>+{  # Chink sequences of VBD and IN
 """
sentence = [("the", "DT"), ("little", "JJ"), ("yellow", "JJ"),
            ("dog", "NN"), ("barked", "VBD"), ("at", "IN"), ("the", "DT"),
            ("cat", "NN")]
cp = nltk.RegexpParser(grammar)
print(cp.parse(sentence))

(S
  (NP the/DT little/JJ yellow/JJ dog/NN)
  barked/VBD
  at/IN
  (NP the/DT cat/NN))
