# Extracting Information from Text

*Chapter 07, NLTK: https://www.nltk.org/book/ch07.html*

In [None]:
%matplotlib inline

In [None]:
from pprint import pprint

import nltk
import os
import pandas as pd
import re
import textwrap

### NLTK Data

#### Installing MegaM

In addition to the downloaded data, [MegaM](http://legacydirs.umiacs.umd.edu/~hal/megam/index.html) is required for some classifier-based chunking.

1. Download the source from http://legacydirs.umiacs.umd.edu/~hal/megam/index.html.
2. Make the following changes to the Makefile (as needed):
    * Update `WITHCLIBS` to point to your local caml lib dir. Invoking `ocamlc -where` may help.
    * Change `WITHSTR` to use `-lcamlstr` instead of `lstr`.
3. Build the optimized binary by invoking `make opt` (or `make` for the slow version).
4. Do one of:
    * Ensure that the location to the `megam.opt` binary is on your path.
    * Set the environment variable `MEGAM` to the location of `megam.opt`.

#### Downloading NLTK Data

Use the NLTK downloader to fetch any necessary datasets and corpora:

In [None]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('brown')
nltk.download('conll2000')

## Information Extraction

### Preprocessing

In [None]:
def ie_sentence_segmentation(document):
    return nltk.sent_tokenize(document)

In [None]:
def ie_tokenization(sentences):
    return [nltk.word_tokenize(sent) for sent in sentences]

In [None]:
def ie_pos_tagging(sentences):
    return [nltk.pos_tag(sent) for sent in sentences]

In [None]:
def ie_preprocess(document):
    sentences = ie_sentence_segmentation(document)
    sentences = ie_tokenization(sentences)
    sentences = ie_pos_tagging(sentences)
    
    return sentences

## Chunking

### Noun Phrase Chunking

`NP: {<DT>?<JJ>*<NN>}`: an NP chunk should be formed whenever the chunker finds an optional determiner `DT` followed by any number of adjectives `JJ` and then a noun `NN`.

In [None]:
def chunk(grammar, documents):
    for document in documents:
        print(document, '\n')

        for sentence in ie_preprocess(document):
            chunk_parser = nltk.RegexpParser(grammar)
            result       = chunk_parser.parse(sentence)

            print(textwrap.indent(str(result), '\t'), '\n')

In [None]:
grammar = r'''
    NP: {<DT|PRP\$>?<JJ.*|RBR|POS>*<CD|NN.*>+}
'''

chunk(grammar, [
    'the little yellow dog barked at the cat',
    'another sharp dive',
    'trade figures',
    'any new policy measures',
    'earlier stages',
    'Panamanian dictator Manuel Noriega',
    'his Mansion House speech',
    'the price cutting',
    '3% to 4%',
    'more than 10%',
    'the fastest developing trends',
    "man's skill",
    
    'the patient arrived earlier than was needed',
    
    "The market for system-management software for Digital's hardware is fragmented enough that a giant such as Computer Associates should do well there.",
])

In [None]:
grammar = r'''
    NP: {<DT|PRP\$>?<JJ.*>*<NN>} # determiner/possessive, adjectives, and noun
        {<NNP>+}                 # sequences of proper nouns
'''

chunk(grammar, [
    'Rapunzel let down her long golden hair',
])

### Exploring Text Corpora

#### `find_chunks(<grammar>, corpus=nltk.corpus.brown, limit=5)`

```python
>>> find_chunks('CHUNK: {<V.*> <TO> <V.*>}')
```

```
(CHUNK combined/VBN to/TO achieve/VB)
(CHUNK continue/VB to/TO place/VB)
...
(CHUNK wanted/VBD to/TO wait/VB)
```


In [None]:
def find_chunks(grammar, corpus=nltk.corpus.brown, limit=5):
    cp = nltk.RegexpParser(grammar)
    
    for sent in corpus.tagged_sents():
        tree = cp.parse(sent)
        
        for subtree in tree.subtrees():
            if 'CHUNK' == subtree.label():
                print(subtree)
                
                if limit is not None:
                    limit -= 1
                    if limit <= 0: return

In [None]:
find_chunks('CHUNK: {<V.*> <TO> <V.*>}')

In [None]:
find_chunks('CHUNK: {<N(?!IL).*>{4,}}')

### Chinking

In [None]:
grammar = r'''
    NP: {<.*>+}     # chunk everything
        }<VBD|IN>+{ # chink sequences of VBD and IN
'''

chunk(grammar, [
    'the little yellow dog barked at the cat',
])

## Developing and Evaluating Chunkers

In [None]:
test_sents  = nltk.corpus.conll2000.chunked_sents('test.txt', chunk_types=['NP'])
train_sents = nltk.corpus.conll2000.chunked_sents('train.txt', chunk_types=['NP'])

def evaluate_chunker(cp):
    print(cp.evaluate(test_sents))

### Baseline

Positive IOB tag accuracy indicates that more than a third of the words are tagged with `O`, i.e. not in an NP chunk. No chunks are found however, and precision, recall, and f-measure are therefore zero.

In [None]:
evaluate_chunker(
    nltk.RegexpParser(''),
)

### Naive Regexp

In [None]:
evaluate_chunker(
    nltk.RegexpParser(r'NP: {<[CDJNP].*>+}'),
)

### Unigram and Bigram

In [None]:
class TaggedChunker(nltk.ChunkParserI):
    def __init__(self, train_sents, tagger):
        train_data  = [[(t,c) for _,t,c in nltk.chunk.tree2conlltags(sent)] for sent in train_sents]
        self.tagger = tagger(train_data)
    
    def parse(self, sentence):
        pos_tags            = [pos for (_, pos) in sentence]
        iob_tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags           = [chunktag for (_, chunktag) in iob_tagged_pos_tags]
        conlltags           = [(word, pos, chunktag) for ((word, pos), chunktag) in zip(sentence, chunktags)]
        
        return nltk.chunk.conlltags2tree(conlltags)

In [None]:
evaluate_chunker(
    TaggedChunker(train_sents, nltk.UnigramTagger),
)

In [None]:
evaluate_chunker(
    TaggedChunker(train_sents, nltk.BigramTagger),
)

### Classifier-Based

#### Tagger

In [None]:
class ConsecutiveNPChunkTagger(nltk.TaggerI):
    def __init__(self, train_sentences):
        train_set = []
        
        for tagged_sentence in train_sentences:
            history           = []
            untagged_sentence = nltk.tag.untag(tagged_sentence)
            
            for i, (_, tag) in enumerate(tagged_sentence):
                featureset = npchunk_features(untagged_sentence, i, history)
                
                train_set.append( (featureset, tag) )
                history.append(tag)

        self.classifier = nltk.MaxentClassifier.train(train_set, algorithm='megam', trace=0)

    def tag(self, sentence):
        history = []
        
        for i, word in enumerate(sentence):
            featureset = npchunk_features(sentence, i, history)
            tag        = self.classifier.classify(featureset)
            
            history.append(tag)
        
        return zip(sentence, history)

#### Chunker

During training, `ConsecutiveNPChunker` maps the chunk trees in the training corpus into tag sequences; in the `parse()` method, it converts the tag sequence provided by the tagger back into a chunk tree:

In [None]:
class ConsecutiveNPChunker(nltk.ChunkParserI):
    def __init__(self, train_sentences):
        tagged_sentences = [
            [((w,t),c) for (w,t,c) in nltk.chunk.tree2conlltags(sentence)]
            for sentence in train_sentences
        ]
        
        self.tagger = ConsecutiveNPChunkTagger(tagged_sentences)
    
    def parse(self, sentence):
        tagged_sentences = self.tagger.tag(sentence)
        conlltags        = [(w,t,c) for ((w,t),c) in tagged_sentences]
        
        return nltk.chunk.conlltags2tree(conlltags)

#### Feature Extractor

In [None]:
def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    
    return { 'pos': pos }

#### Evaluation

In [None]:
evaluate_chunker(
    ConsecutiveNPChunker(train_sents),
)