In [1]:
import nltk, re, pprint

def ie_preprocess(document):
    sentences = nltk.sent_tokenize(document) 
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    sentences = [nltk.pos_tag(sent) for sent in sentences]

In [2]:
sentence = [("the", "DT"), ("little", "JJ"), ("yellow", "JJ"),
 ("dog", "NN"), ("barked", "VBD"), ("at", "IN"),  ("the", "DT"), ("cat", "NN")]

In [3]:
grammar = "NP: {<DT>?<JJ>*<NN>}"

cp = nltk.RegexpParser(grammar)
result = cp.parse(sentence)
print(result)

(S
  (NP the/DT little/JJ yellow/JJ dog/NN)
  barked/VBD
  at/IN
  (NP the/DT cat/NN))


In [4]:
result.draw()

In [None]:
nltk.app.chunkparser()

In [4]:
text = '''
he PRP B-NP
accepted VBD B-VP
the DT B-NP
position NN I-NP
of IN B-PP
vice NN B-NP
chairman NN I-NP
of IN B-PP
Carlyle NNP B-NP
Group NNP I-NP
, , O
a DT B-NP
merchant NN I-NP
banking NN I-NP
concern NN I-NP
. . O
'''

In [5]:
nltk.chunk.conllstr2tree(text, chunk_types=['NP']).draw()


In [6]:
from nltk.corpus import conll2000
cp = nltk.RegexpParser("")
test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
print(cp.accuracy(test_sents))

ChunkParse score:
    IOB Accuracy:  43.4%%
    Precision:      0.0%%
    Recall:         0.0%%
    F-Measure:      0.0%%


In [11]:
grammar = r"NP: {<[CDJNP].*>+}"
cp = nltk.RegexpParser(grammar)
chunkscore = cp.accuracy(test_sents)
print(chunkscore)
print(chunkscore.missed())
print(chunkscore.incorrect())

ChunkParse score:
    IOB Accuracy:  87.7%%
    Precision:     70.6%%
    Recall:        67.8%%
    F-Measure:     69.2%%
[ImmutableTree('NP', [('that', 'WDT')]), ImmutableTree('NP', [('the', 'DT'), ('Fed', 'NNP')]), ImmutableTree('NP', [('$', '$'), ('81', 'CD')]), ImmutableTree('NP', [('1.82', 'CD'), ('%', 'NN')]), ImmutableTree('NP', [("'s", 'POS'), ('name', 'NN')]), ImmutableTree('NP', [('the', 'DT'), ('deliberations', 'NNS')]), ImmutableTree('NP', [('there', 'EX')]), ImmutableTree('NP', [('most', 'RBS'), ('domestic', 'JJ'), ('airline', 'NN'), ('flights', 'NNS')]), ImmutableTree('NP', [('most', 'RBS'), ('junk', 'NN'), ('bonds', 'NNS')]), ImmutableTree('NP', [('operating', 'VBG'), ('profit', 'NN')]), ImmutableTree('NP', [('transfers', 'NNS')]), ImmutableTree('NP', [('who', 'WP')]), ImmutableTree('NP', [('A-2', 'NN'), ('preferred', 'JJ'), ('stock', 'NN')]), ImmutableTree('NP', [('wide', 'JJ'), ('fluctuations', 'NNS')]), ImmutableTree('NP', [('Afghans', 'NNPS')]), ImmutableTree('NP', [

In [12]:
class UnigramChunker(nltk.ChunkParserI):
    def __init__(self, train_sents): # [_code-unigram-chunker-constructor]
        train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)]
                      for sent in train_sents]
        self.tagger = nltk.UnigramTagger(train_data) # [_code-unigram-chunker-buildit]

    def parse(self, sentence): # [_code-unigram-chunker-parse]
        pos_tags = [pos for (word,pos) in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
        conlltags = [(word, pos, chunktag) for ((word,pos),chunktag)
                     in zip(sentence, chunktags)]
        return nltk.chunk.conlltags2tree(conlltags)


In [13]:
test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
unigram_chunker = UnigramChunker(train_sents)
print(unigram_chunker.accuracy(test_sents))

ChunkParse score:
    IOB Accuracy:  92.9%%
    Precision:     79.9%%
    Recall:        86.8%%
    F-Measure:     83.2%%


In [14]:
sent = nltk.corpus.treebank.tagged_sents()
print(nltk.ne_chunk(sent, binary=True))

AttributeError: 'tuple' object has no attribute 'startswith'

In [15]:
IN = re.compile(r'.*\bin\b(?!\b.+ing)')
for doc in nltk.corpus.ieer.parsed_docs('NYT_19980315'):
     for rel in nltk.sem.extract_rels('ORG', 'LOC', doc,
                                      corpus='ieer', pattern = IN):
            print(nltk.sem.rtuple(rel))

[ORG: 'WHYY'] 'in' [LOC: 'Philadelphia']
[ORG: 'McGlashan &AMP; Sarrail'] 'firm in' [LOC: 'San Mateo']
[ORG: 'Freedom Forum'] 'in' [LOC: 'Arlington']
[ORG: 'Brookings Institution'] ', the research group in' [LOC: 'Washington']
[ORG: 'Idealab'] ', a self-described business incubator based in' [LOC: 'Los Angeles']
[ORG: 'Open Text'] ', based in' [LOC: 'Waterloo']
[ORG: 'WGBH'] 'in' [LOC: 'Boston']
[ORG: 'Bastille Opera'] 'in' [LOC: 'Paris']
[ORG: 'Omnicom'] 'in' [LOC: 'New York']
[ORG: 'DDB Needham'] 'in' [LOC: 'New York']
[ORG: 'Kaplan Thaler Group'] 'in' [LOC: 'New York']
[ORG: 'BBDO South'] 'in' [LOC: 'Atlanta']
[ORG: 'Georgia-Pacific'] 'in' [LOC: 'Atlanta']
