## 5.1 词性标注器

In [1]:
import nltk

In [3]:
# refuse VBP and NN
text = nltk.word_tokenize("They refuse to permit us to obtain the refuse permit")
nltk.pos_tag(text)

[('They', 'PRP'),
 ('refuse', 'VBP'),
 ('to', 'TO'),
 ('permit', 'VB'),
 ('us', 'PRP'),
 ('to', 'TO'),
 ('obtain', 'VB'),
 ('the', 'DT'),
 ('refuse', 'NN'),
 ('permit', 'NN')]

## 5.2 标注语料库

In [5]:
sent = '''
... The/AT grand/JJ jury/NN commented/VBD on/IN a/AT number/NN of/IN
... other/AP topics/NNS ,/, AMONG/IN them/PPO the/AT Atlanta/NP and/CC
... Fulton/NP-tl County/NN-tl purchasing/VBG departments/NNS which/WDT it/PPS
... said/VBD ``/`` ARE/BER well/QL operated/VBN and/CC follow/VB generally/RB
... accepted/VBN practices/NNS which/WDT inure/VB to/IN the/AT best/JJT
... interest/NN of/IN both/ABX governments/NNS ''/'' ./.
... '''

In [8]:
print [nltk.tag.str2tuple(t) for t in sent.split()]

[('The', 'AT'), ('grand', 'JJ'), ('jury', 'NN'), ('commented', 'VBD'), ('on', 'IN'), ('a', 'AT'), ('number', 'NN'), ('of', 'IN'), ('other', 'AP'), ('topics', 'NNS'), (',', ','), ('AMONG', 'IN'), ('them', 'PPO'), ('the', 'AT'), ('Atlanta', 'NP'), ('and', 'CC'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('purchasing', 'VBG'), ('departments', 'NNS'), ('which', 'WDT'), ('it', 'PPS'), ('said', 'VBD'), ('``', '``'), ('ARE', 'BER'), ('well', 'QL'), ('operated', 'VBN'), ('and', 'CC'), ('follow', 'VB'), ('generally', 'RB'), ('accepted', 'VBN'), ('practices', 'NNS'), ('which', 'WDT'), ('inure', 'VB'), ('to', 'IN'), ('the', 'AT'), ('best', 'JJT'), ('interest', 'NN'), ('of', 'IN'), ('both', 'ABX'), ('governments', 'NNS'), ("''", "''"), ('.', '.')]


非所有语料库采用统一标记，故使用universal

In [11]:
nltk.corpus.brown.tagged_words(tagset='universal')

[(u'The', u'DET'), (u'Fulton', u'NOUN'), ...]

### 名词
出现在名词前的词类

In [12]:
from nltk.corpus import brown

In [13]:
brown_news_tagged = brown.tagged_words(categories='news', tagset='universal')

In [17]:
word_tag_pairs = nltk.bigrams(brown_news_tagged)
noun_preceders = [a[1] for (a, b) in word_tag_pairs if b[1] == 'NOUN']

In [19]:
fdist = nltk.FreqDist(noun_preceders)
print [tag for (tag, _) in fdist.most_common()]

[u'NOUN', u'DET', u'ADJ', u'ADP', u'.', u'VERB', u'CONJ', u'NUM', u'ADV', u'PRT', u'PRON', u'X']


### 动词
新闻文本常见动词

In [10]:
wsj = nltk.corpus.treebank.tagged_words(tagset='universal')
word_tag_fd = nltk.FreqDist(wsj)

In [15]:
[wt[0] for (wt, _) in word_tag_fd.most_common() if wt[1] == 'VERB'][:10]

[u'is',
 u'said',
 u'was',
 u'are',
 u'be',
 u'has',
 u'have',
 u'will',
 u'says',
 u'would']

In [16]:
cfd1 = nltk.ConditionalFreqDist(wsj)

In [17]:
cfd1['yield'].most_common()

[(u'VERB', 28), (u'NOUN', 20)]

颠倒配对顺序

In [22]:
cfd2 = nltk.ConditionalFreqDist((tag,word) for (word, tag) in wsj)

In [32]:
print cfd2.keys()

[u'ADV', 'VB', u'NOUN', u'ADP', u'PRON', 'VBD', u'DET', 'VBN', 'VD', u'.', u'PRT', u'NUM', 'VN', u'X', u'CONJ', u'ADJ', u'VERB']


In [41]:
# empty VD VN in this lib
cfd2['VD'].keys()

[]

### 最频繁的名词标记

In [42]:
def findtags(tag_prefix, tagged_text):
    cfd = nltk.ConditionalFreqDist((tag, word) for (word, tag) in tagged_text
                                  if tag.startswith(tag_prefix))
    return dict((tag, cfd[tag].most_common(5)) for tag in cfd.conditions())

In [43]:
tagdict = findtags('NN', nltk.corpus.brown.tagged_words(categories='news'))
for tag in sorted(tagdict):
     print(tag, tagdict[tag])

(u'NN', [(u'year', 137), (u'time', 97), (u'state', 88), (u'week', 85), (u'man', 72)])
(u'NN$', [(u"year's", 13), (u"world's", 8), (u"state's", 7), (u"nation's", 6), (u"company's", 6)])
(u'NN$-HL', [(u"Golf's", 1), (u"Navy's", 1)])
(u'NN$-TL', [(u"President's", 11), (u"University's", 3), (u"League's", 3), (u"Gallery's", 3), (u"Army's", 3)])
(u'NN-HL', [(u'cut', 2), (u'Salary', 2), (u'condition', 2), (u'Question', 2), (u'business', 2)])
(u'NN-NC', [(u'eva', 1), (u'ova', 1), (u'aya', 1)])
(u'NN-TL', [(u'President', 88), (u'House', 68), (u'State', 59), (u'University', 42), (u'City', 41)])
(u'NN-TL-HL', [(u'Fort', 2), (u'City', 1), (u'Commissioner', 1), (u'Grove', 1), (u'House', 1)])
(u'NNS', [(u'years', 101), (u'members', 69), (u'people', 52), (u'sales', 51), (u'men', 46)])
(u'NNS$', [(u"children's", 7), (u"women's", 5), (u"men's", 3), (u"janitors'", 3), (u"taxpayers'", 2)])
(u'NNS$-HL', [(u"Dealers'", 1), (u"Idols'", 1)])
(u'NNS$-TL', [(u"Women's", 4), (u"States'", 3), (u"Giants'", 2), (u

查看often后面词汇

In [47]:
brown_text = nltk.corpus.brown.words(categories='learned')
print sorted(set(b for (a, b) in nltk.bigrams(brown_text) if a=='often'))

[u',', u'.', u'accomplished', u'analytically', u'appear', u'apt', u'associated', u'assuming', u'became', u'become', u'been', u'began', u'call', u'called', u'carefully', u'chose', u'classified', u'colorful', u'composed', u'contain', u'differed', u'difficult', u'encountered', u'enough', u'equate', u'extremely', u'found', u'happens', u'have', u'ignored', u'in', u'involved', u'more', u'needed', u'nightly', u'observed', u'of', u'on', u'out', u'quite', u'represent', u'responsible', u'revamped', u'seclude', u'set', u'shortened', u'sing', u'sounded', u'stated', u'still', u'sung', u'supported', u'than', u'to', u'when', u'work']


三词短语

In [2]:
import nltk
from nltk.corpus import brown

In [7]:
def process(sentence):
    for (w1,t1), (w2, t2), (w3, t3) in nltk.trigrams(sentence):
        if (t1.startswith('V') and t2 == 'TO' and t3.startswith('V')): 
            print(w1, w2, w3)

In [11]:
for tagged_sent in brown.tagged_sents()[:50]:
    process(tagged_sent)

(u'combined', u'to', u'achieve')
(u'continue', u'to', u'place')
(u'serve', u'to', u'protect')
(u'wanted', u'to', u'wait')
