## 基于规则的词性标注器

In [17]:
import nltk
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [18]:
nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")
nltk.download("tagsets")

[nltk_data] Downloading package punkt to /Users/wujiangu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/wujiangu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to
[nltk_data]     /Users/wujiangu/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


True

In [19]:
s = "i enjoy playing the piano"

In [20]:
tokens = nltk.word_tokenize(s)
tokens

['i', 'enjoy', 'playing', 'the', 'piano']

In [21]:
tags = nltk.pos_tag(tokens)
tags

[('i', 'NN'),
 ('enjoy', 'VBP'),
 ('playing', 'VBG'),
 ('the', 'DT'),
 ('piano', 'NN')]

In [22]:
nltk.help.upenn_tagset("NN")

NN: noun, common, singular or mass
    common-carrier cabbage knuckle-duster Casino afghan shed thermostat
    investment slide humour falloff slick wind hyena override subhumanity
    machinist ...


In [23]:
sent = "and so i said i am going to play for the play tonight"

In [24]:
tagset = nltk.pos_tag(nltk.word_tokenize(sent))
tagset

[('and', 'CC'),
 ('so', 'RB'),
 ('i', 'JJ'),
 ('said', 'VBD'),
 ('i', 'JJ'),
 ('am', 'VBP'),
 ('going', 'VBG'),
 ('to', 'TO'),
 ('play', 'VB'),
 ('for', 'IN'),
 ('the', 'DT'),
 ('play', 'NN'),
 ('tonight', 'NN')]

In [25]:
nltk.help.upenn_tagset("VBP")

VBP: verb, present tense, not 3rd person singular
    predominate wrap resort sue twist spill cure lengthen brush terminate
    appear tend stray glisten obtain comprise detest tease attract
    emphasize mold postpone sever return wag ...


## 随机的词性标注器

In [26]:
import spacy

In [58]:
nlp = spacy.load("zh_core_web_sm")

In [59]:
doc = nlp("上海自来水来自海上")
doc

上海自来水来自海上

In [68]:
for ent in doc.ents:
    print(ent.text,ent.lable_)

In [60]:
for token in doc:
    print(token.text,token.pos_,token.tag_)

上海 PROPN NR
自来水 NOUN NN
来自 VERB VV
海上 NOUN NN


In [61]:
spacy.explain("PART")

'particle'

## 分块--NLTK

In [62]:
rule = r"""Noun Phrase: {<DT>?<JJ>*<NN>}"""

In [63]:
chunkParaser = nltk.RegexpChunkParser(rule)

In [64]:
tagset

[('and', 'CC'),
 ('so', 'RB'),
 ('i', 'JJ'),
 ('said', 'VBD'),
 ('i', 'JJ'),
 ('am', 'VBP'),
 ('going', 'VBG'),
 ('to', 'TO'),
 ('play', 'VB'),
 ('for', 'IN'),
 ('the', 'DT'),
 ('play', 'NN'),
 ('tonight', 'NN')]

In [65]:
# chunked = chunkParaser.parse(tags)
# chunked.draw()

## 分块--spacy

In [73]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"Discover the best of Apple this holiday season. Find the perfect gifts. Chat with Specialists. Get free, no-contact delivery or convenient pickup. And in most metros, get in-stock items by 2-hour courier delivery.。")
doc

Discover the best of Apple this holiday season. Find the perfect gifts. Chat with Specialists. Get free, no-contact delivery or convenient pickup. And in most metros, get in-stock items by 2-hour courier delivery.。

In [74]:
for chunk in doc.noun_chunks:
    print(chunk.text,chunk.root.text,chunk.root.dep_)

Apple Apple pobj
the perfect gifts gifts dobj
Specialists Specialists pobj
free, no-contact delivery delivery dobj
convenient pickup pickup conj
most metros metros pobj
stock stock pobj
2-hour courier delivery delivery pobj


In [76]:
for ent in doc.ents:
    print(ent.text,ent.label_)

Apple ORG
this holiday season DATE
2-hour TIME
