# tagger

In [1]:
import nltk

In [2]:
s = 'And now for something completely different.'

In [3]:
text = nltk.word_tokenize(s)
text

['And', 'now', 'for', 'something', 'completely', 'different', '.']

In [4]:
print(nltk.pos_tag(text))

[('And', 'CC'), ('now', 'RB'), ('for', 'IN'), ('something', 'NN'), ('completely', 'RB'), ('different', 'JJ'), ('.', '.')]


In [5]:
print(nltk.pos_tag(s.split()))

[('And', 'CC'), ('now', 'RB'), ('for', 'IN'), ('something', 'NN'), ('completely', 'RB'), ('different.', 'JJ')]


# tagger를 이용한 중의성 해소

In [6]:
s = "They refuse to permit us to obtain the refuse permit."

In [7]:
print(nltk.pos_tag(nltk.word_tokenize(s)))

[('They', 'PRP'), ('refuse', 'VBP'), ('to', 'TO'), ('permit', 'VB'), ('us', 'PRP'), ('to', 'TO'), ('obtain', 'VB'), ('the', 'DT'), ('refuse', 'NN'), ('permit', 'NN'), ('.', '.')]


# 그러나 tagger...

In [8]:
print(nltk.pos_tag("Time flies like an arrow".split()))

[('Time', 'NNP'), ('flies', 'NNS'), ('like', 'IN'), ('an', 'DT'), ('arrow', 'NN')]


In [9]:
print(nltk.pos_tag("time flies like an arrow".split()))

[('time', 'NN'), ('flies', 'NNS'), ('like', 'IN'), ('an', 'DT'), ('arrow', 'NN')]


In [10]:
print(nltk.pos_tag("time sometimes flies like an arrow".split()))

[('time', 'NN'), ('sometimes', 'RB'), ('flies', 'NNS'), ('like', 'IN'), ('an', 'DT'), ('arrow', 'NN')]


In [11]:
print(nltk.pos_tag("Time sometimes flies like an arrow".split()))

[('Time', 'NNP'), ('sometimes', 'RB'), ('flies', 'VBZ'), ('like', 'IN'), ('an', 'DT'), ('arrow', 'NN')]


# str2tuple

In [12]:
[nltk.tag.str2tuple(t) for t in 'The/AT grand/JJ jury/NN'.split()]

[('The', 'AT'), ('grand', 'JJ'), ('jury', 'NN')]

In [13]:
[nltk.tag.str2tuple(t) for t in 'The_AT grand_JJ jury_NN'.split()]

[('The_AT', None), ('grand_JJ', None), ('jury_NN', None)]

In [14]:
[nltk.tag.str2tuple(t,sep='_') for t in 'The_AT grand_JJ jury_NN'.split()]

[('The', 'AT'), ('grand', 'JJ'), ('jury', 'NN')]

# tagged corpora

In [15]:
print(nltk.corpus.brown.tagged_words()[:5])

[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL')]


In [16]:
print(nltk.corpus.brown.tagged_words(tagset='universal')[:5])

[('The', 'DET'), ('Fulton', 'NOUN'), ('County', 'NOUN'), ('Grand', 'ADJ'), ('Jury', 'NOUN')]


In [17]:
print(nltk.corpus.treebank.tagged_words()[:5])

[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS')]


In [18]:
print(nltk.corpus.treebank.tagged_words(tagset='universal')[:5])

[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ('61', 'NUM'), ('years', 'NOUN')]


In [19]:
tags = nltk.FreqDist(tag for word, tag in nltk.corpus.brown.tagged_words(tagset='universal'))
print(tags.most_common())

[('NOUN', 275558), ('VERB', 182750), ('.', 147565), ('ADP', 144766), ('DET', 137019), ('ADJ', 83721), ('ADV', 56239), ('PRON', 49334), ('CONJ', 38151), ('PRT', 29829), ('NUM', 14874), ('X', 1386)]


# bi-gram

In [20]:
words = nltk.corpus.brown.tagged_words(categories='news',tagset='universal')
bi_grams = nltk.bigrams(words)
bi_grams

<generator object bigrams at 0x000001E94951D740>

In [21]:
list(bi_grams)[:5]

[(('The', 'DET'), ('Fulton', 'NOUN')),
 (('Fulton', 'NOUN'), ('County', 'NOUN')),
 (('County', 'NOUN'), ('Grand', 'ADJ')),
 (('Grand', 'ADJ'), ('Jury', 'NOUN')),
 (('Jury', 'NOUN'), ('said', 'VERB'))]

In [22]:
list(bi_grams)

[]

In [23]:
z = list(nltk.bigrams(words))

In [24]:
z[:5]

[(('The', 'DET'), ('Fulton', 'NOUN')),
 (('Fulton', 'NOUN'), ('County', 'NOUN')),
 (('County', 'NOUN'), ('Grand', 'ADJ')),
 (('Grand', 'ADJ'), ('Jury', 'NOUN')),
 (('Jury', 'NOUN'), ('said', 'VERB'))]

# n-gram 활용

In [25]:
words = nltk.corpus.brown.tagged_words(categories='news',tagset='universal')
nltk.FreqDist(b[1] for (a,b) in nltk.bigrams(words)
             if a[0].lower()=='often').tabulate()

VERB  ADJ    .  ADP  DET  ADV 
   6    2    2    2    1    1 


In [26]:
words = nltk.corpus.brown.tagged_words(categories='learned',tagset='universal')
nltk.FreqDist(b[1] for (a,b) in nltk.bigrams(words)
             if a[0].lower()=='often').tabulate()

VERB  ADV  ADP  ADJ    .  DET  PRT PRON NOUN 
  37    8    7    6    6    2    2    1    1 


In [27]:
from nltk.corpus import brown

In [28]:
print(brown.tagged_sents()[:1])

[[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN'), ("Atlanta's", 'NP$'), ('recent', 'JJ'), ('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'), ('``', '``'), ('no', 'AT'), ('evidence', 'NN'), ("''", "''"), ('that', 'CS'), ('any', 'DTI'), ('irregularities', 'NNS'), ('took', 'VBD'), ('place', 'NN'), ('.', '.')]]


In [29]:
def process(sentence):
    for (w1,t1),(w2,t2),(w3,t3) in nltk.trigrams(sentence):
        if(t1.startswith('V') and t2=='TO' and t3.startswith('V')):
            print(w1,w2,w3)

In [30]:
for tagged_sent in brown.tagged_sents():
    process(tagged_sent)

combined to achieve
continue to place
serve to protect
wanted to wait
allowed to place
expected to become
expected to approve
expected to make
intends to make
seek to set
like to see
designed to provide
get to hear
expects to tell
expected to give
prefer to pay
required to obtain
permitted to teach
designed to reduce
Asked to elaborate
got to go
raised to pay
scheduled to go
cut to meet
needed to meet
hastened to add
found to prevent
continue to insist
compelled to make
made to remove
revamped to give
want to risk
appear to spark
fails to consider
plans to call
going to examine
plans to name
come to pass
voted to accept
happens to hold
authorized to adopt
hesitated to prosecute
try to make
decided to spend
taken to preserve
left to preserve
stand to bring
decided to seek
trying to induce
proposing to make
decided to run
directed to investigate
expected to pass
expected to make
expected to encounter
hopes to pass
came to pay
expected to receive
understood to follow
wanted to vote
decide

refusing to show
seems to match
decided to rake
entitled to stay
bringing to bear
beginning to get
threaten to use
begin to play
begin to hunt
induced to see
going to hang
continue to test
choose to agree
chosen to find
compelled to conduct
need to realize
used to justify
fail to respond
intends to economize
bound to say
trying to mollify
used to say
came to reassert
choose to call
intend to re-enter
conscripted to enact
continues to feed
tried to integrate
poised to strike
try to force
attempting to reach
afford to take
forbidden to sit
plans to import
likes to imagine
used to get
trying to make
ceased to suggest
going to work
wanting to cut
choose to persuade
trying to keep
like to embark
suited to defeat
hastened to put
like to add
want to preserve
required to participate
happened to save
doing to promote
tempted to quote
continuing to capture
need to communicate
like to see
interested to know
allowed to rust
chose to devote
left to choose
want to own
plan to become
persuaded to res

used to like
offered to ship
hopes to find
invented to hold
learn to like
labored to set
set to receive
entered to compete
seem to make
seemed to answer
decided to use
began to show
Wishing to show
learned to set
forced to fly
hope to break
came to recognize
turning to cup
seems to creep
going to live
got to learn
learn to live
going to live
like to sew
love to run
love to crack
yearn to make
tried to see
love to dust
like to become
decide to write
cause to exist
learn to portray
learn to portray
began to advise
taught to yield
prefer to cope
helps to explain
surprised to bump
seemed to brave
begins to regard
began to embezzle
appears to endorse
expected to like
begin to assert
began to challenge
going to become
helping to make
began to stress
began to describe
fails to gain
liked to play
love to audition
wanted to show
like to see
loved to dance
try to bid
wanted to go
asked to leave
continued to promote
wished to meet
hoping to see
got to know
paused to comfort
hesitate to quote
deci

tried to remedy
tends to express
seem to believe
permitted to return
attempted to make
prepared to demonstrate
calculated to suggest
seemed to disconcert
known to make
going to talk
learns to focus
chooses to subordinate
wish to preserve
cease to exist
seem to constitute
destined to fail
wants to get
began to understand
wanted to capture
liked to tell
decided to migrate
continued to trouble
labored to finish
decided to return
waiting to go
chosen to serve
came to know
helped to escape
opened to admit
happened to see
brought to bear
inclined to argue
seeming to say
prompted to write
come to dominate
used to illustrate
prepared to find
wish to argue
begin to read
plan to discuss
come to call
expect to find
come to believe
continue to pay
tend to thump
determined to prove
learn to control
used to frustrate
trying to assert
trying to expose
comes to regard
felt to indicate
came to speak
needed to explain
required to make
sought to make
accustomed to think
come to look
undertook to give
cam

required to cease
required to operate
designed to operate
permitted to operate
taken to minimize
permitted to operate
permitted to operate
permitted to operate
required to operate
required to afford
elect to use
required to file
required to file
elect to use
required to file
obligated to furnish
trained to read
made to assure
tend to create
rejoicing to remember
permitted to run
came to work
decided to bring
found to permit
helping to strengthen
began to ship
believed to provide
designed to provide
expect to make
developed to facilitate
set to hold
continuing to carry
designed to increase
improved to obtain
purchased to permit
extended to provide
sought to meet
designed to handle
invited to participate
planned to provide
inclined to advance
aims to give
wish to pursue
expected to increase
expected to exceed
begun to make
continues to expand
began to make
need to learn
learn to delegate
working to attain
begun to translate
besieged to serve
help to create
assumed to originate
used to de

decided to strip
made to satisfy
arranged to fit
needed to take
tried to restrict
led to speculate
intended to fill
restored to go
appear to push
began to wash
wish to address
want to meet
appeared to evoke
tend to blunt
try to build
tend to become
presume to lecture
attempting to acquaint
failed to state
like to make
presume to speak
mean to live
made to look
designed to discover
seems to use
used to describe
postulated to explain
used to support
seem to corroborate
wants to hear
comes to represent
used to accompany
seems to symbolize
begins to appear
begins to ramble
help to set
calculated to put
decided to write
seemed to open
combine to create
learned to use
began to take
wanted to tell
wanted to substitute
want to make
come to determine
begun to ebb
intended to incorporate
led to postulate
hope to discover
tended to emphasize
fails to explore
seeks to make
helping to define
trying to avoid
trying to get
made to symbolize
kneels to kiss
serve to travesty
used to equate
altered to s

tried to bring
reaching to release
waiting to report
used to paint
Begin to look
wanted to see
Failing to find
Try to forget
seen to leave
forced to give
inclined to admit
began to make
professed to know
asked to use
leaving to keep
fit to consult
asked to see
wanted to make
continued to discharge
seem to belong
began to flicker
trying to wreck
fit to touch
going to take
trying to clear
want to spend
paused to look
going to allow
like to talk
planning to set
bent to examine
turned to jump
started to retch
going to get
come to recognize
expected to report
failed to see
failed to notify
failed to co-operate
stopping to hear
want to talk
going to cost
wanted to ask
going to get
going to swear
tried to keep
think to look
tried to find
bear to hold
began to pace
tried to tell
intended to scare
began to think
hired to take
going to send
helped to create
wanted to give
led to believe
trying to escape
began to thrash
get to work
come to work
want to see
wanted to get
want to go
managed to swal

gone to purify
waiting to see
come to skirt
trying to make
go to sleep
tried to emulate
began to pulse
amazed to find
kneeling to tie
trying to get
trying to smile
seemed to float
tried to see
stop to analyze
supposed to joke
supposed to handle
want to ask
got to admit
tried to leave
began to walk
supposed to stay
going to tell
going to get
intended to make
began to zip
obliged to roll
want to stop
going to marry
liked to hear
tempted to tell
seemed to mark
tried to explain
managed to look
needed to get
answered to find
afford to get
started to look
takes to get
going to get
tried to quiet
trying to sound
came to meet
seemed to focus
want to talk
want to see
wants to get
went to turn
surprised to find
want to stay
going to make
hoped to dig
trying to make
going to lug
surprised to see
stop to read
intended to move
rising to sting
arranged to live
managed to find
inclined to wobble
supposed to care
shuddered to think
seemed to understand
aroused to go
hate to call
wish to leave
seemed t

In [31]:
words[:3]

[('1', 'NUM'), ('.', '.'), ('Introduction', 'NOUN')]

# ConditionalFreqDist

In [32]:
cfd1 = nltk.ConditionalFreqDist(words)

In [33]:
cfd1['yield']

FreqDist({'VERB': 4, 'NOUN': 3})

In [34]:
cfd1['yield'].most_common()

[('VERB', 4), ('NOUN', 3)]

In [35]:
cfd1['yield'].items()

dict_items([('NOUN', 3), ('VERB', 4)])

In [36]:
cfd1['yield'].keys()

dict_keys(['NOUN', 'VERB'])

In [37]:
cfd1['yield'].values()

dict_values([3, 4])

In [39]:
cfd2 = nltk.ConditionalFreqDist((word,tag) for (word,tag) in words)
cfd2['yield'].most_common()

[('VERB', 4), ('NOUN', 3)]

In [40]:
cfd3 = nltk.ConditionalFreqDist((tag,word) for (word,tag) in words)
cfd3['VERB'].most_common(5)

[('is', 2403), ('be', 1363), ('was', 1114), ('are', 991), ('were', 633)]

In [41]:
cfd3.tabulate(conditions=['VERB','NOUN'],samples=['yield','cut'])

     yield   cut 
VERB     4    17 
NOUN     3     2 
