# 4. 품사 태깅
## 3) n-gram 접근법과 관련된 통계 모델링

UnigramTagger 학습을 수행하는 코드

In [1]:
import nltk

In [2]:
from nltk.tag import UnigramTagger

In [3]:
from nltk.corpus import treebank

In [6]:
training = treebank.tagged_sents()[:7000]

In [7]:
unitagger = UnigramTagger(training)

In [8]:
treebank.sents()[0]

['Pierre',
 'Vinken',
 ',',
 '61',
 'years',
 'old',
 ',',
 'will',
 'join',
 'the',
 'board',
 'as',
 'a',
 'nonexecutive',
 'director',
 'Nov.',
 '29',
 '.']

In [9]:
unitagger.tag(treebank.sents()[0])

[('Pierre', 'NNP'),
 ('Vinken', 'NNP'),
 (',', ','),
 ('61', 'CD'),
 ('years', 'NNS'),
 ('old', 'JJ'),
 (',', ','),
 ('will', 'MD'),
 ('join', 'VB'),
 ('the', 'DT'),
 ('board', 'NN'),
 ('as', 'IN'),
 ('a', 'DT'),
 ('nonexecutive', 'JJ'),
 ('director', 'NN'),
 ('Nov.', 'NNP'),
 ('29', 'CD'),
 ('.', '.')]

UnigramTagger를 평가하기 위해 정확도를 계산하는 코드

In [10]:
import nltk

In [11]:
from nltk.corpus import treebank

In [12]:
from nltk.tag import UnigramTagger

In [13]:
training = treebank.tagged_sents()[:7000]

In [14]:
unitagger = UnigramTagger(training)

In [15]:
testing = treebank.tagged_sents()[2000:]

In [16]:
unitagger.evaluate(testing)

0.9619024159944167

UnigramTagger를 사용한 태깅

In [17]:
import nltk

In [18]:
from nltk.corpus import treebank

In [19]:
from nltk.tag import UnigramTagger

In [20]:
unitag = UnigramTagger(model={'Vinken':'NN'})

In [21]:
unitag.tag(treebank.sents()[0])

[('Pierre', None),
 ('Vinken', 'NN'),
 (',', None),
 ('61', None),
 ('years', None),
 ('old', None),
 (',', None),
 ('will', None),
 ('join', None),
 ('the', None),
 ('board', None),
 ('as', None),
 ('a', None),
 ('nonexecutive', None),
 ('director', None),
 ('Nov.', None),
 ('29', None),
 ('.', None)]

주어진 문맥에서 ContextTagger는 주어진 태그의 빈도를 사용해 가장 가능성 있는 태그의 발생을 결정함.<br>
최소 임계 빈도를 사용하기 위해 특정 값을 컷오프 값에 전달할 수 있음

In [22]:
unitagger = UnigramTagger(training, cutoff=5)

In [24]:
unitagger.evaluate(testing)

0.7972986842375351

* 백오프 태깅 : 태거 중의 하나가 토큰을 태그할 수 없는 경우, 이 후 토큰은 다음 태거에 전달될 수 있도록 모든 태거는 서로 연결됨

아래의 코드에서 DefaultTagger와 UnigramTagger가 토큰을 태그하는 데 사용됨, DefaultTagger와 UnigramTagger의 어떤 태거가 단어를 태그할 수 없는 경우, 다음 태거를 사용하여 태그를 지정할 수 있음

In [26]:
import nltk

In [27]:
from nltk.tag import UnigramTagger, DefaultTagger

In [28]:
from nltk.corpus import treebank

In [29]:
testing = treebank.tagged_sents()[2000:]

In [30]:
training = treebank.tagged_sents()[:7000]

In [31]:
tag1 = DefaultTagger('NN')

In [32]:
tag2 = UnigramTagger(training, backoff=tag1)

In [34]:
tag2.evaluate(testing)

0.9619024159944167

BigramTagger의 구현

In [35]:
import nltk

In [36]:
from nltk.tag import BigramTagger

In [37]:
from nltk.corpus import treebank

In [38]:
training_1 = treebank.tagged_sents()[:7000]

In [39]:
bigramtagger = BigramTagger(training_1)

In [40]:
treebank.sents()[0]

['Pierre',
 'Vinken',
 ',',
 '61',
 'years',
 'old',
 ',',
 'will',
 'join',
 'the',
 'board',
 'as',
 'a',
 'nonexecutive',
 'director',
 'Nov.',
 '29',
 '.']

In [41]:
bigramtagger.tag(treebank.sents()[0])

[('Pierre', 'NNP'),
 ('Vinken', 'NNP'),
 (',', ','),
 ('61', 'CD'),
 ('years', 'NNS'),
 ('old', 'JJ'),
 (',', ','),
 ('will', 'MD'),
 ('join', 'VB'),
 ('the', 'DT'),
 ('board', 'NN'),
 ('as', 'IN'),
 ('a', 'DT'),
 ('nonexecutive', 'JJ'),
 ('director', 'NN'),
 ('Nov.', 'NNP'),
 ('29', 'CD'),
 ('.', '.')]

In [42]:
testing_1 = treebank.tagged_sents()[2000:]

In [43]:
bigramtagger.evaluate(testing_1)

0.9171131227292321

BigramTagger와 TrigramTagger

In [44]:
import nltk

In [45]:
from nltk.tag import BigramTagger, TrigramTagger

In [46]:
from nltk.corpus import treebank

In [47]:
testing = treebank.tagged_sents()[2000:]

In [48]:
training = treebank.tagged_sents()[:7000]

In [49]:
bigramtag = BigramTagger(training)

In [50]:
bigramtag.evaluate(testing)

0.9171131227292321

In [51]:
trigramtag = TrigramTagger(training)

In [52]:
trigramtag.evaluate(testing)

0.9022107272615308

Quadgram

In [53]:
import nltk

In [54]:
from nltk.corpus import treebank

In [55]:
from nltk import NgramTagger

In [56]:
testing = treebank.tagged_sents()[2000:]

In [57]:
training = treebank.tagged_sents()[:7000]

In [58]:
quadgramtag = NgramTagger(4, training)

In [59]:
quadgramtag.evaluate(testing)

0.9304554878173943

AffixTagger도 문맥 정보로서 접두사 혹은 접미사를 사용한 ContextTagger

In [60]:
import nltk

In [61]:
from nltk.corpus import treebank

In [62]:
from nltk.tag import AffixTagger

In [63]:
testing = treebank.tagged_sents()[2000:]

In [64]:
training = treebank.tagged_sents()[:7000]

In [65]:
affixtag = AffixTagger(training)

In [66]:
affixtag.evaluate(testing)

0.2902682841718497

4개 문자 접두사의 사용을 학습하는 코드

In [67]:
import nltk

In [68]:
from nltk.corpus import treebank

In [69]:
from nltk.tag import AffixTagger

In [70]:
testing = treebank.tagged_sents()[2000:]

In [71]:
training = treebank.tagged_sents()[:7000]

In [72]:
prefixtag = AffixTagger(training, affix_length=4)

In [73]:
prefixtag.evaluate(testing)

0.2094751318841472

3개 문자 접미사의 사용을 학습하는 코드

In [74]:
import nltk

In [75]:
from nltk.tag import AffixTagger

In [76]:
from nltk.corpus import treebank

In [77]:
testing = treebank.tagged_sents()[2000:]

In [78]:
training = treebank.tagged_sents()[:7000]

In [79]:
suffixtag = AffixTagger(training, affix_length=3)

In [80]:
suffixtag.evaluate(testing)

0.25699447831352507

백오프 체인에서 많은 접사 태거를 결합하는 코드

In [81]:
import nltk

In [82]:
from nltk.tag import AffixTagger

In [83]:
from nltk.corpus import treebank

In [84]:
testing = treebank.tagged_sents()[2000:]

In [85]:
training = treebank.tagged_sents()[:7000]

In [86]:
prefixtagger = AffixTagger(training, affix_length=4)

In [87]:
prefixtagger.evaluate(testing)

0.2094751318841472

In [88]:
prefixtagger3 = AffixTagger(training, affix_length=3, backoff=prefixtagger)

In [89]:
prefixtagger3.evaluate(testing)

0.25841082168442225

In [90]:
suffixtagger3 = AffixTagger(training, affix_length=3, backoff=prefixtagger3)

In [91]:
suffixtagger3.evaluate(testing)

0.25841082168442225

In [92]:
suffixtagger4 = AffixTagger(training, affix_length=4, backoff=suffixtagger3)

In [93]:
suffixtagger4.evaluate(testing)

0.29386045938789335

TnT(Trigrams n Tags) : 2차 마르코프 모델을 기반으로하는 통계 기반 태거

In [94]:
import nltk

In [95]:
from nltk.tag import tnt

In [96]:
from nltk.corpus import treebank

In [97]:
teseting = treebank.tagged_sents()[2000:]

In [98]:
training = treebank.tagged_sents()[:7000]

In [99]:
tnt_tagger = tnt.TnT()

In [100]:
tnt_tagger.train(training)

In [101]:
tnt_tagger.evaluate(testing)

0.9882176652913768

TnT는 트레이닝 텍스트에서 ConditionalFreqDist 및 internalFreqDist를 계산함.
이 인스턴스는 유니그램, 바이그램 및 트라이그램을 계산하는데 사용.

아래의 코드는 알 수 없는 태거의 값이 명시적으로 제공되는 경우 TRAINED가 TRUE로 설정되는 DefaultTagger의 코드

In [102]:
import nltk

In [103]:
from nltk.tag import DefaultTagger

In [104]:
from nltk.tag import tnt

In [105]:
from nltk.corpus import treebank

In [106]:
testing = treebank.tagged_sents()[2000:]

In [107]:
training = treebank.tagged_sents()[:7000]

In [108]:
tnt_tagger = tnt.TnT()

In [109]:
unknown = DefaultTagger('NN')

In [110]:
tagger_tnt = tnt.TnT(unk=unknown, Trained=True)

In [111]:
tnt_tagger.train(training)

In [112]:
tnt_tagger.evaluate(testing)

0.9882176652913768