# Unigram Tagger

In [1]:
import nltk
from nltk.corpus import brown

brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')
unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)

In [2]:
print(unigram_tagger.tag(brown_sents[2007]))

[('Various', 'JJ'), ('of', 'IN'), ('the', 'AT'), ('apartments', 'NNS'), ('are', 'BER'), ('of', 'IN'), ('the', 'AT'), ('terrace', 'NN'), ('type', 'NN'), (',', ','), ('being', 'BEG'), ('on', 'IN'), ('the', 'AT'), ('ground', 'NN'), ('floor', 'NN'), ('so', 'QL'), ('that', 'CS'), ('entrance', 'NN'), ('is', 'BEZ'), ('direct', 'JJ'), ('.', '.')]


In [3]:
unigram_tagger.evaluate(brown_tagged_sents)

0.9349006503968017

# training data vs test data

In [4]:
size = int(len(brown_tagged_sents) * 0.9)
size

4160

In [5]:
train_sents = brown_tagged_sents[:size]
test_sents = brown_tagged_sents[size:]

In [6]:
unigram_tagger = nltk.UnigramTagger(train_sents)
unigram_tagger.evaluate(test_sents)

0.8121200039868434

# N-gram Tagger

In [7]:
unseen_sent = brown_sents[4203]
print(unigram_tagger.tag(unseen_sent))

[('The', 'AT'), ('population', 'NN'), ('of', 'IN'), ('the', 'AT'), ('Congo', 'NP'), ('is', 'BEZ'), ('13.5', None), ('million', 'CD'), (',', ','), ('divided', 'VBN'), ('into', 'IN'), ('at', 'IN'), ('least', 'AP'), ('seven', 'CD'), ('major', 'JJ'), ('``', '``'), ('culture', None), ('clusters', 'NNS'), ("''", "''"), ('and', 'CC'), ('innumerable', None), ('tribes', None), ('speaking', 'VBG'), ('400', 'CD'), ('separate', 'JJ'), ('dialects', None), ('.', '.')]


In [8]:
bigram_tagger = nltk.BigramTagger(train_sents)
print(bigram_tagger.tag(unseen_sent))

[('The', 'AT'), ('population', 'NN'), ('of', 'IN'), ('the', 'AT'), ('Congo', 'NP'), ('is', 'BEZ'), ('13.5', None), ('million', None), (',', None), ('divided', None), ('into', None), ('at', None), ('least', None), ('seven', None), ('major', None), ('``', None), ('culture', None), ('clusters', None), ("''", None), ('and', None), ('innumerable', None), ('tribes', None), ('speaking', None), ('400', None), ('separate', None), ('dialects', None), ('.', None)]


In [9]:
trigram_tagger = nltk.TrigramTagger(train_sents)
print(trigram_tagger.tag(unseen_sent))

[('The', 'AT'), ('population', None), ('of', None), ('the', None), ('Congo', None), ('is', None), ('13.5', None), ('million', None), (',', None), ('divided', None), ('into', None), ('at', None), ('least', None), ('seven', None), ('major', None), ('``', None), ('culture', None), ('clusters', None), ("''", None), ('and', None), ('innumerable', None), ('tribes', None), ('speaking', None), ('400', None), ('separate', None), ('dialects', None), ('.', None)]


In [10]:
unigram_tagger.evaluate(test_sents)

0.8121200039868434

In [11]:
bigram_tagger.evaluate(test_sents)

0.10206319146815508

In [12]:
trigram_tagger.evaluate(test_sents)

0.0626931127279976

# Combining Tagger

In [13]:
t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(train_sents,backoff=t0)
t2 = nltk.BigramTagger(train_sents,backoff=t1)
t2.evaluate(test_sents)

0.8452108043456593

In [14]:
unigram_tagger.evaluate(test_sents)

0.8121200039868434

In [15]:
t3 = nltk.TrigramTagger(train_sents,backoff=t2)
t3.evaluate(test_sents)

0.843317053722715

In [16]:
t3 = nltk.TrigramTagger(train_sents,backoff=t2,cutoff=1)
t3.evaluate(test_sents)

0.8437157380643875

In [17]:
t3 = nltk.TrigramTagger(train_sents,backoff=t2,cutoff=2)
t3.evaluate(test_sents)

0.84411442240606

In [18]:
brown_tagged_sents = brown.tagged_sents()
size = int(len(brown_tagged_sents)*0.9)
train_sents = brown_tagged_sents[:size]
test_sents = brown_tagged_sents[size:]
t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(train_sents,backoff=t0)
t2 = nltk.BigramTagger(train_sents,backoff=t1)
t3 = nltk.TrigramTagger(train_sents,backoff=t2)
t3.evaluate(test_sents)

0.9130466670857693

In [19]:
t2.evaluate(test_sents)

0.9125751765470128

In [20]:
t1.evaluate(test_sents)

0.8912742817627459

In [21]:
len(train_sents)

51606

In [22]:
t3 = nltk.TrigramTagger(train_sents,backoff=t2,cutoff=1)
t3.evaluate(test_sents)

0.9136648435699168

In [23]:
t3 = nltk.TrigramTagger(train_sents,backoff=t2,cutoff=2)
t3.evaluate(test_sents)

0.913780096812724

In [24]:
t3 = nltk.TrigramTagger(train_sents,backoff=t2,cutoff=3)
t3.evaluate(test_sents)

0.9137172314075565

In [25]:
t3 = nltk.TrigramTagger(train_sents,backoff=t2,cutoff=4)
t3.evaluate(test_sents)

0.9136543660023889

# 태거 저장하기 & 불러오기

In [26]:
import pickle

In [27]:
pickle.dump(t3,open('t3.pkl','wb'))
my_tagger = pickle.load(open('t3.pkl','rb'))
my_tagger.evaluate(test_sents)

0.9136543660023889

In [28]:
text = """The board's action shows what free enterprise
is up against in our complex maze of regulatory laws."""
tokens = nltk.word_tokenize(text)

In [29]:
print(my_tagger.tag(tokens))

[('The', 'AT'), ('board', 'NN'), ("'s", 'NN'), ('action', 'NN'), ('shows', 'VBZ'), ('what', 'WDT'), ('free', 'JJ'), ('enterprise', 'NN'), ('is', 'BEZ'), ('up', 'RP'), ('against', 'IN'), ('in', 'IN'), ('our', 'PP$'), ('complex', 'JJ'), ('maze', 'NN'), ('of', 'IN'), ('regulatory', 'JJ'), ('laws', 'NNS'), ('.', '.')]


# 사전 만들기

In [30]:
print(my_tagger.tag(tokens))

[('The', 'AT'), ('board', 'NN'), ("'s", 'NN'), ('action', 'NN'), ('shows', 'VBZ'), ('what', 'WDT'), ('free', 'JJ'), ('enterprise', 'NN'), ('is', 'BEZ'), ('up', 'RP'), ('against', 'IN'), ('in', 'IN'), ('our', 'PP$'), ('complex', 'JJ'), ('maze', 'NN'), ('of', 'IN'), ('regulatory', 'JJ'), ('laws', 'NNS'), ('.', '.')]


In [31]:
pos = my_tagger.tag(tokens)

In [32]:
print(pos)

[('The', 'AT'), ('board', 'NN'), ("'s", 'NN'), ('action', 'NN'), ('shows', 'VBZ'), ('what', 'WDT'), ('free', 'JJ'), ('enterprise', 'NN'), ('is', 'BEZ'), ('up', 'RP'), ('against', 'IN'), ('in', 'IN'), ('our', 'PP$'), ('complex', 'JJ'), ('maze', 'NN'), ('of', 'IN'), ('regulatory', 'JJ'), ('laws', 'NNS'), ('.', '.')]


In [33]:
print(dict(pos))

{'The': 'AT', 'board': 'NN', "'s": 'NN', 'action': 'NN', 'shows': 'VBZ', 'what': 'WDT', 'free': 'JJ', 'enterprise': 'NN', 'is': 'BEZ', 'up': 'RP', 'against': 'IN', 'in': 'IN', 'our': 'PP$', 'complex': 'JJ', 'maze': 'NN', 'of': 'IN', 'regulatory': 'JJ', 'laws': 'NNS', '.': '.'}


# default dictionary

In [34]:
from collections import defaultdict
a = defaultdict(int)
a

defaultdict(int, {})

In [35]:
a['idea']

0

In [36]:
a

defaultdict(int, {'idea': 0})

In [37]:
a['ideas']

0

In [38]:
a

defaultdict(int, {'idea': 0, 'ideas': 0})

In [39]:
a['colorless'] = 2
a

defaultdict(int, {'idea': 0, 'ideas': 0, 'colorless': 2})

In [40]:
b = defaultdict(list)
b

defaultdict(list, {})

In [41]:
b['ideas']

[]

In [42]:
b

defaultdict(list, {'ideas': []})

In [43]:
b['sleep'] = ['NOUN','VERB']
b

defaultdict(list, {'ideas': [], 'sleep': ['NOUN', 'VERB']})

In [44]:
b['ideas'].append('NOUN')

In [45]:
b

defaultdict(list, {'ideas': ['NOUN'], 'sleep': ['NOUN', 'VERB']})

In [46]:
b.items()

dict_items([('ideas', ['NOUN']), ('sleep', ['NOUN', 'VERB'])])

In [47]:
pos = defaultdict(lambda: 'NOUN')
pos

defaultdict(<function __main__.<lambda>()>, {})

In [48]:
pos['blog']

'NOUN'

In [49]:
pos

defaultdict(<function __main__.<lambda>()>, {'blog': 'NOUN'})

In [50]:
pos.update(b)
pos

defaultdict(<function __main__.<lambda>()>,
            {'blog': 'NOUN', 'ideas': ['NOUN'], 'sleep': ['NOUN', 'VERB']})

# POS dictionary 만들기

In [51]:
from collections import defaultdict
from nltk.corpus import brown
corpus = brown.tagged_words(categories='news',tagset='universal')
v1000 = [wt for (wt,_) in nltk.FreqDist(corpus).most_common(1000)]
v1000[:5]

[('the', 'DET'), (',', '.'), ('.', '.'), ('of', 'ADP'), ('and', 'CONJ')]

In [52]:
pos = defaultdict(lambda: 'NOUN')
pos.update(dict(v1000))
type(pos)

collections.defaultdict

In [53]:
pos

defaultdict(<function __main__.<lambda>()>,
            {'the': 'DET',
             ',': '.',
             '.': '.',
             'of': 'ADP',
             'and': 'CONJ',
             'a': 'DET',
             'in': 'PRT',
             'to': 'ADP',
             'for': 'ADP',
             'The': 'DET',
             '``': '.',
             'is': 'VERB',
             'was': 'VERB',
             "''": '.',
             'on': 'PRT',
             'at': 'ADP',
             'that': 'DET',
             'with': 'ADP',
             'be': 'VERB',
             'by': 'ADP',
             'he': 'PRON',
             'as': 'ADV',
             'said': 'VERB',
             'his': 'DET',
             'will': 'VERB',
             'it': 'PRON',
             'from': 'ADP',
             'are': 'VERB',
             ';': '.',
             'an': 'DET',
             'has': 'VERB',
             '--': '.',
             'had': 'VERB',
             'who': 'PRON',
             'have': 'VERB',
             'not': 'ADV',


# 간단한 태거 만들기

In [54]:
test = [(v,pos[v]) for v in brown.words(categories='news')]
print(test[:30])

[('The', 'DET'), ('Fulton', 'NOUN'), ('County', 'NOUN'), ('Grand', 'NOUN'), ('Jury', 'NOUN'), ('said', 'VERB'), ('Friday', 'NOUN'), ('an', 'DET'), ('investigation', 'NOUN'), ('of', 'ADP'), ("Atlanta's", 'NOUN'), ('recent', 'ADJ'), ('primary', 'NOUN'), ('election', 'NOUN'), ('produced', 'NOUN'), ('``', '.'), ('no', 'DET'), ('evidence', 'NOUN'), ("''", '.'), ('that', 'DET'), ('any', 'DET'), ('irregularities', 'NOUN'), ('took', 'VERB'), ('place', 'NOUN'), ('.', '.'), ('The', 'DET'), ('jury', 'NOUN'), ('further', 'NOUN'), ('said', 'VERB'), ('in', 'PRT')]
