In [1]:
import nltk

In [2]:
from nltk.corpus import brown

suffix_fdist = nltk.FreqDist()
for word in brown.words():
    word = word.lower()
    if len(word) > 0:
        suffix_fdist[word[-1:]] += 1
    if len(word) > 1:
        suffix_fdist[word[-2:]] += 1
    if len(word) > 2:
        suffix_fdist[word[-3:]] += 1
common_suffixes = [suffix for suffix, count in suffix_fdist.most_common(100)]
common_suffixes[:10]

['e', 's', 'd', 't', 'n', 'he', 'the', 'y', ',', 'r']

In [3]:
def pos_features(word):
    features={}
    for suffix in common_suffixes:
        features[f'endswith({suffix})'] = word.lower().endswith(suffix)
    return features

tagged_words = list(brown.tagged_words(categories='news'))
import random
random.shuffle(tagged_words)
tagged_words[:3]

[('buss', 'NN'), ('and', 'CC'), ('Joseph', 'NP')]

In [4]:
featuresets = [(pos_features(n),g) for (n,g) in tagged_words]
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:],featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier,test_set)

0.5262058677274988

In [5]:
classifier.show_most_informative_features(15)

Most Informative Features
             endswith(.) = True                . : NN     =   7115.5 : 1.0
            endswith(of) = True            IN-TL : NN     =   6278.4 : 1.0
            endswith(es) = True              DOZ : IN     =   6214.6 : 1.0
            endswith(he) = True            AT-TL : NN     =   4263.8 : 1.0
           endswith(are) = True              BER : NP     =   4108.2 : 1.0
            endswith(ot) = True                * : NP     =   4106.5 : 1.0
           endswith(ere) = True              BED : NP     =   4105.9 : 1.0
           endswith(ave) = True               HV : NN     =   3377.1 : 1.0
             endswith(a) = True            NN-NC : IN     =   3335.1 : 1.0
            endswith(th) = True              ABX : NNS    =   3032.8 : 1.0
             endswith(h) = True              ABX : NNS    =   3032.8 : 1.0
           endswith(hat) = True               CS : NN     =   2775.8 : 1.0
           endswith(uld) = True            MD-HL : NP     =   2400.6 : 1.0

In [7]:
classifier1 = nltk.DecisionTreeClassifier.train(train_set)
nltk.classify.accuracy(classifier1,test_set)

0.5952262555942317

In [8]:
print(classifier1.pseudocode(depth=1))

if endswith(the) == False: return 'NN'
if endswith(the) == True: return 'AT'



In [9]:
print(classifier1.pseudocode(depth=2))

if endswith(the) == False: 
  if endswith(,) == False: return 'NN'
  if endswith(,) == True: return ','
if endswith(the) == True: return 'AT'



In [10]:
print(classifier1.pseudocode(depth=3))

if endswith(the) == False: 
  if endswith(,) == False: 
    if endswith(s) == False: return 'NN'
    if endswith(s) == True: return 'NNS'
  if endswith(,) == True: return ','
if endswith(the) == True: return 'AT'



In [11]:
print(classifier1.pseudocode(depth=6))

if endswith(the) == False: 
  if endswith(,) == False: 
    if endswith(s) == False: 
      if endswith(.) == False: 
        if endswith(of) == False: 
          if endswith(and) == False: return 'NN'
          if endswith(and) == True: return 'CC'
        if endswith(of) == True: return 'IN'
      if endswith(.) == True: return '.'
    if endswith(s) == True: 
      if endswith(is) == False: 
        if endswith(was) == False: 
          if endswith(as) == False: return 'NNS'
          if endswith(as) == True: return 'CS'
        if endswith(was) == True: return 'BEDZ'
      if endswith(is) == True: 
        if endswith(his) == False: return 'BEZ'
        if endswith(his) == True: return 'PP$'
  if endswith(,) == True: return ','
if endswith(the) == True: return 'AT'



# 맥락을 이용한 POS tagging

In [12]:
def pos_features(sentence, i):
    features = {
        'suffix(1)': sentence[i][-1:],
        'suffix(2)': sentence[i][-2:],
        'suffix(3)': sentence[i][-3:]
    }
    if i==0:
        features['prev-word'] = '<START>'
    else:
        features['prev-word'] = sentence[i-1]
    return features

In [13]:
print(brown.sents()[0])

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']


In [14]:
print(pos_features(brown.sents()[0],0))

{'suffix(1)': 'e', 'suffix(2)': 'he', 'suffix(3)': 'The', 'prev-word': '<START>'}


In [15]:
print(pos_features(brown.sents()[0],1))

{'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ton', 'prev-word': 'The'}


In [16]:
print(pos_features(brown.sents()[0],2))

{'suffix(1)': 'y', 'suffix(2)': 'ty', 'suffix(3)': 'nty', 'prev-word': 'Fulton'}


In [17]:
print(pos_features(brown.sents()[0],3))

{'suffix(1)': 'd', 'suffix(2)': 'nd', 'suffix(3)': 'and', 'prev-word': 'County'}


In [18]:
tagged_sents = list(brown.tagged_sents(categories='news'))
import random
random.shuffle(tagged_sents)
print(tagged_sents[:1])

[[('For', 'IN-HL'), ('crucial', 'JJ-HL'), ('encounter', 'NN-HL')]]


In [19]:
featuresets = []
for tagged_sent in tagged_sents:
    untagged_sent = nltk.tag.untag(tagged_sent)
    for i, (word,tag) in enumerate(tagged_sent):
        featuresets.append((pos_features(untagged_sent,i),tag))

In [21]:
print(featuresets[:3])

[({'suffix(1)': 'r', 'suffix(2)': 'or', 'suffix(3)': 'For', 'prev-word': '<START>'}, 'IN-HL'), ({'suffix(1)': 'l', 'suffix(2)': 'al', 'suffix(3)': 'ial', 'prev-word': 'For'}, 'JJ-HL'), ({'suffix(1)': 'r', 'suffix(2)': 'er', 'suffix(3)': 'ter', 'prev-word': 'crucial'}, 'NN-HL')]


In [22]:
size = int(len(featuresets)*0.1)
train_set, test_set = featuresets[size:],featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier,test_set)

0.7795126802585778

In [23]:
classifier1 = nltk.DecisionTreeClassifier.train(train_set)
nltk.classify.accuracy(classifier1,test_set)

0.7694679264047738

# 함수 untag, 함수 enumerate

In [24]:
temp = brown.tagged_sents()[0]
print(nltk.tag.untag(temp))

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']


In [25]:
print([i for i in enumerate(temp)])

[(0, ('The', 'AT')), (1, ('Fulton', 'NP-TL')), (2, ('County', 'NN-TL')), (3, ('Grand', 'JJ-TL')), (4, ('Jury', 'NN-TL')), (5, ('said', 'VBD')), (6, ('Friday', 'NR')), (7, ('an', 'AT')), (8, ('investigation', 'NN')), (9, ('of', 'IN')), (10, ("Atlanta's", 'NP$')), (11, ('recent', 'JJ')), (12, ('primary', 'NN')), (13, ('election', 'NN')), (14, ('produced', 'VBD')), (15, ('``', '``')), (16, ('no', 'AT')), (17, ('evidence', 'NN')), (18, ("''", "''")), (19, ('that', 'CS')), (20, ('any', 'DTI')), (21, ('irregularities', 'NNS')), (22, ('took', 'VBD')), (23, ('place', 'NN')), (24, ('.', '.'))]
