In [2]:
from collections import defaultdict
import math

def sent_processing(lines):

    if isinstance(lines, list):
        lines = [line.strip().split(" ") for line in lines]

        corpus = []
        for line in lines:
            sent = []
            for word in line:
                word = tuple(word.rsplit("/", 1))
                sent.append(word)
            corpus.append(sent)

        return corpus

    elif isinstance(lines, str):
        line = []
        for word in lines.strip().split(" "):
            word = tuple(word.rsplit("/", 1))
            line.append(word)
        return line

    else:
        print("wrong type of input sentence")
        exit(1)

    
with open("corpus.txt", "r", encoding='utf-8') as f:
    lines = f.readlines()

corpus = sent_processing(lines)

In [3]:
def train(corpus):

    def bigram_count(sent):
        poslist = [pos for _, pos in sent] # [NN, VBD, DT, NN]
        return [(pos0, pos1) for pos0, pos1 in zip(poslist, poslist[1:])]

    pos2words_freq = defaultdict(lambda: defaultdict(int)) # number of (word, tag)
    trans_freq = defaultdict(int) # bigram count --> (tag-1, tag)

    # sent format: [(word, tag), (word, tag), ...,(word, tag)]
    for sent in corpus: # counting
        for word, pos in sent:
            pos2words_freq[pos][word] +=1   # 특정 단어의 빈도수 

        for bigram in bigram_count(sent):
            trans_freq[bigram] +=1

        trans_freq[('BOS', sent[0][1])] += 1 # number of (BOS, tag) bigram
        trans_freq[(sent[-1][1], 'EOS')] +=1 # number of (tag, EOS) bigram

    ### Practice1: emission prob p(x|y) 
    # base prob: p(y).
    # P(y) for every y (count for each tag): {'CMC': count(CMC), 'CMP': count(CMP),..} 
    # pos가 cmc cmp이런 것들
    #word가 아버지 이런 것들
    base = {pos: sum(words.values()) for pos, words in pos2words_freq.items()}
    
    # p(x|y) = p(x, y) / p(y)
    # log(p(x, y)/p(y)) for every (x, y)
    pos2words_prob = defaultdict(lambda: defaultdict(float))
    for pos, words in pos2words_freq.items():
        for word, count in words.items():
            pos2words_prob[pos][word] = math.log(count/base[pos])
            
        
    # Do something..
    
    ### Practice2: transition prob p(y_t|y_(t-1))
    # base prob: p(y_(t-1))
    # Do something to make {'CMC': count('CMC'), 'fjb': count('fjb'), ..}
    base = defaultdict(int)
    for (pos0, pos1), count in trans_freq.items():
        base[pos0] += count
        

    # p(y_t|y_(t-1)) = p(y_t, y_(t-1)) / p(y_(t-1))
    # Do something -> p(y_t, y_t-1) / p(y_t) 
    trans_prob = {(pos0, pos1): math.log(count/base[pos0])
                  for (pos0, pos1), count in trans_freq.items()}
    
    return pos2words_prob, trans_prob

In [4]:
pos2words, trans = train(corpus)

print('명사 라면의 확률:', pos2words['CMC']['라면']) # 명사 '라면'의 확률 (신라면, 진라면 등.)
print('연결어미 라면의 확률:', pos2words['fmoc']['라면']) # 연결어미 '라면'의 확률 (~ 이라면)
# 연결어미 확률이 더 높게 나옴

명사 라면의 확률: -9.427948631791715
연결어미 라면의 확률: -5.6937321388027


In [5]:
class HMM_tagger(object):
    def __init__(self, pos2words, trans):
        self.pos2words = pos2words
        self.trans = trans
        self.unk = -15  # 학습에 등장하지 않은 것을 예외 처리함 없으면 -15로 해줌
        self.eos ='EOS'
        self.bos ='BOS'

    def sent_log_prob(self, sent):
        # emission prob.
        log_prob = sum(
            (self.pos2words.get(tag, {}).get(word, self.unk) for word, tag in sent)
            # do someting..
         ) # get emission prob. for each (w, t), otherwise unk value

        # transition prob.
        bigrams = [(t0, t1) for (_, t0), (_, t1) in zip(sent, sent[1:])] # every bigram in sentence
        log_prob+= sum(
            (self.trans.get(bigrams, self.unk) for bigrams in bigrams)
            
            # do something..
        )
        
        # bos
        log_prob += self.trans.get((self.bos, sent[0][1]),self.unk)
        # get BOS prob for the first (w, t)

        # eos
        log_prob += self.trans.get((sent[-1][-1],self.eos), self.unk)
        # get EOS prob for the last (w, t)
        
        # length norm.
        log_prob /= len(sent)

        return log_prob

In [6]:
tagger = HMM_tagger(pos2words, trans)
test_sent1= "감기/CMC 는/fjb 줄이/YBD 다/fmof ./g"
test_sent2= "감기/fmotg 는/fjb 줄/CMC 이다/fjj ./g"
print("%s: %f" % (test_sent1, tagger.sent_log_prob(sent_processing(test_sent1))))
print("%s: %f" % (test_sent2, tagger.sent_log_prob(sent_processing(test_sent2))))

감기/CMC 는/fjb 줄이/YBD 다/fmof ./g: -5.489636
감기/fmotg 는/fjb 줄/CMC 이다/fjj ./g: -14.037157
