# 텍스트 분류하기3(한국어 버전)

In [1]:
# 필요한 모듈 불러오기 # 에러나면 pip install nltk 입력
from nltk.tokenize import word_tokenize
import nltk

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\SH501\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [29]:
train = [('사과가 좋아','pos'),
        ('밤에 먹는 사과는 비추야','neg'),
        ('사과가 잘 익었어 맛있겠다','pos')]
train

[('사과가 좋아', 'pos'), ('밤에 먹는 사과는 비추야', 'neg'), ('사과가 잘 익었어 맛있겠다', 'pos')]

In [30]:
# train에 있는 문장들의 단어 set 만들기
# set()을 이용하여 집합으로 만들기
all_words = set()

for tup in train:
    sent, label = tup[0], tup[1]  #ex) sent = 'l like you', label = 'pos'
    words = word_tokenize(sent)
    for word in words:
        all_words.add(word)

print(all_words)

{'먹는', '비추야', '사과가', '잘', '밤에', '좋아', '사과는', '익었어', '맛있겠다'}


In [7]:
train_features = []

for tup in train:
    sent, label = tup[0], tup[1]
    words = word_tokenize(sent)
    tmp = {set_word: (set_word in words) for set_word in all_words} # set_word가 words에 있는지 여부 판별
    sent_tup = (tmp, label)
    train_features.append(sent_tup)
    
for i in range(len(train_features)):
    print(train_features[i])

({'먹는': False, '비추야': False, '사과가': True, '잘': False, '밤에': False, '좋아': True, '사과는': False, '익었어': False, '맛있겠다': False}, 'pos')
({'먹는': True, '비추야': True, '사과가': False, '잘': False, '밤에': True, '좋아': False, '사과는': True, '익었어': False, '맛있겠다': False}, 'neg')
({'먹는': False, '비추야': False, '사과가': True, '잘': True, '밤에': False, '좋아': False, '사과는': False, '익었어': True, '맛있겠다': True}, 'pos')


In [8]:
classifier = nltk.NaiveBayesClassifier.train(train_features)
classifier.show_most_informative_features()

Most Informative Features
                    맛있겠다 = False             neg : pos    =      1.5 : 1.0
                     익었어 = False             neg : pos    =      1.5 : 1.0
                       잘 = False             neg : pos    =      1.5 : 1.0
                      좋아 = False             neg : pos    =      1.5 : 1.0


In [9]:
test_sent = '사과는 맛있어'

words = word_tokenize(test_sent)
test_feature = {set_word: (set_word in words) for set_word in all_words}

print(test_feature)

{'먹는': False, '비추야': False, '사과가': False, '잘': False, '밤에': False, '좋아': False, '사과는': True, '익었어': False, '맛있겠다': False}


In [16]:
classifier.classify(test_feature)

'neg'

## 형태소 분석기 적용

In [20]:
from konlpy.tag import Okt
okt = Okt()

In [21]:
def pos_tokenize(raw_sent):
    pos_sent = []
    
    #raw_sent: 사과가 좋아
    sent = okt.pos(raw_sent, norm=True, stem=True)
    #sent: [('사과', 'Noun'),('가','Josa'),('좋다','Adjective')]
    
    for tup in sent: 
        word, tag = tup[0], tup[1]    # tup: ('사과', 'Noun')
        word_tag = word + '/' + tag  # word_tag: '사과/Noun'
        pos_sent.append(word_tag)
        
    return ' '.join(pos_sent)

In [22]:
train = [('사과가 좋아','pos'),
        ('밤에 먹는 사과는 비추야','neg'),
        ('사과가 잘 익었어 맛있겠다','pos')]
train

[('사과가 좋아', 'pos'), ('밤에 먹는 사과는 비추야', 'neg'), ('사과가 잘 익었어 맛있겠다', 'pos')]

In [24]:
# train에 있는 문장들의 단어 set 만들기
# set()을 이용하여 집합으로 만들기
all_words = set()

for tup in train:
    sent, label = tup[0], tup[1]  #ex) sent = 'l like you', label = 'pos'
    sent = pos_tokenize(sent)
    words = word_tokenize(sent)
    for word in words:
        all_words.add(word)

print(all_words)

{'비추다/Verb', '익다/Verb', '밤/Noun', '좋다/Adjective', '는/Josa', '사과/Noun', '자다/Verb', '가/Josa', '먹다/Verb', '에/Josa', '맛있다/Adjective'}


In [25]:
train_features = []

for tup in train:
    sent, label = tup[0], tup[1]
    sent = pos_tokenize(sent)
    words = word_tokenize(sent)
    tmp = {set_word: (set_word in words) for set_word in all_words} # set_word가 words에 있는지 여부 판별
    sent_tup = (tmp, label)
    train_features.append(sent_tup)
    
for i in range(len(train_features)):
    print(train_features[i])

({'비추다/Verb': False, '익다/Verb': False, '밤/Noun': False, '좋다/Adjective': True, '는/Josa': False, '사과/Noun': True, '자다/Verb': False, '가/Josa': True, '먹다/Verb': False, '에/Josa': False, '맛있다/Adjective': False}, 'pos')
({'비추다/Verb': True, '익다/Verb': False, '밤/Noun': True, '좋다/Adjective': False, '는/Josa': True, '사과/Noun': True, '자다/Verb': False, '가/Josa': False, '먹다/Verb': True, '에/Josa': True, '맛있다/Adjective': False}, 'neg')
({'비추다/Verb': False, '익다/Verb': True, '밤/Noun': False, '좋다/Adjective': False, '는/Josa': False, '사과/Noun': True, '자다/Verb': True, '가/Josa': True, '먹다/Verb': False, '에/Josa': False, '맛있다/Adjective': True}, 'pos')


In [26]:
classifier = nltk.NaiveBayesClassifier.train(train_features)
classifier.show_most_informative_features()

Most Informative Features
           맛있다/Adjective = False             neg : pos    =      1.5 : 1.0
                 익다/Verb = False             neg : pos    =      1.5 : 1.0
                 자다/Verb = False             neg : pos    =      1.5 : 1.0
            좋다/Adjective = False             neg : pos    =      1.5 : 1.0


In [27]:
test_sent = '사과는 맛있어' 

words = word_tokenize(test_sent)
test_feature = {set_word: (set_word in words) for set_word in all_words}

print(test_feature)

{'비추다/Verb': False, '익다/Verb': False, '밤/Noun': False, '좋다/Adjective': False, '는/Josa': False, '사과/Noun': False, '자다/Verb': False, '가/Josa': False, '먹다/Verb': False, '에/Josa': False, '맛있다/Adjective': False}


In [28]:
classifier.classify(test_feature) # 맛있다를 통해 맛있어를 긍정으로 판별

'pos'