In [None]:
## import urllib
## urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")
## urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")

In [41]:
from nltk.tokenize import word_tokenize
import nltk
def load_data(file_path):
    #file_path에 있는 데이터를 읽어 옴
    
    train = []
    
    count = 0
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f.readlines():
            if count == 500: break     # 500개만 가져옴
            line = line.strip()
            id, doc, label = line.split('\t')
            if label == '1': label = 'pos'
            elif label == '0': label = 'neg'
            train.append((doc,label))
            count += 1
            
    return train

In [42]:
train = load_data('ratings_train.txt')
print(train[:5])

[('document', 'label'), ('아 더빙.. 진짜 짜증나네요 목소리', 'neg'), ('흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나', 'pos'), ('너무재밓었다그래서보는것을추천한다', 'neg'), ('교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정', 'neg')]


In [43]:
train = train[1:]
print(train[:5])

[('아 더빙.. 진짜 짜증나네요 목소리', 'neg'), ('흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나', 'pos'), ('너무재밓었다그래서보는것을추천한다', 'neg'), ('교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정', 'neg'), ('사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 던스트가 너무나도 이뻐보였다', 'pos')]


## 작성한 내용 함수로 만들기

In [44]:
from konlpy.tag import Okt
okt = Okt()

def pos_tokenize(raw_sent):
    pos_sent = []
    
    #raw_sent: 사과가 좋아
    sent = okt.pos(raw_sent, norm=True, stem=True)
    #sent: [('사과', 'Noun'),('가','Josa'),('좋다','Adjective')]
    
    for tup in sent: 
        word, tag = tup[0], tup[1]    # tup: ('사과', 'Noun')
        word_tag = word + '/' + tag  # word_tag: '사과/Noun'
        pos_sent.append(word_tag)
        
    return ' '.join(pos_sent)

def make_word_dict(train, use_morph=False):
    all_words = set()

    for tup in train:
        sent, label = tup[0], tup[1]  #ex) sent = 'l like you', label = 'pos'
        if use_morph: sent = pos_tokenize(sent)
        words = word_tokenize(sent)
        for word in words:
            all_words.add(word)
    return all_words
def make_train_features(train, all_words, use_morph=False):
    train_features = []

    for tup in train:
        sent, label = tup[0], tup[1]
        if use_morph: sent = pos_tokenize(sent)
        words = word_tokenize(sent)
        tmp = {set_word: (set_word in words) for set_word in all_words} # set_word가 words에 있는지 여부 판별
        sent_tup = (tmp, label)
        train_features.append(sent_tup)
            
    return train_features

## 테스트하기 1

In [45]:
use_morph = True
all_words = make_word_dict(train, use_morph)
print('단어 집합 개수: ', len(all_words))

train_features = make_train_features(train, all_words, use_morph)

단어 집합 개수:  2322


In [46]:
classifier = nltk.NaiveBayesClassifier.train(train_features)
classifier.show_most_informative_features()
## classifier.show_most_informative_features(n=5)

Most Informative Features
                       ; = True              neg : pos    =      8.2 : 1.0
          재미없다/Adjective = True              neg : pos    =      8.2 : 1.0
                주인공/Noun = True              neg : pos    =      7.5 : 1.0
                 최고/Noun = True              pos : neg    =      6.9 : 1.0
                  뭐/Noun = True              neg : pos    =      6.8 : 1.0
           재밌다/Adjective = True              pos : neg    =      6.5 : 1.0
                 내용/Noun = True              neg : pos    =      6.2 : 1.0
       ㅡㅡ/KoreanParticle = True              neg : pos    =      6.1 : 1.0
                스토리/Noun = True              neg : pos    =      6.1 : 1.0
                 다시/Noun = True              pos : neg    =      5.9 : 1.0


In [8]:
test_sent = '보다가 중간에 나왔습니다.'
if use_morph: test_sent = pos_tokenize(test_sent)
words = word_tokenize(test_sent)
test_feature = {set_word: (set_word in words) for set_word in all_words}

In [9]:
classifier.classify(test_feature)

'pos'

## 테스트하기 2

In [10]:
use_morph = False
all_words = make_word_dict(train, use_morph)
print('단어 집합 개수: ', len(all_words))

train_features = make_train_features(train, all_words, use_morph)

단어 집합 개수:  3046


In [11]:
classifier = nltk.NaiveBayesClassifier.train(train_features)
classifier.show_most_informative_features()

Most Informative Features
                       ; = True              neg : pos    =      8.2 : 1.0
                       그 = True              pos : neg    =      4.7 : 1.0
                       수 = True              pos : neg    =      4.7 : 1.0
                      ㅡㅡ = True              neg : pos    =      4.6 : 1.0
                       ! = True              pos : neg    =      4.0 : 1.0
                    스토리도 = True              neg : pos    =      3.9 : 1.0
                      이거 = True              neg : pos    =      3.9 : 1.0
                     하나도 = True              neg : pos    =      3.9 : 1.0
                      없다 = True              neg : pos    =      3.6 : 1.0
                       ? = True              neg : pos    =      3.6 : 1.0


In [12]:
test_sent = '보다가 중간에 나왔습니다.'
if use_morph: test_sent = pos_tokenize(test_sent)
words = word_tokenize(test_sent)
test_feature = {set_word: (set_word in words) for set_word in all_words}

classifier.classify(test_feature)

'neg'

# 실습

In [35]:
def load_data(file_path):
    #file_path에 있는 데이터를 읽어 옴
    
    train = []
    
    count = 0
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f.readlines():
            if count == 500: break     # 500개만 가져옴
            line = line.strip()
            id, doc, label = line.split('\t')
            if label == '1': label = 'pos'
            elif label == '0': label = 'neg'
            train.append((doc,label))
            count += 1
            
    return train

In [47]:
test = load_data('ratings_test.txt')
test = test[1:]
test_features = make_train_features(test, all_words, use_morph)
nltk.classify.accuracy(classifier, test_features)

0.7154308617234469

In [50]:
def load_data(file_path):
    #file_path에 있는 데이터를 읽어 옴
    
    train = []
    
    count = 0
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f.readlines():
            if count == 1000: break     # 500개만 가져옴
            line = line.strip()
            id, doc, label = line.split('\t')
            if label == '1': label = 'pos'
            elif label == '0': label = 'neg'
            train.append((doc,label))
            count += 1
            
    return train

In [52]:
nltk.classify.accuracy(classifier, test_features)

0.7097097097097097

In [37]:
def load_data(file_path):
    #file_path에 있는 데이터를 읽어 옴
    
    train = []
    
    count = 0
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f.readlines():
            if count == 2000: break     # 2000개만 가져옴
            line = line.strip()
            id, doc, label = line.split('\t')
            if label == '1': label = 'pos'
            elif label == '0': label = 'neg'
            train.append((doc,label))
            count += 1
            
    return train

In [38]:
nltk.classify.accuracy(classifier, test_features)

0.7288644322161081

In [39]:
def load_data(file_path):
    #file_path에 있는 데이터를 읽어 옴
    
    train = []
    
    count = 0
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f.readlines():
            if count == 3000: break     # 2000개만 가져옴
            line = line.strip()
            id, doc, label = line.split('\t')
            if label == '1': label = 'pos'
            elif label == '0': label = 'neg'
            train.append((doc,label))
            count += 1
            
    return train

In [40]:
train = load_data('ratings_train.txt')
train = train[1:]
train_features = make_train_features(train, all_words, use_morph)
nltk.classify.accuracy(classifier, train_features)

0.6982327442480827