In [None]:
# mac475의 ipython 표준 style을 적용함
from IPython.core.display import HTML
styles = open("../resources/styles/custom.css", "r").read()
HTML( styles )

In [None]:
#  Pretty Display of Variables를 적용하여 중간 결과를 확인하고자 함
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## 1. 기본적인 작동의 확인

In [None]:
documents = [
    ['왕자', '가', '공주', '를', '좋아한다'],
    ['공주', '가', '왕자', '를', '좋아한다'],
    ['시녀', '가', '왕자', '를', '싫어한다'],
    ['공주', '가', '시녀', '를', '싫어한다'],
    ['시녀', '가', '왕자', '를', '독살한다'],
    ['시녀', '가', '공주', '를', '독살한다']
]

In [None]:
# sum function은 documents내의 list element들을 합쳐줌
words = set(sum(documents, []))

In [None]:
print(words)

## 2. 영화리뷰 data import

In [None]:
# data reading function
def read_data(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        data = [line.split('\t')[1:] for line in f.read().splitlines()]
        data = data[1:]   # header 제외
    return data

In [None]:
train_data = read_data('./datasets/ratings_train.txt')

In [None]:
train_data[:5]

In [None]:
test_data = read_data('./datasets/ratings_test.txt')

In [None]:
test_data[:5]

## 3. tokenizing

In [None]:
# pos tagger 정의, parts of speech : 품사
from konlpy.tag import Twitter
pos_tagger = Twitter()

In [None]:
def tokenize(doc):
    # norm, stem은 optional
    return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]

In [None]:
from tqdm import tqdm
# tqdm 참고 : https://github.com/tqdm/tqdm#iterable-based

# 원 source는 다음과 같으나, progress 확인을 위해, tqdm을 활용함
# train_docs = [(tokenize(row[0]), row[1]) for row in tqdm(train_data)]

train_docs = [(tokenize(row[0]), row[1]) for row in tqdm(train_data)]

In [None]:
# test_docs = [(tokenize(row[0]), row[1]) for row in test_data]

test_docs = [(tokenize(row[0]), row[1]) for row in tqdm(test_data)]

In [None]:
train_docs[0]
test_docs[0]

In [None]:
# 모든 document내의 pos tagging된 word를 하나의 list에 tokens로 담는다
tokens = [t for d in train_docs for t in d[0]]
len(tokens)

In [None]:
tokens[3]

## 4. data exploration

In [None]:
import nltk
text = nltk.Text(tokens, name='NMSC')
print(text)
# => <Text: NMSC>

In [None]:
len(text.tokens)

In [None]:
len(set(text.tokens))

In [None]:
text.vocab().most_common(10)

In [None]:
from matplotlib import font_manager, rc
font_fname = 'c:/windows/fonts/malgun.ttf'     # A font of your choice
font_name = font_manager.FontProperties(fname=font_fname).get_name()
rc('font', family=font_name)

In [None]:
import matplotlib.pyplot as plt
_=plt.figure(figsize=(15, 6))  # the size you want

text.plot(50)

In [None]:
nltk.download()

In [None]:
# https://stackoverflow.com/questions/3522372/how-to-config-nltk-data-directory-from-code 를 참고

import nltk
# home
# nltk.data.path.append("D:/50.work/10.conda.repository/etc/nltk")

# not home
nltk.data.path.append("E:/88.analytics/50.data.resources/nltk")

In [None]:
text.collocations()

In [None]:
# 여기서는 최빈도 단어 2000개를 피쳐로 사용
# WARNING: 쉬운 이해를 위한 코드이며 time/memory efficient하지 않습니다
selected_words = [f[0] for f in text.vocab().most_common(2000)]

def term_exists(doc):
    return {'exists({})'.format(word): (word in set(doc)) for word in selected_words}

# 시간 단축을 위한 꼼수로 training corpus의 일부만 사용할 수 있음
train_docs = train_docs[:10000]

train_xy = [(term_exists(d), c) for d, c in train_docs]

In [None]:
selected_words[0]

In [None]:
train_xy[:3]

In [None]:
# train_data = 'x'
# test_data = 'x'
# tokens = 'x'
# train_docs = 'x'
# test_docs = 'x'
# train_xy = 'x'
# test_xy = 'x'
# classifier = 'x'

# import gc
# gc.collect()

In [None]:
test_xy = [(term_exists(d), c) for d, c in test_docs]

In [None]:
# naive bayes classifier 적용

classifier = nltk.NaiveBayesClassifier.train(train_xy)

In [None]:
nltk.classify.accuracy(classifier, test_xy)

In [None]:
classifier.show_most_informative_features(10)

In [None]:
# https://stackoverflow.com/questions/10017086/save-naive-bayes-trained-classifier-in-nltk 참고

import pickle
f = open('./model/17.10.15.01.naive.bayes.classifier.pickle', 'wb')
pickle.dump(classifier, f)
f.close()

In [None]:
# import pickle
# f = open('my_classifier.pickle', 'rb')
# classifier = pickle.load(f)
# f.close()