## Pycon KR 2015 "한국어와 NLTK, Gensim의 만남" Tutorial  
#### Code Replication by Tae Young, Kang (Management Engineering, KAIST)

#### 1. Word Net

In [1]:
from nltk.corpus import wordnet as wn #한국어는 nltk wordnet 지원 안 됨
print(wn.synsets('computer'))
print(wn.synsets('USA'))
print(wn.synsets('LSTM')) #보다시피 최신용어는 지원 안 됨

[Synset('computer.n.01'), Synset('calculator.n.01')]
[Synset('united_states.n.01'), Synset('united_states_army.n.01')]
[]


#### 2. Co-occurence Matrix

여기서 잠깐. 아래에 dot_product를 만들기 전에 map, lambda, zip에 대해서 연습해보자
lambda는 def의 간단한 버전이다.
map(함수,오브젝트)은 오브젝트에 들어있는 성분 하나하나에 함수를 적용한다.
zip은 장황하게 설명하는거보다 눈으로 직접 보는게 더 빠를거다

In [2]:
#zip
print(list(zip([1, 2, 3], [4, 5, 6], [7, 8, 9])))
print(list(zip(('Socio','Adj'),('Boom','Noun'),('has','Verb'),('arrived','Verb'))))

[(1, 4, 7), (2, 5, 8), (3, 6, 9)]
[('Socio', 'Boom', 'has', 'arrived'), ('Adj', 'Noun', 'Verb', 'Verb')]


In [3]:
#lambda
print((lambda x,y : x**y)(2,5)) #아래와 같은 방식으로도 적용 가능
sample_function = lambda x,y : x**y
print(sample_function(2,5))

32
32


In [4]:
# map & lambda
input_list = [1,2,3,4,5]
output_list = list(map(lambda i: i ** 2 , input_list))
output_list

[1, 4, 9, 16, 25]

In [5]:
import math
def dot_product(v1, v2):
    return sum(map(lambda x: x[0] * x[1], zip(v1, v2)))
def cosine_measure(v1, v2):
    prod = dot_product(v1, v2)
    len1 = math.sqrt(dot_product(v1, v1))
    len2 = math.sqrt(dot_product(v2, v2))
    return prod / (len1 * len2)

#### 3. Korea Movie Review Toy Data

Anaconda Prompt를 하나 키고 cd C:/XXXXX 로 현재 워킹 디렉토리로 이동 후  
git clone https://github.com/e9t/nsmc.git 를 입력한다.  
cloning이 다 끝나고 나면 워킹 디렉토리 안에 'nsmc'라는 이름의 폴더가 생길거다.

In [6]:
def read_data(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        data = [line.split('\t') for line in f.read().splitlines()]
        data = data[1:] # header 제외
    return data

In [7]:
train_data = read_data('./nsmc/ratings_train.txt')[:1500]
test_data = read_data('./nsmc/ratings_test.txt')[:500]

In [8]:
#데이터가 제대로 읽혔는지 확인
print(len(train_data)) # nrows: 1500
print(len(train_data[0])) # ncols: 3
print(len(test_data)) # nrows: 500
print(len(test_data[0])) # ncols: 3

1500
3
500
3


In [9]:
from konlpy.tag import Okt as Twitter
from pprint import pprint
pos_tagger = Twitter()

def tokenize(doc):
    return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]

In [10]:
train_docs = [(tokenize(row[1]), row[2]) for row in train_data]
test_docs = [(tokenize(row[1]), row[2]) for row in test_data]

In [11]:
pprint(train_docs[111])

(['왕/Noun',
  '짜증/Noun',
  '...../Punctuation',
  '아주/Noun',
  '전개/Noun',
  '를/Josa',
  '짬뽕/Noun',
  '으로/Josa',
  '믹스/Noun',
  '하다/Verb',
  '.../Punctuation',
  '음향/Noun',
  '만/Josa',
  '무섭다/Adjective',
  '하다/Verb',
  '../Punctuation',
  '하아/Exclamation'],
 '0')


In [12]:
#training data의 token 모으기
tokens = [t for d in train_docs for t in d[0]]
print(len(tokens))

21885


여기서 잠깐...!  
t for d in train_docs for t in d[0]  
이 문법이 헷갈리면 아래처럼 한 단계 씩 쪼개서 확인해보자

In [13]:
b1 = train_docs[0]
b1

(['아/Exclamation',
  '더빙/Noun',
  '../Punctuation',
  '진짜/Noun',
  '짜증나다/Adjective',
  '목소리/Noun'],
 '0')

In [14]:
b2 = b1[0]
b2

['아/Exclamation',
 '더빙/Noun',
 '../Punctuation',
 '진짜/Noun',
 '짜증나다/Adjective',
 '목소리/Noun']

In [15]:
b3 = b2[0:5]
b3

['아/Exclamation', '더빙/Noun', '../Punctuation', '진짜/Noun', '짜증나다/Adjective']

In [16]:
tokens[0:5]

['아/Exclamation', '더빙/Noun', '../Punctuation', '진짜/Noun', '짜증나다/Adjective']

#### 4. Data Exploration

In [17]:
import nltk
text = nltk.Text(tokens, name='NSMC')
print(text)

<Text: NSMC>


In [18]:
#여기서 잠깐! set함수를 보고 가자!
somewhatlist = [1,2,2,3,3,3,3,4,5,6,7,7,7,8,8,9,10,10]
set(somewhatlist)

{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}

In [19]:
print(len(text.tokens)) #number of total tokens
print(len(set((text.tokens)))) #number of total tokens
pprint(text.vocab().most_common(10)) #most common 10 tokens

21885
4661
[('./Punctuation', 735),
 ('영화/Noun', 502),
 ('하다/Verb', 427),
 ('이/Josa', 402),
 ('보다/Verb', 366),
 ('의/Josa', 309),
 ('../Punctuation', 266),
 ('가/Josa', 257),
 ('.../Punctuation', 241),
 ('에/Josa', 234)]


In [20]:
from matplotlib import font_manager, rc
font_fname = 'c:/windows/fonts/malgun.ttf' # A font of your choice
font_name = font_manager.FontProperties(fname=font_fname).get_name()
rc('font', family=font_name)

print(text.plot(20)) #Visualization of Top 20 tokens

<matplotlib.figure.Figure at 0x238dc6e8940>

None


In [21]:
text.collocations() #연어(collocations) : 함께 자주 등장하는 단어

적/Suffix 인/Josa; 이/Determiner 거/Noun; 이/Determiner 것/Noun; 배우/Noun
들/Suffix; 최고/Noun 의/Josa; 사람/Noun 들/Suffix; 것/Noun 은/Josa; 이/Noun
영화/Noun; 네/Suffix 요/Josa; 이/Noun 게/Josa; 10/Number 점/Noun;
못/VerbPrefix 하다/Verb; 수/Noun 있다/Adjective; 볼/Noun 만/Josa; 오랜/Modifier
만/Noun; 한국영/Noun 화/Suffix; 이다/Josa ./Punctuation; 도/Josa 없다/Adjective;
영화/Noun 를/Josa; 안/VerbPrefix 나오다/Verb


#### 5. Sentiment classification with term-existance

In [105]:
selected_words = [f[0] for f in text.vocab().most_common(2000)] #최빈 어휘 2천개 추출

def term_exists(doc):
    return {'exists({})'.format(word): (word in set(doc)) for word in selected_words}

train_xy = [(term_exists(d), c) for d, c in train_docs]
test_xy = [(term_exists(d), c) for d, c in test_docs]

In [106]:
d_lists = []
c_lists = []

for d, c in train_docs :
    d_lists.append(d)
    c_lists.append(c)

pprint(d_lists[:4]) #word tokens
print(c_lists[:4]) #binary sentiment values

[['아/Exclamation',
  '더빙/Noun',
  '../Punctuation',
  '진짜/Noun',
  '짜증나다/Adjective',
  '목소리/Noun'],
 ['흠/Noun',
  '.../Punctuation',
  '포스터/Noun',
  '보고/Noun',
  '초딩/Noun',
  '영화/Noun',
  '줄/Noun',
  '..../Punctuation',
  '오버/Noun',
  '연기/Noun',
  '조차/Josa',
  '가볍다/Adjective',
  '않다/Verb'],
 ['너/Modifier',
  '무재/Noun',
  '밓었/Noun',
  '다그/Noun',
  '래서/Noun',
  '보다/Verb',
  '추천/Noun',
  '한/Josa',
  '다/Adverb'],
 ['교도소/Noun',
  '이야기/Noun',
  '구먼/Noun',
  '../Punctuation',
  '솔직하다/Adjective',
  '재미/Noun',
  '는/Josa',
  '없다/Adjective',
  '../Punctuation',
  '평점/Noun',
  '조정/Noun']]
['0', '1', '0', '0']


In [107]:
print(selected_words[1],'\n')
print(set(d_lists[1]))
print(set(d_lists[2]),'\n')

print(selected_words[1] in set(d_lists[1]))
print(selected_words[1] in set(d_lists[2]))

영화/Noun 

{'영화/Noun', '오버/Noun', '가볍다/Adjective', '줄/Noun', '연기/Noun', '않다/Verb', '..../Punctuation', '흠/Noun', '조차/Josa', '보고/Noun', '.../Punctuation', '초딩/Noun', '포스터/Noun'}
{'래서/Noun', '밓었/Noun', '추천/Noun', '다/Adverb', '한/Josa', '무재/Noun', '보다/Verb', '너/Modifier', '다그/Noun'} 

True
False


In [108]:
train_xy[0]

({'exists(./Punctuation)': False,
  'exists(영화/Noun)': False,
  'exists(하다/Verb)': False,
  'exists(이/Josa)': False,
  'exists(보다/Verb)': False,
  'exists(의/Josa)': False,
  'exists(../Punctuation)': True,
  'exists(가/Josa)': False,
  'exists(.../Punctuation)': False,
  'exists(에/Josa)': False,
  'exists(을/Josa)': False,
  'exists(도/Josa)': False,
  'exists(들/Suffix)': False,
  'exists(은/Josa)': False,
  'exists(,/Punctuation)': False,
  'exists(는/Josa)': False,
  'exists(를/Josa)': False,
  'exists(없다/Adjective)': False,
  'exists(있다/Adjective)': False,
  'exists(?/Punctuation)': False,
  'exists(좋다/Adjective)': False,
  'exists(재밌다/Adjective)': False,
  'exists(정말/Noun)': False,
  'exists(이/Determiner)': False,
  'exists(진짜/Noun)': True,
  'exists(너무/Adverb)': False,
  'exists(적/Suffix)': False,
  'exists(점/Noun)': False,
  'exists(이/Noun)': False,
  'exists(아니다/Adjective)': False,
  'exists(같다/Adjective)': False,
  'exists(것/Noun)': False,
  'exists(되다/Verb)': False,
  'exists(나오다/Ve

In [110]:
#Naive Bayes Classifier
classifier = nltk.NaiveBayesClassifier.train(train_xy)
print('Accuracy of the model =', nltk.classify.accuracy(classifier, test_xy))
classifier.show_most_informative_features(10)

Accuracy of the model = 0.774
Most Informative Features
exists(ㅡㅡ/KoreanParticle) = True                0 : 1      =     11.6 : 1.0
   exists(아깝다/Adjective) = True                0 : 1      =     11.5 : 1.0
         exists(최고/Noun) = True                1 : 0      =     11.1 : 1.0
  exists(재미없다/Adjective) = True                0 : 1      =     10.6 : 1.0
        exists(쓰레기/Noun) = True                0 : 1      =     10.4 : 1.0
   exists(재밌다/Adjective) = True                1 : 0      =     10.1 : 1.0
   exists(괜찮다/Adjective) = True                1 : 0      =      9.5 : 1.0
   exists(멋지다/Adjective) = True                1 : 0      =      9.5 : 1.0
         exists(최악/Noun) = True                0 : 1      =      7.7 : 1.0
  exists(아름답다/Adjective) = True                1 : 0      =      7.4 : 1.0
