In [2]:
def remove_stopwords(doc, stopwords):
    """
    Remove stopwords from doc.
    :param doc: Document to remove stopwords
    :param stopwords: stopwords corpus
    :return: documents with removed stopwords
    """
    stopwords_removed = []
    for token in doc:
        if not token in stopwords:
            stopwords_removed.append(token)

In [3]:
import numpy as np

def get_weights(matrix, x1, x2):
    """
    Get normalized weights matrix. Used for np.dot(TDM, weights).
    :param matrix: similarity matrix (euclidean or cosine)
    :param x1: rows
    :param x2: cols
    :return: normalized weights matrix
    """
    distance = matrix[x1, x2] ** 2
    variance = np.var(matrix)
    weights = np.exp(-(distance / (2 * variance ** 2)))
    return weights

In [None]:
tmp = ['귀신/Noun', '깜놀/Noun', '놀라다/Verb', '멀미/Noun', '놀램/Noun',
       '지르다/Verb', '갑툭튀/Noun', '지름/Noun', '놀람/Noun', '나쁘다/Adjective',
       '비명/Noun', '튀어나오다/Verb', '소리내다/Verb', '오지/Noun', '울다/Verb',
       '놀랬/Noun', '무섭다/Adjective', '깜짝/Noun', '놀란/Noun', '놀랬어/Noun', 
       '깜짝깜짝/Adverb', '섬뜩/Adverb', '소름/Noun', '돋다/Adjective']

li = remove_pos(tmp)

def get_similar_words(docs):
    li_new = []
    for word in li:
        try:
            token = model.wv.most_similar(tokenize(twitter, word)[0], topn=8)
            if token not in li_new:
                li_new.append(token)
        except:
            pass

    return li_new

li_new = get_similar_words(li)
for line in li_new:
    for w in line:
        if w[0] not in tmp:
            tmp.append(w[0])

In [1]:
from utils import *

In [15]:
train_data = read_data('../reviews/all_contents_03.txt')

In [2]:
from konlpy.tag import Twitter
twitter = Twitter()

In [6]:
%%time
train_docs = [tokenize(row[0]) for row in train_data]
train_docs_no_pos = [remove_pos(docs) for docs in train_docs]

CPU times: user 4min 33s, sys: 1.34 s, total: 4min 34s
Wall time: 4min 18s


In [7]:
from pprint import pprint

pprint(train_docs[-100])
print('')
pprint(train_docs_no_pos[-100])

['힘내다/Verb',
 '모두/Noun',
 '들/Suffix',
 '!!/Punctuation',
 '진실/Noun',
 '을/Josa',
 '밝히다/Verb',
 '!!/Punctuation']

['힘내다', '모두', '들', '!!', '진실', '을', '밝히다', '!!']


In [8]:
import nltk

tokens = [t for d in train_docs for t in d]
text = nltk.Text(tokens, name="NMSC")
print(text)

<Text: NMSC>


In [9]:
print(len(text.tokens))
print(len(set(text.tokens)))
pprint(text.vocab().most_common(10))

3931769
63779
[('./Punctuation', 115283),
 ('하다/Verb', 93744),
 ('영화/Noun', 90605),
 ('이/Josa', 71434),
 ('보다/Verb', 65114),
 ('의/Josa', 51345),
 ('에/Josa', 49016),
 ('../Punctuation', 47244),
 ('가/Josa', 46659),
 ('을/Josa', 42707)]


In [10]:
%%time
from gensim.models import word2vec
model = word2vec.Word2Vec(train_docs, size=300, workers=4, min_count=10, iter=100)

CPU times: user 16min 58s, sys: 3.74 s, total: 17min 2s
Wall time: 4min 41s


In [11]:
save_pickle('../train_docs_0426.pickle', train_docs)
save_pickle('../train_docs_no_pos_0426.pickle', train_docs_no_pos)
save_pickle('../nltk_text_0426.pickle', text)
model.save('../model/review_word2vec_20180426')

In [3]:
from konlpy.tag import Twitter; twitter = Twitter()
from gensim.models import word2vec
train_docs = load_pickle('../train_docs_0426.pickle')
train_docs_no_pos = load_pickle('../train_docs_no_pos_0426.pickle')
text = load_pickle('../nltk_text_0426.pickle')
model = word2vec.Word2Vec.load('../model/review_word2vec_20180426')

In [6]:
emotion_pair = {'joy': '기쁘다', 'anger': '화나다', 
                'disgust': '역겹다', 'sadness': '슬프다', 'fear': '무섭다'}
emotion_ko_list = list(emotion_pair.values())
emotion_ko_dic = {
    '기쁘다': 0, 
    '화나다': 1, 
    '역겹다': 2,
    '슬프다': 3,
    '무섭다': 4,
}
emotion_ko_list = [tokenize(row) for row in emotion_ko_list]
emotion_ko_list = [row[0] for row in emotion_ko_list]
emotion_ko_list

['기쁘다/Adjective',
 '화나다/Verb',
 '역겹다/Adjective',
 '슬프다/Adjective',
 '무섭다/Adjective']

In [7]:
model.wv.most_similar('무서움/Noun', topn=5)

[('무섭다/Adjective', 0.6415336728096008),
 ('애나벨/Noun', 0.36187833547592163),
 ('귀신/Noun', 0.3564368188381195),
 ('깜놀/Noun', 0.3522293269634247),
 ('놀램/Noun', 0.3495345115661621)]

In [8]:
def read_data_comma(filename):
    with open(filename, 'r') as f:
        data = [line.split(',') for line in f.read().splitlines()]
        data = data[1:]
    return data

# emotion_corpus = read_data_comma('../emotion_corpus_labeled.txt')

In [262]:
emotion_corpus = pd.read_csv('../emotion_corpus_labeled.txt', encoding='utf-8', sep=',')

joy_list = emotion_corpus[emotion_corpus['emotion']==0].values.tolist()
anger_list = emotion_corpus[emotion_corpus['emotion']==1].values.tolist()
disgust_list = emotion_corpus[emotion_corpus['emotion']==2].values.tolist()
sadness_list = emotion_corpus[emotion_corpus['emotion']==3].values.tolist()
fear_list = emotion_corpus[emotion_corpus['emotion']==4].values.tolist()

joy_list = [row[0] for row in joy_list]
anger_list = [row[0] for row in anger_list]
disgust_list = [row[0] for row in disgust_list]
sadness_list = [row[0] for row in sadness_list]
fear_list = [row[0] for row in fear_list]

len(joy_list), len(anger_list), len(disgust_list), len(sadness_list), len(fear_list)

(174, 196, 179, 154, 106)

In [263]:
%%time
train_docs_labeled = []

for row in train_docs:
    joy_score = 0
    anger_score = 0
    disgust_score = 0
    sadness_score = 0
    fear_score = 0
    all_scores = {}
    
    all_scores['기쁘다'] = joy_score
    all_scores['화나다'] = anger_score
    all_scores['역겹다'] = disgust_score
    all_scores['슬프다'] = sadness_score
    all_scores['무섭다'] = fear_score
    
    for _, token in enumerate(row):
            
        if token in joy_list:
            joy_score += 1
        elif token in anger_list:
            anger_score += 1
        elif token in disgust_list:
            disgust_score += 1
        elif token in sadness_list:
            sadness_score += 1
        elif token in fear_list:
            fear_score += 1
        
        all_scores['기쁘다'] += joy_score
        all_scores['화나다'] += anger_score
        all_scores['역겹다'] += disgust_score
        all_scores['슬프다'] += sadness_score
        all_scores['무섭다'] += fear_score   
        
        label = max(all_scores, key=lambda key: all_scores[key])
    
    if all_scores[label] == 0:
        label = '중립'
        train_docs_labeled.append((row, label))
    elif all_scores['기쁘다'] == all_scores['화나다'] == all_scores['역겹다'] == all_scores['슬프다'] == all_scores['무섭다']:
        label = '중립'
        train_docs_labeled.append((row, label))
    else:
        train_docs_labeled.append((row, label))   
#     print(all_scores)
#     print(label)

CPU times: user 40.8 s, sys: 614 ms, total: 41.4 s
Wall time: 41.4 s


In [264]:
from pprint import pprint
all_labeled = [row[1] for row in train_docs_labeled]

from collections import Counter
label_count = Counter(all_labeled)
pprint(label_count)
print()
sum_ = sum(label_count.values())
for label in label_count.keys():
    print(label ,":", round(label_count[label] / sum_, 2))

Counter({'기쁘다': 80251,
         '중립': 77900,
         '화나다': 41013,
         '슬프다': 20687,
         '역겹다': 20376,
         '무섭다': 12620})

무섭다 : 0.05
중립 : 0.31
역겹다 : 0.08
슬프다 : 0.08
기쁘다 : 0.32
화나다 : 0.16


**2018/04/28 15:18**
- 중립이 생각보다 많다. (38%)
- 긍정/부정/중립 비율은 31:31:38 (sentiment 상으로는 비율이 맞게 보임)

**2018/04/28 21:36**
- 긍정/부정/중립 비율은 32:37:31 (무섭다의 비율 감소)
- 무섭다의 비율이 적은 건 공표영화의 수가 적어서인 것으로 보임

In [265]:
train_docs_labeld_neutral = []
for row in train_docs_labeled:
    if row[1] == '중립':
        train_docs_labeld_neutral.append(row)

indicies = []

for idx, row in enumerate(train_docs_labeled):
    if row[1] == '중립':
        indicies.append(idx)
print(len(indicies))
train_docs_neutral = np.array(train_docs)[indicies].tolist()

77900


**2018/04/28 15:18**<br>
**어떤 리뷰가 중립으로 판별되었나?**
1. 다섯 가지 감정으로 분류하기 어려운 리뷰 (***회의*** --> ???)
2. 리뷰만으로는 긍정/부정을 알기 어려운 리뷰 (***더잘어울리는*** --> ??)
3. 감정을 나타내는 어휘가 없는 리뷰

**해결 방안**
- 1, 3: 제외
- 2: 긍정/부정 레이블 확인 or 평점 확인 --> 문제점: 레이블이 없는 리뷰도 존재

In [266]:
print(train_data[indicies[0]])
print(train_data[indicies[10]])
print(train_data[indicies[25]])
print(train_data[indicies[52]])

['"가이 리치의 영화, 1998년 록 스탁 앤 투 스모킹 배럴즈를 좋아하는 저는, 글쎄요, 극적 설정과 줄거리에 회의가 들어요."']
['법 이란걸 집행하는사람이 범죄자들에게 조종당하는 현실을 비판하는 영화']
['코난 시리즈중 침묵의15분다음으로 볼만했어요!우리꼬맹이왈']
['"스토리, 연출, 연기, 비주얼 등 영화의 기본 조차 안된 영화에 무슨 평을 해. 이런 영화 찍고도 김문옥 감독은 내가 영화 경력이 몇OO인데 조무래기들이 내 영화를 평론해? 같은 마인드에 빠져있겠지?"']


In [267]:
%%time
model_neutral = word2vec.Word2Vec(train_docs_neutral, size=300, workers=4, min_count=10, iter=100)

CPU times: user 3min 8s, sys: 901 ms, total: 3min 9s
Wall time: 57.8 s


In [268]:
save_pickle('../train_docs_neutral_0428_2136.pickle', train_docs_neutral)
model_neutral.save('../model/review_word2vec_neutral_20180428_2136')

In [269]:
df = pd.DataFrame(list(model_neutral.wv.vocab.keys()), columns=['token'])
df.to_csv('../neutral_corpus_0428.txt', index=False, encoding='utf-8')
train_data_neutral = pd.DataFrame(np.array(train_data)[indicies].tolist(), columns=['sentence'])
train_data_neutral.head()
train_data_neutral.to_csv('../neutral_data_0428.txt', index=False, encoding='utf-8')

In [270]:
import nltk
text_neutral = nltk.Text(train_docs_neutral, name="NEUTRAL")
save_pickle('../nltk_text_neutral.picke' , text_neutral)

In [289]:
save_pickle('../train_docs_lebeled.pickle', train_docs_labeled)
save_pickle('../train_data_lebeled.pickle', train_data_labeled)

In [284]:
emotion_label = [row[1] for row in train_docs_labeled]
train_data_labeled = [([train_data[i][0]], emotion_label[i]) for i in range(len(emotion_label))]

In [290]:
train_data_labeled[0]

(['"75분짜리, 결코  길다할 수없는 애니메이션 한편에 이정도 탄탄한 내용을 담을 수 있단 사실에 정말 놀랐다"'], '무섭다')

In [358]:
train_data_labeled_joy = [row[0] for row in train_data_labeled if row[1] == '기쁘다']
train_data_labeled_anger = [row[0] for row in train_data_labeled if row[1] == '화나다']
train_data_labeled_disgust = [row[0] for row in train_data_labeled if row[1] == '역겹다']
train_data_labeled_sadness = [row[0] for row in train_data_labeled if row[1] == '슬프다']
train_data_labeled_fear = [row[0] for row in train_data_labeled if row[1] == '무섭다']

In [361]:
train_data_labeled_no_neutral = [row for row in train_data_labeled if row[1] != '중립']
train_docs_labeled_no_neutral = [row for row in train_docs_labeled if row[1] != '중립']
print(len(train_data_labeled_no_neutral), len(train_docs_labeled_no_neutral))
save_pickle('../train_data_lebeled_no_neutral.pickle', train_data_labeled_no_neutral)
save_pickle('../train_docs_lebeled_no_neutral.pickle', train_docs_labeled_no_neutral)

174947 174947


In [301]:
def data_to_df(df):
    df = pd.DataFrame(columns=['tokens', 'emotion'])
    for idx, row in enumerate(df):
        df.loc[len(df)] = {
        'tokens': row[0],
        'emotion': row[1]
        }
    return df

In [362]:
df_data_no_neutral = pd.DataFrame(train_data_labeled_no_neutral, 
                                  columns=['sentence', 'emotion'])
df_data_no_neutral.head()

Unnamed: 0,sentence,emotion
0,"[""75분짜리, 결코 길다할 수없는 애니메이션 한편에 이정도 탄탄한 내용을 담을 ...",무섭다
1,"[""난무하는 욕설이나 더러운 말들, OO란 단어의 반복까진 참았다. 남녀가 첫만남부...",역겹다
2,"[""내가 실제 그장소에있다고 몰입 찐하게 하면 역대급으로 무섭고 ,영화가 이번씬은 ...",무섭다
3,"[""너무 가슴아픈 일이라 펑펑울면 어쩌나 걱정을 많이 했는데, 담담하게 볼수 있었습...",슬프다
4,"[""너에게 감동을 줄테니,너는 나에게 호주머니의 지갑을 열어달라고 울부짖고있음""]",기쁘다


In [318]:
df_docs_no_neutral = pd.DataFrame(train_docs_labeled_no_neutral, 
                                  columns=['tokens', 'emotion'])
df_docs_no_neutral.head()

Unnamed: 0,tokens,emotion
0,"[""/Punctuation, 75/Number, 분/Noun, 짜다/Verb, ,/...",무섭다
1,"[""/Punctuation, 난무/Noun, 하다/Verb, 욕설/Noun, 이나/...",역겹다
2,"[""/Punctuation, 내/Noun, 가/Josa, 실제/Noun, 그/Det...",무섭다
3,"[""/Punctuation, 너무/Noun, 가슴/Noun, 아프다/Adjectiv...",슬프다
4,"[""/Punctuation, 너/Noun, 에게/Josa, 감동/Noun, 을/Jo...",기쁘다


In [330]:
df_data_no_neutral.to_csv('../train_data_labeled_no_neutral.txt', encoding='utf-8', index=False)
df_docs_no_neutral.to_csv('../train_docs_labeled_no_neutral.txt', encoding='utf-8', index=False)

In [363]:
df_train_data_labeled_joy = pd.DataFrame(train_data_labeled_joy, columns=['sentence'])
df_train_data_labeled_anger = pd.DataFrame(train_data_labeled_anger, columns=['sentence'])
df_train_data_labeled_disgust = pd.DataFrame(train_data_labeled_disgust, columns=['sentence'])
df_train_data_labeled_sadness = pd.DataFrame(train_data_labeled_sadness, columns=['sentence'])
df_train_data_labeled_fear = pd.DataFrame(train_data_labeled_fear, columns=['sentence'])

In [333]:
train_docs_labeled_joy = [row for row in train_docs_labeled if row[1] == '기쁘다']
train_docs_labeled_anger = [row for row in train_docs_labeled if row[1] == '화나다']
train_docs_labeled_disgust = [row for row in train_docs_labeled if row[1] == '역겹다']
train_docs_labeled_sadness = [row for row in train_docs_labeled if row[1] == '슬프다']
train_docs_labeled_fear = [row for row in train_docs_labeled if row[1] == '무섭다']

In [335]:
df_train_docs_labeled_joy = pd.DataFrame(train_docs_labeled_joy, columns=['tokens', 'emotion'])
df_train_docs_labeled_anger = pd.DataFrame(train_docs_labeled_anger, columns=['tokens', 'emotion'])
df_train_docs_labeled_disgust = pd.DataFrame(train_docs_labeled_disgust, columns=['tokens', 'emotion'])
df_train_docs_labeled_sadness = pd.DataFrame(train_docs_labeled_sadness, columns=['tokens', 'emotion'])
df_train_docs_labeled_fear = pd.DataFrame(train_docs_labeled_fear, columns=['tokens', 'emotion'])

In [336]:
df_train_data_list = [df_train_data_labeled_joy, 
                      df_train_data_labeled_anger,
                      df_train_data_labeled_disgust,
                      df_train_data_labeled_sadness,
                      df_train_data_labeled_fear]

df_train_docs_list = [df_train_docs_labeled_joy, 
                      df_train_docs_labeled_anger,
                      df_train_docs_labeled_disgust,
                      df_train_docs_labeled_sadness,
                      df_train_docs_labeled_fear]

In [370]:
def to_csv(directory, obj):
    obj.to_csv(directory, encoding='utf-8', index=False)

In [371]:
for i in range(len(df_train_data_list)):
    to_csv('../train_data_labeled_{}.txt'.format(list(emotion_pair.keys())[i]), 
           df_train_data_list[i])

In [345]:
for i in range(len(df_train_data_list)):
    to_csv('../train_docs_labeled_{}.txt'.format(list(emotion_pair.keys())[i]), 
           df_train_docs_list[i])

In [434]:
train_data_joy_filtered = read_data('../train_data_labeled_joy.txt')
train_data_anger_filtered = read_data('../train_data_labeled_anger.txt')
train_data_disgust_filtered = read_data('../train_data_labeled_disgust.txt')
train_data_sadness_filtered = read_data('../train_data_labeled_sadness.txt')
train_data_fear_filtered = read_data('../train_data_labeled_fear.txt')

print("필터링 이후 데이터 개수:")
print(len(train_data_joy_filtered), len(train_data_anger_filtered), 
      len(train_data_disgust_filtered), len(train_data_sadness_filtered), 
      len(train_data_fear_filtered))

필터링 이후 데이터 개수:
77601 40201 20311 20691 10615


In [435]:
print(len(train_data_labeled_joy) - len(train_data_joy_filtered))
print(len(train_data_labeled_anger) - len(train_data_anger_filtered))
print(len(train_data_labeled_disgust) - len(train_data_disgust_filtered))
print(len(train_data_labeled_sadness) - len(train_data_sadness_filtered))
print(len(train_data_labeled_fear) - len(train_data_fear_filtered))

2650
812
65
-4
2005


**2018/04/29 22:05**
- 데이터 필터링 (레이블과 맞지 않는 데이터 제거)
- 필터링 이후 데이터 개수:
    - 기쁘다: 77601 (-2650)
    - 화나다: 40201 (-812)
    - 역겹다: 20311 (-65)
    - 슬프다: 20691 (+4)
    - 무섭다: 10615 (-2005)
    

- `tokenizer`의 한계로 부정문, 조사에 따른 변화를 (교착어의 특성을) 제대로 잡아내지 못함
- 특히 `norm`과 `stem` 모두 `True`로 설정해서
    - `tokenize('무서워')` --> '무섭다/Adjective'
    - `tokenize('무섭냐')` --> '무섭다/Adjective'

In [447]:
import itertools
def lst_of_lst_rm_duplicates(data):
    data = list(row for row, _ in itertools.groupby(data))
    return data

In [448]:
train_data_filtered_list = [train_data_joy_filtered, 
                            train_data_anger_filtered, 
                            train_data_disgust_filtered, 
                            train_data_sadness_filtered, 
                            train_data_fear_filtered]

for data in train_data_filtered_list:
    lst_of_lst_rm_duplicates(data)
    print(len(data))

77601
40201
20311
20691
10615


In [438]:
test_data_joy_filtered = random.sample(train_data_joy_filtered, 7600)
test_data_anger_filtered = random.sample(train_data_anger_filtered, 4020)
test_data_disgust_filtered = random.sample(train_data_disgust_filtered, 2030)
test_data_sadness_filtered = random.sample(train_data_sadness_filtered, 2070)
test_data_fear_filtered = random.sample(train_data_fear_filtered, 1060)

In [423]:
# 각 데이터 레이블 태깅

train_data_joy_filtered = [(row, 0) for row in train_data_joy_filtered]
train_data_anger_filtered = [(row, 1) for row in train_data_anger_filtered]
train_data_disgust_filtered = [(row, 2) for row in train_data_disgust_filtered]
train_data_sadness_filtered = [(row, 3) for row in train_data_sadness_filtered]
train_data_fear_filtered = [(row, 4) for row in train_data_fear_filtered]