In [2]:
def remove_stopwords(doc, stopwords):
    """
    Remove stopwords from doc.
    :param doc: Document to remove stopwords
    :param stopwords: stopwords corpus
    :return: documents with removed stopwords
    """
    stopwords_removed = []
    for token in doc:
        if not token in stopwords:
            stopwords_removed.append(token)

In [3]:
import numpy as np

def get_weights(matrix, x1, x2):
    """
    Get normalized weights matrix. Used for np.dot(TDM, weights).
    :param matrix: similarity matrix (euclidean or cosine)
    :param x1: rows
    :param x2: cols
    :return: normalized weights matrix
    """
    distance = matrix[x1, x2] ** 2
    variance = np.var(matrix)
    weights = np.exp(-(distance / (2 * variance ** 2)))
    return weights

In [None]:
tmp = ['귀신/Noun', '깜놀/Noun', '놀라다/Verb', '멀미/Noun', '놀램/Noun',
       '지르다/Verb', '갑툭튀/Noun', '지름/Noun', '놀람/Noun', '나쁘다/Adjective',
       '비명/Noun', '튀어나오다/Verb', '소리내다/Verb', '오지/Noun', '울다/Verb',
       '놀랬/Noun', '무섭다/Adjective', '깜짝/Noun', '놀란/Noun', '놀랬어/Noun', 
       '깜짝깜짝/Adverb', '섬뜩/Adverb', '소름/Noun', '돋다/Adjective']

li = remove_pos(tmp)

def get_similar_words(docs):
    li_new = []
    for word in li:
        try:
            token = model.wv.most_similar(tokenize(twitter, word)[0], topn=8)
            if token not in li_new:
                li_new.append(token)
        except:
            pass

    return li_new

li_new = get_similar_words(li)
for line in li_new:
    for w in line:
        if w[0] not in tmp:
            tmp.append(w[0])

In [1]:
from utils import *

In [15]:
train_data = read_data('../reviews/all_contents_03.txt')

In [2]:
from konlpy.tag import Twitter
twitter = Twitter()

In [6]:
%%time
train_docs = [tokenize(row[0]) for row in train_data]
train_docs_no_pos = [remove_pos(docs) for docs in train_docs]

CPU times: user 4min 33s, sys: 1.34 s, total: 4min 34s
Wall time: 4min 18s


In [7]:
from pprint import pprint

pprint(train_docs[-100])
print('')
pprint(train_docs_no_pos[-100])

['힘내다/Verb',
 '모두/Noun',
 '들/Suffix',
 '!!/Punctuation',
 '진실/Noun',
 '을/Josa',
 '밝히다/Verb',
 '!!/Punctuation']

['힘내다', '모두', '들', '!!', '진실', '을', '밝히다', '!!']


In [8]:
import nltk

tokens = [t for d in train_docs for t in d]
text = nltk.Text(tokens, name="NMSC")
print(text)

<Text: NMSC>


In [9]:
print(len(text.tokens))
print(len(set(text.tokens)))
pprint(text.vocab().most_common(10))

3931769
63779
[('./Punctuation', 115283),
 ('하다/Verb', 93744),
 ('영화/Noun', 90605),
 ('이/Josa', 71434),
 ('보다/Verb', 65114),
 ('의/Josa', 51345),
 ('에/Josa', 49016),
 ('../Punctuation', 47244),
 ('가/Josa', 46659),
 ('을/Josa', 42707)]


In [10]:
%%time
from gensim.models import word2vec
model = word2vec.Word2Vec(train_docs, size=300, workers=4, min_count=10, iter=100)

CPU times: user 16min 58s, sys: 3.74 s, total: 17min 2s
Wall time: 4min 41s


In [11]:
save_pickle('../train_docs_0426.pickle', train_docs)
save_pickle('../train_docs_no_pos_0426.pickle', train_docs_no_pos)
save_pickle('../nltk_text_0426.pickle', text)
model.save('../model/review_word2vec_20180426')

In [3]:
from konlpy.tag import Twitter; twitter = Twitter()
from gensim.models import word2vec
train_docs = load_pickle('../train_docs_0426.pickle')
train_docs_no_pos = load_pickle('../train_docs_no_pos_0426.pickle')
text = load_pickle('../nltk_text_0426.pickle')
model = word2vec.Word2Vec.load('../model/review_word2vec_20180426')

In [6]:
emotion_pair = {'joy': '기쁘다', 'anger': '화나다', 
                'disgust': '역겹다', 'sadness': '슬프다', 'fear': '무섭다'}
emotion_ko_list = list(emotion_pair.values())
emotion_ko_dic = {
    '기쁘다': 0, 
    '화나다': 1, 
    '역겹다': 2,
    '슬프다': 3,
    '무섭다': 4,
}
emotion_ko_list = [tokenize(row) for row in emotion_ko_list]
emotion_ko_list = [row[0] for row in emotion_ko_list]
emotion_ko_list

['기쁘다/Adjective',
 '화나다/Verb',
 '역겹다/Adjective',
 '슬프다/Adjective',
 '무섭다/Adjective']

In [7]:
model.wv.most_similar('무서움/Noun', topn=5)

[('무섭다/Adjective', 0.6415336728096008),
 ('애나벨/Noun', 0.36187833547592163),
 ('귀신/Noun', 0.3564368188381195),
 ('깜놀/Noun', 0.3522293269634247),
 ('놀램/Noun', 0.3495345115661621)]

In [8]:
def read_data_comma(filename):
    with open(filename, 'r') as f:
        data = [line.split(',') for line in f.read().splitlines()]
        data = data[1:]
    return data

# emotion_corpus = read_data_comma('../emotion_corpus_labeled.txt')

In [22]:
emotion_corpus = pd.read_csv('../emotion_corpus_labeled.txt', encoding='utf-8', sep=',')

joy_list = emotion_corpus[emotion_corpus['emotion']==0].values.tolist()
anger_list = emotion_corpus[emotion_corpus['emotion']==1].values.tolist()
disgust_list = emotion_corpus[emotion_corpus['emotion']==2].values.tolist()
sadness_list = emotion_corpus[emotion_corpus['emotion']==3].values.tolist()
fear_list = emotion_corpus[emotion_corpus['emotion']==4].values.tolist()

joy_list = [row[0] for row in joy_list]
anger_list = [row[0] for row in anger_list]
disgust_list = [row[0] for row in disgust_list]
sadness_list = [row[0] for row in sadness_list]
fear_list = [row[0] for row in fear_list]

len(joy_list), len(anger_list), len(disgust_list), len(sadness_list), len(fear_list)

(114, 154, 139, 95, 88)

In [23]:
%%time
train_docs_labeled = []

for row in train_docs:
    joy_score = 0
    anger_score = 0
    disgust_score = 0
    sadness_score = 0
    fear_score = 0
    all_scores = {}
    
    all_scores['기쁘다'] = joy_score
    all_scores['화나다'] = anger_score
    all_scores['역겹다'] = disgust_score
    all_scores['슬프다'] = sadness_score
    all_scores['무섭다'] = fear_score
    
    for _, token in enumerate(row):
            
        if token in joy_list:
            joy_score += 1
        elif token in anger_list:
            anger_score += 1
        elif token in disgust_list:
            disgust_score += 1
        elif token in sadness_list:
            sadness_score += 1
        elif token in fear_list:
            fear_score += 1
        
        all_scores['기쁘다'] += joy_score
        all_scores['화나다'] += anger_score
        all_scores['역겹다'] += disgust_score
        all_scores['슬프다'] += sadness_score
        all_scores['무섭다'] += fear_score   
        
        label = max(all_scores, key=lambda key: all_scores[key])
    
    if all_scores[label] == 0:
        label = '중립'
        train_docs_labeled.append((row, label))
#     elif all_scores['기쁘다'] == all_scores['화나다'] == all_scores['역겹다'] == all_scores['슬프다'] == all_scores['무섭다']:
#         label = '중립'
#         train_docs_labeled.append((row, label))
    else:
        train_docs_labeled.append((row, label))   
#     print(all_scores)
#     print(label)

CPU times: user 31.4 s, sys: 97.3 ms, total: 31.5 s
Wall time: 31.7 s


In [40]:
from pprint import pprint
all_labeled = [row[1] for row in train_docs_labeled]

from collections import Counter
label_count = Counter(all_labeled)
pprint(label_count)
print()
sum_ = sum(label_count.values())
for label in label_count.keys():
    print(label ,":", round(label_count[label] / sum_, 2))

Counter({'중립': 96261,
         '기쁘다': 78493,
         '화나다': 28464,
         '역겹다': 18696,
         '슬프다': 17086,
         '무섭다': 13847})

무섭다 : 0.05
중립 : 0.38
역겹다 : 0.07
슬프다 : 0.07
기쁘다 : 0.31
화나다 : 0.11


**2018/04/28 15:18**
- 중립이 생각보다 많다. (38%)
- 긍정/부정/중립 비율은 31:31:38 (sentiment 상으로는 비율이 맞게 보임)

In [33]:
train_docs_labeld_neutral = []
for row in train_docs_labeled:
    if row[1] == '중립':
        train_docs_labeld_neutral.append(row)

indicies = []

for idx, row in enumerate(train_docs_labeled):
    if row[1] == '중립':
        indicies.append(idx)
print(len(indicies))
train_docs_neutral = np.array(train_docs)[indicies].tolist()

96261


**어떤 리뷰가 중립으로 판별되었나?**
1. 다섯 가지 감정으로 분류하기 어려운 리뷰 (***회의*** --> ???)
2. 리뷰만으로는 긍정/부정을 알기 어려운 리뷰 (***더잘어울리는*** --> ??)
3. 감정을 나타내는 어휘가 없는 리뷰

**해결 방안**
- 1, 3: 제외
- 2: 긍정/부정 레이블 확인 or 평점 확인 --> 문제점: 레이블이 없는 리뷰도 존재

In [39]:
print(train_data[indicies[0]])
print(train_data[indicies[10]])
print(train_data[indicies[25]])
print(train_data[indicies[52]])

['"가이 리치의 영화, 1998년 록 스탁 앤 투 스모킹 배럴즈를 좋아하는 저는, 글쎄요, 극적 설정과 줄거리에 회의가 들어요."']
['무법자의 천상천하유아독쫑 이란 제목이 더 잘어울리는 영화!']
['조조 히데오감독은 나름 내용에 의미는 부여 하는 듯!']
['명대사: 앤딩부분 여선생曰 적당']


In [41]:
%%time
model_neutral = word2vec.Word2Vec(train_docs_neutral, size=300, workers=4, min_count=10, iter=100)

CPU times: user 4min 14s, sys: 1.27 s, total: 4min 15s
Wall time: 1min 17s


In [42]:
save_pickle('../train_docs_neutral_0427.pickle', train_docs_neutral)
model_neutral.save('../model/review_word2vec_neutral_20180427')

In [50]:
df = pd.DataFrame(list(model_neutral.wv.vocab.keys()), columns=['token'])
df.to_csv('../neutral_corpus.txt', index=False, encoding='utf-8')

In [43]:
import nltk
text_neutral = nltk.Text(train_docs_neutral, name="NEUTRAL")

In [84]:
tokenize('싸구려')

['싸구려/Noun']

In [85]:
model.wv.most_similar('싸구려/Noun', topn=50)

[('삼류/Noun', 0.33203670382499695),
 ('신파극/Noun', 0.3291999399662018),
 ('저급/Noun', 0.3059639632701874),
 ('에로영화/Noun', 0.291871041059494),
 ('에로티시즘/Noun', 0.28579774498939514),
 ('비급/Noun', 0.282213419675827),
 ('에로물/Noun', 0.2783181369304657),
 ('헐리우드/Noun', 0.27360162138938904),
 ('질/Noun', 0.2633456289768219),
 ('무협지/Noun', 0.2590341866016388),
 ('단순하다/Adjective', 0.25870680809020996),
 ('양산/Noun', 0.25715211033821106),
 ('억지/Noun', 0.25333061814308167),
 ('쓰레기/Noun', 0.2532225549221039),
 ('요즘/Noun', 0.2517792582511902),
 ('류/Noun', 0.25019702315330505),
 ('시답/Noun', 0.25014448165893555),
 ('어설픈/Noun', 0.2495957314968109),
 ('포르노/Noun', 0.24708467721939087),
 ('코미디/Noun', 0.2448120415210724),
 ('뭣/Noun', 0.24169448018074036),
 ('경박/Noun', 0.23776760697364807),
 ('설탕/Noun', 0.2366945594549179),
 ('애로/Noun', 0.23624417185783386),
 ('천박/Noun', 0.233965665102005),
 ('코메디/Noun', 0.23341765999794006),
 ('썰렁하다/Adjective', 0.2324945479631424),
 ('유아/Noun', 0.23086710274219513),
 ('옴니버스/Nou