In [1]:
from utils import *

In [2]:
def remove_stopwords(doc, stopwords):
    """
    Remove stopwords from doc.
    :param doc: Document to remove stopwords
    :param stopwords: stopwords corpus
    :return: documents with removed stopwords
    """
    stopwords_removed = []
    for token in doc:
        if not token in stopwords:
            stopwords_removed.append(token)

In [3]:
import numpy as np

def get_weights(matrix, x1, x2):
    """
    Get normalized weights matrix. Used for np.dot(TDM, weights).
    :param matrix: similarity matrix (euclidean or cosine)
    :param x1: rows
    :param x2: cols
    :return: normalized weights matrix
    """
    distance = matrix[x1, x2] ** 2
    variance = np.var(matrix)
    weights = np.exp(-(distance / (2 * variance ** 2)))
    return weights

In [4]:
train_data = read_data('../reviews/all_contents_03.txt')

In [2]:
from konlpy.tag import Twitter
twitter = Twitter()

In [6]:
%%time
train_docs = [tokenize(row[0]) for row in train_data]
train_docs_no_pos = [remove_pos(docs) for docs in train_docs]

CPU times: user 4min 33s, sys: 1.34 s, total: 4min 34s
Wall time: 4min 18s


In [7]:
from pprint import pprint

pprint(train_docs[-100])
print('')
pprint(train_docs_no_pos[-100])

['힘내다/Verb',
 '모두/Noun',
 '들/Suffix',
 '!!/Punctuation',
 '진실/Noun',
 '을/Josa',
 '밝히다/Verb',
 '!!/Punctuation']

['힘내다', '모두', '들', '!!', '진실', '을', '밝히다', '!!']


In [8]:
import nltk

tokens = [t for d in train_docs for t in d]
text = nltk.Text(tokens, name="NMSC")
print(text)

<Text: NMSC>


In [9]:
print(len(text.tokens))
print(len(set(text.tokens)))
pprint(text.vocab().most_common(10))

3931769
63779
[('./Punctuation', 115283),
 ('하다/Verb', 93744),
 ('영화/Noun', 90605),
 ('이/Josa', 71434),
 ('보다/Verb', 65114),
 ('의/Josa', 51345),
 ('에/Josa', 49016),
 ('../Punctuation', 47244),
 ('가/Josa', 46659),
 ('을/Josa', 42707)]


In [10]:
%%time
from gensim.models import word2vec
model = word2vec.Word2Vec(train_docs, size=300, workers=4, min_count=10, iter=100)

CPU times: user 16min 58s, sys: 3.74 s, total: 17min 2s
Wall time: 4min 41s


In [11]:
save_pickle('../train_docs_0426.pickle', train_docs)
save_pickle('../train_docs_no_pos_0426.pickle', train_docs_no_pos)
save_pickle('../nltk_text_0426.pickle', text)
model.save('../model/review_word2vec_20180426')

In [3]:
from konlpy.tag import Twitter; twitter = Twitter()
from gensim.models import word2vec
train_docs = load_pickle('../train_docs_0426.pickle')
train_docs_no_pos = load_pickle('../train_docs_no_pos_0426.pickle')
text = load_pickle('../nltk_text_0426.pickle')
model = word2vec.Word2Vec.load('../model/review_word2vec_20180426')

In [8]:
emotion_pair = {'joy': '기쁘다', 'anger': '화나다', 
                'disgust': '역겹다', 'sadness': '슬프다', 'fear': '무섭다'}
emotion_ko_list = list(emotion_pair.values())
emotion_ko_dic = {
    '기쁘다': 0, 
    '화나다': 1, 
    '역겹다': 2,
    '슬프다': 3,
    '무섭다': 4,
}
emotion_ko_list = [tokenize(row) for row in emotion_ko_list]
emotion_ko_list = [row[0] for row in emotion_ko_list]
emotion_ko_list

['기쁘다/Adjective',
 '화나다/Verb',
 '역겹다/Adjective',
 '슬프다/Adjective',
 '무섭다/Adjective']

In [270]:
tmp = ['귀신/Noun', '깜놀/Noun', '놀라다/Verb', '멀미/Noun', '놀램/Noun',
       '지르다/Verb', '갑툭튀/Noun', '지름/Noun', '놀람/Noun', '나쁘다/Adjective',
       '비명/Noun', '튀어나오다/Verb', '소리내다/Verb', '오지/Noun', '울다/Verb',
       '놀랬/Noun', '무섭다/Adjective', '깜짝/Noun', '놀란/Noun', '놀랬어/Noun', 
       '깜짝깜짝/Adverb', '섬뜩/Adverb', '소름/Noun', '돋다/Adjective']

li = remove_pos(tmp)

def get_similar_words(docs):
    li_new = []
    for word in li:
        try:
            token = model.wv.most_similar(tokenize(twitter, word)[0], topn=8)
            if token not in li_new:
                li_new.append(token)
        except:
            pass

    return li_new

li_new = get_similar_words(li)
for line in li_new:
    for w in line:
        if w[0] not in tmp:
            tmp.append(w[0])

In [286]:
pprint(list(fear_list)[:10])

['오름/Noun',
 '비명/Noun',
 '뿜었/Noun',
 '잼슴/Noun',
 '웃다/Verb',
 'Ost/Alpha',
 '드럼/Noun',
 '놀라다/Verb',
 '오지/Noun',
 '빙의/Noun']


In [30]:
model.wv.most_similar('무서움/Noun', topn=5)

[('무섭다/Adjective', 0.6415336728096008),
 ('애나벨/Noun', 0.36187833547592163),
 ('귀신/Noun', 0.3564368188381195),
 ('깜놀/Noun', 0.3522293269634247),
 ('놀램/Noun', 0.3495345115661621)]

In [31]:
def read_data_comma(filename):
    with open(filename, 'r') as f:
        data = [line.split(',') for line in f.read().splitlines()]
        data = data[1:]
    return data

# emotion_corpus = read_data_comma('../emotion_corpus_labeled.txt')

In [180]:
emotion_corpus = pd.read_csv('../emotion_corpus_labeled.txt', encoding='utf-8', sep=',')
emotion_corpus.head()

Unnamed: 0,token,emotion
0,기쁘다/Adjective,0
1,감동/Noun,0
2,감사/Noun,0
3,고맙다/Adjective,0
4,괜찮다/Adjective,0


In [181]:
joy_list = emotion_corpus[emotion_corpus['emotion']==0].values.tolist()
anger_list = emotion_corpus[emotion_corpus['emotion']==1].values.tolist()
disgust_list = emotion_corpus[emotion_corpus['emotion']==2].values.tolist()
sadness_list = emotion_corpus[emotion_corpus['emotion']==3].values.tolist()
fear_list = emotion_corpus[emotion_corpus['emotion']==4].values.tolist()

joy_list = [row[0] for row in joy_list]
anger_list = [row[0] for row in anger_list]
disgust_list = [row[0] for row in disgust_list]
sadness_list = [row[0] for row in sadness_list]
fear_list = [row[0] for row in fear_list]

len(joy_list), len(anger_list), len(disgust_list), len(sadness_list), len(fear_list)

(94, 152, 137, 93, 88)

In [182]:
%%time
train_docs_labeled = []

for row in train_docs:
    joy_score = 0
    anger_score = 0
    disgust_score = 0
    sadness_score = 0
    fear_score = 0
    all_scores = {}
    
    all_scores['기쁘다'] = joy_score
    all_scores['화나다'] = anger_score
    all_scores['역겹다'] = disgust_score
    all_scores['슬프다'] = sadness_score
    all_scores['무섭다'] = fear_score
    
    for _, token in enumerate(row):
            
        if token in joy_list:
            joy_score += 1
        elif token in anger_list:
            anger_score += 1
        elif token in disgust_list:
            disgust_score += 1
        elif token in sadness_list:
            sadness_score += 1
        elif token in fear_list:
            fear_score += 1
        
        all_scores['기쁘다'] += joy_score
        all_scores['화나다'] += anger_score
        all_scores['역겹다'] += disgust_score
        all_scores['슬프다'] += sadness_score
        all_scores['무섭다'] += fear_score   
        
        label = max(all_scores, key=lambda key: all_scores[key])
    
    if all_scores[label] == 0:
        label = '중립'
        train_docs_labeled.append((row, label))
#     elif all_scores['기쁘다'] == all_scores['화나다'] == all_scores['역겹다'] == all_scores['슬프다'] == all_scores['무섭다']:
#         label = '중립'
#         train_docs_labeled.append((row, label))
    else:
        train_docs_labeled.append((row, label))   
#     print(all_scores)
#     print(label)

CPU times: user 29.1 s, sys: 86.8 ms, total: 29.2 s
Wall time: 29.2 s


In [183]:
all_labeled = [row[1] for row in train_docs_labeled]

from collections import Counter
label_count = Counter(all_labeled)
label_count

Counter({'기쁘다': 69354,
         '무섭다': 14219,
         '슬프다': 17737,
         '역겹다': 18923,
         '중립': 104237,
         '화나다': 28377})

In [184]:
train_docs_labeld_neutral = []
for row in train_docs_labeled:
    if row[1] == '중립':
        train_docs_labeld_neutral.append(row)

indicies = []

for idx, row in enumerate(train_docs_labeled):
    if row[1] == '중립':
        indicies.append(idx)
print(len(indicies))
train_docs_neutral = np.array(train_docs)[indicies].tolist()

104237


In [185]:
%%time
model_neutral = word2vec.Word2Vec(train_docs_neutral, size=300, workers=4, min_count=10, iter=100)

CPU times: user 4min 37s, sys: 1.12 s, total: 4min 38s
Wall time: 1min 21s


In [186]:
save_pickle('../train_docs_neutral_0427.pickle', train_docs_neutral)
model_neutral.save('../model/review_word2vec_neutral_20180427')

In [187]:
df = pd.DataFrame(list(model_neutral.wv.vocab.keys()))
df.to_csv('../neutral_corpus.txt', index=False, encoding='utf-8')

In [188]:
import nltk
text_neutral = nltk.Text(train_docs_neutral, name="NEUTRAL")

In [202]:
model.wv.most_similar('가엾다/Adjective', topn=50)

[('불쌍하다/Adjective', 0.2824586033821106),
 ('비위/Noun', 0.2561807632446289),
 ('감수성/Noun', 0.23796811699867249),
 ('신앙/Noun', 0.23324984312057495),
 ('무미건조/Noun', 0.22985348105430603),
 ('연약/Noun', 0.22469256818294525),
 ('가출/Noun', 0.22458600997924805),
 ('한심하다/Adjective', 0.22446684539318085),
 ('멘탈/Noun', 0.2216242253780365),
 ('멀미/Noun', 0.22064091265201569),
 ('참담/Noun', 0.21878448128700256),
 ('나약/Noun', 0.2170790433883667),
 ('부끄러움/Noun', 0.21612493693828583),
 ('피비/Noun', 0.2144235074520111),
 ('노망/Noun', 0.2143237143754959),
 ('하앍/Noun', 0.2133876234292984),
 ('지못미/Noun', 0.21295066177845),
 ('짜증/Noun', 0.21245987713336945),
 ('전능/Noun', 0.21142469346523285),
 ('사랑스럽다/Adjective', 0.2111159861087799),
 ('이기심/Noun', 0.2098180055618286),
 ('화도/Noun', 0.20951983332633972),
 (',..../Punctuation', 0.20921377837657928),
 ('신격화/Noun', 0.20774462819099426),
 ('애처/Noun', 0.2071099728345871),
 ('화병/Noun', 0.20569823682308197),
 ('미개/Noun', 0.20228713750839233),
 ('투철/Noun', 0.2015980035066