In [1]:
import pandas as pd
import numpy as np
import re

reviews = pd.read_csv('data/dataset.csv', encoding='utf8')
neg_reviews = reviews[(reviews['rate'] == 1) | (reviews['rate'] == 2) | (reviews['rate'] == 3) | (reviews['rate'] == 4)]

docs = neg_reviews['review']

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
import xgboost
from sklearn.pipeline import Pipeline
from nltk.corpus import stopwords
from konlpy.tag import Twitter, Mecab, Komoran
from soynlp.tokenizer import RegexTokenizer, LTokenizer, MaxScoreTokenizer

mecab = Mecab()
twitter = Twitter()
komoran = Komoran()

def mecab_tokenizer(text):
    return [t for t in mecab.morphs(text)]

def twit_tokenizer(text):
    return [t for t in twitter.morphs(text, norm=True, stem=True)]

def komoran_tokenizer(text):
    return [t for t in komoran.morphs(text)]

def max_tokenizer(text):
    tokenizer=MaxScoreTokenizer()
    return tokenizer.tokenize(text)

In [3]:
%%time
train_docs = [row for row in docs]
sentences = [max_tokenizer(d) for d in train_docs]

CPU times: user 12.2 s, sys: 90.3 ms, total: 12.3 s
Wall time: 12.3 s


In [4]:
import gensim
from gensim import models
from gensim import corpora
from gensim.models import doc2vec
from gensim.models.doc2vec import TaggedDocument
from gensim.models import word2vec, Word2Vec

In [5]:
%%time
model = word2vec.Word2Vec(sentences, size=100, min_count=3, window=3, workers=8, iter=100, sg=1)
model.init_sims(replace=True)

CPU times: user 4min 34s, sys: 1.13 s, total: 4min 36s
Wall time: 57.4 s


In [6]:
model.most_similar(positive='사양', topn=20)

[('심해요...', 0.574049174785614),
 ('다른앱', 0.5618752241134644),
 ('고쳐야', 0.5403777360916138),
 ('2초에', 0.5397657155990601),
 ('너어무', 0.537325382232666),
 ('조작법', 0.5326470732688904),
 ('소비자들을', 0.5323509573936462),
 ('요새는', 0.5305172204971313),
 ('전체화면에', 0.5213994979858398),
 ('않구요.', 0.5181449055671692),
 ('개선되나요?', 0.5142256021499634),
 ('사용량이', 0.5110517740249634),
 ('자체도', 0.5085805654525757),
 ('느리기도', 0.5000627040863037),
 ('멈추면서', 0.49874767661094666),
 ('없지만,', 0.4986870586872101),
 ('무한로딩,', 0.49834316968917847),
 ('5분마다', 0.49690020084381104),
 ('느림...', 0.4968810975551605),
 ('그렇습니다..', 0.4962446391582489)]

In [8]:
a = model.syn1neg

In [14]:
word = list(model.wv.vocab.keys())
words = pd.Series(word)

In [15]:
words[0]

'오늘'

In [11]:
a.shape

(28232, 100)

In [None]:
%%time
ttm = pd.DataFrame()
for num, i in enumerate(range(len(a))):
    dists = list()
    for j in range(len(a)):
        dist = np.linalg.norm(a[i]-a[j])
        dists.append(dist)
    
    ttm[words[num]] = dists

In [None]:
ttm.set_index(words)

In [None]:
ttm.to_csv("distance_matrix.csv", index=False)