In [1]:
import pandas as pd
import numpy as np
import re

reviews = pd.read_csv('data/dataset.csv', encoding='utf8')
neg_reviews = reviews[(reviews['rate'] == 1) | (reviews['rate'] == 2) | (reviews['rate'] == 3) | (reviews['rate'] == 4)]

docs = neg_reviews['review']

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
import xgboost
from sklearn.pipeline import Pipeline
from nltk.corpus import stopwords
from konlpy.tag import Twitter, Mecab, Komoran
from soynlp.tokenizer import RegexTokenizer, LTokenizer, MaxScoreTokenizer

mecab = Mecab()
twitter = Twitter()
komoran = Komoran()

def mecab_tokenizer(text):
    return [t for t in mecab.morphs(text)]

def twit_tokenizer(text):
    return [t for t in twitter.morphs(text, norm=True, stem=True)]

def komoran_tokenizer(text):
    return [t for t in komoran.morphs(text)]

def max_tokenizer(text):
    tokenizer=MaxScoreTokenizer()
    return tokenizer.tokenize(text)

In [3]:
%%time
train_docs = [row for row in docs]
sentences = [max_tokenizer(d) for d in train_docs]

CPU times: user 12.2 s, sys: 90.3 ms, total: 12.3 s
Wall time: 12.3 s


In [4]:
import gensim
from gensim import models
from gensim import corpora
from gensim.models import doc2vec
from gensim.models.doc2vec import TaggedDocument
from gensim.models import word2vec, Word2Vec

In [5]:
%%time
model = word2vec.Word2Vec(sentences, size=100, min_count=3, window=3, workers=8, iter=100, sg=1)
model.init_sims(replace=True)

CPU times: user 4min 34s, sys: 1.13 s, total: 4min 36s
Wall time: 57.4 s


In [6]:
model.most_similar(positive='사양', topn=20)

[('심해요...', 0.574049174785614),
 ('다른앱', 0.5618752241134644),
 ('고쳐야', 0.5403777360916138),
 ('2초에', 0.5397657155990601),
 ('너어무', 0.537325382232666),
 ('조작법', 0.5326470732688904),
 ('소비자들을', 0.5323509573936462),
 ('요새는', 0.5305172204971313),
 ('전체화면에', 0.5213994979858398),
 ('않구요.', 0.5181449055671692),
 ('개선되나요?', 0.5142256021499634),
 ('사용량이', 0.5110517740249634),
 ('자체도', 0.5085805654525757),
 ('느리기도', 0.5000627040863037),
 ('멈추면서', 0.49874767661094666),
 ('없지만,', 0.4986870586872101),
 ('무한로딩,', 0.49834316968917847),
 ('5분마다', 0.49690020084381104),
 ('느림...', 0.4968810975551605),
 ('그렇습니다..', 0.4962446391582489)]

In [8]:
a = model.syn1neg

In [14]:
word = list(model.wv.vocab.keys())
words = pd.Series(word)

In [15]:
words[0]

'오늘'

In [11]:
a.shape

(28232, 100)

In [31]:
%%time
ttm = pd.DataFrame()
for num, i in enumerate(range(len(a))):
    dists = list()
    for j in range(len(a)):
        dist = np.linalg.norm(a[i]-a[j])
        dists.append(dist)
    
    ttm[words[num]] = dists

CPU times: user 1h 44min 42s, sys: 35min 56s, total: 2h 20min 38s
Wall time: 3h 22min 26s


In [32]:
ttm.set_index(words)

Unnamed: 0,오늘,아침,프레이,하면서,구매를,했는데,도감,받았다고,하는데,수치가,...,체온계를,시즌1~3,울음,이유식,임신,개월수,개월수가,미접종으로,베페,백업폴더
오늘,0.000000,3.010202,2.636879,2.620956,3.118804,2.973584,3.300972,3.339690,4.098722,2.977553,...,4.316735,4.839762,4.485609,4.823262,4.382636,4.212671,4.839598,4.947616,4.496786,4.646063
아침,3.010202,0.000000,2.692993,2.658778,2.422013,2.942491,3.124512,3.330594,3.998819,2.605075,...,4.797442,4.637306,4.951381,4.432413,5.077631,4.268838,4.063090,4.414053,4.727905,4.114304
프레이,2.636879,2.692993,0.000000,2.719648,2.711962,2.741400,3.119675,3.224352,3.869446,3.010875,...,4.228076,4.801086,4.417546,4.416020,4.911632,4.173082,4.368901,4.712857,4.376740,4.643119
하면서,2.620956,2.658778,2.719648,0.000000,3.057285,2.391138,3.198012,3.057244,4.030236,2.813936,...,4.591563,4.998090,4.600024,4.721667,4.663608,4.777465,4.358956,4.728779,4.330373,4.151120
구매를,3.118804,2.422013,2.711962,3.057285,0.000000,2.991350,3.325371,3.103309,3.719041,2.937642,...,4.482367,5.035891,4.706844,4.846210,5.177785,4.596841,4.500524,4.689230,4.588502,4.526751
했는데,2.973584,2.942491,2.741400,2.391138,2.991350,0.000000,3.348155,3.046381,3.971649,3.247437,...,4.625213,4.855656,4.461199,4.375663,4.656478,5.007792,4.330386,4.799002,4.380734,4.224074
도감,3.300972,3.124512,3.119675,3.198012,3.325371,3.348155,0.000000,3.402635,4.127107,3.424962,...,4.778552,4.522870,4.123897,4.567635,4.732057,4.959084,4.375244,4.853742,5.004393,4.402514
받았다고,3.339690,3.330594,3.224352,3.057244,3.103309,3.046381,3.402635,0.000000,4.112440,3.309194,...,4.755494,4.873313,4.730867,4.914104,4.631474,5.210726,4.497602,4.852703,4.949323,4.243362
하는데,4.098722,3.998819,3.869446,4.030236,3.719041,3.971649,4.127107,4.112440,0.000000,4.273146,...,5.264136,5.433907,5.082719,5.204056,5.266076,5.276933,4.967569,5.468447,5.138552,4.807314
수치가,2.977553,2.605075,3.010875,2.813936,2.937642,3.247437,3.424962,3.309194,4.273146,0.000000,...,4.498213,4.662057,4.825461,4.888714,4.904352,4.486492,4.603949,4.878207,5.097737,4.421197


In [33]:
ttm.shape

(28232, 28232)

### mongo DB로 보내야 함 (collection 새로)

In [34]:
ttm.to_csv("distance_matrix.csv", index=False)