In [1]:
import pandas as pd
import numpy as np
import random
import pickle
import itertools
import logging
import re
import time

import matplotlib.pyplot as plt
import seaborn as sns

from konlpy.tag import Okt
from gensim.models import Word2Vec, KeyedVectors
from collections import Counter
from tqdm import tqdm
from sklearn import preprocessing

import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_excel('./data/paper_train.xlsx')
val = pd.read_excel('./data/paper_val.xlsx')

In [None]:
train

In [None]:
val

In [None]:
train_abst = train['abstract']
val_abst = val['abstract']

In [None]:
train_sent = list(train_abst)
val_sent = list(val_abst)

In [None]:
len(train_sent)

In [None]:
len(val_sent)

In [None]:
sentences = train_sent + val_sent

In [None]:
#sentences

In [None]:
len(sentences)

In [None]:
type(sentences)

# wiki_ko

In [None]:
f = open('./data/embedding/processed_wiki_ko.txt', encoding="utf8")

wiki_ko = []
for line in tqdm(f):
    wiki_ko.append(line)
    
f.close()

In [None]:
#wiki_ko

In [None]:
len(wiki_ko)

In [None]:
type(wiki_ko)

# Preprocess

In [None]:
total = sentences + wiki_ko
total

In [None]:
len(total)

In [None]:
type(total)

In [None]:
hangul = re.compile('[^ ㄱ-ㅣ가-힣0-9]+')
preprocessed_data = []

for sent in tqdm(total):
    result = hangul.sub('', sent)
    preprocessed_data.append(result)

In [None]:
len(preprocessed_data)

In [None]:
type(preprocessed_data)

# Tokenize

In [None]:
okt = Okt()
tokenized_data = []
stopwords = ['을', '를', '이', '가', '은', '는', '의']

for sentence in tqdm(preprocessed_data):
    token = okt.morphs(sentence) # 토큰화
    token = [t for t in token if not t in stopwords]
    tokenized_data.append(token)

In [None]:
tokenized_data[:5]

In [None]:
len(tokenized_data)

In [None]:
type(tokenized_data)

In [None]:
print('초록의 최대 길이 :',max(len(l) for l in tokenized_data))
print('초록의 평균 길이 :',sum(map(len, tokenized_data))/len(tokenized_data))
plt.hist([len(s) for s in tokenized_data], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()

In [None]:
# save
with open('./tokenized_data.pickle', 'wb') as f:
    pickle.dump(tokenized_data, f, pickle.HIGHEST_PROTOCOL)

# Train

In [None]:
# load
with open('./tokenized_data.pickle', 'rb') as f:
    tokenized_data = pickle.load(f)

In [None]:
import multiprocessing

In [None]:
model = Word2Vec(sentences = tokenized_data, size = 100, window = 5, min_count = 1, iter=300,  workers = multiprocessing.cpu_count(), sg = 1)

In [None]:
word_vectors = model.wv
word_vectors

In [None]:
word_vectors['딥러닝']

In [None]:
vocabs = word_vectors.vocab.keys()
vocabs

In [None]:
word_vectors_list = [word_vectors[v] for v in vocabs]
word_vectors_list

In [None]:
model_result1 = model.wv.most_similar("인공지능")
print(model_result1)

In [None]:
word_vectors.similarity(w1 = '블록체인', w2 = '산업혁명')

In [None]:
model.save('pretrained_word2vec_okt.model')

# Inference

In [3]:
model = Word2Vec.load('./data/embedding/word2vec_okt.model')

In [4]:
word_vectors = model.wv
word_vectors

<gensim.models.keyedvectors.Word2VecKeyedVectors at 0x1a1f9960188>

In [10]:
word_vectors['딥러닝']

array([ 0.19632578, -0.32316336,  0.4636331 ,  0.31049374,  0.02876209,
       -0.42791793,  0.10199765,  0.19288541, -0.21562782,  0.59461254,
        0.23702449, -0.27611616,  0.3299997 ,  0.00494902, -0.18033482,
        0.30356586,  0.38594848, -0.9446756 ,  0.15146336, -0.2139304 ,
       -0.24217233,  0.02080054,  0.17438036, -0.2336421 , -0.4393187 ,
        0.42865777, -0.05062845,  0.04456147, -0.6185588 ,  0.15555798,
       -0.14548448, -0.47857082,  0.03385461,  0.28788298,  0.29972106,
        0.01208432, -0.27864715,  0.25673065,  0.3428577 , -0.04635178,
       -0.40450376,  0.33554044,  0.5395992 , -0.57439953, -0.27314472,
        0.5627202 ,  0.4921471 ,  0.18457916, -0.5121402 ,  0.00651723,
        0.8806682 ,  0.18588154, -0.516977  , -0.17836343, -0.8257688 ,
        0.43736076, -0.562226  , -0.303891  ,  0.10456767,  0.29746675,
        1.0408556 , -0.02111146,  0.2889631 , -0.10263211, -0.8283508 ,
        0.01743611, -0.16500169,  0.6596909 , -0.02772344,  0.10

In [11]:
vocabs = word_vectors.vocab.keys()
vocabs

dict_keys(['Modbus', '는', '각종', '자동화', '장비', '감시', '및', '제어에', '전', '세계', '적', '으로', '널리', '사용', '되고', '있는', '자발적', '산업', '표준', '통신', '프로토콜', '이다', '.', '그러므로', '선박', ',', '빌딩', '기차', '비행기', '등', '를', '이용하는', '모든', '들', '과', '연결', '이', '가능하다', '환경변수', '의', '측정', '원격', '제어가', '가능하게', '된다', '본', '논문에서', '퍼지', '제어', '시스템을', '이용하여', '외부', '환경요인을', '각각', '조합', '한', '불확실한', '내용을', '정량적', '인', '값으로', '변환하', '여', 'LED', '조명', '표현', '하다', '위해', '알고', '리즘을', '설계', '하고', '알고리즘에', '프로토콜을', '추가', '통합', '관리', '시스템에서', '환경요인', '확인', '가능', '감성', '조명용', '제어기', '회로', '구현', '하였다', '환경요소인', '온도', '습도', '조도', '값을', '센서를', '통해', '로', '받아들이다', '값들', '을', '알고리즘을', 'Serial', '통신으로', 'RS', '485', '다른', '기기', '와', '되어', '상태', '출력', '값', '가능하고', '또한', '사용자가', 'RGB', '변경', '할', '수', '있기', '때문에', '원하는', '색으로', '변경이', '제작', '제', '어기로', '에', '따라', '색상이', '변화', '되는', '것을', '고속철도의', '출현', '함께', '철도는', '국내외', '에서', '자주', '사용하는', '교통', '수단', '중', '하나', '환경적', '측면', '에서도', '수단에비해', '이산화탄소', '배출', '량도', '적은', '편이', '며', '에

In [12]:
word_vectors_list = [word_vectors[v] for v in vocabs]
word_vectors_list

[array([ 0.14214931, -0.3220672 ,  0.24234228,  0.10468625, -0.14468123,
        -0.04907226,  0.07077931,  0.0423888 ,  0.13967028,  0.20564361,
        -0.05458711,  0.05939947, -0.00372817,  0.21704051, -0.03194778,
         0.05724464,  0.20068945, -0.35387197,  0.20292191, -0.01534614,
         0.00254283,  0.1775285 ,  0.07020892, -0.18671028,  0.1284039 ,
         0.15610035, -0.07418267, -0.21684223, -0.20865192, -0.01491244,
        -0.18039793, -0.16617402, -0.21143971,  0.13529505,  0.21069562,
         0.11874828, -0.09852316, -0.04128211,  0.22073978,  0.12833853,
         0.01273375, -0.07576126, -0.04120583, -0.10763839,  0.25685945,
         0.14745522,  0.17790492,  0.1640289 , -0.19387282, -0.07792636,
         0.28729454, -0.11555526,  0.00686749, -0.28024128, -0.3668563 ,
         0.08145001, -0.18589357, -0.00853472,  0.05670733,  0.18556817,
         0.16105293, -0.02726256,  0.2508829 ,  0.17694707, -0.12602265,
         0.21429007,  0.0108842 ,  0.3601595 , -0.0

In [13]:
model_result1 = model.wv.most_similar("인공지능")
print(model_result1)

[('AI', 0.7897999286651611), ('로봇', 0.7499017715454102), ('인공지능과', 0.7401383519172668), ('인공지능의', 0.7367221117019653), ('지능형', 0.7310532331466675), ('인공지능을', 0.7136895656585693), ('인공지능이', 0.695063054561615), ('자동화', 0.6571853756904602), ('현재의', 0.6487205624580383), ('첨단', 0.6483511924743652)]


In [14]:
word_vectors.similarity(w1 = '블록체인', w2 = '산업혁명')

0.63430595

In [5]:
word_vectors['본']

array([ 1.08830623e-01, -1.45116359e-01, -1.23280562e-01,  3.89518738e-01,
        1.40670270e-01, -1.75492957e-01, -6.73430227e-03,  5.04833460e-02,
        2.92364180e-01,  5.94709292e-02, -4.62253491e-04,  5.67198545e-02,
       -1.19919591e-02, -1.37349755e-01,  1.21780343e-01, -2.61755437e-01,
       -4.45973963e-01, -3.65692466e-01,  1.21166319e-01, -3.85837890e-02,
       -3.58566016e-01, -7.86251426e-02, -2.48292208e-01, -8.09373558e-02,
       -4.48122054e-01,  2.43987426e-01, -2.84249336e-01, -5.92648564e-03,
       -1.66517850e-02,  2.10204348e-01, -6.06507296e-03,  1.41294701e-02,
        2.38750249e-01,  3.93640518e-01,  9.99653935e-02, -2.34180853e-01,
       -1.92313880e-01, -4.16026324e-01,  2.59162784e-01, -7.83519745e-01,
        3.02687764e-01,  5.31156421e-01, -2.77566671e-01, -1.13524847e-01,
        1.02663204e-01, -1.30376741e-01,  9.49568227e-02, -5.85255437e-02,
       -2.27319956e-01, -5.24434984e-01, -1.84939235e-01, -9.36172962e-01,
        1.90319985e-01,  

In [6]:
word_vectors['논문에서는']

array([ 0.07422867, -0.13879712, -0.03217978, -0.04808616,  0.4596005 ,
       -0.25556898,  0.0310453 ,  0.08433139,  0.60146594,  0.34468463,
       -0.17936273,  0.19180553,  0.14329214, -0.11244883,  0.11655741,
       -0.23594926, -0.13909581, -0.4495152 ,  0.04849399, -0.16514063,
       -0.38869143, -0.00927205, -0.23231263, -0.00821557, -0.2357543 ,
        0.02211455, -0.48741034,  0.174364  , -0.00151126,  0.48202145,
        0.19495928, -0.02267373,  0.33588788,  0.35841265,  0.23940165,
       -0.4415219 , -0.3061249 , -0.16445075,  0.34506455, -0.8083846 ,
        0.5574465 ,  0.45644617, -0.29319182,  0.13501218,  0.19878356,
        0.03240171,  0.22177838, -0.00383219, -0.25379172, -0.69586796,
       -0.12004627, -0.68527174,  0.65095335,  0.5497908 , -0.32847956,
        0.18717787,  0.23225702, -0.15326619, -0.20478015, -0.38997287,
        0.32762843,  0.03922682, -0.63729763, -0.04640365,  0.17867899,
       -0.19725376,  0.09102291, -0.12021025, -0.6814168 , -0.17

In [10]:
word_vectors['최신']

array([ 3.10226828e-02, -1.10580397e+00, -3.48504394e-01, -5.39194345e-01,
        6.81508780e-02, -3.93005908e-01, -5.76753914e-01, -3.25724371e-02,
        4.54148829e-01,  3.20468724e-01,  8.55936557e-02, -5.94241023e-01,
       -7.32191563e-01, -2.30316490e-01,  7.72797614e-02, -3.17843854e-01,
        4.72475469e-01, -3.28000605e-01,  1.31539583e-01, -5.83861709e-01,
       -3.74805599e-01, -6.28191173e-01, -4.58916575e-01, -1.65072367e-01,
       -1.84155941e-01,  5.53861380e-01, -6.63010255e-02,  1.07930535e-02,
        5.78881085e-01,  8.82861674e-01,  6.50319606e-02,  1.93364009e-01,
        5.22754737e-04,  4.51120324e-02,  3.61359507e-01, -2.73351371e-01,
        1.28986180e-01,  6.83012843e-01, -5.46527743e-01,  4.09289114e-02,
       -9.98662561e-02, -2.02101856e-01, -2.14026287e-01,  4.40353245e-01,
        2.88037866e-01, -3.06915879e-01, -3.38595430e-03,  2.02111267e-02,
       -1.98778525e-01,  2.09638793e-02,  7.37646222e-02, -6.92574084e-02,
       -3.73636603e-01,  

In [11]:
word_vectors['모델을']

array([ 0.25162616, -0.45837918, -0.13481429, -0.19774887,  0.19136557,
       -0.31361568,  0.01007773, -0.3117399 ,  0.6506478 , -0.01019445,
        0.09849616,  0.45976892, -0.38232088, -0.18228948,  0.5099745 ,
       -0.2615171 ,  0.29055467, -0.30199182,  0.11678364, -0.19542332,
        0.10665746, -0.10212007,  0.30181393,  0.05062505,  0.21169776,
       -0.05013287, -0.43599546, -0.17584081, -0.02696205, -0.06265536,
        0.20061992, -0.07501114, -0.19500239,  0.6062773 ,  0.04194705,
       -0.26857522,  0.2006044 ,  0.4462062 , -0.5289004 , -0.36124426,
       -0.39173806,  0.6945131 ,  0.07845373, -0.1802043 ,  0.5619445 ,
       -0.0549533 ,  0.34740302,  0.06036654, -0.1997327 , -0.6470941 ,
       -0.0947281 , -0.10146372, -0.03404899, -0.6544181 ,  0.06530306,
        0.11828923, -0.1950225 , -0.12309274, -0.3305263 , -0.33431014,
       -0.07502859,  0.04326251,  0.0291918 , -0.19731185,  0.40234157,
       -0.24861208,  0.18958907, -0.74235696, -0.5511594 ,  0.23

In [12]:
word_vectors['개발']

array([-0.24765675, -0.13088024, -0.03584925,  0.10642435,  0.5175488 ,
       -0.43700847, -0.51209706, -0.04089105, -0.0667778 , -0.1910208 ,
       -0.28242657,  0.31369123, -0.4357974 , -0.1848793 , -0.06320531,
       -0.12058043, -0.3790647 , -0.03144405,  0.05795506, -0.26296046,
        0.22137712, -0.30927294,  0.34680712, -0.09909733, -0.1521457 ,
       -0.02345827,  0.2723481 ,  0.21512504,  0.08148118,  0.16124919,
       -0.17859153,  0.46442857, -0.09981363,  0.69716173, -0.13398154,
        0.20930187,  0.25279376,  0.08701743, -0.42749503, -0.5868373 ,
       -0.47030473,  0.25463432,  0.41738042, -0.04260132,  0.6327698 ,
       -0.07304155,  0.06442718, -0.0715663 , -0.07492986, -0.531915  ,
       -0.14950672, -0.05445634, -0.24335347,  0.15690625,  0.37090603,
       -0.33112037, -0.03869842, -0.12244172, -0.9947536 , -0.11247724,
       -0.26408258,  0.03753511,  0.13632588, -0.10749409, -0.14990996,
        0.5370197 ,  0.23576273, -0.08416114, -0.17116413,  0.15

In [13]:
word_vectors['하였다']

array([-0.18639208,  0.04594791, -0.650038  ,  0.48014787, -0.4896101 ,
       -0.12366228,  0.06880878, -0.18671888,  0.06253512, -0.04062589,
        0.16234317, -0.11122565,  0.09515085, -0.35314786,  0.03698705,
       -0.2482891 ,  0.04630958,  0.24009402,  0.1874587 , -0.25012976,
        0.3133603 , -0.2985825 ,  0.16312061,  0.02923655, -0.45419258,
        0.17903256, -0.0623306 , -0.5350584 , -0.08260557,  0.05494003,
       -0.13919957, -0.1323333 ,  0.38924235,  0.20142847,  0.0275053 ,
       -0.13849686,  0.13533552,  0.159568  , -0.2520797 , -0.05679316,
       -0.5820388 ,  0.07942318,  0.5483432 ,  0.32200775,  0.47757807,
       -0.2518102 , -0.27552518, -0.39933944, -0.6006485 , -0.1119136 ,
       -0.20034158,  0.12595126, -0.3667278 ,  0.17537035,  0.01975806,
        0.18408246,  0.04036335, -0.19213055, -0.16550805, -0.35119367,
        0.02167746,  0.3494826 , -0.00818388,  0.6637569 ,  0.39362255,
        0.09636202, -0.12584345, -0.30610004,  0.04726357, -0.11