In [1]:
import gensim # version 4.0.1

# Step1: prepare the corpus for training

In [2]:
# Step 1. 주어진 data로 gensim을 활용하여 word2vec 모델 학습

# 학습을 위한 데이터 로딩 -- Data 준비
class TextIterator(object):
	def __init__(self, fname):
		self.fname = fname

	def __iter__(self):
		for line in open(self.fname):
			yield line.split()

filename = 'newskor.txt'
sentences = TextIterator(filename)

# Step 2, 3: Training & Load Word2Vec model

In [4]:
# Hyperparams
train = False # train flag (True: train model / False: load trained model)
SIZE = 300 # vector size
WINDOW = 5 # context window
SG = 1 # 1 for skip-gram / otherwise cbow
MIN_COUNT = 10 # ignores all words appearing lower than min_count
WORKERS = 20 # cpu cores

In [5]:
if train:
    model = gensim.models.Word2Vec(
        vector_size=SIZE, window=WINDOW, sg=SG, 
        min_count=MIN_COUNT, workers=WORKERS
    )
    model.build_vocab(sentences) # prepare model vocab
    model.train(sentences, total_examples=model.corpus_count, epochs=5)
    model.save('newskor.model')
else:
    model = gensim.models.Word2Vec.load('newskor.model')

In [6]:
vocab = model.wv.index_to_key # See vocabs
for i, v in enumerate(vocab):
    print("{}: {}".format(i, v))
    if i==30: break

0: 하
1: 이
2: .
3: 는
4: 을
5: ㄴ
6: 다
7: 의
8: 에
9: 를
10: 은
11: 어
12: 있
13: 고
14: 으로
15: 가
16: 였
17: ㄹ
18: 되
19: ,
20: 에서
21: 었
22: )
23: (
24: 로
25: 것
26: 도
27: 등
28: 과
29: 들
30: 지


In [9]:
## check word embed result
word = '버스'
print(model.wv[word])
print('size of vector: ', len(model.wv[word]))

[ 0.2715847  -0.4110245   0.05292874  0.25203902 -0.43354133 -0.31395456
 -0.38652474  0.17409396  0.09732261  0.0857102   0.33092657  0.15264848
 -0.06908963  0.15235962  0.00971851 -0.07327552 -0.09597956  0.03339197
 -0.14834644 -0.2842372  -0.03190251 -0.4458092  -0.58018994 -0.43267536
 -0.10747258 -0.5685088   0.03274008 -0.12354413  0.41608647 -0.24299626
  0.22650497 -0.07594266  0.02808464 -0.43623492 -0.48877466  0.16534336
 -0.47772175 -0.3533794   0.0497962   0.38129923  0.24344234 -0.22787471
  0.3143169   0.19553974  0.02521827  0.02969735 -0.03262899  0.11954125
 -0.188389    0.16422062  0.51755726  0.05637418  0.08843178  0.31328052
  0.429697    0.4868151   0.14308344 -0.13715728 -0.274024    0.07898688
  0.15081364  0.05356184 -0.02355863  0.02049953 -0.26301914  0.31544527
  0.09945583 -0.39227945 -0.32577708 -0.05783262 -0.04052503  0.11624837
 -0.05318096  0.04025342  0.5278434   0.13758366  0.18236583  0.19009952
  0.18445383 -0.13050452  0.06278759  0.1667132   0

# Step4: Get word similarity

In [8]:
#word1 = '한국'
#word2 = '북한'
print ("Caculate the similarity between word 1 and word2")
word1 = input("word1: ")
word2 = input("word2: ")

# check the words are in the vocabulary
no_problem = True
vocab = model.wv.index_to_key

if word1 not in vocab:
	print ('the word ' + word1 + ' is not in the vocabulary')
	no_problem = False

if word2 not in vocab:
	print ('the word ' + word2 + ' is not in the vocabulary')
	no_problem = False

if no_problem:
	similarity = model.wv.similarity(word1, word2)
	print ('the similarity between ' + word1 + ' and ' + word2 + ' : ', similarity)

Caculate the similarity between word 1 and word2
the similarity between 한국 and 북한 :  0.3075229


# Step5: Find mismatch word

In [23]:
#words = '소프트웨어 네트워크 프로그램 가방'
print("Find mismatched word in the words")
text = input("text(words): ")
words = text.split()

# check the words are in the vocabulary
no_problem = True
vocab = model.wv.index_to_key

for word in words:
	if word not in vocab:
		print('the word ' + word + ' is not in the vocabulary')
		no_problem = False
		break;

if no_problem:
	mismatched = model.wv.doesnt_match(words)
	print ('the mismatch word between ' + text +' is', mismatched)

Find mismatched word in the words
text(words): 소프트웨어 네트워크 프로그램 가방
the mismatch word between 소프트웨어 네트워크 프로그램 가방 is 가방


# Step 6. Find the top-N most similar words

In [24]:
print("Print the most similar words")
word = input("word: ")

no_problem = True
vocab = model.wv.index_to_key

if word not in vocab:
	print ('the word ' + word + ' is not in the vocabulary')
	no_problem = False

if no_problem:
    print(model.wv.most_similar(positive=[word]))

Print the most similar words
word: 인간
[('동물', 0.617953896522522), ('배아', 0.6104809641838074), ('생쥐', 0.5970973968505859), ('존엄성', 0.5852876901626587), ('본성', 0.5809779167175293), ('인류', 0.5760883092880249), ('생명체', 0.5717364549636841), ('유기체', 0.5703120827674866), ('핵이식', 0.5635316371917725), ('욕망', 0.561828076839447)]


# Step 7: Vector calculation

In [25]:
#word_a = '한국'
#word_b = '아시아'
#word_c = '유럽'
print('Find the most similar word with the result of [ a - b + c ]')
word_a = input("a: ")
word_b = input("b: ")
word_c = input("c: ")

# check the words are in the vocabulary
no_problem = True
vocab = model.wv.index_to_key

if word_a not in vocab:
	print ('the word ' + word_a + ' is not in the vocabulary')
	no_problem = False

if word_b not in vocab:
	print ('the word ' + word_b + ' is not in the vocabulary')
	no_problem = False

if word_c not in vocab:
	print ('the word ' + word_c + ' is not in the vocabulary')
	no_problem = False

if no_problem:
	mostsimilar = model.wv.most_similar(positive=[word_a, word_c], negative=[word_b], topn=5)
	print ('most similar word of ' + word_a + ' - ' + word_b + ' + ' + word_c + ' is', mostsimilar[0][0], mostsimilar[1][0], mostsimilar[2][0])

Find the most similar word with the result of [ a - b + c ]
a: 독도
b: 한국
c: 일본
most similar word of 독도 - 한국 + 일본 is 다케시마 영유권 울릉도
