In [5]:
import gensim

# Step1: prepare the corpus for training

In [14]:
# Step 1. 주어진 data로 gensim을 활용하여 word2vec 모델 학습

# 학습을 위한 데이터 로딩 -- Data 준비
class TextIterator(object):
	def __init__(self, fname):
		self.fname = fname

	def __iter__(self):
		for line in open(self.fname):
			yield line.split()

filename = 'newskor.txt'
sentences = TextIterator(filename)

# Step 2, 3: Training & Load Word2Vec model

In [7]:
# Hyperparams
train = True # train flag (True: train model / False: load trained model)
SIZE = 300 # vector size
WINDOW = 5 # context window
SG = 1 # 1 for skip-gram / otherwise cbow
MIN_COUNT = 10 # ignores all words appearing lower than min_count (10번 미만 데이터 학습 X.)
WORKERS = 20 # cpu cores

In [9]:
if train:
    model = gensim.models.Word2Vec(
        size=SIZE, window=WINDOW, sg=SG, 
        min_count=MIN_COUNT, workers=WORKERS
    )
    model.build_vocab(sentences) # prepare model vocab
    model.train(sentences, total_examples=model.corpus_count, epochs=5)
    model.save('newskor.model')
else:
    model = gensim.models.Word2Vec.load('newskor.model')

In [18]:
model.wv.index2word # See vocabs

['하',
 '이',
 '.',
 '는',
 '을',
 'ㄴ',
 '다',
 '의',
 '에',
 '를',
 '은',
 '어',
 '있',
 '고',
 '으로',
 '가',
 '였',
 'ㄹ',
 '되',
 ',',
 '에서',
 '었',
 ')',
 '(',
 '로',
 '것',
 '도',
 '등',
 '과',
 '들',
 '지',
 '와',
 '여',
 '일',
 '기',
 '·',
 'ㄴ다',
 '적',
 '수',
 '아',
 '%',
 '게',
 '원',
 '년',
 '2',
 '았',
 '3',
 '1',
 '다고',
 '“',
 '”',
 '월',
 '위하',
 '대하',
 '말하',
 '시장',
 '면',
 '업체',
 '따르',
 '하고',
 '않',
 '만',
 '까지',
 '‘',
 '’',
 '밝히',
 '명',
 '및',
 '부터',
 '다가',
 '미국',
 '며',
 '이라고',
 '4',
 '개',
 '대',
 '나',
 '오',
 '대표',
 '국내',
 '한국',
 '5',
 '다는',
 '던',
 '서비스',
 '습니다',
 '개발',
 '계획',
 '인',
 '주',
 '통하',
 '제품',
 '없',
 '또',
 '화',
 '정부',
 '면서',
 '최근',
 '한',
 '으며',
 '6',
 '지난해',
 '중',
 '그',
 '사업',
 '올해',
 '중국',
 '보이',
 '크',
 '받',
 '일본',
 '이번',
 '10',
 '보다',
 '내',
 '기업',
 '경우',
 '보',
 '대통령',
 '같',
 '-',
 '관련',
 '기술',
 '에게',
 '지만',
 '때문',
 '전',
 '전망',
 'LG',
 '라고',
 '7',
 '달러',
 '현재',
 '예정',
 '지나',
 '어서',
 'ㄴ다는',
 '특히',
 '이상',
 '8',
 '북한',
 '사',
 '시스템',
 '문제',
 '김',
 '삼성전자',
 '겠',
 '늘',
 '간',
 '나서',
 '관계자',
 '잇',
 '9',
 '아니',


In [11]:
## check word embed result
word = '버스'
print(model.wv[word])
print('size of vector: ', len(model.wv[word]))

[-0.1685909  -0.06815929  0.03862754 -0.17013025 -0.0222107  -0.32134584
  0.32902363  0.02374835 -0.32800677 -0.5159291  -0.03338749  0.22155814
 -0.7748226   0.10288942  0.01950315  0.12780008  0.2330913   0.37870157
 -0.06138415 -0.42132545 -0.3672338   0.09775122  0.22589192 -0.17918798
  0.34277806 -0.3893849  -0.46086147 -0.17517492 -0.44161347 -0.08979039
 -0.60346144  0.21198706 -0.02217958 -0.0661274  -0.10122002 -0.05852121
 -0.18313754  0.07583261  0.14024098  0.7634658   0.2056343   0.15760174
  0.41918263  0.508057    0.6573746   0.01041878  0.2764879  -0.115475
 -0.62231994 -0.20625061 -0.03133462 -0.20136891 -0.0603966  -0.12205875
 -0.3280499   0.5147516  -0.30284822 -0.37873787  0.1165148  -0.11643907
  0.22712985  0.01339927  0.11130055  0.24697241  0.11539216  0.46687412
  0.15046936  0.2494117   0.3704534   0.11955588 -0.08151569 -0.02378023
  0.23902106 -0.36102486 -0.10081082 -0.03202547 -0.20903397  0.36197385
 -0.20855834 -0.37764156  0.2909458  -0.11058491 -0.1

# Step4: Get word similarity

In [19]:
#word1 = '한국'
#word2 = '북한'
print ("Caculate the similarity between word 1 and word2")
word1 = input("word1: ")
word2 = input("word2: ")

# check the words are in the vocabulary
no_problem = True
vocab = model.wv.index2word

if word1 not in vocab:
	print ('the word ' + word1 + ' is not in the vocabulary')
	no_problem = False

if word2 not in vocab:
	print ('the word ' + word2 + ' is not in the vocabulary')
	no_problem = False

if no_problem:
	similarity = model.wv.similarity(word1, word2)
	print ('the similarity between ' + word1 + ' and ' + word2 + ' : ', similarity)

Caculate the similarity between word 1 and word2
word1: 한국
word2: 중국
the similarity between 한국 and 중국 :  0.3785267


# Step5: Find mismatch word

In [20]:
#words = '소프트웨어 네트워크 프로그램 가방'
print("Find mismatched word in the words")
text = input("text(words): ")
words = text.split()

# check the words are in the vocabulary
no_problem = True
vocab = model.wv.index2word

for word in words:
	if word not in vocab:
		print('the word ' + word + ' is not in the vocabulary')
		no_problem = False
		break;

if no_problem:
	mismatched = model.wv.doesnt_match(words)
	print ('the mismatch word between ' + text +' is', mismatched)

Find mismatched word in the words
text(words): 소프트웨어 하드웨어
the mismatch word between 소프트웨어 하드웨어 is 하드웨어


  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


# Step 6. Find the top-N most similar words

In [24]:
print("Print the most similar words")
word = input("word: ")

no_problem = True
vocab = model.wv.index2word

if word not in vocab:
	print ('the word ' + word + ' is not in the vocabulary')
	no_problem = False

if no_problem:
    print(model.wv.most_similar(positive=[word]))

Print the most similar words
word: 인간
[('동물', 0.617953896522522), ('배아', 0.6104809641838074), ('생쥐', 0.5970973968505859), ('존엄성', 0.5852876901626587), ('본성', 0.5809779167175293), ('인류', 0.5760883092880249), ('생명체', 0.5717364549636841), ('유기체', 0.5703120827674866), ('핵이식', 0.5635316371917725), ('욕망', 0.561828076839447)]


# Step 7: Vector calculation

In [25]:
#word_a = '한국'
#word_b = '아시아'
#word_c = '유럽'
print('Find the most similar word with the result of [ a - b + c ]')
word_a = input("a: ")
word_b = input("b: ")
word_c = input("c: ")

# check the words are in the vocabulary
no_problem = True
vocab = model.wv.index2word

if word_a not in vocab:
	print ('the word ' + word_a + ' is not in the vocabulary')
	no_problem = False

if word_b not in vocab:
	print ('the word ' + word_b + ' is not in the vocabulary')
	no_problem = False

if word_c not in vocab:
	print ('the word ' + word_c + ' is not in the vocabulary')
	no_problem = False

if no_problem:
	mostsimilar = model.wv.most_similar(positive=[word_a, word_c], negative=[word_b], topn=5)
	print ('most similar word of ' + word_a + ' - ' + word_b + ' + ' + word_c + ' is', mostsimilar[0][0], mostsimilar[1][0], mostsimilar[2][0])

Find the most similar word with the result of [ a - b + c ]
a: 독도
b: 한국
c: 일본
most similar word of 독도 - 한국 + 일본 is 다케시마 영유권 울릉도
