In [1]:
import re,pickle
from janome.tokenizer import Tokenizer

In [2]:
def textToWords(text):
    text = re.sub("《[^》]+》","",text)
    text = re.sub("【[^]+]】","",text)
    text = re.sub("[「」　|\n]","",text)

    seperator = "。"
    text_list = text.split(seperator)
    text_list = [el + seperator for el in text_list]

    t = Tokenizer()
    words = []
    for sentence in text_list:
        words.append(list(t.tokenize(sentence,wakati=True)))

    with open("./data/newWords.pickle","wb") as f:
        pickle.dump(words,f)


In [3]:
with open("./data/wagahaiwa_nekodearu.txt",mode="r",encoding="utf-8") as f:
  wagahai_original = f.read()

In [4]:
textToWords(wagahai_original)

In [5]:
with open("./data/newWords.pickle","rb") as f:
    wagahai_words = pickle.load(f)
print(wagahai_words[:2])

[['吾輩', 'は', '猫', 'で', 'ある', '。'], ['名前', 'は', 'まだ', '無い', '。']]


In [6]:
from gensim.models import word2vec

In [7]:
model = word2vec.Word2Vec(
    wagahai_words,
    vector_size = 100,
    min_count = 5,
    window = 5,
    epochs = 20,
    sg = 0,
)

In [8]:
print(model.wv.vectors.shape)
print(model.wv.vectors)

(3350, 100)
[[ 0.7120657   0.9290544  -0.01387589 ...  0.3441443   0.07925468
   1.3528777 ]
 [ 0.8746206  -0.12597567  0.2200162  ... -0.01700269 -1.2211871
   0.4270016 ]
 [ 0.10323434  0.9955892  -2.1541984  ...  1.0210408   0.50952953
  -0.54835117]
 ...
 [ 0.03898301  0.20178224  0.21429719 ... -0.14357395  0.07893235
  -0.14259668]
 [ 0.04930685  0.19253528 -0.05725256 ... -0.12810715 -0.02253911
   0.00785525]
 [ 0.00791195  0.2127562  -0.06422442 ... -0.12108771 -0.07557514
   0.0074276 ]]


In [9]:
print(len(model.wv.index_to_key))
print(model.wv.index_to_key[:4])

3350
['の', '。', 'て', '、']


In [10]:
print(model.wv.vectors[0])
print(model.wv.__getitem__("の"))

[ 0.7120657   0.9290544  -0.01387589  0.8431612  -0.28564057 -1.5449731
  1.1611814   0.7183047  -0.28267118  0.04394454  0.24095774 -0.4776052
 -0.39154112  0.3174479  -0.8414111   0.40956867 -0.08599903 -0.42737916
 -0.6340406  -0.12286481  0.10612803 -0.34313044  0.8747316   0.06452833
  0.53198206 -0.21549374  0.60858375 -0.05527045 -0.54901123 -0.49158138
 -0.25172132 -0.01307109  0.375001    0.5309404   0.07656481 -0.08895757
  0.3327615  -0.5438785  -0.11015219 -1.0861014  -0.8979848   0.39977914
 -0.2594511   0.2671707  -0.66395503 -0.69032395 -0.1086839   0.24756001
 -0.53718483  0.37004802  0.04115601  1.091614   -0.26160043  0.14846142
  0.41558516 -0.917943    0.50750005 -0.5062759  -0.36552373  0.15128836
 -0.503548    0.48603344 -0.5492006  -0.35823166  0.00953872 -0.01086549
 -0.20687822  0.19075246  0.01313731  0.40030032 -0.19919592  0.9205057
 -0.23445487 -0.57850343  0.02226331  0.7280019  -0.21577698  0.48628312
 -0.21845183 -0.11362222 -0.10200728  0.48045012  1.18

In [11]:
print(model.wv.most_similar("猫"))

[('人間', 0.7354174852371216), ('教師', 0.7237818241119385), ('恋', 0.6710265874862671), ('芸術', 0.6575531959533691), ('戦争', 0.6448022127151489), ('傾向', 0.6438324451446533), ('彼等', 0.6406911015510559), ('水彩', 0.6373931169509888), ('君子', 0.6294218897819519), ('本人', 0.6265374422073364)]


In [12]:
import numpy as np

In [13]:
a = model.wv.__getitem__('猫')
b = model.wv.__getitem__('人間')
cos_sim = np.dot(a,b)/np.linalg.norm(a)/np.linalg.norm(b)
print(cos_sim)

0.7354175


In [14]:
model.wv.most_similar(positive=['猫','人間'])

[('彼等', 0.7480913400650024),
 ('世間', 0.74677574634552),
 ('事実', 0.7283207178115845),
 ('君子', 0.7270220518112183),
 ('戦争', 0.723979115486145),
 ('者', 0.717056393623352),
 ('傾向', 0.7163284420967102),
 ('逆上', 0.7067831754684448),
 ('教師', 0.705712080001831),
 ('充分', 0.6999576091766357)]

In [15]:
model.wv.most_similar(positive=['人間','猫'],negative=['夢'])

[('教師', 0.7346369028091431),
 ('芸術', 0.6949732899665833),
 ('恋', 0.6397367119789124),
 ('充分', 0.6005122065544128),
 ('実業', 0.5831262469291687),
 ('出来る', 0.5781002640724182),
 ('的', 0.5722547769546509),
 ('智識', 0.5705852508544922),
 ('君子', 0.5703545808792114),
 ('世間', 0.5653780698776245)]

In [16]:
model.wv.most_similar(positive=['教師'],negative=['夢'])

[('えらい', 0.6565906405448914),
 ('嫌', 0.6372228860855103),
 ('いえ', 0.6282118558883667),
 ('あれ', 0.6207963228225708),
 ('月並', 0.588832676410675),
 ('生徒', 0.5778862833976746),
 ('実業', 0.5764115452766418),
 ('金田', 0.5702313184738159),
 ('芸術', 0.5681326389312744),
 ('しかし', 0.5670684576034546)]