In [1]:
import requests
import numpy as np
url = 'https://www.aozora.gr.jp/cards/000148/files/773_14560.html' #青空文庫
res = requests.get(url)
try:
 res.raise_for_status()
except Exception as exc:
 print('Error : {}'.format(exc))

from bs4 import BeautifulSoup as bs4
no_strach_soup = bs4(res.content, 'lxml')
print(type(no_strach_soup))
contents = no_strach_soup.find('div', class_="main_text")
text = contents.get_text()

#テキストの前処理
#不要な記号などを省く
import re
text = re.sub('（.+?）', '', text) 
text = re.sub('\n\n', '\n', text)
text = re.sub('\r', '', text)
text = re.sub(r'[\u3000 \t]','', text) 
text = re.sub(r'\（.+\）', '', text)
text = text.replace("「", "")
text = text.replace("」", "。")
text = text.replace("\n", "")
text = text.strip()

text = text.split("。")
print(text[:4])

<class 'bs4.BeautifulSoup'>
['上先生と私一私はその人を常に先生と呼んでいた', 'だからここでもただ先生と書くだけで本名は打ち明けない', 'これは世間を憚かる遠慮というよりも、その方が私にとって自然だからである', '私はその人の記憶を呼び起すごとに、すぐ先生']


In [2]:
from janome.tokenizer import Tokenizer
t = Tokenizer()
sentences = []

#形態素解析
#ストップワードとして、助詞など意味の薄い単語は省いた
for sentence in text:
    words = []
    for token in t.tokenize(sentence):
        hinshi = token.part_of_speech.split(",")
        if  hinshi[0] == "名詞" or hinshi[0] == "動詞" or hinshi[0] == "形容詞" or hinshi[0] == "副詞":
            words.append(token.surface)
    sentences.append(words)

In [3]:
print(sentences[:10])

[['上', '先生', '私', '一', '私', '人', '常に', '先生', '呼ん', 'い'], ['ここ', '先生', '書く', '本名', '打ち明け'], ['これ', '世間', '憚', 'かる', '遠慮', 'いう', '方', '私', '自然'], ['私', '人', '記憶', '呼び', '起す', 'ごと', 'すぐ', '先生'], ['いい', 'なる'], ['筆', '執っ', '心持', '事'], ['よそよそしい', '頭文字', 'とても', '使う', '気', 'なら'], ['私', '先生', '知り合い', 'なっ', 'の', '鎌倉', 'ある'], ['時', '私', 'まだ', '若々しい', '書生'], ['暑中', '休暇', '利用', 'し', '海水浴', '行っ', '友達', 'ぜひ', '来い', '端書', '受け取っ', '私', '多少', '金', '工面', 'し', '出掛ける', '事', 'し']]


In [4]:
words1 = sum(sentences, []) 
words1 = set(words1)

In [9]:
wordtoid = {}
idtoword = {}
for i, word in enumerate(words1):
    idtoword[i] = word
    wordtoid[word] = i

In [12]:
#文章をベクトル化するための辞書:corpusを作る
corpus = []
for sentence in sentences:
    sub = []
    for word in sentence:
        sub.append(wordtoid[word])
    corpus.append(sub)

In [13]:
print(corpus[:10])

[[6367, 4233, 4170, 386, 4170, 5663, 6034, 4233, 148, 4897], [5779, 4233, 6148, 1754, 3452], [5187, 5214, 3000, 5465, 4594, 85, 3716, 4170, 4313], [4170, 5663, 4145, 5069, 1152, 574, 4434, 4233], [843, 6093], [3577, 791, 2607, 2828], [5926, 5642, 993, 4174, 5420, 5956], [4170, 4233, 3220, 3314, 4241, 4193, 1953], [5637, 4170, 149, 1034, 1379], [4628, 1915, 2286, 3583, 5101, 4988, 3822, 4907, 2764, 2336, 956, 4170, 4731, 3227, 5740, 3583, 738, 2828, 3583]]


In [15]:
from keras.preprocessing import text
from keras.utils import np_utils
from keras.preprocessing import sequence

In [16]:
def generate_context_word_pairs(corpus, window_size, vocab_size):
    context_length = window_size*2
    for words in corpus:
        sentence_length = len(words)
        for index, word in enumerate(words):
            context_words = []
            label_word   = []            
            start = index - window_size
            end = index + window_size + 1
            
            context_words.append([words[i] 
                                 for i in range(start, end) 
                                 if 0 <= i < sentence_length 
                                 and i != index])
            label_word.append(word)

            x = sequence.pad_sequences(context_words, maxlen=context_length)
            y = np_utils.to_categorical(label_word, vocab_size)
            yield (x, y)

In [22]:
vocab_size = len(wordtoid)
embed_size = 100
window_size = 2 

#ウィンドウ（文脈の広さ）を２に設定
#共起行列を作る
i = 0
for x, y in generate_context_word_pairs(corpus=corpus, window_size=window_size, vocab_size=vocab_size):
    if 0 not in x[0]:
        print('Context (X):', [idtoword[w] for w in x[0]], '-> Target (Y):', idtoword[np.argwhere(y[0])[0][0]])
    
        if i == 10:
            break
        i += 1

Context (X): ['上', '先生', '一', '私'] -> Target (Y): 私
Context (X): ['先生', '私', '私', '人'] -> Target (Y): 一
Context (X): ['私', '一', '人', '常に'] -> Target (Y): 私
Context (X): ['一', '私', '常に', '先生'] -> Target (Y): 人
Context (X): ['私', '人', '先生', '呼ん'] -> Target (Y): 常に
Context (X): ['人', '常に', '呼ん', 'い'] -> Target (Y): 先生
Context (X): ['ここ', '先生', '本名', '打ち明け'] -> Target (Y): 書く
Context (X): ['これ', '世間', 'かる', '遠慮'] -> Target (Y): 憚
Context (X): ['世間', '憚', '遠慮', 'いう'] -> Target (Y): かる
Context (X): ['憚', 'かる', 'いう', '方'] -> Target (Y): 遠慮
Context (X): ['かる', '遠慮', '方', '私'] -> Target (Y): いう


In [24]:
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda

# build CBOW architecture
cbow = Sequential()
#embedding layer > onehot into vectorizering
cbow.add(Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=window_size*2))
cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(embed_size,)))
cbow.add(Dense(vocab_size, activation='softmax'))
cbow.compile(loss='categorical_crossentropy', optimizer='rmsprop')

# view model summary
print(cbow.summary())

# visualize model structure
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

SVG(model_to_dot(cbow, show_shapes=True, show_layer_names=False, rankdir='TB').create(prog='dot', format='svg'))


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 4, 100)            640300    
_________________________________________________________________
lambda_1 (Lambda)            (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 6403)              646703    
Total params: 1,287,003
Trainable params: 1,287,003
Non-trainable params: 0
_________________________________________________________________
None
('Failed to import pydot. You must `pip install pydot` and install graphviz (https://graphviz.gitlab.io/download/), ', 'for `pydotprint` to work.')


AttributeError: 'NoneType' object has no attribute 'create'

In [36]:
for epoch in range(1, 3):
    loss = 0.
    i = 0
    for x, y in generate_context_word_pairs(corpus=corpus, window_size=window_size, vocab_size=vocab_size):
        i += 1
        loss += cbow.train_on_batch(x, y)
        if i % 100000 == 0:
            print('Processed {} (context, word) pairs'.format(i))

    print('Epoch:', epoch, '\tLoss:', loss)
    print()

Epoch: 1 	Loss: 596023.1958738164

Epoch: 2 	Loss: 604676.7702609171



In [37]:
weights = cbow.get_weights()[0]
weights = weights[1:]

from sklearn.metrics.pairwise import euclidean_distances

distance_matrix = euclidean_distances(weights)
print(distance_matrix.shape)

similar_words = {search_term: [idtoword[idx] for idx in distance_matrix[wordtoid[search_term]-1].argsort()[1:9]+1] 
                   for search_term in ["先生", "私", "東京", "故郷"]}

similar_words

(6402, 6402)


{'先生': ['ため', 'ある', '行っ', '前', '言葉', 'すぐ', '一', 'なっ'],
 '私': ['し', 'の', 'もの', 'よう', '上', 'い', 'する', '中'],
 '東京': ['学校', '好い', '得', '後', '下', '身体', '場合', '我々'],
 '故郷': ['程度', '狭い', 'がら', '入り', '試み', '来い', '専門', '立派']}