# コード例：シソーラス、カウントと推論に基づいた設計（生実装、spacy編）
- 補足
    - 自然言語処理は利用するツールによって操作が大きく異なります。ここでは代表的な前処理（文分割、トークナイズ、ステミング等）を観察しやすくすることを優先しています。後日より使いやすいツールについても紹介する予定です。
- 全体の流れ
    - 事前準備
    - シソーラスの例
    - Bag-of-Words
    - sklearnのBoWとTF-IDFを使った例
    - 共起行列に基づいた単語のベクトル化
    - 相互情報量による分散表現の高度化
    - SVDによる次元削減

## 事前準備
Google Colab標準に追加で spacy を利用する。また英語の事前学習済みモデルとして en_core_web_sm のダウンロードが必要。

In [81]:
# spacyインストール＆モデルダウンロードするときはコメント外して実行。
'''
!pip install -U spacy
!python -m spacy download en_core_web_sm
'''

'\n!pip install -U spacy\n!python -m spacy download en_core_web_sm\n'

In [82]:
import numpy as np
import pandas as pd
import sklearn.metrics.pairwise as pairwise
import spacy

In [83]:
nlp = spacy.load("en_core_web_sm")

# ドキュメント例（3つのドキュメント）
docs = []
docs.append("You can get dis-counted price with trade-in.")
docs.append("iPhone 11 shoots beautifully sharp 4K video at 60 fps across all its cameras.")
docs.append("From $16.62/mo. or $399 with trade-in.")

# 形態素解析
# token.text: 分かち書きされた単語そのもの。
# token.lemma_: 単語の原形。
# token.pos_: Universal POS tags: https://universaldependencies.org/u/pos/

doc = nlp(docs[1])
print(doc.text)
for token in doc:
  print(token.text, token.lemma_, token.pos_)

# 固有表現の抽出例
doc = nlp(docs[1])
for entity in doc.ents:
  print(entity.text, entity.label_)

# 固有表現を文中に強調表示
from spacy import displacy
displacy.render(doc, style="ent", jupyter=True)


iPhone 11 shoots beautifully sharp 4K video at 60 fps across all its cameras.
iPhone iPhone PROPN
11 11 NUM
shoots shoot NOUN
beautifully beautifully ADV
sharp sharp ADJ
4 4 NUM
K k NOUN
video video NOUN
at at ADP
60 60 NUM
fps fps NOUN
across across ADP
all all DET
its its PRON
cameras camera NOUN
. . PUNCT
11 CARDINAL
4 CARDINAL
60 CARDINAL


In [84]:
# 全文書に対して分かち書き＋原形化してみる。
for doc in docs:
    processed = nlp(doc)
    for token in processed:
        print(token.lemma_, token.pos_)

you PRON
can AUX
get VERB
dis NOUN
- PUNCT
count VERB
price NOUN
with ADP
trade NOUN
- PUNCT
in NOUN
. PUNCT
iPhone PROPN
11 NUM
shoot NOUN
beautifully ADV
sharp ADJ
4 NUM
k NOUN
video NOUN
at ADP
60 NUM
fps NOUN
across ADP
all DET
its PRON
camera NOUN
. PUNCT
from ADP
$ SYM
16.62 NUM
/ SYM
mo PROPN
. PROPN
or CCONJ
$ SYM
399 NUM
with ADP
trade NOUN
- PUNCT
in NOUN
. PUNCT


## シソーラスの例
- preprocess_docs(): テキストに対する前処理の例。
- simple_matching(): ユーザクエリに対する単純な単語マッチングによるスコアを算出。
- relation_matching(): 単純マッチングに加え、シソーラスを使って加点する例。

### preprocess_docs()

In [85]:
def preprocess_docs(docs, stopwords=[]):
    '''英文書集合 docs に対し前処理を施し、分かち書きしたリストのリストとして返す。

    :param docs(list): 1文書1文字列で保存。複数文書をリストとして並べたもの。
    :return (list): 文分割、単語分割、基本形、ストップワード除去した結果。
    '''

    result = []
    for doc in docs:
        temp = []
        processed = nlp(doc)
        for token in processed:
            if token.lemma_ not in stopwords:
                temp.append(token.lemma_.lower())
        result.append(temp)
    return result

docs2 = preprocess_docs(docs)
for index in range(len(docs2)):
    print('before: ', docs[index])
    print('after: ', docs2[index])
    print('----')


before:  You can get dis-counted price with trade-in.
after:  ['you', 'can', 'get', 'dis', '-', 'count', 'price', 'with', 'trade', '-', 'in', '.']
----
before:  iPhone 11 shoots beautifully sharp 4K video at 60 fps across all its cameras.
after:  ['iphone', '11', 'shoot', 'beautifully', 'sharp', '4', 'k', 'video', 'at', '60', 'fps', 'across', 'all', 'its', 'camera', '.']
----
before:  From $16.62/mo. or $399 with trade-in.
after:  ['from', '$', '16.62', '/', 'mo', '.', 'or', '$', '399', 'with', 'trade', '-', 'in', '.']
----


### simple_matching()

In [86]:
# simple matching
def simple_matching(query, docs):
    '''単純な単語マッチングによりマッチ数でスコアを算出。

    :param query(str): クエリ（検索要求）。
    :param docs(list): 1文書1文字列で保存。複数文書をリストとして並べたもの。
    :return (list): 文書毎のスコア。
    '''
    query = query.split(" ")
    result = []
    for doc in docs:
        score = 0
        for word in doc:
            for key in query:
                if key == word:
                    score += 1
        result.append(score)
    return result

user_query = "how much iphone"
scores = simple_matching(user_query, docs2)
print('simple_matching scores = ', scores)


simple_matching scores =  [0, 1, 0]


### relation_matching()

In [87]:
# relation matching
related_words = {}
related_words['buy'] = ['buy', '$', 'price', 'how much', 'trade-in']
related_words['UX'] = ['UX', 'stylish', 'seamless']

def relation_matching(query, docs, related_words):
    '''予め用意された関連用語を利用し、マッチする数を加点して算出。

    :param query(str): クエリ（検索要求）。
    :param docs(list): 1文書1文字列で保存。複数文書をリストとして並べたもの。
    :param related_words:
    :return (list): 文書毎のスコア。
    '''
    scores = simple_matching(query, docs)

    query = query.split(" ")
    for q in query:
        for relation in related_words:
            matches = [q in word for word in related_words[relation]]
            if True in matches:
                new_query = ' '.join(related_words[relation])
                temp_scores = simple_matching(new_query, docs)
                print(f'# q = {q}, relation = {relation} => temp_scores = {temp_scores}')
                scores = list(np.array(scores) + np.array(temp_scores))
    scores = list(scores)
    return scores

scores2 = relation_matching(user_query, docs2, related_words)
print('simple_matching scores = ', scores)
print('relation_matching scores = ', scores2)

# q = how, relation = buy => temp_scores = [1, 0, 2]
# q = much, relation = buy => temp_scores = [1, 0, 2]
simple_matching scores =  [0, 1, 0]
relation_matching scores =  [2, 1, 4]


## Bag-of-Words
- collect_words_eng(): 英文書集合から単語コードブック作成
- make_vectors_eng(): コードブックを素性とする文書ベクトルを作る
- euclidean_distance(): ユークリッド距離
- cosine_distance(): コサイン距離
- cosine_similarity(): コサイン類似度

### collect_words_eng()

In [88]:
import scipy.spatial.distance as distance

# BoW
# ドキュメント例（3つのドキュメント）
docs3 = []
docs3.append("This is test.")
docs3.append("That is test too.")
docs3.append("There are so many many tests.")


# 文書集合からターム素性集合（コードブック）を作る
def collect_words_eng(docs, stopwords=set(), pos=[]):
    '''英文書集合から単語コードブック作成。
    シンプルに文書集合を予め決めうちした方式で処理する。

    :param docs(list): 1文書1文字列で保存。複数文書をリストとして並べたもの。
    :param stopwords(set): 不要単語を集合として並べたもの。
    :param pos(list): 抽出したい品詞をリストとして並べたもの。

    :return (list): 文分割、単語分割、基本形、ストップワード除去した、ユニークな単語一覧。
    '''
    codebook = []
    nlp = spacy.load("en_core_web_sm")
    for doc in docs:
        for sent in nlp(doc).sents: # 文分割
            for token in sent:      # トークン分割
                this_word = token.lemma_ # 原形化
                if len(pos) == 0:   # pos指定ない場合
                    if (this_word not in codebook
                            and this_word not in stopwords):
                        codebook.append(this_word)
                else:               # pos指定ある場合
                    if (this_word not in codebook
                            and token.pos_ in pos
                            and this_word not in stopwords):
                        codebook.append(this_word)
    return codebook

codebook = collect_words_eng(docs3)
print('codebook = ',codebook)

codebook =  ['this', 'be', 'test', '.', 'that', 'too', 'there', 'so', 'many']


### ストップワードの導入
codebookには「this」「.」という文字が含まれている。これは不要なので削除したい=ストップワードとして設定。

In [89]:
stopwords = {'this', '.'}
codebook = collect_words_eng(docs3, stopwords)
print('codebook = ',codebook)

codebook =  ['be', 'test', 'that', 'too', 'there', 'so', 'many']


### 特定品詞のみ抽出
目的によっては品詞を指定する方が対照語を抽出しやすいかもしれない。

In [90]:
pos = ['NOUN', 'ADJ']
codebook = collect_words_eng(docs3, stopwords, pos)
print('codebook = ',codebook)

codebook =  ['test', 'many']


### make_vectors_eng()

In [91]:
# コードブックを素性とする文書ベクトルを作る (直接ベクトル生成)
def make_vectors_eng(docs, codebook):
    '''コードブックを素性とする文書ベクトルを作る（直接ベクトル生成）

    :param docs(list): 1文書1文字列で保存。複数文書をリストとして並べたもの。
    :param codebook(list): ユニークな単語一覧。
    :return (list): コードブックを元に、出現回数を特徴量とするベクトルを返す。
    '''
    vectors = []
    for doc in docs:
        # 文書毎に単語出現回数をカウント
        freq = {}
        for sent in nlp(doc).sents:
            for word in sent:
                if word.lemma_ not in freq:
                    freq[word.lemma_] = 1  # 初めて出現した単語
                else:
                    freq[word.lemma_] += 1 # 既出の単語

        # codebookを基準に、ベクトル化
        this_vector = []
        for word in codebook:
            try:
                this_vector.append(freq[word]) # 出現回数
            except KeyError:
                this_vector.append(0) # 出現しなかった単語の処理
        vectors.append(this_vector)
    return vectors

vectors = make_vectors_eng(docs3, codebook)
for index in range(len(docs3)):
    print('docs[{}] = {}'.format(index,docs3[index]))
    print('vectors[{}] = {}'.format(index,vectors[index]))
    print('----')

docs[0] = This is test.
vectors[0] = [1, 0]
----
docs[1] = That is test too.
vectors[1] = [1, 0]
----
docs[2] = There are so many many tests.
vectors[2] = [1, 2]
----


### ユークリッド距離、コサイン類似度によるベクトル比較

In [92]:
def euclidean_distance(vectors):
    vectors = np.array(vectors)
    distances = []
    for i in range(len(vectors)):
        temp = []
        for j in range(len(vectors)):
            temp.append(np.linalg.norm(vectors[i] - vectors[j]))
        distances.append(temp)
    return distances

distances = euclidean_distance(vectors)
print('# euclidean_distance')
for index in range(len(distances)):
    print(distances[index])

distances = pairwise.cosine_similarity(vectors)
print('# cosine_similarity')
for index in range(len(distances)):
    print(distances[index])

# euclidean_distance
[0.0, 0.0, 2.0]
[0.0, 0.0, 2.0]
[2.0, 2.0, 0.0]
# cosine_similarity
[1.        1.        0.4472136]
[1.        1.        0.4472136]
[0.4472136 0.4472136 1.       ]


## sklearnのBoWとTF-IDFを使った例
ステミング、ストップワード等の指定もできるが、細かな制御はしにくいかも。（主観）

### BoW

In [93]:
import sklearn.feature_extraction.text as fe_text

def bow(docs):
    '''Bag-of-Wordsによるベクトルを生成。

    :param docs(list): 1文書1文字列で保存。複数文書をリストとして並べたもの。
    :return: 文書ベクトル。
    '''
    vectorizer = fe_text.CountVectorizer(stop_words='english')
    vectors = vectorizer.fit_transform(docs)
    return vectors.toarray(), vectorizer

vectors, vectorizer = bow(docs)
print('# normal BoW')
column_names = vectorizer.get_feature_names_out()
print(vectors)
df = pd.DataFrame(vectors, columns=column_names.tolist())
df

# normal BoW
[[0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 1 0]
 [1 0 0 1 1 0 1 1 0 0 1 1 0 0 1 1 0 1]
 [0 1 1 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0]]


Unnamed: 0,11,16,399,4k,60,62,beautifully,cameras,counted,dis,fps,iphone,mo,price,sharp,shoots,trade,video
0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,1,0
1,1,0,0,1,1,0,1,1,0,0,1,1,0,0,1,1,0,1
2,0,1,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0


### TF-IDF

In [94]:
def bow_tfidf(docs):
    '''Bag-of-WordsにTF-IDFで重み調整したベクトルを生成。

    :param docs(list): 1文書1文字列で保存。複数文書をリストとして並べたもの。
    :return: 重み調整したベクトル。
    '''
    vectorizer = fe_text.TfidfVectorizer(norm=None, stop_words='english')
    vectors = vectorizer.fit_transform(docs)
    return vectors.toarray(), vectorizer

vectors, vectorizer = bow_tfidf(docs)
print('# BoW + tfidf')
column_names = vectorizer.get_feature_names_out()
df = pd.DataFrame(vectors, columns=column_names.tolist())
df

# BoW + tfidf


Unnamed: 0,11,16,399,4k,60,62,beautifully,cameras,counted,dis,fps,iphone,mo,price,sharp,shoots,trade,video
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.693,1.693,0.0,0.0,0.0,1.693,0.0,0.0,1.288,0.0
1,1.693,0.0,0.0,1.693,1.693,0.0,1.693,1.693,0.0,0.0,1.693,1.693,0.0,0.0,1.693,1.693,0.0,1.693
2,0.0,1.693,1.693,0.0,0.0,1.693,0.0,0.0,0.0,0.0,0.0,0.0,1.693,0.0,0.0,0.0,1.288,0.0


## 共起行列に基づいた単語のベクトル化
- preprocess(): テキストに対する前処理。
- create_co_matrix(): 共起行列を作成。
- most_similar(): コサイン類似度Top5を出力。

### preprocess()

In [95]:
import pandas as pd

sentence = 'pandas is an open source programming tools. The best way to get pandas is via conda. "conda install pandas"'
print(sentence)
print('len(sentence) = ', len(sentence))


def preprocess(text):
    """テキストに対する前処理。
    「ゼロから作るDeepLearning2 自然言語処理辺」p.66より。

    :param text:
    :return:
      courpus(list): id_to_wordのidに基づいたone-hot vector。
      word_to_id(dict): 単語をkeyとして、idを参照する辞書。
      id_to_word(dict): idをkeyとして、単語を参照する辞書。
    """
    text = text.lower()
    text = text.replace('.', ' .')
    text = text.replace('"', '')
    words = text.split(' ')

    word_to_id = {}
    id_to_word = {}
    for word in words:
        if word not in word_to_id:
            new_id = len(word_to_id)
            word_to_id[word] = new_id
            id_to_word[new_id] = word
    corpus = np.array([word_to_id[w] for w in words])
    return corpus, word_to_id, id_to_word

corpus, word_to_id, id_to_word = preprocess(sentence)
vocab_size = len(word_to_id)
print(corpus)
print(word_to_id)
print(id_to_word)

pandas is an open source programming tools. The best way to get pandas is via conda. "conda install pandas"
len(sentence) =  107
[ 0  1  2  3  4  5  6  7  8  9 10 11 12  0  1 13 14  7 14 15  0]
{'pandas': 0, 'is': 1, 'an': 2, 'open': 3, 'source': 4, 'programming': 5, 'tools': 6, '.': 7, 'the': 8, 'best': 9, 'way': 10, 'to': 11, 'get': 12, 'via': 13, 'conda': 14, 'install': 15}
{0: 'pandas', 1: 'is', 2: 'an', 3: 'open', 4: 'source', 5: 'programming', 6: 'tools', 7: '.', 8: 'the', 9: 'best', 10: 'way', 11: 'to', 12: 'get', 13: 'via', 14: 'conda', 15: 'install'}


### create_co_matrix()

In [96]:
def create_co_matrix(corpus, vocab_size, window_size=1):
    """共起行列を作成。
    「ゼロから作るDeepLearning2 自然言語処理辺」p.72より。

    :param corpus(str): テキスト文。
    :param vocab_size: 語彙数。
    :param window_size: 共起判定の範囲。
    :return:
    """
    corpus_size = len(corpus)
    co_matrix = np.zeros((vocab_size, vocab_size), dtype=np.int32)

    for idx, word_id in enumerate(corpus):
        for i in range(1, window_size+1):
            left_idx = idx - i
            right_idx = idx + i
            if left_idx >= 0:
                left_word_id = corpus[left_idx]
                co_matrix[word_id, left_word_id] += 1
            if right_idx < corpus_size:
                right_word_id = corpus[right_idx]
                co_matrix[word_id, right_word_id] += 1
    return co_matrix

co_matrix = create_co_matrix(corpus, vocab_size, window_size=2)
df = pd.DataFrame(co_matrix, index=word_to_id.keys(), columns=word_to_id.keys())
df

Unnamed: 0,pandas,is,an,open,source,programming,tools,.,the,best,way,to,get,via,conda,install
pandas,0,2,1,0,0,0,0,0,0,0,0,1,1,1,1,1
is,2,0,1,1,0,0,0,0,0,0,0,0,1,1,1,0
an,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0
open,0,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0
source,0,0,1,1,0,1,1,0,0,0,0,0,0,0,0,0
programming,0,0,0,1,1,0,1,1,0,0,0,0,0,0,0,0
tools,0,0,0,0,1,1,0,1,1,0,0,0,0,0,0,0
.,0,0,0,0,0,1,1,0,1,1,0,0,0,1,2,1
the,0,0,0,0,0,0,1,1,0,1,1,0,0,0,0,0
best,0,0,0,0,0,0,0,1,1,0,1,1,0,0,0,0


### most_similar()

In [97]:
def most_similar(query, word_to_id, id_to_word, word_matrix, top=5):
    """コサイン類似度Top5を出力。

    :param query(str): クエリ。
    :param word_to_id(dict): 単語をkeyとして、idを参照する辞書。
    :param id_to_word(dict): idをkeyとして、単語を参照する辞書。
    :param word_matrix: 共起行列。
    :param top(int): 上位何件まで表示させるか。
    :return: なし。
    """
    if query not in word_to_id:
        print('%s is not found' % query)
        return

    print('[query] ' + query)
    query_id = word_to_id[query]
    query_vec = word_matrix[query_id]

    vocab_size = len(word_to_id)
    similarity = np.zeros(vocab_size)
    for i in range(vocab_size):
        similarity[i] = pairwise.cosine_similarity([word_matrix[i]], [query_vec])

    count = 0
    for i in (-1 * similarity).argsort():
        if id_to_word[i] == query:
            continue
        print(' %s: %s' % (id_to_word[i], similarity[i]))
        count += 1
        if count >= top:
            return

print('\n# most_similar() with co_matrix')
user_query = "pandas"
most_similar(user_query, word_to_id, id_to_word, co_matrix)


# most_similar() with co_matrix
[query] pandas
 conda: 0.5477225575051663
 open: 0.4743416490252569
 get: 0.4743416490252569
 via: 0.4743416490252569
 is: 0.4216370213557839


## 相互情報量による分散表現の高度化
- ppmi(): Positive PMI（正の相互情報量）。

### ppmi()

In [98]:
def ppmi(C, verbose=False, eps=1e-8):
    """Positive PMI（正の相互情報量）
    「ゼロから作るDeepLearning2 自然言語処理辺」p.79より。

    :param C: 共起行列。
    :param verbose(boolean): 処理状況を出力するためのフラグ。
    :param eps(float): np.log2演算時に-infとなるのを避けるための微小な値。
    :return:
    """
    M = np.zeros_like(C, dtype=np.float32)
    N = np.sum(C)
    S = np.sum(C, axis=0)
    total = C.shape[0] * C.shape[1]
    cnt = 0

    for i in range(C.shape[0]):
        for j in range(C.shape[1]):
            pmi = np.log2(C[i, j] * N / (S[j]*S[i]) + eps)
            M[i, j] = max(0, pmi)

            if verbose:
                cnt += 1
                if cnt % (total//100) == 0:
                    print('%.1f%% done' % (100+cnt/total))
    return M

M = ppmi(co_matrix)
print('\n# PPMI')
df2 = pd.DataFrame(M, index=word_to_id.keys(), columns=word_to_id.keys())
df2


# PPMI


Unnamed: 0,pandas,is,an,open,source,programming,tools,.,the,best,way,to,get,via,conda,install
pandas,0.0,1.478,1.285,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.285,1.285,1.285,0.285,1.7
is,1.478,0.0,1.478,1.478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.478,1.478,0.478,0.0
an,1.285,1.478,0.0,2.285,2.285,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
open,0.0,1.478,2.285,0.0,2.285,2.285,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
source,0.0,0.0,2.285,2.285,0.0,2.285,2.285,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
programming,0.0,0.0,0.0,2.285,2.285,0.0,2.285,1.285,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tools,0.0,0.0,0.0,0.0,2.285,2.285,0.0,1.285,2.285,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.,0.0,0.0,0.0,0.0,0.0,1.285,1.285,0.0,1.285,1.285,0.0,0.0,0.0,1.285,1.285,1.7
the,0.0,0.0,0.0,0.0,0.0,0.0,2.285,1.285,0.0,2.285,2.285,0.0,0.0,0.0,0.0,0.0
best,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.285,2.285,0.0,2.285,2.285,0.0,0.0,0.0,0.0


### 表示桁数調整

In [99]:
#np.set_printoptions(precision=3) # 有効桁3桁（表示上の省略で、データは保持）
pd.options.display.precision = 3 # 同上
print('\n# PPMI with precision=3')
df2 = pd.DataFrame(M, index=word_to_id.keys(), columns=word_to_id.keys())
df2


# PPMI with precision=3


Unnamed: 0,pandas,is,an,open,source,programming,tools,.,the,best,way,to,get,via,conda,install
pandas,0.0,1.478,1.285,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.285,1.285,1.285,0.285,1.7
is,1.478,0.0,1.478,1.478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.478,1.478,0.478,0.0
an,1.285,1.478,0.0,2.285,2.285,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
open,0.0,1.478,2.285,0.0,2.285,2.285,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
source,0.0,0.0,2.285,2.285,0.0,2.285,2.285,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
programming,0.0,0.0,0.0,2.285,2.285,0.0,2.285,1.285,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tools,0.0,0.0,0.0,0.0,2.285,2.285,0.0,1.285,2.285,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.,0.0,0.0,0.0,0.0,0.0,1.285,1.285,0.0,1.285,1.285,0.0,0.0,0.0,1.285,1.285,1.7
the,0.0,0.0,0.0,0.0,0.0,0.0,2.285,1.285,0.0,2.285,2.285,0.0,0.0,0.0,0.0,0.0
best,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.285,2.285,0.0,2.285,2.285,0.0,0.0,0.0,0.0


## SVDによる次元削減
- np.linalg.svd(): 線形代数ライブラリを利用。

### svd()

In [100]:
# svd
U, S, V = np.linalg.svd(M)
print('\n# SVD: dense vectors with all singular values')
print(U)

use_s_values = 2
U2 = U[:,0:use_s_values]
print('\n# SVD: dense vectors with singular values = {}'.format(use_s_values))
print(U2)

print('\n# most_similar() with SVD-2')
most_similar(user_query, word_to_id, id_to_word, U2)


# SVD: dense vectors with all singular values
[[-0.20909968 -0.05742509 -0.4260506  -0.17504837  0.2712971   0.04672289
   0.18823615 -0.18591174  0.36170715 -0.02090307  0.33414906  0.12803775
   0.20772134 -0.17504339  0.12016748 -0.49748433]
 [-0.20854177  0.06963065 -0.3888203   0.1402168   0.20635465  0.15388878
  -0.2102995  -0.13104966 -0.38273078  0.39125082  0.11626503 -0.32180378
  -0.35602686 -0.3037927  -0.09017423  0.09666564]
 [-0.23961712  0.26655334 -0.18719402  0.05390574 -0.35268205  0.2592225
  -0.32773367 -0.09349788  0.2730225   0.0377553  -0.40234843 -0.31079715
   0.17229767  0.28257638 -0.12586372 -0.25696287]
 [-0.2782892   0.35838643 -0.04367146 -0.32175225  0.05215078  0.23192607
   0.21995124  0.49517277  0.11888672 -0.1294459   0.23939379 -0.12363093
  -0.32118776  0.27299288  0.13509066  0.19793455]
 [-0.3138919   0.39330027  0.14138082  0.21887659  0.3510343   0.1303136
   0.33645037 -0.16895744 -0.4147285  -0.14546476 -0.16639212  0.12915234
   0.364361