# Chap. 2

[WordNet](https://wordnet.princeton.edu/)

## カウントベースの手法

分布仮説（Distributional Hypothesis）: The Distributional Hypothesis is that words that occur in the same contexts tend to have similar meanings (Harris, 1954)

In [1]:
import numpy as np

def preprocess(text):
    """Create a word list and corresponding index list
    """
    text = text.lower()
    text = text.replace('.', ' .')
    words = text.split(' ')

    word_to_id = {}
    id_to_word = {}

    for word in words:
        if word not in word_to_id:
            new_id = len(word_to_id)
            word_to_id[word] = new_id
            id_to_word[new_id] = word
        
        #print(word_to_id)
        #print(id_to_word)

    # create a corpus
    corpus = [word_to_id[w] for w in words]
    corpus = np.array(corpus)
    
    return corpus, word_to_id, id_to_word

def create_co_matrix(corpus, vocab_size, window_size=1):
    corpus_size = len(corpus)
    co_matrix = np.zeros((vocab_size, vocab_size), dtype=np.int32)
    
    for idx, word_id in enumerate(corpus):
        for i in range(1, window_size + 1):
            left_idx = idx-1
            right_idx = idx+1
            
            if left_idx >= 0:
                left_word_id = corpus[left_idx]
                co_matrix[word_id, left_word_id] += 1
                
            if right_idx < corpus_size:
                right_word_id = corpus[right_idx]
                co_matrix[word_id, right_word_id] += 1
                
    return co_matrix

def cos_similarity(x, y, eps=1e-8):
    nx = x / (np.sqrt(np.sum(x**2)) + eps)
    ny = y / (np.sqrt(np.sum(y**2)) + eps)
    return np.dot(nx, ny)

def most_similar(query, word_to_id, id_to_word, word_matrix, top=5):
    if query not in word_to_id:
        print("%s is not found" % query)
        return
    
    print('\n[query] '+query)
    query_id = word_to_id[query]
    query_vec = word_matrix[query_id]
    
    vocab_size = len(id_to_word)
    similarity = np.zeros(vocab_size)
    for i in range(vocab_size):
        similarity[i] = cos_similarity(word_matrix[i], query_vec)
        
    count = 0
    for i in (-1 * similarity).argsort():
        if id_to_word[i] == query:
            continue
        print('%s: %s' % (id_to_word[i], similarity[i]))
        count += 1
        if count >= top:
            return
        
text = "You say goodbye and I say hello."
corpus, word_to_id, id_to_word = preprocess(text)
C = create_co_matrix(corpus, len(word_to_id))

c0 = C[word_to_id['you']]
c1 = C[word_to_id['i']]
#print(cos_similarity(c0, c1))
most_similar('you', word_to_id, id_to_word, C)
print(C)


[query] you
goodbye: 0.7071067691154799
i: 0.7071067691154799
hello: 0.7071067691154799
say: 0.0
and: 0.0
[[0 1 0 0 0 0 0]
 [1 0 1 0 1 1 0]
 [0 1 0 1 0 0 0]
 [0 0 1 0 1 0 0]
 [0 1 0 1 0 0 0]
 [0 1 0 0 0 0 1]
 [0 0 0 0 0 1 0]]


#### Improve count base method

* [(Positive) Pointwise mutual information](https://en.wikipedia.org/wiki/Pointwise_mutual_information)
* [SVD](https://en.wikipedia.org/wiki/Singular_value_decomposition)

In [2]:
def ppmi(C, verbose=False, eps=1e-8):
    M = np.zeros_like(C, dtype=np.float32)
    N = np.sum(C)
    S = np.sum(C, axis=0)
    total = C.shape[0] * C.shape[1]
    cnt = 0
    
    for i in range(C.shape[0]):
        for j in range(C.shape[1]):
            pmi = np.log2(C[i, j] * N / (S[j]*S[i]) + eps)
            M[i, j] = max(0, pmi)
            
            if verbose:
                cnt += 1
                if cnt % (total//100) == 0:
                    print('%.1f%% done' % (100*cnt/total))
                    
    return M

In [3]:
import sys
import numpy as np
import matplotlib.pyplot as plt

text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)
vocab_size = len(word_to_id)
C = create_co_matrix(corpus, vocab_size)
W = ppmi(C)

# SVD
U, S, V = np.linalg.svd(W)

np.set_printoptions(precision=3)
print('covariance matrix')
print(C)
print('-'*50)
print('PPMI')
print(W)
print('-'*50)
print('U (SVD)')
print(U)

for word, word_id in word_to_id.items():
    plt.annotate(word, (U[word_id, 0], U[word_id, 1]))
    
plt.scatter(U[:,0], U[:,1], alpha=0.5)
plt.show()

covariance matrix
[[0 1 0 0 0 0 0]
 [1 0 1 0 1 1 0]
 [0 1 0 1 0 0 0]
 [0 0 1 0 1 0 0]
 [0 1 0 1 0 0 0]
 [0 1 0 0 0 0 1]
 [0 0 0 0 0 1 0]]
--------------------------------------------------
PPMI
[[0.    1.807 0.    0.    0.    0.    0.   ]
 [1.807 0.    0.807 0.    0.807 0.807 0.   ]
 [0.    0.807 0.    1.807 0.    0.    0.   ]
 [0.    0.    1.807 0.    1.807 0.    0.   ]
 [0.    0.807 0.    1.807 0.    0.    0.   ]
 [0.    0.807 0.    0.    0.    0.    2.807]
 [0.    0.    0.    0.    0.    2.807 0.   ]]
--------------------------------------------------
U (SVD)
[[-3.409e-01 -1.110e-16 -3.886e-16 -1.205e-01  0.000e+00  9.323e-01
   2.226e-16]
 [ 0.000e+00 -5.976e-01  1.802e-01  0.000e+00 -7.812e-01  0.000e+00
   0.000e+00]
 [-4.363e-01 -4.241e-17 -2.172e-16 -5.088e-01 -1.767e-17 -2.253e-01
  -7.071e-01]
 [-2.614e-16 -4.978e-01  6.804e-01 -6.574e-17  5.378e-01  9.951e-17
   1.201e-17]
 [-4.363e-01 -3.229e-17 -1.654e-16 -5.088e-01 -1.345e-17 -2.253e-01
   7.071e-01]
 [-7.092e-01 -3.229e-

<Figure size 640x480 with 1 Axes>

# Chap. 3

##### カウントベースの手法の問題点

コーパスで扱う語彙数が増えた際に、共起行列が巨大になることに加え、SVDは計算量が $O(n^3)$ なので現実的でなくなる。

## word2vec

* CBOW (Continuous bag-of-words)
    - コンテキストからターゲットを推測することを目的としたニューラルネットワーク
    - 「ターゲット」は中央の単語、その周囲の単語が「コンテキスト」
* skip-gram

In [51]:
import numpy as np

class MatMul:
    def __init__(self, W):
        self.params = [W]
        self.grads = [np.zeros_like(W)]
        self.x = None
        
    def forward(self, x):
        W, = self.params
        out = np.dot(x, W)
        self.x = x
        return out
    
    def backward(self, dout):
        W, = self.params
        dx = np.dot(dout, W.T)
        dW = np.dot(self.x.T, dout)
        self.grads[0][...] = dW
        return dx
    
def convert_one_hot(target, vocab_size):
    new_shape = (*target.shape, vocab_size)
    one_hot = np.zeros(new_shape, dtype=int)
    return one_hot

def create_contexts_target(corpus, window_size=1):
    target = corpus[window_size:-window_size]
    contexts = []
    
    for idx in range(window_size, len(corpus)-window_size):
        cs = []
        for t in range(-window_size, window_size + 1):
            if t == 0:
                continue
            cs.append(corpus[idx + t])
        contexts.append(cs)
        
    return np.array(contexts), np.array(target)

contexts, target = create_contexts_target(corpus, window_size=1)
one_hot = convert_one_hot(target, 7)
print(one_hot)

[[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]]
