In [1]:
import numpy as np

In [2]:
# 말뭉치 전처리

def preprocess(text) :
    text = text.lower()
    text = text.replace('.', ' .')
    words = text.split(' ')

    word_to_id = {}
    id_to_word = {}
    for word in words :
        new_id = len(word_to_id)
        word_to_id[word] = new_id
        id_to_word[new_id] = word

    corpus = np.array([word_to_id[w] for w in words])

    return corpus, word_to_id, id_to_word

In [3]:
text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)

print(corpus)
print(word_to_id)
print(word_to_id)

[0 5 2 3 4 5 5 6]
{'you': 0, 'say': 5, 'goodbye': 2, 'and': 3, 'i': 4, 'hello': 5, '.': 6}
{'you': 0, 'say': 5, 'goodbye': 2, 'and': 3, 'i': 4, 'hello': 5, '.': 6}


## 동시발생 행렬

### 통계기반 기법 : 어떤 단어에 주목했을 때, 그 주변에 어떤 단어가 몇번이나 등장하는지를 세어 집계

In [5]:
import sys
sys.path.append('..')
import numpy as np
from common.util import preprocess

In [6]:
text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)

print(corpus)
print(id_to_word)

[0 1 2 3 4 1 5 6]
{0: 'you', 1: 'say', 2: 'goodbye', 3: 'and', 4: 'i', 5: 'hello', 6: '.'}


In [9]:
# 모든 단어에 대해 동시발생하는 단어을 표로 정리 = 동시발생 행렬
# 위 text를 window size = 1로 했을 때 다음의 동시발생 행렬이 만들어짐

C = np.array([
    [0,1,0,0,0,0,0],
    [1,0,1,0,1,1,0],
    [0,1,0,1,0,0,0],
    [0,0,1,0,1,0,0],
    [0,1,0,1,0,0,0],
    [0,1,0,0,0,0,1],
    [0,0,0,0,0,1,0],
])


## id = 0인 단어의 벡터 표현 : you의 경우 동시발생은 say 뿐
print(C[0])

## id = 4인 단어의 벡터 표현 : you의 경우 동시발생은 say, and
print(C[4])

[0 1 0 0 0 0 0]
[0 1 0 1 0 0 0]


In [10]:
# 동시발생 행렬 만드는 함수

def create_co_matrix(corpus, vocab_size, window_size = 1) :
    corpus_size = len(corpus)
    co_matrix = np.zeros((vocab_size, vocab_size))

    for idx, word_id in enumerate(corpus) :
        for i in range(1, window_size+1) :
            left_idx = idx - 1
            right_idx = idx + 1

            if left_idx >= 0 :
                left_word_id = corpus[left_idx]
                co_matrix[word_id, left_word_id] += 1

            if right_idx < corpus_size :
                right_word_id = corpus[right_idx]
                co_matrix[word_id, right_word_id] += 1
    return co_matrix


In [14]:
result = create_co_matrix(corpus, 7, window_size=1)
print(result)

result == C



[[0. 1. 0. 0. 0. 0. 0.]
 [1. 0. 1. 0. 1. 1. 0.]
 [0. 1. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 1. 0. 0.]
 [0. 1. 0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1. 0.]]


array([[ True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True]])

## 벡터 간 유사도

In [15]:
# 코사인 유사도

def cos_similarity(x, y, eps = 1e-8) :
    nx = x / (np.sqrt(np.sum(x**2)) + eps)
    ny = y / (np.sqrt(np.sum(y**2)) + eps)

    return np.dot(nx, ny)

In [16]:
corpus, word_to_id, id_to_word = preprocess(text)

vocab_size = len(word_to_id)
C = create_co_matrix(corpus, vocab_size)

c0 = C[word_to_id['you']]
c1 = C[word_to_id['i']]
print(cos_similarity(c0, c1))


0.7071067691154799


In [17]:
# 코사인 유사도 기반 유사도 랭킹

def most_similar(query, word_to_id, id_to_word, word_matrix, top = 5) :
    if query not in word_to_id :
        print('%s 를 찾을 수 없습니다.' %query)
        return

    print('\n[query]' + query)
    query_id = word_to_id[query]
    query_vec = word_matrix[query_id]

    ## 유사도 계산
    vocab_size = len(id_to_word)
    similarity = np.zeros(vocab_size)
    for i in range(vocab_size) :
        similarity[i] = cos_similarity(word_matrix[i], query_vec)

    ## 내림차순 출력
    count = 0
    for i in (-1*similarity).argsort() :
        if id_to_word[i] == query :
            continue
        print('%s : %s' %(id_to_word[i], similarity[i]))

        count += 1
        if count >= top :
            return

In [18]:
from common.util import preprocess, create_co_matrix, most_similar

text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)
vocab_size = len(word_to_id)
C = create_co_matrix(corpus, vocab_size)

most_similar('you', word_to_id, id_to_word, C, top = 5)

ImportError: cannot import name 'create_co_matrix' from 'common.util' (/Users/kimtaeyoung/Documents/GitHub/NLP_Scratch2/Ch02/../common/util.py)