In [None]:
import platform
from collections import Counter

import numpy as np

# tokenizer import
from konlpy.tag import Okt, Komoran, Hannanum, Kkma

#운영체제에 따라 mecab설치 방법이 다름.
if platform.system() == "Windows":
    try:
        from eunjeon import Mecab
    except:
        print("please install eunjeon module")
else:  # Ubuntu일 경우
    from konlpy.tag import Mecab

from typing import List, Callable, Union, Any, TypeVar, Tuple, Dict


In [None]:
with open("./sents.txt", "r") as f:
    sents = f.read().split("\n")
print(sents)

In [None]:
def get_tokenizer(tokenizer_name):
    if tokenizer_name == "komoran":
        tokenizer = Komoran()
    elif tokenizer_name == "okt":
        tokenizer = Okt()
    elif tokenizer_name == "mecab":
        tokenizer = Mecab()
    elif tokenizer_name == "hannanum":
        tokenizer = Hannanum()
    elif tokenizer_name == "kkma":
        tokenizer = Kkma()
    else:
        tokenizer = Mecab()
    return tokenizer

In [None]:
def get_tokens(sent: List[str], noun=True, tokenizer="mecab") -> List[str]:
    tokenizer = get_tokenizer(tokenizer)

    if noun:
        nouns = tokenizer.nouns(sent)
        print('n',nouns)
        nouns = [word for word in nouns if len(word) > 1]
        print('nouns',nouns)
        return nouns
    
    return tokenizer.morphs(sent)


In [None]:
from functools import partial

from sklearn.feature_extraction.text import CountVectorizer

In [None]:
stopwords = ["연합뉴스", "가방"]

vectorizer = CountVectorizer(
    stop_words=stopwords,
    tokenizer=partial(get_tokens, noun=True, tokenizer="mecab"),
    min_df=2,
)


# 코퍼스로부터 각 단어의 빈도 수를 기록.
x = vectorizer.fit_transform(sents)
x.toarray().shape
print(x.toarray())


# 각 단어의 인덱스가 어떻게 부여되었는지를 보여줌.
vocab_idx = vectorizer.vocabulary_
print(vocab_idx)
idx_vocab = {idx: vocab for vocab, idx in vocab_idx.items()}
print(idx_vocab)

In [None]:
def vectorize_sents(
    sents: List[str], stopwords=None, min_count=2, tokenizer="mecab", noun=True
):

    vectorizer = CountVectorizer(
        stop_words=stopwords,
        tokenizer=partial(get_tokens, noun=noun, tokenizer="mecab"),
        min_df=min_count,
    )

    vec = vectorizer.fit_transform(sents)
    print(vec)
    vocab_idx = vectorizer.vocabulary_
    print(vocab_idx)
    idx_vocab = {idx: vocab for vocab, idx in vocab_idx.items()}
    print(idx_vocab)
    
    return vec, vocab_idx, idx_vocab

In [None]:
# binary csr_matrix
numerators = (x > 0) * 1

# Inverse sentence length
min_length = 1
denominators = np.asarray(x.sum(axis=1))
denominators[np.where(denominators <= min_length)] = 10000
denominators = np.log(denominators)

denom_log1 = np.matmul(denominators, np.ones(denominators.shape).T)
denom_log2 = np.matmul(np.ones(denominators.shape), denominators.T)

sim_mat = np.dot(numerators, numerators.T)

sim_mat = sim_mat / (denom_log1 + denom_log2)

min_sim = 0.3
sim_mat[np.where(sim_mat <= min_sim)] = 0