In [1]:
!pip install sentence_transformers
!pip install konlpy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 1.8 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.22.1-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 4.4 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 33.9 MB/s 
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 33.7 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 33.9 MB/s 
Building wheels for collected pa

In [2]:
import numpy as np
import itertools

from konlpy.tag import Okt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

In [3]:
doc = '서비스로 날치알밥도 맛있게 잘 먹었어요'

In [4]:
okt = Okt()

tokenized_doc = okt.pos(doc)
tag = ['Noun','Adjective']
tokenized_nouns = ' '.join([word[0] for word in tokenized_doc if word[1] in tag])

print('품사 태깅 10개만 출력 :',tokenized_doc[:10])
print('명사 추출 :',tokenized_nouns)

품사 태깅 10개만 출력 : [('서비스', 'Noun'), ('로', 'Josa'), ('날치', 'Noun'), ('알밥', 'Noun'), ('도', 'Josa'), ('맛있게', 'Adjective'), ('잘', 'Verb'), ('먹었어요', 'Verb')]
명사 추출 : 서비스 날치 알밥 맛있게


In [5]:
n_gram_range = (1,2)

count = CountVectorizer(ngram_range=n_gram_range).fit([tokenized_nouns])
candidates = count.get_feature_names_out()

print('trigram 개수 :',len(candidates))
print('trigram 다섯개만 출력 :',candidates[:5])

trigram 개수 : 7
trigram 다섯개만 출력 : ['날치' '날치 알밥' '맛있게' '서비스' '서비스 날치']


In [None]:
model = SentenceTransformer('sentence-transformers/xlm-r-100langs-bert-base-nli-stsb-mean-tokens')


In [None]:
doc_embedding = model.encode([doc])
candidate_embeddings = model.encode(candidates)

In [None]:
top_n = 5
distances = cosine_similarity(doc_embedding, candidate_embeddings)
keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]
print(keywords)

['알밥', '서비스', '날치 알밥', '서비스 날치', '날치']


In [None]:
def max_sum_sim(doc_embedding, candidate_embeddings, words, top_n, nr_candidates):
    # 문서와 각 키워드들 간의 유사도
    distances = cosine_similarity(doc_embedding, candidate_embeddings)

    # 각 키워드들 간의 유사도
    distances_candidates = cosine_similarity(candidate_embeddings, 
                                            candidate_embeddings)

    # 코사인 유사도에 기반하여 키워드들 중 상위 top_n개의 단어를 pick.
    words_idx = list(distances.argsort()[0][-nr_candidates:])
    words_vals = [candidates[index] for index in words_idx]
    distances_candidates = distances_candidates[np.ix_(words_idx, words_idx)]

    # 각 키워드들 중에서 가장 덜 유사한 키워드들간의 조합을 계산
    min_sim = np.inf
    candidate = None
    for combination in itertools.combinations(range(len(words_idx)), top_n):
        sim = sum([distances_candidates[i][j] for i in combination for j in combination if i != j])
        if sim < min_sim:
            candidate = combination
            min_sim = sim

    return [words_vals[idx] for idx in candidate]

In [None]:
max_sum_sim(doc_embedding, candidate_embeddings, candidates, top_n=5, nr_candidates=5)

['알밥', '서비스', '날치 알밥', '서비스 날치', '날치']

In [None]:
def summary(doc):
  okt = Okt()
  
  a = []

  tokenized_doc = okt.pos(doc)
  tokenized_nouns = ' '.join([word[0] for word in tokenized_doc if word[1] == 'Noun'])

  n_gram_range = (1,2)

  count = CountVectorizer(ngram_range=n_gram_range).fit([tokenized_nouns])
  candidates = count.get_feature_names_out()

  # doc_embedding = model.encode([doc])
  # candidate_embeddings = model.encode(candidates)

  # top_n = 5
  # distances = cosine_similarity(doc_embedding, candidate_embeddings)
  # keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]

  for i in range(len(candidates)):
    a.append(candidates[i])
  b = candidates[:2].tolist()
  for i in range(len(b)):
    a.append(b[i])
  for i in a:
    i.split()
  # print(max_sum_sim(doc_embedding, candidate_embeddings, candidates, top_n=5, nr_candidates=3))
  # print(a)
  print(set(a))

In [None]:
summary('솔직히 예전에 비해 소스양이 너무 줄었네요')

{'예전 소스', '예전', '소스'}


In [None]:
okt = Okt()

a = []

tokenized_doc = okt.pos(doc)
tokenized_nouns = ' '.join([word[0] for word in tokenized_doc if word[1] == 'Noun'])

n_gram_range = (1,2)

count = CountVectorizer(ngram_range=n_gram_range).fit([tokenized_nouns])
candidates = count.get_feature_names_out()

# doc_embedding = model.encode([doc])
# candidate_embeddings = model.encode(candidates)

# top_n = 5
# distances = cosine_similarity(doc_embedding, candidate_embeddings)
# keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]

for i in range(len(candidates)):
  a.append(candidates[i])
b = candidates[:2].tolist()
for i in range(len(b)):
  a.append(b[i])
for i in range(len(a)):

In [None]:
c = []
for i in a:
  c.append(i.split())


In [None]:
c

[['가능'],
 ['가능', '기준'],
 ['기준'],
 ['기준', '요금'],
 ['다른'],
 ['다른', '배달'],
 ['떡볶이'],
 ['떡볶이', '혼자'],
 ['먹기'],
 ['먹기', '배달'],
 ['배달'],
 ['배달', '가능'],
 ['배달', '떡볶이'],
 ['부담'],
 ['요금'],
 ['요금', '부담'],
 ['혼자'],
 ['혼자', '먹기'],
 ['가능'],
 ['가능', '기준']]

In [None]:
doc = '솔직히 예전에 비해 소스양이 너무 줄었네요'
tokenized_doc = okt.pos(doc)
tokenized_nouns = ' '.join([word[0] for word in tokenized_doc if word[1] == 'Noun'])

n_gram_range = (1,2)

count = CountVectorizer(ngram_range=n_gram_range).fit([tokenized_nouns])
candidates = count.get_feature_names_out()

In [None]:
doc = '솔직히 예전에 비해 소스양이 너무 줄었네요'
doc_embedding = model.encode([doc])
candidate_embeddings = model.encode(candidates)

In [None]:
max_sum_sim(doc_embedding, candidate_embeddings, candidates, top_n=5, nr_candidates=1)

IndexError: ignored