In [46]:
from sentence_transformers import SentenceTransformer, util
import os
import csv

model = SentenceTransformer('all-MiniLM-L6-v2')


url = "http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv"
dataset_path = "quora_duplicate_questions.tsv"
max_corpus_size = 500000


# 샘플 dataset check 및 download
if not os.path.exists(dataset_path):
    util.http_get(url, dataset_path)

# 집합에 unique한 문장을 추출 및 저장
corpus_sentences = set()
with open(dataset_path, encoding='utf8') as file:
    reader = csv.DictReader(file, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
    for row in reader:
        corpus_sentences.add(row['question1'])
        corpus_sentences.add(row['question2'])
        if len(corpus_sentences) >= max_corpus_size:
            break

corpus_sentences = list(corpus_sentences)
corpus_embeddings = model.encode(corpus_sentences, batch_size=256, show_progress_bar=True, convert_to_tensor=True)

# 클러스터 내의 최소 문장 수 == 10, 최소 유사도 == 0.75
clusters = util.community_detection(corpus_embeddings, min_community_size=5, threshold=0.85)

# cluseter는 2차원 리스트
# 1차원 -> 각 클러스터
# 2차원 -> 문장 ID

# for i, cluster in enumerate(clusters)
# corpus_sentences[sentence_id]




Batches: 100%|██████████| 1954/1954 [00:17<00:00, 113.06it/s]


In [54]:
import random as rd
import torch
test = ['What is a suitable inpatient drug and alcohol rehab center near Scott County AR?']
low_val = 0.5
high_val = 0.5
low_cluster = 0
high_cluster = 0
test_embedding = model.encode(test, convert_to_tensor=True)
for i, cluster in enumerate(clusters):
    rnd_sentence_id = rd.sample(cluster, 5)
    rnd_cluster = [corpus_sentences[sentence_id] for sentence_id in rnd_sentence_id]
    cluster_embeddings = model.encode(rnd_cluster, convert_to_tensor=True)
    cosine_scores = util.cos_sim(test_embedding, cluster_embeddings)
    cluster_cos_score = torch.mean(cosine_scores).item()
    
    if cluster_cos_score < low_val:
        low_val = cluster_cos_score
        low_cluster = i
    elif cluster_cos_score > high_val:
        high_val = cluster_cos_score
        high_cluster = i

# rd가 아니라 가장 높은 것 3개, 낮은 것 3개 추출   
high_question_cluster = [corpus_sentences[sentence_id] for sentence_id in clusters[high_cluster]]
low_question_cluster = [corpus_sentences[sentence_id] for sentence_id in clusters[low_cluster]]
            
        

In [56]:

high_question_cluster_embedding = model.encode(high_question_cluster, convert_to_tensor=True)
low_question_cluster_embedding = model.encode(low_question_cluster, convert_to_tensor=True)

top3_val, top3_index = torch.topk(util.cos_sim(test_embedding,high_question_cluster_embedding), 3)
bot3_val, bot3_index = torch.topk(-(util.cos_sim(test_embedding, low_question_cluster_embedding)), 3)


In [69]:
output_high_question = [high_question_cluster[i] for i in top3_index.tolist()[0]]
output_low_question = [low_question_cluster[i] for i in bot3_index.tolist()[0]]
print(output_high_question)
print(output_low_question)

['What is a suitable inpatient drug and alcohol rehab center near Scott County AR?', 'What is a suitable inpatient drug and alcohol rehab center near Miller County AR?', 'Which is a suitable inpatient drug and alcohol rehab center near Clark County AR?']
['Do the Chinese and Japanese actually hate each other?', 'Do the Japanese and Chinese hate each other?', 'Do Japanese really hate Chinese people?']
