In [1]:
from sentence_transformers import SentenceTransformer, util
import os
import csv

model = SentenceTransformer('all-MiniLM-L6-v2')


url = "http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv"
dataset_path = "quora_duplicate_questions.tsv"
max_corpus_size = 500000


# 샘플 dataset check 및 download
if not os.path.exists(dataset_path):
    util.http_get(url, dataset_path)

# 집합에 unique한 문장을 추출 및 저장
corpus_sentences = set()
with open(dataset_path, encoding='utf8') as file:
    reader = csv.DictReader(file, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
    for row in reader:
        corpus_sentences.add(row['question1'])
        corpus_sentences.add(row['question2'])
        if len(corpus_sentences) >= max_corpus_size:
            break

corpus_sentences = list(corpus_sentences)
len(corpus_sentences)
print("Encode the corpus. This might take a while")
corpus_embeddings = model.encode(corpus_sentences, batch_size=64, show_progress_bar=True, convert_to_tensor=True)


print("Start clustering")
start_time = time.time()

# 클러스터 내의 최소 문장 수 == 10, 최소 유사도 == 0.75
clusters = util.community_detection(corpus_embeddings, min_community_size=10, threshold=0.75)


# 클러스터 상위 3개, 하위 3개 항목 출력
for i, cluster in enumerate(clusters):
    print("\nCluster {}, #{} Elements ".format(i+1, len(cluster)))
    for sentence_id in cluster[0:3]:
        print("\t", corpus_sentences[sentence_id])
    print("\t", "...")
    for sentence_id in cluster[-3:]:
        print("\t", corpus_sentences[sentence_id])

  from .autonotebook import tqdm as notebook_tqdm


Encode the corpus. This might take a while


Batches: 100%|██████████| 7813/7813 [00:35<00:00, 221.25it/s]


Start clustering
Clustering done after 198.52 sec

Cluster 1, #681 Elements 
	 What are some things new employees should know going into their first day at Align Technologies?
	 What are some things new employees should know going into their first day at Agile Therapeutics?
	 What are some things new employees should know going into their first day at Cepheid?
	 ...
	 What are some things new employees should know going into their first day at Chase?
	 What are some things new employees should know going into their first day at Hanmi Financial?
	 What are some things new employees should know going into their first day at Hallmark Financial?

Cluster 2, #458 Elements 
	 What is a good inpatient drug and alcohol rehab center near Kossuth County IA?
	 Which is a suitable inpatient drug and alcohol rehab center near Bertie County NC?
	 Which is a suitable inpatient drug and alcohol rehab center in Henderson County IL?
	 ...
	 What is a suitable inpatient drug and alcohol rehab center near

In [4]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L6-v2')

sentences = ['The cat sits outside',
             'A man is playing guitar',
             'I love pasta',
             'The new movie is awesome',
             'The cat plays in the garden',
             'A woman watches TV',
             'The new movie is so great',
             'Do you like pizza?']

paraphrases = util.paraphrase_mining(model, sentences)


for paraphrase in paraphrases[0:10]:
    score, i, j = paraphrase
    print("{} | {} -> Score: {:.4f}".format(sentences[i], sentences[j], score))

The new movie is awesome | The new movie is so great -> Score: 0.8939
The cat sits outside | The cat plays in the garden -> Score: 0.6788
I love pasta | Do you like pizza? -> Score: 0.5096
I love pasta | The new movie is so great -> Score: 0.2560
I love pasta | The new movie is awesome -> Score: 0.2440
A man is playing guitar | The cat plays in the garden -> Score: 0.2105
The new movie is awesome | Do you like pizza? -> Score: 0.1969
The new movie is so great | Do you like pizza? -> Score: 0.1692
The cat sits outside | A woman watches TV -> Score: 0.1310
The cat plays in the garden | Do you like pizza? -> Score: 0.0900


In [11]:
# 의미론적 텍스트 유사성

from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')

# 문장 배열 2개
sentences1 = ['The cat sits outside',
              'A man is playing guitar',
              'The new movie is awesome'
            ] 

sentences2 = ['The dog plays in the garden',
              'A woman watches TV',
              'The new movie is so great']

# 각 문장 배열에 대해 embedding 계산
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)

# 두 문장 배열에 대해 cosine similarity 계산
# 3X3 행렬
cosine_scores = util.cos_sim(embeddings1, embeddings2)
# 행 == sentences1
# 렬 == sentences2

for i in range(len(sentences1)):
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences1[i], sentences2[i], cosine_scores[i][i]))

tensor([[ 0.2838,  0.1310, -0.0029],
        [ 0.2277, -0.0327, -0.0136]], device='cuda:0')
The cat sits outside 		 The dog plays in the garden 		 Score: 0.2838
A man is playing guitar 		 A woman watches TV 		 Score: -0.0327


In [6]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L6-v2')

# Single list of sentences
sentences = ['The cat sits outside',
             'A man is playing guitar',
             'I love pasta',
             'The new movie is awesome',
             'The cat plays in the garden',
             'A woman watches TV',
             'The new movie is so great',
             'Do you like pizza?']

#Compute embeddings
embeddings = model.encode(sentences, convert_to_tensor=True)

#Compute cosine-similarities for each sentence with each other sentence
cosine_scores = util.cos_sim(embeddings, embeddings)

#Find the pairs with the highest cosine similarity scores
pairs = []
for i in range(len(cosine_scores)-1):
    for j in range(i+1, len(cosine_scores)):
        pairs.append({'index': [i, j], 'score': cosine_scores[i][j]})

#Sort scores in decreasing order
pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)

for pair in pairs[0:10]:
    i, j = pair['index']
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences[i], sentences[j], pair['score']))

The new movie is awesome 		 The new movie is so great 		 Score: 0.8939
The cat sits outside 		 The cat plays in the garden 		 Score: 0.6788
I love pasta 		 Do you like pizza? 		 Score: 0.5096
I love pasta 		 The new movie is so great 		 Score: 0.2560
I love pasta 		 The new movie is awesome 		 Score: 0.2440
A man is playing guitar 		 The cat plays in the garden 		 Score: 0.2105
The new movie is awesome 		 Do you like pizza? 		 Score: 0.1969
The new movie is so great 		 Do you like pizza? 		 Score: 0.1692
The cat sits outside 		 A woman watches TV 		 Score: 0.1310
The cat plays in the garden 		 Do you like pizza? 		 Score: 0.0900
