In [1]:
# !pip install sentence-transformers datasets
# !pip install scikit-learn
# !pip install --upgrade pip
# !pip install SpeechRecognition
# !pip install pyautogui
# !pip install clipboard
# !pip install keyboard
# !pip install pyaudio

In [2]:
import math
import logging
from datetime import datetime

import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from sentence_transformers import SentenceTransformer,  LoggingHandler, losses, models, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample

In [3]:
logging.basicConfig(
    format="%(asctime)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level=logging.INFO,
    handlers=[LoggingHandler()],
)

In [4]:
model_name = "klue/roberta-base"

train_batch_size = 32
num_epochs = 1
model_save_path = "output/training_klue_sts_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [5]:
embedding_model = models.Transformer(model_name)

pooler = models.Pooling(
    embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False,
)

Some weights of the model checkpoint at klue/roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for

In [6]:
model = SentenceTransformer(modules=[embedding_model, pooler])

2022-12-22 16:00:32 - Use pytorch device: cuda


In [7]:
datasets = load_dataset("klue", "sts")
testsets = load_dataset("kor_nlu", "sts")

2022-12-22 16:00:36 - Found cached dataset klue (C:/Users/maili/.cache/huggingface/datasets/klue/sts/1.0.0/e0fc3bc3de3eb03be2c92d72fd04a60ecc71903f821619cb28ca0e1e29e4233e)


  0%|          | 0/2 [00:00<?, ?it/s]

2022-12-22 16:00:40 - Found cached dataset kor_nlu (C:/Users/maili/.cache/huggingface/datasets/kor_nlu/sts/1.0.0/4facbba77df60b0658056ced2052633e681a50187b9428bd5752ebd59d332ba8)


  0%|          | 0/3 [00:00<?, ?it/s]

In [8]:
train_samples = []
dev_samples = []
test_samples = []

# KLUE STS 내 훈련, 검증 데이터 예제 변환
for phase in ["train", "validation"]:
    examples = datasets[phase]

    for example in examples:
        score = float(example["labels"]["label"]) / 5.0  # 0.0 ~ 1.0 스케일로 유사도 정규화

        inp_example = InputExample(
            texts=[example["sentence1"], example["sentence2"]], 
            label=score,
        )

        if phase == "validation":
            dev_samples.append(inp_example)
        else:
            train_samples.append(inp_example)

# KorSTS 내 테스트 데이터 예제 변환
for example in testsets["test"]:
    score = float(example["score"]) / 5.0

    if example["sentence1"] and example["sentence2"]:
        inp_example = InputExample(
            texts=[example["sentence1"], example["sentence2"]],
            label=score,
        )

    test_samples.append(inp_example)

In [9]:
train_samples[0].texts, train_samples[0].label
test_samples[0].texts, test_samples[0].label

train_dataloader = DataLoader(
    train_samples,
    shuffle=True,
    batch_size=train_batch_size,
)
train_loss = losses.CosineSimilarityLoss(model=model)

In [10]:
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    dev_samples,
    name="sts-dev",
)

In [11]:
warmup_steps = math.ceil(len(train_dataloader) * num_epochs  * 0.1)  # 10% of train data for warm-up
logging.info(f"Warmup-steps: {warmup_steps}")

2022-12-22 16:00:41 - Warmup-steps: 37


# 모델 학습

In [12]:
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=evaluator,
    epochs=num_epochs,
    evaluation_steps=1000,
    warmup_steps=warmup_steps,
    output_path=model_save_path,
)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/365 [00:00<?, ?it/s]

2022-12-22 16:02:12 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset after epoch 0:
2022-12-22 16:02:13 - Cosine-Similarity :	Pearson: 0.8649	Spearman: 0.8618
2022-12-22 16:02:13 - Manhattan-Distance:	Pearson: 0.8669	Spearman: 0.8602
2022-12-22 16:02:13 - Euclidean-Distance:	Pearson: 0.8672	Spearman: 0.8602
2022-12-22 16:02:13 - Dot-Product-Similarity:	Pearson: 0.8555	Spearman: 0.8483
2022-12-22 16:02:13 - Save model to output/training_klue_sts_klue-roberta-base-2022-12-22_16-00-28


In [14]:
model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test')

2022-12-22 16:02:35 - Load pretrained SentenceTransformer: output/training_klue_sts_klue-roberta-base-2022-12-22_16-00-28
2022-12-22 16:02:35 - Use pytorch device: cuda


In [15]:
test_evaluator(model, output_path=model_save_path)

2022-12-22 16:02:35 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-test dataset:
2022-12-22 16:02:38 - Cosine-Similarity :	Pearson: 0.7642	Spearman: 0.7526
2022-12-22 16:02:38 - Manhattan-Distance:	Pearson: 0.7632	Spearman: 0.7576
2022-12-22 16:02:38 - Euclidean-Distance:	Pearson: 0.7629	Spearman: 0.7572
2022-12-22 16:02:38 - Dot-Product-Similarity:	Pearson: 0.7439	Spearman: 0.7300


0.7575724107071362

In [16]:
docs = [
         "친구가 갑자기 다리밑에서 배가 아파요",
        "동료가 길을 가다가 쓰러졌습니다"
]
document_embeddings = model.encode(docs)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [17]:
query = "친구가 갑자기 다리밑에서 쓰러졌습니다"
query_embedding = model.encode(query)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [18]:
top_k = min(5, len(docs))

# 입력 문장 - 문장 후보군 간 코사인 유사도 계산 후,
cos_scores = util.pytorch_cos_sim(query_embedding, document_embeddings)[0]

# 코사인 유사도 순으로 `top_k` 개 문장 추출
top_results = torch.topk(cos_scores, k=top_k)

print(f"입력 문장: {query}")
print(f"\n<입력 문장과 유사한 {top_k} 개의 문장>\n")

for i, (score, idx) in enumerate(zip(top_results[0], top_results[1])):
    print(f"{i+1}: {docs[idx]} {'(유사도: {:.4f})'.format(score)}\n")

입력 문장: 친구가 갑자기 다리밑에서 쓰러졌습니다

<입력 문장과 유사한 2 개의 문장>

1: 친구가 갑자기 다리밑에서 배가 아파요 (유사도: 0.7487)

2: 동료가 길을 가다가 쓰러졌습니다 (유사도: 0.5807)



# 모델 테스트

In [19]:
def model(model):
    model_save_path = model
    model = SentenceTransformer(model_save_path)
    test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test')
    test_evaluator(model, output_path=model_save_path)
    print(test_evaluator(model, output_path=model_save_path))
    
ModelList =  ["training_klue_sts_klue-roberta-base-2022-12-18_19-36-15" , # 1번
              "training_klue_sts_klue-roberta-base-2022-12-18_19-51-30" , # 100번
              "training_klue_sts_klue-roberta-base-2022-12-18_22-46-00" , # 200번
              "training_klue_sts_klue-roberta-base-2022-12-19_04-33-15"   # 300번
             ]   

for i in range(len(ModelList)):
    model(ModelList[i])

2022-12-22 16:02:38 - Load pretrained SentenceTransformer: training_klue_sts_klue-roberta-base-2022-12-18_19-36-15
2022-12-22 16:02:39 - Use pytorch device: cuda
2022-12-22 16:02:39 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-test dataset:
2022-12-22 16:02:41 - Cosine-Similarity :	Pearson: 0.7590	Spearman: 0.7489
2022-12-22 16:02:41 - Manhattan-Distance:	Pearson: 0.7568	Spearman: 0.7541
2022-12-22 16:02:41 - Euclidean-Distance:	Pearson: 0.7564	Spearman: 0.7537
2022-12-22 16:02:41 - Dot-Product-Similarity:	Pearson: 0.7375	Spearman: 0.7273
2022-12-22 16:02:41 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-test dataset:
2022-12-22 16:02:43 - Cosine-Similarity :	Pearson: 0.7590	Spearman: 0.7489
2022-12-22 16:02:43 - Manhattan-Distance:	Pearson: 0.7568	Spearman: 0.7541
2022-12-22 16:02:43 - Euclidean-Distance:	Pearson: 0.7564	Spearman: 0.7537
2022-12-22 16:02:43 - Dot-Product-Similarity:	Pearson: 0.7375	Spearman: 0.7273
0.7540743241026938
2022-12-22 16:02:43 - L

# 크게 차이는 없지만 모델 중 그나마 300번 정도 학습한 모델이 가장 정확도가 높다.

In [20]:
model_save_path = "training_klue_sts_klue-roberta-base-2022-12-19_04-33-15"
model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test')
test_evaluator(model, output_path=model_save_path)
print(test_evaluator(model, output_path=model_save_path))

docs = [
        "친구가 갑자기 다리밑에서 배가 아파요",
        "동료가 길을 가다가 쓰러졌습니다"
]
document_embeddings = model.encode(docs)

query = "친구가 갑자기 다리밑에서 쓰러졌습니다"
query_embedding = model.encode(query)


top_k = min(5, len(docs))

# 입력 문장 - 문장 후보군 간 코사인 유사도 계산 후,
cos_scores = util.pytorch_cos_sim(query_embedding, document_embeddings)[0]

# 코사인 유사도 순으로 `top_k` 개 문장 추출
top_results = torch.topk(cos_scores, k=top_k)

print(f"입력 문장: {query}")
print(f"\n<입력 문장과 유사한 {top_k} 개의 문장>\n")

for i, (score, idx) in enumerate(zip(top_results[0], top_results[1])):
    if score > 0.2:
        print(f"{i+1}: {docs[idx]} {'(유사도: {:.4f})'.format(score)}\n")

2022-12-22 16:02:58 - Load pretrained SentenceTransformer: training_klue_sts_klue-roberta-base-2022-12-19_04-33-15
2022-12-22 16:02:59 - Use pytorch device: cuda
2022-12-22 16:02:59 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-test dataset:
2022-12-22 16:03:01 - Cosine-Similarity :	Pearson: 0.7715	Spearman: 0.7616
2022-12-22 16:03:01 - Manhattan-Distance:	Pearson: 0.7691	Spearman: 0.7671
2022-12-22 16:03:01 - Euclidean-Distance:	Pearson: 0.7690	Spearman: 0.7670
2022-12-22 16:03:01 - Dot-Product-Similarity:	Pearson: 0.7392	Spearman: 0.7263
2022-12-22 16:03:01 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-test dataset:
2022-12-22 16:03:03 - Cosine-Similarity :	Pearson: 0.7715	Spearman: 0.7616
2022-12-22 16:03:03 - Manhattan-Distance:	Pearson: 0.7691	Spearman: 0.7671
2022-12-22 16:03:03 - Euclidean-Distance:	Pearson: 0.7690	Spearman: 0.7670
2022-12-22 16:03:03 - Dot-Product-Similarity:	Pearson: 0.7392	Spearman: 0.7263
0.7670928479133706


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

입력 문장: 친구가 갑자기 다리밑에서 쓰러졌습니다

<입력 문장과 유사한 2 개의 문장>

1: 친구가 갑자기 다리밑에서 배가 아파요 (유사도: 0.7658)

2: 동료가 길을 가다가 쓰러졌습니다 (유사도: 0.5795)

