<a href="https://colab.research.google.com/github/taehyeonk/pre-onboarding_project/blob/jina/220314_2_sub3_roberta_large_devX.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 3.3 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 23.1 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 65.0 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 6.7 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 36.1 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (

In [2]:
import argparse
import logging
import math
import os
import random
from datetime import datetime
import numpy as np
import torch
from sentence_transformers import SentenceTransformer, LoggingHandler, models, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from torch.utils.data import DataLoader
import json
from sentence_transformers.readers import InputExample

In [5]:
# Parse command line arguments
parser = argparse.ArgumentParser()
parser.add_argument("--model_name_or_path", type=str, default = "klue/roberta-large")
parser.add_argument("--max_seq_length", type=int, default=128)
parser.add_argument("--batch_size", type=int, default=8)
parser.add_argument("--num_epochs", type=int, default=5)
parser.add_argument("--output_dir", type=str, default="output")
parser.add_argument("--output_prefix", type=str, default="kor_sts_")
parser.add_argument("--seed", type=int, default=777)
args = parser.parse_args("")

In [6]:
# Fix random seed
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.cuda.manual_seed(args.seed)

# Configure logger
logging.basicConfig(
    format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]
)

# Read the dataset
model_save_path = os.path.join(
    args.output_dir,
    args.output_prefix + args.model_name_or_path.replace("/", "-") + '-' + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
)

In [7]:
# Define SentenceTransformer model
word_embedding_model = models.Transformer(args.model_name_or_path, max_seq_length=args.max_seq_length)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)
model = SentenceTransformer(modules=[word_embedding_model, pooling_model], device = 'cuda')

Downloading:   0%|          | 0.00/547 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.25G [00:00<?, ?B/s]

Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it f

Downloading:   0%|          | 0.00/375 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/243k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/734k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/173 [00:00<?, ?B/s]

In [9]:
def load_kor_sts_samples(filename):
    samples = []
    with open(filename,'r') as f:
        json_data = json.load(f)
    for row in json_data:
        score = float(row['labels']['label']) / 5.0  # Normalize score to range 0 ... 1
        samples.append(InputExample(texts=[row['sentence1'], row['sentence2']], label=score))
    return samples

In [10]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [11]:
%cd /content/drive/MyDrive/Colab Notebooks/klue-sts-v1.1

/content/drive/MyDrive/Colab Notebooks/klue-sts-v1.1


In [13]:
# Read the dataset
logging.info("Read KorSTS train/dev dataset")

train_file = '/content/drive/MyDrive/Colab Notebooks/klue-sts-v1.1/klue-sts-v1.1_train.json'
train_samples = load_kor_sts_samples(train_file)

n_train = int(0.9 * len(train_samples))
valid_samples = train_samples[n_train:]
train_samples = train_samples[:n_train]

train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=args.batch_size)
valid_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(valid_samples, batch_size=args.batch_size,
                                                                 name='sts-valid')

2022-03-14 13:46:31 - Read KorSTS train/dev dataset


In [14]:
train_loss = losses.CosineSimilarityLoss(model=model)

# Configure the training.
warmup_steps = math.ceil(len(train_dataloader) * args.num_epochs * 0.1)  # 10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

2022-03-14 13:46:51 - Warmup-steps: 657


In [16]:
# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=valid_evaluator,
          epochs=args.num_epochs,
          show_progress_bar = True,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path)



Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1313 [00:00<?, ?it/s]

2022-03-14 13:52:25 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-valid dataset in epoch 0 after 1000 steps:
2022-03-14 13:52:33 - Cosine-Similarity :	Pearson: 0.9488	Spearman: 0.9044
2022-03-14 13:52:33 - Manhattan-Distance:	Pearson: 0.9404	Spearman: 0.9004
2022-03-14 13:52:33 - Euclidean-Distance:	Pearson: 0.9405	Spearman: 0.9003
2022-03-14 13:52:33 - Dot-Product-Similarity:	Pearson: 0.9271	Spearman: 0.8785
2022-03-14 13:52:33 - Save model to output/kor_sts_klue-roberta-large-2022-03-14_13-43-19
2022-03-14 13:54:14 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-valid dataset after epoch 0:
2022-03-14 13:54:22 - Cosine-Similarity :	Pearson: 0.9553	Spearman: 0.9111
2022-03-14 13:54:22 - Manhattan-Distance:	Pearson: 0.9501	Spearman: 0.9098
2022-03-14 13:54:22 - Euclidean-Distance:	Pearson: 0.9499	Spearman: 0.9102
2022-03-14 13:54:22 - Dot-Product-Similarity:	Pearson: 0.9396	Spearman: 0.8869
2022-03-14 13:54:22 - Save model to output/kor_sts_klue-roberta-large-

Iteration:   0%|          | 0/1313 [00:00<?, ?it/s]

2022-03-14 13:59:29 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-valid dataset in epoch 1 after 1000 steps:
2022-03-14 13:59:38 - Cosine-Similarity :	Pearson: 0.9606	Spearman: 0.9161
2022-03-14 13:59:38 - Manhattan-Distance:	Pearson: 0.9511	Spearman: 0.9106
2022-03-14 13:59:38 - Euclidean-Distance:	Pearson: 0.9517	Spearman: 0.9117
2022-03-14 13:59:38 - Dot-Product-Similarity:	Pearson: 0.9365	Spearman: 0.8821
2022-03-14 13:59:38 - Save model to output/kor_sts_klue-roberta-large-2022-03-14_13-43-19
2022-03-14 14:01:16 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-valid dataset after epoch 1:
2022-03-14 14:01:25 - Cosine-Similarity :	Pearson: 0.9614	Spearman: 0.9210
2022-03-14 14:01:25 - Manhattan-Distance:	Pearson: 0.9509	Spearman: 0.9131
2022-03-14 14:01:25 - Euclidean-Distance:	Pearson: 0.9512	Spearman: 0.9131
2022-03-14 14:01:25 - Dot-Product-Similarity:	Pearson: 0.9344	Spearman: 0.8854
2022-03-14 14:01:25 - Save model to output/kor_sts_klue-roberta-large-

Iteration:   0%|          | 0/1313 [00:00<?, ?it/s]

2022-03-14 14:06:30 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-valid dataset in epoch 2 after 1000 steps:
2022-03-14 14:06:38 - Cosine-Similarity :	Pearson: 0.9625	Spearman: 0.9229
2022-03-14 14:06:38 - Manhattan-Distance:	Pearson: 0.9501	Spearman: 0.9104
2022-03-14 14:06:38 - Euclidean-Distance:	Pearson: 0.9501	Spearman: 0.9101
2022-03-14 14:06:38 - Dot-Product-Similarity:	Pearson: 0.9382	Spearman: 0.8947
2022-03-14 14:06:38 - Save model to output/kor_sts_klue-roberta-large-2022-03-14_13-43-19
2022-03-14 14:08:18 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-valid dataset after epoch 2:
2022-03-14 14:08:26 - Cosine-Similarity :	Pearson: 0.9626	Spearman: 0.9216
2022-03-14 14:08:26 - Manhattan-Distance:	Pearson: 0.9521	Spearman: 0.9116
2022-03-14 14:08:26 - Euclidean-Distance:	Pearson: 0.9520	Spearman: 0.9113
2022-03-14 14:08:26 - Dot-Product-Similarity:	Pearson: 0.9378	Spearman: 0.8897


Iteration:   0%|          | 0/1313 [00:00<?, ?it/s]

2022-03-14 14:13:27 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-valid dataset in epoch 3 after 1000 steps:
2022-03-14 14:13:35 - Cosine-Similarity :	Pearson: 0.9649	Spearman: 0.9266
2022-03-14 14:13:35 - Manhattan-Distance:	Pearson: 0.9546	Spearman: 0.9179
2022-03-14 14:13:35 - Euclidean-Distance:	Pearson: 0.9545	Spearman: 0.9179
2022-03-14 14:13:35 - Dot-Product-Similarity:	Pearson: 0.9442	Spearman: 0.8942
2022-03-14 14:13:35 - Save model to output/kor_sts_klue-roberta-large-2022-03-14_13-43-19
2022-03-14 14:15:16 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-valid dataset after epoch 3:
2022-03-14 14:15:24 - Cosine-Similarity :	Pearson: 0.9650	Spearman: 0.9261
2022-03-14 14:15:24 - Manhattan-Distance:	Pearson: 0.9535	Spearman: 0.9180
2022-03-14 14:15:24 - Euclidean-Distance:	Pearson: 0.9534	Spearman: 0.9177
2022-03-14 14:15:24 - Dot-Product-Similarity:	Pearson: 0.9433	Spearman: 0.8951


Iteration:   0%|          | 0/1313 [00:00<?, ?it/s]

2022-03-14 14:20:26 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-valid dataset in epoch 4 after 1000 steps:
2022-03-14 14:20:34 - Cosine-Similarity :	Pearson: 0.9652	Spearman: 0.9270
2022-03-14 14:20:34 - Manhattan-Distance:	Pearson: 0.9547	Spearman: 0.9188
2022-03-14 14:20:34 - Euclidean-Distance:	Pearson: 0.9545	Spearman: 0.9185
2022-03-14 14:20:34 - Dot-Product-Similarity:	Pearson: 0.9443	Spearman: 0.8961
2022-03-14 14:20:34 - Save model to output/kor_sts_klue-roberta-large-2022-03-14_13-43-19
2022-03-14 14:22:14 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-valid dataset after epoch 4:
2022-03-14 14:22:22 - Cosine-Similarity :	Pearson: 0.9652	Spearman: 0.9273
2022-03-14 14:22:22 - Manhattan-Distance:	Pearson: 0.9545	Spearman: 0.9189
2022-03-14 14:22:22 - Euclidean-Distance:	Pearson: 0.9543	Spearman: 0.9186
2022-03-14 14:22:22 - Dot-Product-Similarity:	Pearson: 0.9445	Spearman: 0.8965
2022-03-14 14:22:22 - Save model to output/kor_sts_klue-roberta-large-

In [17]:
# Load the stored model and evaluate its performance on STS benchmark dataset
model = SentenceTransformer(model_save_path)
logging.info("Read KorSTS benchmark test dataset")
test_file_1 = './klue-sts-v1.1_dev.json'
test_samples_1 = load_kor_sts_samples(test_file_1)
test_evaluator_1 = EmbeddingSimilarityEvaluator.from_input_examples(test_samples_1, name='sts-test')
test_evaluator_1(model, output_path=model_save_path)

2022-03-14 14:24:20 - Load pretrained SentenceTransformer: output/kor_sts_klue-roberta-large-2022-03-14_13-43-19
2022-03-14 14:24:24 - Use pytorch device: cuda
2022-03-14 14:24:24 - Read KorSTS benchmark test dataset
2022-03-14 14:24:24 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-test dataset:
2022-03-14 14:24:28 - Cosine-Similarity :	Pearson: 0.8906	Spearman: 0.8909
2022-03-14 14:24:28 - Manhattan-Distance:	Pearson: 0.8843	Spearman: 0.8886
2022-03-14 14:24:28 - Euclidean-Distance:	Pearson: 0.8841	Spearman: 0.8888
2022-03-14 14:24:28 - Dot-Product-Similarity:	Pearson: 0.8533	Spearman: 0.8492


0.8908901088169193

In [20]:
!pip install datasets

Collecting datasets
  Downloading datasets-1.18.4-py3-none-any.whl (312 kB)
[?25l[K     |█                               | 10 kB 36.3 MB/s eta 0:00:01[K     |██                              | 20 kB 20.1 MB/s eta 0:00:01[K     |███▏                            | 30 kB 10.9 MB/s eta 0:00:01[K     |████▏                           | 40 kB 9.0 MB/s eta 0:00:01[K     |█████▎                          | 51 kB 3.4 MB/s eta 0:00:01[K     |██████▎                         | 61 kB 4.0 MB/s eta 0:00:01[K     |███████▍                        | 71 kB 4.2 MB/s eta 0:00:01[K     |████████▍                       | 81 kB 4.1 MB/s eta 0:00:01[K     |█████████▌                      | 92 kB 4.6 MB/s eta 0:00:01[K     |██████████▌                     | 102 kB 3.9 MB/s eta 0:00:01[K     |███████████▌                    | 112 kB 3.9 MB/s eta 0:00:01[K     |████████████▋                   | 122 kB 3.9 MB/s eta 0:00:01[K     |█████████████▋                  | 133 kB 3.9 MB/s eta 0:00:01

In [21]:
from torch.utils.data import DataLoader
from datasets import load_dataset

In [23]:
logging.info("Read KorNLU benchmark test dataset")
test_file_2 = load_dataset("kor_nlu", "sts")
test_samples_2=[]
for example in test_file_2["test"]:
    score = float(example["score"]) / 5.0

    if example["sentence1"] and example["sentence2"]:
        inp_example = InputExample(
            texts=[example["sentence1"], example["sentence2"]],
            label=score,
        )

    test_samples_2.append(inp_example)

test_evaluator_2 = EmbeddingSimilarityEvaluator.from_input_examples(test_samples_2, name='sts-test')
test_evaluator_2(model, output_path=model_save_path)

2022-03-14 14:25:55 - Read KorNLU benchmark test dataset
2022-03-14 14:25:56 - Reusing dataset kor_nlu (/root/.cache/huggingface/datasets/kor_nlu/sts/1.0.0/4facbba77df60b0658056ced2052633e681a50187b9428bd5752ebd59d332ba8)


  0%|          | 0/3 [00:00<?, ?it/s]

2022-03-14 14:25:57 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-test dataset:
2022-03-14 14:26:04 - Cosine-Similarity :	Pearson: 0.7708	Spearman: 0.7617
2022-03-14 14:26:04 - Manhattan-Distance:	Pearson: 0.7425	Spearman: 0.7493
2022-03-14 14:26:04 - Euclidean-Distance:	Pearson: 0.7422	Spearman: 0.7488
2022-03-14 14:26:04 - Dot-Product-Similarity:	Pearson: 0.7437	Spearman: 0.7358


0.7616980050439768