<a href="https://colab.research.google.com/github/taehyeonk/pre-onboarding_project/blob/jina/220314_3_sub3_roberta_large_devO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 3.3 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 23.1 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 65.0 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 6.7 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 36.1 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (

In [2]:
import argparse
import logging
import math
import os
import random
from datetime import datetime
import numpy as np
import torch
from sentence_transformers import SentenceTransformer, LoggingHandler, models, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from torch.utils.data import DataLoader
import json
from sentence_transformers.readers import InputExample

In [5]:
# Parse command line arguments
parser = argparse.ArgumentParser()
parser.add_argument("--model_name_or_path", type=str, default = "klue/roberta-large")
parser.add_argument("--max_seq_length", type=int, default=128)
parser.add_argument("--batch_size", type=int, default=8)
parser.add_argument("--num_epochs", type=int, default=5)
parser.add_argument("--output_dir", type=str, default="output")
parser.add_argument("--output_prefix", type=str, default="kor_sts_")
parser.add_argument("--seed", type=int, default=777)
args = parser.parse_args("")

In [6]:
# Fix random seed
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.cuda.manual_seed(args.seed)

# Configure logger
logging.basicConfig(
    format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]
)

# Read the dataset
model_save_path = os.path.join(
    args.output_dir,
    args.output_prefix + args.model_name_or_path.replace("/", "-") + '-' + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
)

In [7]:
# Define SentenceTransformer model
word_embedding_model = models.Transformer(args.model_name_or_path, max_seq_length=args.max_seq_length)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)
model = SentenceTransformer(modules=[word_embedding_model, pooling_model], device = 'cuda')

Downloading:   0%|          | 0.00/547 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.25G [00:00<?, ?B/s]

Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it f

Downloading:   0%|          | 0.00/375 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/243k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/734k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/173 [00:00<?, ?B/s]

In [9]:
def load_kor_sts_samples(filename):
    samples = []
    with open(filename,'r') as f:
        json_data = json.load(f)
    for row in json_data:
        score = float(row['labels']['label']) / 5.0  # Normalize score to range 0 ... 1
        samples.append(InputExample(texts=[row['sentence1'], row['sentence2']], label=score))
    return samples

In [10]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [11]:
%cd /content/drive/MyDrive/Colab Notebooks/klue-sts-v1.1

/content/drive/MyDrive/Colab Notebooks/klue-sts-v1.1


In [25]:
# Read the dataset
logging.info("Read KorSTS train/dev dataset")

train_file = '/content/drive/MyDrive/Colab Notebooks/klue-sts-v1.1/klue-sts-v1.1_train.json'
train_samples = load_kor_sts_samples(train_file)

valid_file = './klue-sts-v1.1_dev.json'
valid_samples = load_kor_sts_samples(valid_file)

train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=args.batch_size)
valid_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(valid_samples, batch_size=args.batch_size,
                                                                 name='sts-valid')

2022-03-14 14:29:48 - Read KorSTS train/dev dataset


In [26]:
train_loss = losses.CosineSimilarityLoss(model=model)

# Configure the training.
warmup_steps = math.ceil(len(train_dataloader) * args.num_epochs * 0.1)  # 10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

2022-03-14 14:29:54 - Warmup-steps: 730


In [27]:
# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=valid_evaluator,
          epochs=args.num_epochs,
          show_progress_bar = True,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path)



Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1459 [00:00<?, ?it/s]

2022-03-14 14:35:01 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-valid dataset in epoch 0 after 1000 steps:
2022-03-14 14:35:04 - Cosine-Similarity :	Pearson: 0.8731	Spearman: 0.8706
2022-03-14 14:35:04 - Manhattan-Distance:	Pearson: 0.8642	Spearman: 0.8668
2022-03-14 14:35:04 - Euclidean-Distance:	Pearson: 0.8646	Spearman: 0.8672
2022-03-14 14:35:04 - Dot-Product-Similarity:	Pearson: 0.8179	Spearman: 0.8133
2022-03-14 14:35:04 - Save model to output/kor_sts_klue-roberta-large-2022-03-14_13-43-19
2022-03-14 14:37:28 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-valid dataset after epoch 0:
2022-03-14 14:37:32 - Cosine-Similarity :	Pearson: 0.8771	Spearman: 0.8806
2022-03-14 14:37:32 - Manhattan-Distance:	Pearson: 0.8723	Spearman: 0.8803
2022-03-14 14:37:32 - Euclidean-Distance:	Pearson: 0.8726	Spearman: 0.8799
2022-03-14 14:37:32 - Dot-Product-Similarity:	Pearson: 0.8269	Spearman: 0.8281
2022-03-14 14:37:32 - Save model to output/kor_sts_klue-roberta-large-

Iteration:   0%|          | 0/1459 [00:00<?, ?it/s]

2022-03-14 14:42:40 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-valid dataset in epoch 1 after 1000 steps:
2022-03-14 14:42:43 - Cosine-Similarity :	Pearson: 0.8846	Spearman: 0.8827
2022-03-14 14:42:43 - Manhattan-Distance:	Pearson: 0.8782	Spearman: 0.8817
2022-03-14 14:42:43 - Euclidean-Distance:	Pearson: 0.8787	Spearman: 0.8821
2022-03-14 14:42:43 - Dot-Product-Similarity:	Pearson: 0.8109	Spearman: 0.8051
2022-03-14 14:42:43 - Save model to output/kor_sts_klue-roberta-large-2022-03-14_13-43-19
2022-03-14 14:45:08 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-valid dataset after epoch 1:
2022-03-14 14:45:12 - Cosine-Similarity :	Pearson: 0.8880	Spearman: 0.8896
2022-03-14 14:45:12 - Manhattan-Distance:	Pearson: 0.8810	Spearman: 0.8842
2022-03-14 14:45:12 - Euclidean-Distance:	Pearson: 0.8806	Spearman: 0.8844
2022-03-14 14:45:12 - Dot-Product-Similarity:	Pearson: 0.8157	Spearman: 0.8111
2022-03-14 14:45:12 - Save model to output/kor_sts_klue-roberta-large-

Iteration:   0%|          | 0/1459 [00:00<?, ?it/s]

2022-03-14 14:50:18 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-valid dataset in epoch 2 after 1000 steps:
2022-03-14 14:50:22 - Cosine-Similarity :	Pearson: 0.8916	Spearman: 0.8910
2022-03-14 14:50:22 - Manhattan-Distance:	Pearson: 0.8708	Spearman: 0.8742
2022-03-14 14:50:22 - Euclidean-Distance:	Pearson: 0.8718	Spearman: 0.8759
2022-03-14 14:50:22 - Dot-Product-Similarity:	Pearson: 0.8191	Spearman: 0.8166
2022-03-14 14:50:22 - Save model to output/kor_sts_klue-roberta-large-2022-03-14_13-43-19
2022-03-14 14:52:45 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-valid dataset after epoch 2:
2022-03-14 14:52:49 - Cosine-Similarity :	Pearson: 0.8977	Spearman: 0.8979
2022-03-14 14:52:49 - Manhattan-Distance:	Pearson: 0.8849	Spearman: 0.8871
2022-03-14 14:52:49 - Euclidean-Distance:	Pearson: 0.8854	Spearman: 0.8878
2022-03-14 14:52:49 - Dot-Product-Similarity:	Pearson: 0.8372	Spearman: 0.8344
2022-03-14 14:52:49 - Save model to output/kor_sts_klue-roberta-large-

Iteration:   0%|          | 0/1459 [00:00<?, ?it/s]

2022-03-14 14:57:55 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-valid dataset in epoch 3 after 1000 steps:
2022-03-14 14:57:59 - Cosine-Similarity :	Pearson: 0.8899	Spearman: 0.8902
2022-03-14 14:57:59 - Manhattan-Distance:	Pearson: 0.8756	Spearman: 0.8800
2022-03-14 14:57:59 - Euclidean-Distance:	Pearson: 0.8756	Spearman: 0.8805
2022-03-14 14:57:59 - Dot-Product-Similarity:	Pearson: 0.8161	Spearman: 0.8156
2022-03-14 15:00:16 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-valid dataset after epoch 3:
2022-03-14 15:00:19 - Cosine-Similarity :	Pearson: 0.8914	Spearman: 0.8910
2022-03-14 15:00:19 - Manhattan-Distance:	Pearson: 0.8782	Spearman: 0.8831
2022-03-14 15:00:19 - Euclidean-Distance:	Pearson: 0.8780	Spearman: 0.8830
2022-03-14 15:00:19 - Dot-Product-Similarity:	Pearson: 0.8246	Spearman: 0.8235


Iteration:   0%|          | 0/1459 [00:00<?, ?it/s]

2022-03-14 15:05:20 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-valid dataset in epoch 4 after 1000 steps:
2022-03-14 15:05:23 - Cosine-Similarity :	Pearson: 0.8934	Spearman: 0.8934
2022-03-14 15:05:23 - Manhattan-Distance:	Pearson: 0.8790	Spearman: 0.8845
2022-03-14 15:05:23 - Euclidean-Distance:	Pearson: 0.8788	Spearman: 0.8844
2022-03-14 15:05:23 - Dot-Product-Similarity:	Pearson: 0.8250	Spearman: 0.8232
2022-03-14 15:07:42 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-valid dataset after epoch 4:
2022-03-14 15:07:46 - Cosine-Similarity :	Pearson: 0.8932	Spearman: 0.8933
2022-03-14 15:07:46 - Manhattan-Distance:	Pearson: 0.8793	Spearman: 0.8846
2022-03-14 15:07:46 - Euclidean-Distance:	Pearson: 0.8790	Spearman: 0.8845
2022-03-14 15:07:46 - Dot-Product-Similarity:	Pearson: 0.8260	Spearman: 0.8243


In [20]:
!pip install datasets

Collecting datasets
  Downloading datasets-1.18.4-py3-none-any.whl (312 kB)
[?25l[K     |█                               | 10 kB 36.3 MB/s eta 0:00:01[K     |██                              | 20 kB 20.1 MB/s eta 0:00:01[K     |███▏                            | 30 kB 10.9 MB/s eta 0:00:01[K     |████▏                           | 40 kB 9.0 MB/s eta 0:00:01[K     |█████▎                          | 51 kB 3.4 MB/s eta 0:00:01[K     |██████▎                         | 61 kB 4.0 MB/s eta 0:00:01[K     |███████▍                        | 71 kB 4.2 MB/s eta 0:00:01[K     |████████▍                       | 81 kB 4.1 MB/s eta 0:00:01[K     |█████████▌                      | 92 kB 4.6 MB/s eta 0:00:01[K     |██████████▌                     | 102 kB 3.9 MB/s eta 0:00:01[K     |███████████▌                    | 112 kB 3.9 MB/s eta 0:00:01[K     |████████████▋                   | 122 kB 3.9 MB/s eta 0:00:01[K     |█████████████▋                  | 133 kB 3.9 MB/s eta 0:00:01

In [21]:
from torch.utils.data import DataLoader
from datasets import load_dataset

In [28]:
logging.info("Read KorNLU benchmark test dataset")
test_file_2 = load_dataset("kor_nlu", "sts")
test_samples_2=[]
for example in test_file_2["test"]:
    score = float(example["score"]) / 5.0

    if example["sentence1"] and example["sentence2"]:
        inp_example = InputExample(
            texts=[example["sentence1"], example["sentence2"]],
            label=score,
        )

    test_samples_2.append(inp_example)

test_evaluator_2 = EmbeddingSimilarityEvaluator.from_input_examples(test_samples_2, name='sts-test')
test_evaluator_2(model, output_path=model_save_path)

2022-03-14 15:07:54 - Read KorNLU benchmark test dataset
2022-03-14 15:07:56 - Reusing dataset kor_nlu (/root/.cache/huggingface/datasets/kor_nlu/sts/1.0.0/4facbba77df60b0658056ced2052633e681a50187b9428bd5752ebd59d332ba8)


  0%|          | 0/3 [00:00<?, ?it/s]

2022-03-14 15:07:56 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-test dataset:
2022-03-14 15:08:03 - Cosine-Similarity :	Pearson: 0.7521	Spearman: 0.7441
2022-03-14 15:08:03 - Manhattan-Distance:	Pearson: 0.7085	Spearman: 0.7151
2022-03-14 15:08:03 - Euclidean-Distance:	Pearson: 0.7078	Spearman: 0.7142
2022-03-14 15:08:03 - Dot-Product-Similarity:	Pearson: 0.7276	Spearman: 0.7235


0.7440933403628609