In [1]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 3.0 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 31.1 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 53.9 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 6.7 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 65.1 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (

In [2]:
import argparse
import logging
import math
import os
import random
from datetime import datetime
import numpy as np
import torch
from sentence_transformers import SentenceTransformer, LoggingHandler, models, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from torch.utils.data import DataLoader
import json
from sentence_transformers.readers import InputExample
from torch.nn.functional import cosine_similarity

In [3]:
# Parse command line arguments
parser = argparse.ArgumentParser()
parser.add_argument("--model_name_or_path", type=str, default = "klue/roberta-base")
parser.add_argument("--max_seq_length", type=int, default=64) # test defalut : 128
parser.add_argument("--batch_size", type=int, default=8) # test defalut : 8
parser.add_argument("--num_epochs", type=int, default=5) # test defalut : 5
parser.add_argument("--output_dir", type=str, default="output") 
parser.add_argument("--output_prefix", type=str, default="kor_sts_") 
parser.add_argument("--seed", type=int, default=777)
args = parser.parse_args("")

In [4]:
# Fix random seed
seed=777
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.cuda.manual_seed(args.seed)

# Configure logger
logging.basicConfig(
    format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]
)

# Read the dataset
model_save_path = os.path.join(
    args.output_dir,
    args.output_prefix + args.model_name_or_path.replace("/", "-") + '-' + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
)

In [5]:
# Define SentenceTransformer model
word_embedding_model = models.Transformer(args.model_name_or_path, max_seq_length=args.max_seq_length)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)
model = SentenceTransformer(modules=[word_embedding_model, pooling_model], device = 'cuda')

Downloading:   0%|          | 0.00/546 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/422M [00:00<?, ?B/s]

Some weights of the model checkpoint at klue/roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for

Downloading:   0%|          | 0.00/375 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/243k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/734k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/173 [00:00<?, ?B/s]

## Text Preprocessing

In [6]:
import re
def clean_text(text):
    review = re.sub(r'\s+', ' ', str(text))  #과다한 띄어쓰기를 한번의 띄어쓰기로 변경
    review = review.lower() #영어 일단 소문자화
    review = re.sub(r'<[^>]+>','',review) #Html tags 제거
    review = re.sub(r'[^ A-Za-z0-9가-힣+]','',review) #특수문자 제거
    review = re.sub(r'\s+', ' ', review) #과다한 띄어쓰기를 한번의 띄어쓰기로 변경22
    review = re.sub(r"^\s+", '', review) #띄어쓰기로 시작하는 것 제거
    review = re.sub(r'\s+$', '', review) #띄어쓰기로 끝나는 것 제거
    return review

In [7]:
def load_kor_sts_samples(filename):
    samples = []
    with open(filename,'r') as f:
        json_data = json.load(f)
    for row in json_data:
        score = float(row['labels']['real-label']) / 5.0  # Normalize score to range 0 ... 1
        samples.append(InputExample(texts=[clean_text(row['sentence1']), clean_text(row['sentence2'])], label=score))
    return samples

In [9]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive

Mounted at /gdrive
/gdrive


In [10]:
%cd /gdrive/MyDrive/github/기업과제/STS

/gdrive/MyDrive/github/기업과제/STS


In [11]:
# Read the dataset
logging.info("Read KorSTS train/dev dataset")

train_file = '/gdrive/MyDrive/github/기업과제/STS/klue-sts-v1.1/klue-sts-v1.1_train.json'
train_samples = load_kor_sts_samples(train_file)
random.shuffle(train_samples)

n_train = int(0.9 * len(train_samples))
valid_samples = train_samples[n_train:]
train_samples = train_samples[:n_train]

train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=args.batch_size)
valid_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(valid_samples, batch_size=args.batch_size,
                                                                 name='sts-valid')

2022-03-23 08:17:41 - Read KorSTS train/dev dataset


In [12]:
train_loss = losses.CosineSimilarityLoss(model=model)

# Configure the training.
warmup_steps = math.ceil(len(train_dataloader) * args.num_epochs * 0.1)  # 10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

2022-03-23 08:17:42 - Warmup-steps: 657


## Train the model

In [13]:
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=valid_evaluator,
          epochs=args.num_epochs,
          show_progress_bar = False,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path)



2022-03-23 08:19:34 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-valid dataset in epoch 0 after 1000 steps:
2022-03-23 08:19:37 - Cosine-Similarity :	Pearson: 0.9477	Spearman: 0.9048
2022-03-23 08:19:37 - Manhattan-Distance:	Pearson: 0.9431	Spearman: 0.9067
2022-03-23 08:19:37 - Euclidean-Distance:	Pearson: 0.9436	Spearman: 0.9073
2022-03-23 08:19:37 - Dot-Product-Similarity:	Pearson: 0.9363	Spearman: 0.8856
2022-03-23 08:19:37 - Save model to output/kor_sts_klue-roberta-base-2022-03-23_08-16-57
2022-03-23 08:20:09 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-valid dataset after epoch 0:
2022-03-23 08:20:12 - Cosine-Similarity :	Pearson: 0.9537	Spearman: 0.9086
2022-03-23 08:20:12 - Manhattan-Distance:	Pearson: 0.9497	Spearman: 0.9112
2022-03-23 08:20:12 - Euclidean-Distance:	Pearson: 0.9498	Spearman: 0.9115
2022-03-23 08:20:12 - Dot-Product-Similarity:	Pearson: 0.9391	Spearman: 0.8849
2022-03-23 08:20:12 - Save model to output/kor_sts_klue-roberta-base-20

## Evaluate the model

In [15]:
model = SentenceTransformer(model_save_path)
logging.info("Read KorSTS benchmark test dataset")
test_file = '/gdrive/MyDrive/github/기업과제/STS/klue-sts-v1.1/klue-sts-v1.1_dev.json'
test_samples = load_kor_sts_samples(test_file)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test')
test_evaluator(model, output_path=model_save_path)

2022-03-23 08:31:07 - Load pretrained SentenceTransformer: output/kor_sts_klue-roberta-base-2022-03-23_08-16-57
2022-03-23 08:31:08 - Use pytorch device: cuda
2022-03-23 08:31:08 - Read KorSTS benchmark test dataset
2022-03-23 08:31:08 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-test dataset:
2022-03-23 08:31:10 - Cosine-Similarity :	Pearson: 0.8911	Spearman: 0.8899
2022-03-23 08:31:10 - Manhattan-Distance:	Pearson: 0.8919	Spearman: 0.8922
2022-03-23 08:31:10 - Euclidean-Distance:	Pearson: 0.8927	Spearman: 0.8931
2022-03-23 08:31:10 - Dot-Product-Similarity:	Pearson: 0.8626	Spearman: 0.8566


0.8931025170161075

## model을 dev set 에 적용시켜서 예측값 추출

In [16]:
test_file = '/gdrive/MyDrive/github/기업과제/STS/klue-sts-v1.1/klue-sts-v1.1_dev.json'
guid = []
sentence1 = []
sentence2 = []
real_label = []
binary_label = []

with open(test_file,'r') as f:
    json_data = json.load(f)
for row in json_data:
    sentence1.append(row['sentence1'])
    sentence2.append(row['sentence2'])
    real_label.append(row['labels']['real-label'])
    binary_label.append(row['labels']['binary-label'])
    guid.append(row['guid'])

In [17]:
s1_features = model.encode(sentence1)

Batches:   0%|          | 0/17 [00:00<?, ?it/s]

In [18]:
s2_features = model.encode(sentence2)

Batches:   0%|          | 0/17 [00:00<?, ?it/s]

In [19]:
# 코사인 유사도 (범위:-1 ~ 1)
sim = cosine_similarity(torch.tensor(s1_features), torch.tensor(s2_features))

In [20]:
# 구해진 코사인 유사도를 0 ~ 5의 범위로 변환
label_pred = torch.clip(sim, min=0) * 5  

In [21]:
label_pred

tensor([4.7394, 2.5282, 3.3430, 3.6475, 2.7304, 1.3242, 3.7131, 1.6617, 0.0000,
        1.4399, 4.1134, 3.3471, 2.4219, 2.7468, 4.7588, 3.6436, 4.4245, 1.6065,
        0.2548, 2.1871, 3.4664, 4.1057, 3.7551, 3.4387, 0.4983, 3.5706, 3.7941,
        4.0196, 0.7012, 3.9511, 3.1444, 0.5326, 0.9925, 3.5826, 4.7864, 2.5246,
        3.5521, 3.9364, 4.8948, 3.7187, 3.9863, 4.4703, 1.7539, 3.2993, 2.0320,
        4.2531, 4.4762, 4.7563, 1.5741, 4.2718, 0.4650, 4.5926, 3.4245, 1.2957,
        3.1619, 1.1850, 1.5734, 2.6390, 3.6672, 3.2896, 3.2360, 1.9923, 3.6838,
        2.2362, 3.6819, 4.2459, 1.4836, 1.1237, 4.9139, 4.4944, 1.3865, 1.4903,
        3.7975, 4.2184, 2.8237, 4.5440, 0.2208, 4.4359, 4.0518, 3.9639, 3.2459,
        2.3073, 4.3550, 1.8643, 3.3559, 3.9937, 4.6215, 3.6816, 0.5096, 4.1085,
        0.4595, 4.1725, 4.7838, 2.7104, 4.5960, 3.9474, 2.8850, 3.1688, 3.6551,
        0.8736, 0.2113, 4.2136, 4.0253, 2.5278, 3.5729, 2.5509, 1.2338, 3.4571,
        3.3747, 2.8201, 1.7209, 1.8663, 

In [22]:
# 0~5 범위 중 3 이상이면 1, 아니면 0 으로 변환
bin_pred = torch.where(label_pred >= 3, 1, 0) 

In [23]:
bin_pred

tensor([1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1,
        0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1,
        0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
        1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1,
        0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1,
        1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0,
        0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1,
        0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
        0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
        0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1,
        1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0,
        0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0,
        0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0,

## 예측값을 csv로 추출

In [24]:
import pandas as pd
predict_label = pd.DataFrame({'guid' : guid,
                              'true_real_label' : real_label,
                              'true_binary_label' : binary_label,
                              'predict_real_label' : label_pred,
                              'predict_binary_label' : bin_pred})

In [25]:
csv_output_filename = '기업과제3_5팀_dev_set_score.csv'
predict_label.to_csv(csv_output_filename, index = False)