## 시작

In [1]:
!pip install torch_optimizer

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting torch_optimizer
  Downloading torch_optimizer-0.3.0-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 7.3 MB/s  eta 0:00:01
[?25hCollecting pytorch-ranger>=0.1.1
  Downloading pytorch_ranger-0.1.1-py3-none-any.whl (14 kB)
Installing collected packages: pytorch-ranger, torch-optimizer
Successfully installed pytorch-ranger-0.1.1 torch-optimizer-0.3.0
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [2]:
# ------ LIBRARY -------#
import numpy as np
import os
import pickle
import sys
import pandas as pd
import re
import cv2
# torch
import torch
import torch.cuda.amp as amp
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
from torch.utils.data.sampler import *

import torch.nn as nn
import torch.nn.functional as F

from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR, ReduceLROnPlateau, MultiStepLR, OneCycleLR
#

import math
import torch
from torch.optim.optimizer import Optimizer, required
import torch_optimizer as optim
from collections import defaultdict
import itertools as it

import tqdm
import random
#import time
import matplotlib.pyplot as plt
from timeit import default_timer as timer
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import torch
import transformers

# transformer
from transformers import XLMPreTrainedModel, XLMRobertaModel, XLMRobertaConfig, XLMRobertaTokenizer
from transformers import XLMRobertaForSequenceClassification, BertForSequenceClassification
from transformers import AutoTokenizer
from transformers import BertForSequenceClassification, DistilBertForSequenceClassification, XLNetForSequenceClassification,\
XLMRobertaForSequenceClassification, XLMForSequenceClassification, RobertaForSequenceClassification
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

In [46]:
# class args
class args:
    # ---- factor ---- #
    debug=False
    amp = True
    gpu = '0'
    
    epochs=5
    batch_size=32
    weight_decay=1e-6
    n_fold=5
    fold=3 # [0, 1, 2, 3, 4] # 원래는 3
    
    exp_name = 'model_f'
    dir_ = f'./saved_models/'
    pt = 'mz_model'
    max_len = 128
    
    start_lr = 1e-5#1e-3,5e-5
    min_lr=1e-6
    # ---- Dataset ---- #

    # ---- Else ---- #
    num_workers=8
    seed=2021
    scheduler = None#'get_linear_schedule_with_warmup'


data_dir = './data'
os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
device = torch.device(f"cuda" if torch.cuda.is_available() else "cpu")

##----------------
def set_seeds(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False # for faster training, but not deterministic

set_seeds(seed=args.seed)    


In [47]:
# - util - #
def get_learning_rate(optimizer):
    lr=[]
    for param_group in optimizer.param_groups:
        lr +=[ param_group['lr'] ]

    assert(len(lr)==1) #we support only one param_group
    lr = lr[0]

    return lr

def load_data():
    train=pd.read_csv('./data/train.csv')
    test=pd.read_csv('./data/test.csv')
    

# 2. Label Encoding (유형, 극성, 시제, 확실성)
    from sklearn.preprocessing import LabelEncoder
    type_le = LabelEncoder()
    train["유형"] = type_le.fit_transform(train["유형"].values)


    polarity_le = LabelEncoder()
    train["극성"] = polarity_le.fit_transform(train["극성"].values)


    tense_le = LabelEncoder()
    train["시제"] = tense_le.fit_transform(train["시제"].values)


    certainty_le = LabelEncoder()
    train["확실성"] = certainty_le.fit_transform(train["확실성"].values)

    #
    train=train[['문장','극성']]
    test=test[['문장']]
    #
    from sklearn.model_selection import StratifiedKFold
    skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
    train['fold'] = -1
    for n_fold, (_,v_idx) in enumerate(skf.split(train, train['극성'])):
        train.loc[v_idx, 'fold']  = n_fold
    train['id'] = [x for x in range(len(train))]
    
    return train, test


# 전처리

In [5]:
# make KoBertTokenizer
import logging
import os
import unicodedata
from shutil import copyfile
 
from transformers import PreTrainedTokenizer
 
logger = logging.getLogger(__name__)
 
VOCAB_FILES_NAMES = {"vocab_file": "tokenizer_78b3253a26.model",
                     "vocab_txt": "vocab.txt"}
 
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "monologg/kobert": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/kobert/tokenizer_78b3253a26.model",
        "monologg/kobert-lm": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/kobert-lm/tokenizer_78b3253a26.model",
        "monologg/distilkobert": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/distilkobert/tokenizer_78b3253a26.model"
    },
    "vocab_txt": {
        "monologg/kobert": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/kobert/vocab.txt",
        "monologg/kobert-lm": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/kobert-lm/vocab.txt",
        "monologg/distilkobert": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/distilkobert/vocab.txt"
    }
}
 
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "monologg/kobert": 512,
    "monologg/kobert-lm": 512,
    "monologg/distilkobert": 512
}
 
PRETRAINED_INIT_CONFIGURATION = {
    "monologg/kobert": {"do_lower_case": False},
    "monologg/kobert-lm": {"do_lower_case": False},
    "monologg/distilkobert": {"do_lower_case": False}
}
 
SPIECE_UNDERLINE = u'▁'
 
class KoBertTokenizer(PreTrainedTokenizer):
    """
        SentencePiece based tokenizer. Peculiarities:
            - requires `SentencePiece <https://github.com/google/sentencepiece>`_
    """
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
    def __init__(
            self,
            vocab_file,
            vocab_txt,
            do_lower_case=False,
            remove_space=True,
            keep_accents=False,
            unk_token="[UNK]",
            sep_token="[SEP]",
            pad_token="[PAD]",
            cls_token="[CLS]",
            mask_token="[MASK]",
            **kwargs):
        super().__init__(
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            **kwargs
        )
 
        # Build vocab
        self.token2idx = dict()
        self.idx2token = []
        with open(vocab_txt, 'r', encoding='utf-8') as f:
            for idx, token in enumerate(f):
                token = token.strip()
                self.token2idx[token] = idx
                self.idx2token.append(token)
 
        #self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
        #self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens
 
        try:
            import sentencepiece as spm
        except ImportError:
            logger.warning("You need to install SentencePiece to use KoBertTokenizer: https://github.com/google/sentencepiece"
                           "pip install sentencepiece")
 
        self.do_lower_case = do_lower_case
        self.remove_space = remove_space
        self.keep_accents = keep_accents
        self.vocab_file = vocab_file
        self.vocab_txt = vocab_txt
 
        self.sp_model = spm.SentencePieceProcessor()
        self.sp_model.Load(vocab_file)
 
    @property
    def vocab_size(self):
        return len(self.idx2token)
 
    def __getstate__(self):
        state = self.__dict__.copy()
        state["sp_model"] = None
        return state
 
    def __setstate__(self, d):
        self.__dict__ = d
        try:
            import sentencepiece as spm
        except ImportError:
            logger.warning("You need to install SentencePiece to use KoBertTokenizer: https://github.com/google/sentencepiece"
                           "pip install sentencepiece")
        self.sp_model = spm.SentencePieceProcessor()
        self.sp_model.Load(self.vocab_file)
 
    def preprocess_text(self, inputs):
        if self.remove_space:
            outputs = " ".join(inputs.strip().split())
        else:
            outputs = inputs
        outputs = outputs.replace("``", '"').replace("''", '"')
 
        if not self.keep_accents:
            outputs = unicodedata.normalize('NFKD', outputs)
            outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
        if self.do_lower_case:
            outputs = outputs.lower()
 
        return outputs
 
    def _tokenize(self, text, return_unicode=True, sample=False):
        """ Tokenize a string. """
        text = self.preprocess_text(text)
 
        if not sample:
            pieces = self.sp_model.EncodeAsPieces(text)
        else:
            pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
        new_pieces = []
        for piece in pieces:
            if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
                cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
                if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
                    if len(cur_pieces[0]) == 1:
                        cur_pieces = cur_pieces[1:]
                    else:
                        cur_pieces[0] = cur_pieces[0][1:]
                cur_pieces.append(piece[-1])
                new_pieces.extend(cur_pieces)
            else:
                new_pieces.append(piece)
 
        return new_pieces
 
    def _convert_token_to_id(self, token):
        """ Converts a token (str/unicode) in an id using the vocab. """
        return self.token2idx.get(token, self.token2idx[self.unk_token])
 
    def _convert_id_to_token(self, index, return_unicode=True):
        """Converts an index (integer) in a token (string/unicode) using the vocab."""
        return self.idx2token[index]
 
    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (strings for sub-words) in a single string."""
        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
        return out_string
 
    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
        by concatenating and adding special tokens.
        A RoBERTa sequence has the following format:
            single sequence: [CLS] X [SEP]
            pair of sequences: [CLS] A [SEP] B [SEP]
        """
        if token_ids_1 is None:
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
        cls = [self.cls_token_id]
        sep = [self.sep_token_id]
        return cls + token_ids_0 + sep + token_ids_1 + sep
 
    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
        """
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
        Args:
            token_ids_0: list of ids (must not contain special tokens)
            token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
                for sequence pairs
            already_has_special_tokens: (default False) Set to True if the token list is already formated with
                special tokens for the model
        Returns:
            A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
        """
 
        if already_has_special_tokens:
            if token_ids_1 is not None:
                raise ValueError(
                    "You should not supply a second sequence if the provided sequence of "
                    "ids is already formated with special tokens for the model."
                )
            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
 
        if token_ids_1 is not None:
            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
        return [1] + ([0] * len(token_ids_0)) + [1]
 
    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
        """
        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
        A BERT sequence pair mask has the following format:
        0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence
        if token_ids_1 is None, only returns the first portion of the mask (0's).
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]
        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
 
    def save_vocabulary(self, save_directory):
        """ Save the sentencepiece vocabulary (copy original file) and special tokens file
            to a directory.
        """
        if not os.path.isdir(save_directory):
            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
            return
 
        # 1. Save sentencepiece model
        out_vocab_model = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
 
        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_model):
            copyfile(self.vocab_file, out_vocab_model)
 
        # 2. Save vocab.txt
        index = 0
        out_vocab_txt = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_txt"])
        with open(out_vocab_txt, "w", encoding="utf-8") as writer:
            for token, token_index in sorted(self.token2idx.items(), key=lambda kv: kv[1]):
                if index != token_index:
                    logger.warning(
                        "Saving vocabulary to {}: vocabulary indices are not consecutive."
                        " Please check that the vocabulary is not corrupted!".format(out_vocab_txt)
                    )
                    index = token_index
                writer.write(token + "\n")
                index += 1
 
        return out_vocab_model, out_vocab_txt

In [15]:
def bert_tokenizer(sent, MAX_LEN, tokenizer):
    
    encoded_dict=tokenizer.encode_plus(
    text = sent, 
    add_special_tokens=True, 
    max_length=MAX_LEN, 
    pad_to_max_length=True, 
    return_attention_mask=True,
    truncation = True)
    
    input_id=encoded_dict['input_ids']
    attention_mask=encoded_dict['attention_mask']
    #token_type_id = encoded_dict['token_type_ids']
    token_type_id = 0
    
    return input_id, attention_mask, token_type_id

def preprocessing_train():
    
    pt = args.pt#'monologg/kobert'
    
    if 'kobert' in pt:
        tokenizer = KoBertTokenizer.from_pretrained(pt,  cache_dir='bert_ckpt', do_lower_case=False)
        print('load kobert')
    else:
        tokenizer = AutoTokenizer.from_pretrained(args.pt)
    
    MAX_LEN = args.max_len
    train = pd.read_csv('./data/train.csv')
    

    # 2. Label Encoding (유형, 극성, 시제, 확실성)
    from sklearn.preprocessing import LabelEncoder
    type_le = LabelEncoder()
    train["유형"] = type_le.fit_transform(train["유형"].values)


    polarity_le = LabelEncoder()
    train["극성"] = polarity_le.fit_transform(train["극성"].values)


    tense_le = LabelEncoder()
    train["시제"] = tense_le.fit_transform(train["시제"].values)


    certainty_le = LabelEncoder()
    train["확실성"] = certainty_le.fit_transform(train["확실성"].values)

    train=train[['문장','유형']]

    input_ids =[]
    attention_masks =[]
    token_type_ids =[]
    train_data_labels = []

    for train_sent, train_label in tqdm.tqdm(zip(train['문장'], train['유형'])):
        try:
            input_id, attention_mask,_ = bert_tokenizer(train_sent, MAX_LEN=MAX_LEN, tokenizer=tokenizer)

            input_ids.append(input_id)
            attention_masks.append(attention_mask)
            token_type_ids.append(0)
            #########################################
            train_data_labels.append(train_label)

        except Exception as e:
            print(e)
            pass

    train_input_ids=np.array(input_ids, dtype=int)
    train_attention_masks=np.array(attention_masks, dtype=int)
    train_token_type_ids=np.array(token_type_ids, dtype=int)
    ###########################################################
    train_inputs=(train_input_ids, train_attention_masks, train_token_type_ids)
    train_labels=np.asarray(train_data_labels, dtype=np.int32)

    # save
    train_data = {}

    train_data['input_ids'] = train_input_ids
    train_data['attention_mask'] = train_attention_masks
    train_data['token_type_ids'] = train_token_type_ids
    train_data['targets'] = np.asarray(train_data_labels, dtype=np.int32)
    
    os.makedirs(f'./data/{pt}/', exist_ok=True)
    with open(f'./data/{pt}/train_data_{MAX_LEN}.pickle', 'wb') as f:
        pickle.dump(train_data, f, pickle.HIGHEST_PROTOCOL)

def preprocessing_test():
    
    pt = args.pt
    if 'kobert' in pt:
        tokenizer = KoBertTokenizer.from_pretrained(pt,  cache_dir='bert_ckpt', do_lower_case=False)
        print('load kobert')
    else:
        tokenizer = AutoTokenizer.from_pretrained(args.pt)
    MAX_LEN = args.max_len
    
    test = pd.read_csv('./data/test.csv')
    test=test[['문장']]
    
    input_ids =[]
    attention_masks =[]
    token_type_ids =[]

    for test_sent in tqdm.tqdm(test['문장']):
        try:
            input_id, attention_mask,_ = bert_tokenizer(test_sent, MAX_LEN=MAX_LEN, tokenizer=tokenizer)

            input_ids.append(input_id)
            attention_masks.append(attention_mask)
            token_type_ids.append(0)
            #########################################

        except Exception as e:
            print(e)
            pass

    test_input_ids=np.array(input_ids, dtype=int)
    test_attention_masks=np.array(attention_masks, dtype=int)
    test_token_type_ids=np.array(token_type_ids, dtype=int)
    ###########################################################
    test_inputs=(test_input_ids, test_attention_masks, test_token_type_ids)


    # save
    test_data = {}

    test_data['input_ids'] = test_input_ids
    test_data['attention_mask'] = test_attention_masks
    test_data['token_type_ids'] = test_token_type_ids
    
    os.makedirs(f'./data/{pt}/', exist_ok=True)
    with open(f'./data/{pt}/test_data_{MAX_LEN}.pickle', 'wb') as f:
        pickle.dump(test_data, f, pickle.HIGHEST_PROTOCOL)
           

In [16]:
for pt, max_len in zip(['xlm-roberta-large'],[128]):
    args.max_len = max_len
    args.pt = pt
    preprocessing_train()
    preprocessing_test()
        
    print(f'{args.pt} 모델 전처리 완료')

16541it [00:02, 6007.95it/s]
100%|██████████| 7090/7090 [00:01<00:00, 6609.16it/s]


xlm-roberta-large 모델 전처리 완료


In [10]:
for pt, max_len in zip(['monologg/kobert','klue/roberta-base','klue/roberta-small','klue/roberta-large','xlm-roberta-large', 
           'bert-base-multilingual-uncased', 'klue/roberta-large'],[128,128,128,128,128,128,96]):
    args.max_len = max_len
    args.pt = pt
    preprocessing_train()
    preprocessing_test()
        
    print(f'{args.pt} 모델 전처리 완료')

Downloading:   0%|          | 0.00/371k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/77.8k [00:00<?, ?B/s]

0it [00:00, ?it/s]

load kobert


16541it [00:03, 5445.98it/s]
  8%|▊         | 552/7090 [00:00<00:01, 5512.38it/s]

load kobert


100%|██████████| 7090/7090 [00:01<00:00, 5519.93it/s]


monologg/kobert 모델 전처리 완료


Downloading:   0%|          | 0.00/546 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/248k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/752k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/173 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/375 [00:00<?, ?B/s]

16541it [00:02, 6991.74it/s]
100%|██████████| 7090/7090 [00:00<00:00, 7629.06it/s]


klue/roberta-base 모델 전처리 완료


Downloading:   0%|          | 0.00/545 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/248k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/752k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/173 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/375 [00:00<?, ?B/s]

16541it [00:02, 7574.15it/s]
100%|██████████| 7090/7090 [00:00<00:00, 7573.99it/s]


klue/roberta-small 모델 전처리 완료


Downloading:   0%|          | 0.00/547 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/248k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/752k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/173 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/375 [00:00<?, ?B/s]

16541it [00:02, 7613.73it/s]
100%|██████████| 7090/7090 [00:00<00:00, 7684.98it/s]


klue/roberta-large 모델 전처리 완료


Downloading:   0%|          | 0.00/616 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

447it [00:00, 4461.73it/s]

'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_id

1327it [00:00, 4315.37it/s]

'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_id

2294it [00:00, 4656.71it/s]

'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_id

3225it [00:00, 4628.96it/s]

'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_id

4159it [00:00, 4613.84it/s]

'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_id

5071it [00:01, 4484.34it/s]

'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_id

5960it [00:01, 4337.61it/s]

'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_id

6893it [00:01, 4499.40it/s]

'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_id

7858it [00:01, 4678.59it/s]

'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_id

8795it [00:01, 4568.38it/s]

'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_id

9743it [00:02, 4634.63it/s]

'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_id

10686it [00:02, 4669.78it/s]

'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_id

11609it [00:02, 4488.16it/s]

'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_id

12573it [00:02, 4659.19it/s]

'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_id

13552it [00:02, 4753.49it/s]

'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_id

14525it [00:03, 4674.05it/s]

'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_id

15463it [00:03, 4589.23it/s]

'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_id

16541it [00:03, 4579.60it/s]

'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_id


 13%|█▎        | 891/7090 [00:00<00:01, 4452.95it/s]

'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_id

 26%|██▌       | 1820/7090 [00:00<00:01, 4580.84it/s]

'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_id

 33%|███▎      | 2328/7090 [00:00<00:01, 4759.33it/s]

'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_id

 47%|████▋     | 3308/7090 [00:00<00:00, 4784.55it/s]

'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_id

 61%|██████    | 4302/7090 [00:00<00:00, 4812.62it/s]

'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_id

 74%|███████▍  | 5276/7090 [00:01<00:00, 4809.57it/s]

'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_id

 88%|████████▊ | 6253/7090 [00:01<00:00, 4692.59it/s]

'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_id

100%|██████████| 7090/7090 [00:01<00:00, 4666.96it/s]


'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_ids'
'token_type_id

xlm-roberta-large 모델 전처리 완료


Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/872k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.72M [00:00<?, ?B/s]

16541it [00:03, 4795.51it/s]
100%|██████████| 7090/7090 [00:01<00:00, 5108.94it/s]


bert-base-multilingual-uncased 모델 전처리 완료


16541it [00:02, 7637.35it/s]
100%|██████████| 7090/7090 [00:00<00:00, 7710.11it/s]


klue/roberta-large 모델 전처리 완료


# models

In [7]:
# ------------------------
#  dataset
# ------------------------
class KobertDataSet(Dataset):
    
    def __init__(self, data, test=False):
        
        self.data = data
        self.test = test
        
    def __len__(self):
        
        return self.data['input_ids'].shape[0]
    
    def __getitem__(self,idx):
        
        ids = torch.tensor(self.data['input_ids'][idx], dtype=torch.long)
        mask = torch.tensor(self.data['attention_mask'][idx], dtype=torch.long)
        token_type_ids = torch.tensor(self.data['token_type_ids'][idx], dtype=torch.long)
         
            
        if self.test:
            return {
                'ids': ids,
                'mask': mask,
                'token_type_ids': token_type_ids
            }
        
        else:
            target = torch.tensor(self.data['targets'][idx],dtype=torch.long)

            return {
                    'ids': ids,
                    'mask': mask,
                    'token_type_ids': token_type_ids,
                    'targets': target
                }

# training

In [8]:
# ------------------------
#  scheduler
# ------------------------

def do_valid(net, valid_loader):

    val_loss = 0
    target_lst = []
    pred_lst = []
    logit = []
    loss_fn = nn.CrossEntropyLoss()

    net.eval()
    start_timer = timer()
    for t, data in enumerate(tqdm.tqdm(valid_loader)):
        ids  = data['ids'].to(device)
        mask  = data['mask'].to(device)
        tokentype = data['token_type_ids'].to(device)
        target = data['targets'].to(device)

        with torch.no_grad():
            if args.amp:
                with amp.autocast():
                    # output
                    output = net(ids, mask)
                    output = output[0]

                    # loss
                    loss = loss_fn(output, target)

            else:
                output = net(ids, mask)#.squeeze(0)
                loss = loss_fn(output, target)
            
            val_loss += loss
            target_lst.extend(target.detach().cpu().numpy())
            pred_lst.extend(output.argmax(dim=1).tolist())
            logit.extend(output.tolist())
            
        val_mean_loss = val_loss / len(valid_loader)
        validation_score = f1_score(y_true=target_lst, y_pred=pred_lst, average='weighted')
        validation_acc = accuracy_score(y_true=target_lst, y_pred=pred_lst)
        

    return val_mean_loss, validation_score, validation_acc, logit

def do_predict(net, valid_loader):
    
    val_loss = 0
    pred_lst = []
    logit=[]
    net.eval()
    for t, data in enumerate(tqdm.tqdm(valid_loader)):
        ids  = data['ids'].to(device)
        mask  = data['mask'].to(device)
        tokentype = data['token_type_ids'].to(device)

        with torch.no_grad():
            if args.amp:
                with amp.autocast():
                    # output
                    output = net(ids, mask)[0]

            else:
                output = net(ids, mask)
             
            pred_lst.extend(output.argmax(dim=1).tolist())
            logit.extend(output.tolist())
            
    return pred_lst,logit

def run_train(folds=3):
    out_dir = args.dir_+ f'/fold{args.fold}/{args.exp_name}/'
    os.makedirs(out_dir, exist_ok=True)
    
    # load dataset
    train, test = load_data()    
    with open(f'./data/{args.pt}/train_data_{args.max_len}.pickle', 'rb') as f:
        train_data = pickle.load(f)
    with open(f'./data/{args.pt}/test_data_{args.max_len}.pickle', 'rb') as f:
        test_data = pickle.load(f)    
    
    # split fold
    for n_fold in range(5):
        if n_fold != folds:
            print(f'{n_fold} fold pass'+'\n')
            continue
            
        if args.debug:
            train = train.sample(1000).copy()
        
        trn_idx = train[train['fold']!=n_fold]['id'].values
        val_idx = train[train['fold']==n_fold]['id'].values
    

        train_dict = {'input_ids' : train_data['input_ids'][trn_idx] , 'attention_mask' : train_data['attention_mask'][trn_idx] , 
                      'token_type_ids' : train_data['token_type_ids'][trn_idx], 'targets' : train_data['targets'][trn_idx]}
        val_dict = {'input_ids' : train_data['input_ids'][val_idx] , 'attention_mask' : train_data['attention_mask'][val_idx] , 
                      'token_type_ids' : train_data['token_type_ids'][val_idx], 'targets' : train_data['targets'][val_idx]}

        ## dataset ------------------------------------
        train_dataset = KobertDataSet(data = train_dict)
        valid_dataset = KobertDataSet(data = val_dict)
        trainloader = DataLoader(dataset=train_dataset, batch_size=args.batch_size,
                                 num_workers=8, shuffle=True, pin_memory=True)
        validloader = DataLoader(dataset=valid_dataset, batch_size=args.batch_size, 
                                 num_workers=8, shuffle=False, pin_memory=True)

        ## net ----------------------------------------
        scaler = amp.GradScaler()
        if 'xlm-roberta' in args.pt:
            net = XLMRobertaForSequenceClassification.from_pretrained(args.pt, num_labels = 4) 
        
        elif 'klue/roberta' in args.pt:
            net = RobertaForSequenceClassification.from_pretrained(args.pt, num_labels = 4) 
        else:
            net = BertForSequenceClassification.from_pretrained(args.pt, num_labels = 4) 

        net.to(device)
        if len(args.gpu)>1:
            net = nn.DataParallel(net)

        # ------------------------
        # loss
        # ------------------------
        loss_fn = nn.CrossEntropyLoss()

        # ------------------------
        #  Optimizer
        # ------------------------
        optimizer = optim.Lookahead(optim.RAdam(filter(lambda p: p.requires_grad,net.parameters()), lr=args.start_lr), alpha=0.5, k=5)

        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = len(trainloader)*args.epochs)
        
        
        # ----
        start_timer = timer()
        best_score = 0

        for epoch in range(1, args.epochs+1):
            train_loss = 0
            valid_loss = 0

            target_lst = []
            pred_lst = []
            lr = get_learning_rate(optimizer)
            print(f'-------------------')
            print(f'{epoch}epoch start')
            print(f'-------------------'+'\n')
            print(f'learning rate : {lr : .6f}')
            for t, data in enumerate(tqdm.tqdm(trainloader)):

                # one iteration update  -------------
                ids  = data['ids'].to(device)
                mask  = data['mask'].to(device)
                tokentype = data['token_type_ids'].to(device)
                target = data['targets'].to(device)

                # ------------
                net.train()
                optimizer.zero_grad()


                if args.amp:
                    with amp.autocast():
                        # output
                        output = net(ids, mask)
                        output = output[0]

                        # loss
                        loss = loss_fn(output, target)
                        train_loss += loss


                    scaler.scale(loss).backward()
                    scaler.step(optimizer)
                    scaler.update()

                else:
                    # output
                    output = net(ids, mask)

                    # loss
                    loss = loss_fn(output, target)
                    train_loss += loss

                    # update
                    loss.backward()
                    optimizer.step()


                # for calculate f1 score
                target_lst.extend(target.detach().cpu().numpy())
                pred_lst.extend(output.argmax(dim=1).tolist())


                if scheduler is not None:
                    scheduler.step() 
            train_loss = train_loss / len(trainloader)
            train_score = f1_score(y_true=target_lst, y_pred=pred_lst, average='weighted')
            train_acc = accuracy_score(y_true=target_lst, y_pred=pred_lst)

            # validation
            valid_loss, valid_score, valid_acc, _ = do_valid(net, validloader)


            if valid_acc > best_score:
                best_score = valid_acc
                best_epoch = epoch
                best_loss = valid_loss

                torch.save(net.state_dict(), out_dir + f'/{folds}f_{epoch}e_{best_score:.4f}_s.pth')
                print('best model saved'+'\n')


            print(f'train loss : {train_loss:.4f}, train f1 score : {train_score : .4f}, train acc : {train_acc : .4f}'+'\n')
            print(f'valid loss : {valid_loss:.4f}, valid f1 score : {valid_score : .4f}, valid acc : {valid_acc : .4f}'+'\n')


        print(f'best valid loss : {best_loss : .4f}'+'\n')
        print(f'best epoch : {best_epoch }'+'\n')
        print(f'best accuracy : {best_score : .4f}'+'\n')
        
def run_predict(model_path):
    ## dataset ------------------------------------
    # load
    with open(f'./data/{args.pt}/test_data_{args.max_len}.pickle', 'rb') as f:
        test_dict = pickle.load(f)
        
    print('test load')
    test_dataset = KobertDataSet(data = test_dict, test=True)
    testloader = DataLoader(dataset=test_dataset, batch_size=args.batch_size, 
                             num_workers=8, shuffle=False, pin_memory=True)
    print('set testloader')
    ## net ----------------------------------------
    scaler = amp.GradScaler()
    if 'xlm-roberta' in args.pt:
        net = XLMRobertaForSequenceClassification.from_pretrained(args.pt, num_labels = 4) 
        
    elif 'klue/roberta' in args.pt:
        net = RobertaForSequenceClassification.from_pretrained(args.pt, num_labels = 4) 
    else:
        net = BertForSequenceClassification.from_pretrained(args.pt, num_labels = 4) 
        
    net.to(device)
    
    if len(args.gpu)>1:
        net = nn.DataParallel(net)

    f = torch.load(model_path)
    net.load_state_dict(f, strict=True)  # True
    print('load saved models')
    # ------------------------
    # validation
    preds, logit = do_predict(net, testloader) #outputs
           
    print('complete predict')
    
    return preds, np.array(logit)
     

In [17]:
"""5fold 전용"""
if __name__ == '__main__':
    for pt, max_len in zip([ 
           'xlm-roberta-large'],[128]):
        
        args.max_len = max_len
        args.pt = pt
        args.exp_name = str(args.pt) + '_' + str(args.max_len)
        
        for i in [0,1,2,3,4]: # 5fold
            run_train(folds=i)

Downloading:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.den

-------------------
1epoch start
-------------------

learning rate :  0.000010


100%|██████████| 414/414 [02:11<00:00,  3.16it/s]
100%|██████████| 104/104 [00:10<00:00,  9.55it/s]


best model saved

train loss : 0.6587, train f1 score :  0.7402, train acc :  0.8195

valid loss : 0.5756, valid f1 score :  0.7383, valid acc :  0.8196



  0%|          | 0/414 [00:00<?, ?it/s]

-------------------
2epoch start
-------------------

learning rate :  0.000008


100%|██████████| 414/414 [02:12<00:00,  3.12it/s]
100%|██████████| 104/104 [00:10<00:00,  9.69it/s]


best model saved

train loss : 0.5468, train f1 score :  0.7571, train acc :  0.8231

valid loss : 0.4162, valid f1 score :  0.8154, valid acc :  0.8477



  0%|          | 0/414 [00:00<?, ?it/s]

-------------------
3epoch start
-------------------

learning rate :  0.000006


 72%|███████▏  | 298/414 [01:36<00:37,  3.10it/s]


KeyboardInterrupt: 

In [13]:
"""5fold 전용"""
if __name__ == '__main__':
    for pt, max_len in zip(['monologg/kobert','klue/roberta-base','klue/roberta-small','klue/roberta-large','xlm-roberta-large', 
           'bert-base-multilingual-uncased', 'klue/roberta-large'],[128,128,128,128,128,128,128]):
        
        args.max_len = max_len
        args.pt = pt
        args.exp_name = str(args.pt) + '_' + str(args.max_len)
        
        for i in [0,1,2,3,4]: # 5fold
            run_train(folds=i)

Downloading:   0%|          | 0.00/426 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/369M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/414 [00:00<?, ?it/s]

-------------------
1epoch start
-------------------

learning rate :  0.000010


100%|██████████| 414/414 [00:46<00:00,  8.98it/s]
100%|██████████| 104/104 [00:04<00:00, 24.62it/s]


best model saved

train loss : 0.8755, train f1 score :  0.6997, train acc :  0.7073

valid loss : 0.6117, valid f1 score :  0.7383, valid acc :  0.8196



  0%|          | 0/414 [00:00<?, ?it/s]

-------------------
2epoch start
-------------------

learning rate :  0.000008


100%|██████████| 414/414 [00:45<00:00,  9.17it/s]
100%|██████████| 104/104 [00:04<00:00, 24.27it/s]
  0%|          | 0/414 [00:00<?, ?it/s]

best model saved

train loss : 0.5196, train f1 score :  0.7743, train acc :  0.8392

valid loss : 0.4151, valid f1 score :  0.8363, valid acc :  0.8697

-------------------
3epoch start
-------------------

learning rate :  0.000006


100%|██████████| 414/414 [00:45<00:00,  9.07it/s]
100%|██████████| 104/104 [00:04<00:00, 24.14it/s]
  0%|          | 0/414 [00:00<?, ?it/s]

best model saved

train loss : 0.3864, train f1 score :  0.8616, train acc :  0.8811

valid loss : 0.3463, valid f1 score :  0.8771, valid acc :  0.8903

-------------------
4epoch start
-------------------

learning rate :  0.000004


100%|██████████| 414/414 [00:45<00:00,  9.03it/s]
100%|██████████| 104/104 [00:04<00:00, 24.11it/s]


train loss : 0.3355, train f1 score :  0.8792, train acc :  0.8923

valid loss : 0.3265, valid f1 score :  0.8775, valid acc :  0.8876



  0%|          | 0/414 [00:00<?, ?it/s]

-------------------
5epoch start
-------------------

learning rate :  0.000002


100%|██████████| 414/414 [00:45<00:00,  9.11it/s]
100%|██████████| 104/104 [00:04<00:00, 24.13it/s]


train loss : 0.3159, train f1 score :  0.8852, train acc :  0.8978

valid loss : 0.3231, valid f1 score :  0.8780, valid acc :  0.8864

best valid loss :  0.3463

best epoch : 3

best accuracy :  0.8903

1 fold pass

2 fold pass

3 fold pass

4 fold pass

0 fold pass



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/414 [00:00<?, ?it/s]

-------------------
1epoch start
-------------------

learning rate :  0.000010


100%|██████████| 414/414 [00:45<00:00,  9.08it/s]
100%|██████████| 104/104 [00:04<00:00, 24.31it/s]


best model saved

train loss : 0.9137, train f1 score :  0.6991, train acc :  0.7251

valid loss : 0.6428, valid f1 score :  0.7387, valid acc :  0.8198

-------------------
2epoch start


  0%|          | 0/414 [00:00<?, ?it/s]

-------------------

learning rate :  0.000008


100%|██████████| 414/414 [00:45<00:00,  9.16it/s]
100%|██████████| 104/104 [00:04<00:00, 23.88it/s]
  0%|          | 0/414 [00:00<?, ?it/s]

best model saved

train loss : 0.5791, train f1 score :  0.7389, train acc :  0.8198

valid loss : 0.5231, valid f1 score :  0.7570, valid acc :  0.8271

-------------------
3epoch start
-------------------

learning rate :  0.000006


100%|██████████| 414/414 [00:45<00:00,  9.16it/s]
100%|██████████| 104/104 [00:04<00:00, 24.11it/s]


best model saved

train loss : 0.4462, train f1 score :  0.8105, train acc :  0.8487

valid loss : 0.3988, valid f1 score :  0.8704, valid acc :  0.8824



  0%|          | 0/414 [00:00<?, ?it/s]

-------------------
4epoch start
-------------------

learning rate :  0.000004


100%|██████████| 414/414 [00:45<00:00,  9.12it/s]
100%|██████████| 104/104 [00:04<00:00, 24.23it/s]


best model saved

train loss : 0.3562, train f1 score :  0.8672, train acc :  0.8848

valid loss : 0.3595, valid f1 score :  0.8785, valid acc :  0.8854



  0%|          | 0/414 [00:00<?, ?it/s]

-------------------
5epoch start
-------------------

learning rate :  0.000002


100%|██████████| 414/414 [00:45<00:00,  9.08it/s]
100%|██████████| 104/104 [00:04<00:00, 24.14it/s]


best model saved

train loss : 0.3267, train f1 score :  0.8759, train acc :  0.8912

valid loss : 0.3415, valid f1 score :  0.8830, valid acc :  0.8933

best valid loss :  0.3415

best epoch : 5

best accuracy :  0.8933

2 fold pass

3 fold pass

4 fold pass

0 fold pass

1 fold pass



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/414 [00:00<?, ?it/s]

-------------------
1epoch start
-------------------

learning rate :  0.000010


100%|██████████| 414/414 [00:45<00:00,  9.09it/s]
100%|██████████| 104/104 [00:04<00:00, 24.23it/s]


best model saved

train loss : 0.8802, train f1 score :  0.7107, train acc :  0.7449

valid loss : 0.5935, valid f1 score :  0.7387, valid acc :  0.8198

-------------------
2epoch start
-------------------



  0%|          | 0/414 [00:00<?, ?it/s]

learning rate :  0.000008


100%|██████████| 414/414 [00:45<00:00,  9.12it/s]
100%|██████████| 104/104 [00:04<00:00, 24.21it/s]


best model saved

train loss : 0.5147, train f1 score :  0.7747, train acc :  0.8309

valid loss : 0.4447, valid f1 score :  0.8171, valid acc :  0.8440

-------------------

  0%|          | 0/414 [00:00<?, ?it/s]


3epoch start
-------------------

learning rate :  0.000006


100%|██████████| 414/414 [00:45<00:00,  9.07it/s]
100%|██████████| 104/104 [00:04<00:00, 24.29it/s]


best model saved

train loss : 0.4071, train f1 score :  0.8456, train acc :  0.8652

valid loss : 0.3897, valid f1 score :  0.8636, valid acc :  0.8727



  0%|          | 0/414 [00:00<?, ?it/s]

-------------------
4epoch start
-------------------

learning rate :  0.000004


100%|██████████| 414/414 [00:45<00:00,  9.11it/s]
100%|██████████| 104/104 [00:04<00:00, 24.09it/s]
  0%|          | 0/414 [00:00<?, ?it/s]

best model saved

train loss : 0.3511, train f1 score :  0.8784, train acc :  0.8921

valid loss : 0.3544, valid f1 score :  0.8680, valid acc :  0.8791

-------------------
5epoch start
-------------------

learning rate :  0.000002


100%|██████████| 414/414 [00:45<00:00,  9.06it/s]
100%|██████████| 104/104 [00:04<00:00, 24.13it/s]


train loss : 0.3272, train f1 score :  0.8845, train acc :  0.8981

valid loss : 0.3472, valid f1 score :  0.8681, valid acc :  0.8785

best valid loss :  0.3544

best epoch : 4

best accuracy :  0.8791

3 fold pass

4 fold pass

0 fold pass

1 fold pass

2 fold pass



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/414 [00:00<?, ?it/s]

-------------------
1epoch start
-------------------

learning rate :  0.000010


100%|██████████| 414/414 [00:45<00:00,  9.14it/s]
100%|██████████| 104/104 [00:04<00:00, 23.85it/s]


best model saved

train loss : 0.8599, train f1 score :  0.7261, train acc :  0.7833

valid loss : 0.6130, valid f1 score :  0.7382, valid acc :  0.8195



  0%|          | 0/414 [00:00<?, ?it/s]

-------------------
2epoch start
-------------------

learning rate :  0.000008


100%|██████████| 414/414 [00:45<00:00,  9.13it/s]
100%|██████████| 104/104 [00:04<00:00, 23.43it/s]


best model saved

train loss : 0.5270, train f1 score :  0.7588, train acc :  0.8262

valid loss : 0.4171, valid f1 score :  0.8379, valid acc :  0.8634



  0%|          | 0/414 [00:00<?, ?it/s]

-------------------
3epoch start
-------------------

learning rate :  0.000006


100%|██████████| 414/414 [00:45<00:00,  9.09it/s]
100%|██████████| 104/104 [00:04<00:00, 23.68it/s]


best model saved

train loss : 0.3840, train f1 score :  0.8576, train acc :  0.8754

valid loss : 0.3437, valid f1 score :  0.8706, valid acc :  0.8845



  0%|          | 0/414 [00:00<?, ?it/s]

-------------------
4epoch start
-------------------

learning rate :  0.000004


100%|██████████| 414/414 [00:45<00:00,  9.03it/s]
100%|██████████| 104/104 [00:04<00:00, 23.65it/s]


best model saved

train loss : 0.3229, train f1 score :  0.8796, train acc :  0.8936

valid loss : 0.3179, valid f1 score :  0.8719, valid acc :  0.8860



  0%|          | 0/414 [00:00<?, ?it/s]

-------------------
5epoch start
-------------------

learning rate :  0.000002


100%|██████████| 414/414 [00:45<00:00,  9.06it/s]
100%|██████████| 104/104 [00:04<00:00, 23.93it/s]


train loss : 0.2974, train f1 score :  0.8879, train acc :  0.8997

valid loss : 0.3121, valid f1 score :  0.8748, valid acc :  0.8857

best valid loss :  0.3179

best epoch : 4

best accuracy :  0.8860

4 fold pass

0 fold pass

1 fold pass

2 fold pass

3 fold pass



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/414 [00:00<?, ?it/s]

-------------------
1epoch start
-------------------

learning rate :  0.000010


100%|██████████| 414/414 [00:45<00:00,  9.13it/s]
100%|██████████| 104/104 [00:04<00:00, 23.72it/s]
  0%|          | 0/414 [00:00<?, ?it/s]

best model saved

train loss : 0.8580, train f1 score :  0.7261, train acc :  0.7835

valid loss : 0.6068, valid f1 score :  0.7382, valid acc :  0.8195

-------------------
2epoch start
-------------------

learning rate :  0.000008


100%|██████████| 414/414 [00:45<00:00,  9.10it/s]
100%|██████████| 104/104 [00:04<00:00, 23.61it/s]


best model saved

train loss : 0.5220, train f1 score :  0.7571, train acc :  0.8261

valid loss : 0.4468, valid f1 score :  0.8357, valid acc :  0.8637



  0%|          | 0/414 [00:00<?, ?it/s]

-------------------
3epoch start
-------------------

learning rate :  0.000006


100%|██████████| 414/414 [00:45<00:00,  9.14it/s]
100%|██████████| 104/104 [00:04<00:00, 23.30it/s]
  0%|          | 0/414 [00:00<?, ?it/s]

best model saved

train loss : 0.4086, train f1 score :  0.8473, train acc :  0.8702

valid loss : 0.3749, valid f1 score :  0.8701, valid acc :  0.8839

-------------------
4epoch start
-------------------

learning rate :  0.000004


100%|██████████| 414/414 [00:45<00:00,  9.04it/s]
100%|██████████| 104/104 [00:04<00:00, 23.68it/s]


best model saved

train loss : 0.3519, train f1 score :  0.8714, train acc :  0.8870

valid loss : 0.3416, valid f1 score :  0.8784, valid acc :  0.8930



  0%|          | 0/414 [00:00<?, ?it/s]

-------------------
5epoch start
-------------------

learning rate :  0.000002


100%|██████████| 414/414 [00:45<00:00,  9.15it/s]
100%|██████████| 104/104 [00:04<00:00, 23.50it/s]


train loss : 0.3251, train f1 score :  0.8792, train acc :  0.8937

valid loss : 0.3346, valid f1 score :  0.8805, valid acc :  0.8930

best valid loss :  0.3416

best epoch : 4

best accuracy :  0.8930



Downloading:   0%|          | 0.00/443M [00:00<?, ?B/s]

Some weights of the model checkpoint at klue/roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier

-------------------
1epoch start
-------------------

learning rate :  0.000010


100%|██████████| 414/414 [00:45<00:00,  9.09it/s]
100%|██████████| 104/104 [00:04<00:00, 23.86it/s]


best model saved

train loss : 0.7256, train f1 score :  0.7251, train acc :  0.7702

valid loss : 0.4482, valid f1 score :  0.7859, valid acc :  0.8380



  0%|          | 0/414 [00:00<?, ?it/s]

-------------------
2epoch start
-------------------

learning rate :  0.000008


100%|██████████| 414/414 [00:45<00:00,  9.07it/s]
100%|██████████| 104/104 [00:04<00:00, 23.89it/s]
  0%|          | 0/414 [00:00<?, ?it/s]

best model saved

train loss : 0.3940, train f1 score :  0.8497, train acc :  0.8687

valid loss : 0.3231, valid f1 score :  0.8758, valid acc :  0.8837

-------------------
3epoch start
-------------------

learning rate :  0.000006


100%|██████████| 414/414 [00:45<00:00,  9.06it/s]
100%|██████████| 104/104 [00:04<00:00, 23.62it/s]


best model saved

train loss : 0.3217, train f1 score :  0.8793, train acc :  0.8919

valid loss : 0.2967, valid f1 score :  0.8813, valid acc :  0.8942



  0%|          | 0/414 [00:00<?, ?it/s]

-------------------
4epoch start
-------------------

learning rate :  0.000004


100%|██████████| 414/414 [00:46<00:00,  8.96it/s]
100%|██████████| 104/104 [00:04<00:00, 23.91it/s]


train loss : 0.2957, train f1 score :  0.8840, train acc :  0.8957

valid loss : 0.2823, valid f1 score :  0.8851, valid acc :  0.8942



  0%|          | 0/414 [00:00<?, ?it/s]

-------------------
5epoch start
-------------------

learning rate :  0.000002


100%|██████████| 414/414 [00:46<00:00,  8.99it/s]
100%|██████████| 104/104 [00:04<00:00, 23.83it/s]


train loss : 0.2794, train f1 score :  0.8891, train acc :  0.9002

valid loss : 0.2787, valid f1 score :  0.8850, valid acc :  0.8933

best valid loss :  0.2967

best epoch : 3

best accuracy :  0.8942

1 fold pass

2 fold pass

3 fold pass

4 fold pass

0 fold pass



Some weights of the model checkpoint at klue/roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier

-------------------
1epoch start
-------------------

learning rate :  0.000010


100%|██████████| 414/414 [00:45<00:00,  9.01it/s]
100%|██████████| 104/104 [00:04<00:00, 23.52it/s]


best model saved

train loss : 0.7865, train f1 score :  0.6932, train acc :  0.7151

valid loss : 0.4719, valid f1 score :  0.7829, valid acc :  0.8331

-------------------
2epoch start


  0%|          | 0/414 [00:00<?, ?it/s]

-------------------

learning rate :  0.000008


100%|██████████| 414/414 [00:45<00:00,  9.11it/s]
100%|██████████| 104/104 [00:04<00:00, 23.52it/s]


best model saved

train loss : 0.4095, train f1 score :  0.8404, train acc :  0.8619

valid loss : 0.3349, valid f1 score :  0.8813, valid acc :  0.8951

-------------------
3epoch start
-------------------

learning rate :  0.000006


100%|██████████| 414/414 [00:45<00:00,  9.06it/s]
100%|██████████| 104/104 [00:04<00:00, 23.65it/s]
  0%|          | 0/414 [00:00<?, ?it/s]

train loss : 0.3205, train f1 score :  0.8819, train acc :  0.8948

valid loss : 0.3013, valid f1 score :  0.8824, valid acc :  0.8875

-------------------
4epoch start
-------------------

learning rate :  0.000004


100%|██████████| 414/414 [00:45<00:00,  9.06it/s]
100%|██████████| 104/104 [00:04<00:00, 23.68it/s]


best model saved

train loss : 0.2795, train f1 score :  0.8939, train acc :  0.9025

valid loss : 0.2830, valid f1 score :  0.8902, valid acc :  0.8972

-------------------
5epoch start
-------------------



  0%|          | 0/414 [00:00<?, ?it/s]

learning rate :  0.000002


100%|██████████| 414/414 [00:45<00:00,  9.08it/s]
100%|██████████| 104/104 [00:04<00:00, 23.63it/s]


train loss : 0.2633, train f1 score :  0.8962, train acc :  0.9033

valid loss : 0.2833, valid f1 score :  0.8933, valid acc :  0.8957

best valid loss :  0.2830

best epoch : 4

best accuracy :  0.8972

2 fold pass

3 fold pass

4 fold pass

0 fold pass

1 fold pass



Some weights of the model checkpoint at klue/roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier

-------------------
1epoch start
-------------------

learning rate :  0.000010


100%|██████████| 414/414 [00:45<00:00,  9.02it/s]
100%|██████████| 104/104 [00:04<00:00, 23.52it/s]


best model saved

train loss : 0.7545, train f1 score :  0.7299, train acc :  0.7775

valid loss : 0.5097, valid f1 score :  0.7950, valid acc :  0.8156

-------------------
2epoch start


  0%|          | 0/414 [00:00<?, ?it/s]

-------------------

learning rate :  0.000008


100%|██████████| 414/414 [00:45<00:00,  9.02it/s]
100%|██████████| 104/104 [00:04<00:00, 23.42it/s]


best model saved

train loss : 0.3969, train f1 score :  0.8507, train acc :  0.8683

valid loss : 0.3273, valid f1 score :  0.8659, valid acc :  0.8824

-------------------
3epoch start


  0%|          | 0/414 [00:00<?, ?it/s]

-------------------

learning rate :  0.000006


100%|██████████| 414/414 [00:45<00:00,  9.06it/s]
100%|██████████| 104/104 [00:04<00:00, 23.58it/s]
  0%|          | 0/414 [00:00<?, ?it/s]

best model saved

train loss : 0.2931, train f1 score :  0.8887, train acc :  0.8980

valid loss : 0.2896, valid f1 score :  0.8797, valid acc :  0.8866

-------------------
4epoch start
-------------------

learning rate :  0.000004


100%|██████████| 414/414 [00:45<00:00,  9.01it/s]
100%|██████████| 104/104 [00:04<00:00, 23.50it/s]


best model saved

train loss : 0.2566, train f1 score :  0.9009, train acc :  0.9067

valid loss : 0.2789, valid f1 score :  0.8826, valid acc :  0.8891



  0%|          | 0/414 [00:00<?, ?it/s]

-------------------
5epoch start
-------------------

learning rate :  0.000002


100%|██████████| 414/414 [00:45<00:00,  9.02it/s]
100%|██████████| 104/104 [00:04<00:00, 23.59it/s]


train loss : 0.2428, train f1 score :  0.9064, train acc :  0.9117

valid loss : 0.2754, valid f1 score :  0.8827, valid acc :  0.8878

best valid loss :  0.2789

best epoch : 4

best accuracy :  0.8891

3 fold pass

4 fold pass

0 fold pass

1 fold pass

2 fold pass



Some weights of the model checkpoint at klue/roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier

-------------------
1epoch start
-------------------

learning rate :  0.000010


100%|██████████| 414/414 [00:45<00:00,  9.03it/s]
100%|██████████| 104/104 [00:04<00:00, 23.44it/s]


best model saved

train loss : 0.7407, train f1 score :  0.7180, train acc :  0.7695

valid loss : 0.4467, valid f1 score :  0.7475, valid acc :  0.8226



  0%|          | 0/414 [00:00<?, ?it/s]

-------------------
2epoch start
-------------------

learning rate :  0.000008


100%|██████████| 414/414 [00:45<00:00,  9.04it/s]
100%|██████████| 104/104 [00:04<00:00, 23.53it/s]


best model saved

train loss : 0.3917, train f1 score :  0.8481, train acc :  0.8658

valid loss : 0.3262, valid f1 score :  0.8731, valid acc :  0.8900



  0%|          | 0/414 [00:00<?, ?it/s]

-------------------
3epoch start
-------------------

learning rate :  0.000006


100%|██████████| 414/414 [00:45<00:00,  9.06it/s]
100%|██████████| 104/104 [00:04<00:00, 23.35it/s]


best model saved

train loss : 0.3128, train f1 score :  0.8791, train acc :  0.8923

valid loss : 0.2943, valid f1 score :  0.8817, valid acc :  0.8906



  0%|          | 0/414 [00:00<?, ?it/s]

-------------------
4epoch start
-------------------

learning rate :  0.000004


100%|██████████| 414/414 [00:45<00:00,  9.07it/s]
100%|██████████| 104/104 [00:04<00:00, 23.39it/s]


best model saved

train loss : 0.2743, train f1 score :  0.8912, train acc :  0.9006

valid loss : 0.2820, valid f1 score :  0.8909, valid acc :  0.8990



  0%|          | 0/414 [00:00<?, ?it/s]

-------------------
5epoch start
-------------------

learning rate :  0.000002


100%|██████████| 414/414 [00:46<00:00,  8.98it/s]
100%|██████████| 104/104 [00:04<00:00, 23.47it/s]


train loss : 0.2572, train f1 score :  0.8984, train acc :  0.9050

valid loss : 0.2768, valid f1 score :  0.8935, valid acc :  0.8978

best valid loss :  0.2820

best epoch : 4

best accuracy :  0.8990

4 fold pass

0 fold pass

1 fold pass

2 fold pass

3 fold pass



Some weights of the model checkpoint at klue/roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier

-------------------
1epoch start
-------------------

learning rate :  0.000010


100%|██████████| 414/414 [00:46<00:00,  8.97it/s]
100%|██████████| 104/104 [00:04<00:00, 23.63it/s]
  0%|          | 0/414 [00:00<?, ?it/s]

best model saved

train loss : 0.8089, train f1 score :  0.6936, train acc :  0.7140

valid loss : 0.4996, valid f1 score :  0.7382, valid acc :  0.8195

-------------------
2epoch start
-------------------

learning rate :  0.000008


100%|██████████| 414/414 [00:45<00:00,  9.08it/s]
100%|██████████| 104/104 [00:04<00:00, 23.51it/s]
  0%|          | 0/414 [00:00<?, ?it/s]

best model saved

train loss : 0.3942, train f1 score :  0.8467, train acc :  0.8685

valid loss : 0.3331, valid f1 score :  0.8739, valid acc :  0.8782

-------------------
3epoch start
-------------------

learning rate :  0.000006


100%|██████████| 414/414 [00:45<00:00,  9.03it/s]
100%|██████████| 104/104 [00:04<00:00, 23.37it/s]


best model saved

train loss : 0.3058, train f1 score :  0.8798, train acc :  0.8919

valid loss : 0.2806, valid f1 score :  0.8932, valid acc :  0.8987



  0%|          | 0/414 [00:00<?, ?it/s]

-------------------
4epoch start
-------------------

learning rate :  0.000004


100%|██████████| 414/414 [00:45<00:00,  9.11it/s]
100%|██████████| 104/104 [00:04<00:00, 23.49it/s]


train loss : 0.2664, train f1 score :  0.8947, train acc :  0.9019

valid loss : 0.2710, valid f1 score :  0.8916, valid acc :  0.8939



  0%|          | 0/414 [00:00<?, ?it/s]

-------------------
5epoch start
-------------------

learning rate :  0.000002


100%|██████████| 414/414 [00:45<00:00,  9.09it/s]
100%|██████████| 104/104 [00:04<00:00, 23.58it/s]


train loss : 0.2502, train f1 score :  0.9006, train acc :  0.9064

valid loss : 0.2696, valid f1 score :  0.8918, valid acc :  0.8930

best valid loss :  0.2806

best epoch : 3

best accuracy :  0.8987



Downloading:   0%|          | 0.00/273M [00:00<?, ?B/s]

Some weights of the model checkpoint at klue/roberta-small were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-small and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifi

-------------------
1epoch start
-------------------

learning rate :  0.000010


100%|██████████| 414/414 [00:24<00:00, 16.97it/s]
100%|██████████| 104/104 [00:02<00:00, 36.68it/s]
  0%|          | 0/414 [00:00<?, ?it/s]

best model saved

train loss : 0.7566, train f1 score :  0.7298, train acc :  0.7990

valid loss : 0.5545, valid f1 score :  0.7383, valid acc :  0.8196

-------------------
2epoch start
-------------------

learning rate :  0.000008


100%|██████████| 414/414 [00:24<00:00, 16.88it/s]
100%|██████████| 104/104 [00:02<00:00, 36.64it/s]
  0%|          | 0/414 [00:00<?, ?it/s]

best model saved

train loss : 0.4270, train f1 score :  0.8256, train acc :  0.8552

valid loss : 0.3164, valid f1 score :  0.8760, valid acc :  0.8879

-------------------
3epoch start
-------------------

learning rate :  0.000006


100%|██████████| 414/414 [00:24<00:00, 16.96it/s]
100%|██████████| 104/104 [00:02<00:00, 36.08it/s]
  0%|          | 0/414 [00:00<?, ?it/s]

best model saved

train loss : 0.3122, train f1 score :  0.8792, train acc :  0.8910

valid loss : 0.2710, valid f1 score :  0.8937, valid acc :  0.9015

-------------------
4epoch start
-------------------

learning rate :  0.000004


100%|██████████| 414/414 [00:24<00:00, 17.03it/s]
100%|██████████| 104/104 [00:02<00:00, 36.33it/s]
  0%|          | 0/414 [00:00<?, ?it/s]

best model saved

train loss : 0.2752, train f1 score :  0.8932, train acc :  0.9005

valid loss : 0.2597, valid f1 score :  0.8979, valid acc :  0.9042

-------------------
5epoch start
-------------------

learning rate :  0.000002


100%|██████████| 414/414 [00:24<00:00, 16.90it/s]
100%|██████████| 104/104 [00:02<00:00, 36.16it/s]


best model saved

train loss : 0.2633, train f1 score :  0.8964, train acc :  0.9035

valid loss : 0.2564, valid f1 score :  0.9018, valid acc :  0.9075

best valid loss :  0.2564

best epoch : 5

best accuracy :  0.9075

1 fold pass

2 fold pass

3 fold pass

4 fold pass

0 fold pass



Some weights of the model checkpoint at klue/roberta-small were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-small and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifi

-------------------
1epoch start
-------------------

learning rate :  0.000010


100%|██████████| 414/414 [00:24<00:00, 16.92it/s]
100%|██████████| 104/104 [00:02<00:00, 36.66it/s]
  0%|          | 0/414 [00:00<?, ?it/s]

best model saved

train loss : 0.7519, train f1 score :  0.7308, train acc :  0.7937

valid loss : 0.5549, valid f1 score :  0.7387, valid acc :  0.8198

-------------------
2epoch start
-------------------

learning rate :  0.000008


100%|██████████| 414/414 [00:24<00:00, 16.84it/s]
100%|██████████| 104/104 [00:02<00:00, 36.62it/s]
  0%|          | 0/414 [00:00<?, ?it/s]

best model saved

train loss : 0.4584, train f1 score :  0.8058, train acc :  0.8421

valid loss : 0.3479, valid f1 score :  0.8660, valid acc :  0.8869

-------------------
3epoch start
-------------------

learning rate :  0.000006


100%|██████████| 414/414 [00:24<00:00, 16.90it/s]
100%|██████████| 104/104 [00:02<00:00, 36.96it/s]
  0%|          | 0/414 [00:00<?, ?it/s]

best model saved

train loss : 0.3238, train f1 score :  0.8731, train acc :  0.8869

valid loss : 0.2933, valid f1 score :  0.8897, valid acc :  0.8981

-------------------
4epoch start
-------------------

learning rate :  0.000004


100%|██████████| 414/414 [00:24<00:00, 16.84it/s]
100%|██████████| 104/104 [00:02<00:00, 36.81it/s]
  0%|          | 0/414 [00:00<?, ?it/s]

train loss : 0.2793, train f1 score :  0.8920, train acc :  0.9003

valid loss : 0.2826, valid f1 score :  0.8894, valid acc :  0.8978

-------------------
5epoch start
-------------------

learning rate :  0.000002


100%|██████████| 414/414 [00:24<00:00, 16.89it/s]
100%|██████████| 104/104 [00:02<00:00, 36.79it/s]


train loss : 0.2649, train f1 score :  0.8948, train acc :  0.9017

valid loss : 0.2799, valid f1 score :  0.8912, valid acc :  0.8981

best valid loss :  0.2933

best epoch : 3

best accuracy :  0.8981

2 fold pass

3 fold pass

4 fold pass

0 fold pass

1 fold pass



Some weights of the model checkpoint at klue/roberta-small were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-small and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifi

-------------------
1epoch start
-------------------

learning rate :  0.000010


100%|██████████| 414/414 [00:24<00:00, 16.92it/s]
100%|██████████| 104/104 [00:02<00:00, 36.72it/s]
  0%|          | 0/414 [00:00<?, ?it/s]

best model saved

train loss : 0.7493, train f1 score :  0.7251, train acc :  0.7840

valid loss : 0.5152, valid f1 score :  0.7410, valid acc :  0.8207

-------------------
2epoch start
-------------------

learning rate :  0.000008


100%|██████████| 414/414 [00:24<00:00, 16.84it/s]
100%|██████████| 104/104 [00:02<00:00, 36.84it/s]
  0%|          | 0/414 [00:00<?, ?it/s]

best model saved

train loss : 0.4160, train f1 score :  0.8357, train acc :  0.8597

valid loss : 0.3323, valid f1 score :  0.8614, valid acc :  0.8782

-------------------
3epoch start
-------------------

learning rate :  0.000006


100%|██████████| 414/414 [00:24<00:00, 16.91it/s]
100%|██████████| 104/104 [00:02<00:00, 36.29it/s]
  0%|          | 0/414 [00:00<?, ?it/s]

best model saved

train loss : 0.3037, train f1 score :  0.8855, train acc :  0.8952

valid loss : 0.2915, valid f1 score :  0.8798, valid acc :  0.8894

-------------------
4epoch start
-------------------

learning rate :  0.000004


100%|██████████| 414/414 [00:24<00:00, 16.89it/s]
100%|██████████| 104/104 [00:02<00:00, 36.57it/s]
  0%|          | 0/414 [00:00<?, ?it/s]

train loss : 0.2711, train f1 score :  0.8980, train acc :  0.9052

valid loss : 0.2808, valid f1 score :  0.8806, valid acc :  0.8894

-------------------
5epoch start
-------------------

learning rate :  0.000002


100%|██████████| 414/414 [00:24<00:00, 16.90it/s]
100%|██████████| 104/104 [00:02<00:00, 37.08it/s]


best model saved

train loss : 0.2547, train f1 score :  0.9041, train acc :  0.9101

valid loss : 0.2800, valid f1 score :  0.8822, valid acc :  0.8900

best valid loss :  0.2800

best epoch : 5

best accuracy :  0.8900

3 fold pass

4 fold pass

0 fold pass

1 fold pass

2 fold pass



Some weights of the model checkpoint at klue/roberta-small were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-small and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifi

-------------------
1epoch start
-------------------

learning rate :  0.000010


100%|██████████| 414/414 [00:24<00:00, 17.20it/s]
100%|██████████| 104/104 [00:03<00:00, 34.42it/s]
  0%|          | 0/414 [00:00<?, ?it/s]

best model saved

train loss : 0.7373, train f1 score :  0.7291, train acc :  0.7960

valid loss : 0.5647, valid f1 score :  0.7382, valid acc :  0.8195

-------------------
2epoch start
-------------------

learning rate :  0.000008


100%|██████████| 414/414 [00:24<00:00, 16.99it/s]
100%|██████████| 104/104 [00:02<00:00, 36.70it/s]
  0%|          | 0/414 [00:00<?, ?it/s]

best model saved

train loss : 0.4370, train f1 score :  0.8192, train acc :  0.8511

valid loss : 0.3138, valid f1 score :  0.8726, valid acc :  0.8878

-------------------
3epoch start
-------------------

learning rate :  0.000006


100%|██████████| 414/414 [00:24<00:00, 16.91it/s]
100%|██████████| 104/104 [00:02<00:00, 36.49it/s]
  0%|          | 0/414 [00:00<?, ?it/s]

best model saved

train loss : 0.3047, train f1 score :  0.8831, train acc :  0.8939

valid loss : 0.2771, valid f1 score :  0.8896, valid acc :  0.8951

-------------------
4epoch start
-------------------

learning rate :  0.000004


100%|██████████| 414/414 [00:24<00:00, 16.80it/s]
100%|██████████| 104/104 [00:02<00:00, 36.81it/s]
  0%|          | 0/414 [00:00<?, ?it/s]

best model saved

train loss : 0.2695, train f1 score :  0.8963, train acc :  0.9033

valid loss : 0.2701, valid f1 score :  0.8904, valid acc :  0.8963

-------------------
5epoch start
-------------------

learning rate :  0.000002


100%|██████████| 414/414 [00:24<00:00, 16.86it/s]
100%|██████████| 104/104 [00:02<00:00, 36.27it/s]


train loss : 0.2549, train f1 score :  0.9002, train acc :  0.9063

valid loss : 0.2685, valid f1 score :  0.8889, valid acc :  0.8951

best valid loss :  0.2701

best epoch : 4

best accuracy :  0.8963

4 fold pass

0 fold pass

1 fold pass

2 fold pass

3 fold pass



Some weights of the model checkpoint at klue/roberta-small were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-small and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifi

-------------------
1epoch start
-------------------

learning rate :  0.000010


100%|██████████| 414/414 [00:24<00:00, 16.94it/s]
100%|██████████| 104/104 [00:02<00:00, 37.21it/s]
  0%|          | 0/414 [00:00<?, ?it/s]

best model saved

train loss : 0.7388, train f1 score :  0.7335, train acc :  0.8000

valid loss : 0.5227, valid f1 score :  0.7382, valid acc :  0.8195

-------------------
2epoch start
-------------------

learning rate :  0.000008


100%|██████████| 414/414 [00:24<00:00, 16.94it/s]
100%|██████████| 104/104 [00:02<00:00, 36.79it/s]
  0%|          | 0/414 [00:00<?, ?it/s]

best model saved

train loss : 0.4034, train f1 score :  0.8387, train acc :  0.8619

valid loss : 0.3279, valid f1 score :  0.8705, valid acc :  0.8897

-------------------
3epoch start
-------------------

learning rate :  0.000006


100%|██████████| 414/414 [00:24<00:00, 16.96it/s]
100%|██████████| 104/104 [00:02<00:00, 36.40it/s]
  0%|          | 0/414 [00:00<?, ?it/s]

best model saved

train loss : 0.3004, train f1 score :  0.8833, train acc :  0.8939

valid loss : 0.2849, valid f1 score :  0.8853, valid acc :  0.8954

-------------------
4epoch start
-------------------

learning rate :  0.000004


100%|██████████| 414/414 [00:24<00:00, 16.81it/s]
100%|██████████| 104/104 [00:02<00:00, 36.44it/s]
  0%|          | 0/414 [00:00<?, ?it/s]

train loss : 0.2693, train f1 score :  0.8944, train acc :  0.9018

valid loss : 0.2760, valid f1 score :  0.8914, valid acc :  0.8942

-------------------
5epoch start
-------------------

learning rate :  0.000002


100%|██████████| 414/414 [00:24<00:00, 16.96it/s]
100%|██████████| 104/104 [00:02<00:00, 36.42it/s]


train loss : 0.2580, train f1 score :  0.9000, train acc :  0.9059

valid loss : 0.2720, valid f1 score :  0.8883, valid acc :  0.8954

best valid loss :  0.2849

best epoch : 3

best accuracy :  0.8954



Downloading:   0%|          | 0.00/1.35G [00:00<?, ?B/s]

Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifi

-------------------
1epoch start
-------------------

learning rate :  0.000010


100%|██████████| 414/414 [02:01<00:00,  3.42it/s]
100%|██████████| 104/104 [00:10<00:00,  9.66it/s]


best model saved

train loss : 0.6275, train f1 score :  0.7599, train acc :  0.7996

valid loss : 0.3616, valid f1 score :  0.8539, valid acc :  0.8809



  0%|          | 0/414 [00:00<?, ?it/s]

-------------------
2epoch start
-------------------

learning rate :  0.000008


100%|██████████| 414/414 [02:00<00:00,  3.43it/s]
100%|██████████| 104/104 [00:10<00:00,  9.63it/s]


best model saved

train loss : 0.3004, train f1 score :  0.8807, train acc :  0.8910

valid loss : 0.2540, valid f1 score :  0.8984, valid acc :  0.9033



  0%|          | 0/414 [00:00<?, ?it/s]

-------------------
3epoch start
-------------------

learning rate :  0.000006


100%|██████████| 414/414 [02:00<00:00,  3.43it/s]
100%|██████████| 104/104 [00:10<00:00,  9.61it/s]


best model saved

train loss : 0.2481, train f1 score :  0.9002, train acc :  0.9052

valid loss : 0.2458, valid f1 score :  0.9012, valid acc :  0.9048

-------------------
4epoch start
-------------------

learning rate :  0.000004


100%|██████████| 414/414 [02:00<00:00,  3.43it/s]
100%|██████████| 104/104 [00:10<00:00,  9.66it/s]


train loss : 0.2186, train f1 score :  0.9144, train acc :  0.9179

valid loss : 0.2537, valid f1 score :  0.9009, valid acc :  0.9045

-------------------
5epoch start

  0%|          | 0/414 [00:00<?, ?it/s]


-------------------

learning rate :  0.000002


100%|██████████| 414/414 [02:00<00:00,  3.44it/s]
100%|██████████| 104/104 [00:10<00:00,  9.67it/s]


train loss : 0.1987, train f1 score :  0.9202, train acc :  0.9230

valid loss : 0.2523, valid f1 score :  0.9027, valid acc :  0.9042

best valid loss :  0.2458

best epoch : 3

best accuracy :  0.9048

1 fold pass

2 fold pass

3 fold pass

4 fold pass

0 fold pass



Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifi

-------------------
1epoch start
-------------------

learning rate :  0.000010


100%|██████████| 414/414 [02:00<00:00,  3.44it/s]
100%|██████████| 104/104 [00:10<00:00,  9.58it/s]


best model saved

train loss : 0.6449, train f1 score :  0.7535, train acc :  0.7670

valid loss : 0.3313, valid f1 score :  0.8765, valid acc :  0.8927



  0%|          | 0/414 [00:00<?, ?it/s]

-------------------
2epoch start
-------------------

learning rate :  0.000008


100%|██████████| 414/414 [02:00<00:00,  3.43it/s]
100%|██████████| 104/104 [00:10<00:00,  9.60it/s]
  0%|          | 0/414 [00:00<?, ?it/s]

best model saved

train loss : 0.2887, train f1 score :  0.8856, train acc :  0.8946

valid loss : 0.2756, valid f1 score :  0.8942, valid acc :  0.8960

-------------------
3epoch start
-------------------

learning rate :  0.000006


100%|██████████| 414/414 [02:00<00:00,  3.43it/s]
100%|██████████| 104/104 [00:10<00:00,  9.66it/s]


best model saved

train loss : 0.2386, train f1 score :  0.9033, train acc :  0.9082

valid loss : 0.2661, valid f1 score :  0.8980, valid acc :  0.9018



  0%|          | 0/414 [00:00<?, ?it/s]

-------------------
4epoch start
-------------------

learning rate :  0.000004


100%|██████████| 414/414 [02:00<00:00,  3.43it/s]
100%|██████████| 104/104 [00:10<00:00,  9.65it/s]


best model saved

train loss : 0.2090, train f1 score :  0.9159, train acc :  0.9191

valid loss : 0.2713, valid f1 score :  0.8979, valid acc :  0.9024



  0%|          | 0/414 [00:00<?, ?it/s]

-------------------
5epoch start
-------------------

learning rate :  0.000002


100%|██████████| 414/414 [02:00<00:00,  3.43it/s]
100%|██████████| 104/104 [00:10<00:00,  9.59it/s]


best model saved

train loss : 0.1842, train f1 score :  0.9259, train acc :  0.9280

valid loss : 0.2779, valid f1 score :  0.9002, valid acc :  0.9027

best valid loss :  0.2779

best epoch : 5

best accuracy :  0.9027

2 fold pass

3 fold pass

4 fold pass

0 fold pass

1 fold pass



Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifi

-------------------
1epoch start
-------------------

learning rate :  0.000010


100%|██████████| 414/414 [02:00<00:00,  3.44it/s]
100%|██████████| 104/104 [00:10<00:00,  9.58it/s]


best model saved

train loss : 0.6356, train f1 score :  0.7555, train acc :  0.7737

valid loss : 0.3447, valid f1 score :  0.8596, valid acc :  0.8782



  0%|          | 0/414 [00:00<?, ?it/s]

-------------------
2epoch start
-------------------

learning rate :  0.000008


100%|██████████| 414/414 [02:00<00:00,  3.43it/s]
100%|██████████| 104/104 [00:10<00:00,  9.46it/s]


best model saved

train loss : 0.2852, train f1 score :  0.8898, train acc :  0.8975

valid loss : 0.2863, valid f1 score :  0.8771, valid acc :  0.8900



  0%|          | 0/414 [00:00<?, ?it/s]

-------------------
3epoch start
-------------------

learning rate :  0.000006


100%|██████████| 414/414 [02:00<00:00,  3.43it/s]
100%|██████████| 104/104 [00:10<00:00,  9.55it/s]


best model saved

train loss : 0.2372, train f1 score :  0.9036, train acc :  0.9088

valid loss : 0.2683, valid f1 score :  0.8881, valid acc :  0.8948

-------------------
4epoch start
-------------------

learning rate :  0.000004


100%|██████████| 414/414 [02:00<00:00,  3.43it/s]
100%|██████████| 104/104 [00:10<00:00,  9.61it/s]


train loss : 0.2075, train f1 score :  0.9201, train acc :  0.9234

valid loss : 0.2698, valid f1 score :  0.8854, valid acc :  0.8909



  0%|          | 0/414 [00:00<?, ?it/s]

-------------------
5epoch start
-------------------

learning rate :  0.000002


100%|██████████| 414/414 [02:00<00:00,  3.43it/s]
100%|██████████| 104/104 [00:10<00:00,  9.54it/s]


train loss : 0.1829, train f1 score :  0.9299, train acc :  0.9323

valid loss : 0.2813, valid f1 score :  0.8868, valid acc :  0.8909

best valid loss :  0.2683

best epoch : 3

best accuracy :  0.8948

3 fold pass

4 fold pass

0 fold pass

1 fold pass

2 fold pass



Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifi

-------------------
1epoch start
-------------------

learning rate :  0.000010


100%|██████████| 414/414 [02:00<00:00,  3.43it/s]
100%|██████████| 104/104 [00:11<00:00,  9.36it/s]


best model saved

train loss : 0.6016, train f1 score :  0.7801, train acc :  0.8265

valid loss : 0.3131, valid f1 score :  0.8671, valid acc :  0.8812



  0%|          | 0/414 [00:00<?, ?it/s]

-------------------
2epoch start
-------------------

learning rate :  0.000008


100%|██████████| 414/414 [02:01<00:00,  3.42it/s]
100%|██████████| 104/104 [00:11<00:00,  9.40it/s]
  0%|          | 0/414 [00:00<?, ?it/s]

best model saved

train loss : 0.2919, train f1 score :  0.8856, train acc :  0.8946

valid loss : 0.2625, valid f1 score :  0.8967, valid acc :  0.8990

-------------------
3epoch start
-------------------

learning rate :  0.000006


100%|██████████| 414/414 [02:01<00:00,  3.42it/s]
100%|██████████| 104/104 [00:11<00:00,  9.35it/s]


best model saved

train loss : 0.2413, train f1 score :  0.9024, train acc :  0.9077

valid loss : 0.2582, valid f1 score :  0.8948, valid acc :  0.9002



  0%|          | 0/414 [00:00<?, ?it/s]

-------------------
4epoch start
-------------------

learning rate :  0.000004


100%|██████████| 414/414 [02:01<00:00,  3.42it/s]
100%|██████████| 104/104 [00:11<00:00,  9.37it/s]


best model saved

train loss : 0.2115, train f1 score :  0.9148, train acc :  0.9186

valid loss : 0.2531, valid f1 score :  0.8949, valid acc :  0.9005



  0%|          | 0/414 [00:00<?, ?it/s]

-------------------
5epoch start
-------------------

learning rate :  0.000002


100%|██████████| 414/414 [02:01<00:00,  3.42it/s]
100%|██████████| 104/104 [00:11<00:00,  9.34it/s]


train loss : 0.1899, train f1 score :  0.9246, train acc :  0.9273

valid loss : 0.2590, valid f1 score :  0.8962, valid acc :  0.8999

best valid loss :  0.2531

best epoch : 4

best accuracy :  0.9005

4 fold pass

0 fold pass

1 fold pass

2 fold pass

3 fold pass



Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifi

-------------------
1epoch start
-------------------

learning rate :  0.000010


100%|██████████| 414/414 [02:00<00:00,  3.43it/s]
100%|██████████| 104/104 [00:11<00:00,  9.18it/s]


best model saved

train loss : 0.6358, train f1 score :  0.7467, train acc :  0.7538

valid loss : 0.3206, valid f1 score :  0.8742, valid acc :  0.8869



  0%|          | 0/414 [00:00<?, ?it/s]

-------------------
2epoch start
-------------------

learning rate :  0.000008


100%|██████████| 414/414 [02:01<00:00,  3.42it/s]
100%|██████████| 104/104 [00:11<00:00,  9.22it/s]
  0%|          | 0/414 [00:00<?, ?it/s]

best model saved

train loss : 0.2849, train f1 score :  0.8888, train acc :  0.8967

valid loss : 0.2614, valid f1 score :  0.8950, valid acc :  0.9024

-------------------
3epoch start
-------------------

learning rate :  0.000006


100%|██████████| 414/414 [02:01<00:00,  3.41it/s]
100%|██████████| 104/104 [00:11<00:00,  9.18it/s]


train loss : 0.2361, train f1 score :  0.9055, train acc :  0.9105

valid loss : 0.2511, valid f1 score :  0.8944, valid acc :  0.8993



  0%|          | 0/414 [00:00<?, ?it/s]

-------------------
4epoch start
-------------------

learning rate :  0.000004


100%|██████████| 414/414 [02:01<00:00,  3.41it/s]
100%|██████████| 104/104 [00:11<00:00,  9.24it/s]


train loss : 0.2037, train f1 score :  0.9206, train acc :  0.9239

valid loss : 0.2598, valid f1 score :  0.8951, valid acc :  0.8996



  0%|          | 0/414 [00:00<?, ?it/s]

-------------------
5epoch start
-------------------

learning rate :  0.000002


100%|██████████| 414/414 [02:01<00:00,  3.42it/s]
100%|██████████| 104/104 [00:11<00:00,  9.23it/s]


train loss : 0.1822, train f1 score :  0.9302, train acc :  0.9327

valid loss : 0.2662, valid f1 score :  0.8929, valid acc :  0.8954

best valid loss :  0.2614

best epoch : 2

best accuracy :  0.9024



IndexError: index 0 is out of bounds for axis 0 with size 0

# ensemble

In [30]:
def ensemble():
    final_logit=0
    args.max_len=128
    args.pt = 'monologg/kobert'
    _, logit1 = run_predict("./saved_models/fold3/monologg/kobert_128/0f_3e_0.8903_s.pth")
    _, logit2 = run_predict("./saved_models/fold3/monologg/kobert_128/1f_5e_0.8933_s.pth")
    _, logit3 = run_predict("./saved_models/fold3/monologg/kobert_128/2f_4e_0.8791_s.pth")
    _, logit4 = run_predict("./saved_models/fold3/monologg/kobert_128/3f_4e_0.8860_s.pth")
    _, logit5 = run_predict("./saved_models/fold3/monologg/kobert_128/4f_4e_0.8930_s.pth")
    final_logit += (logit1+logit2+logit3+logit4+logit5)/5
    
    #####################

    args.pt = 'klue/roberta-base'
    _, logit1 = run_predict("./saved_models/fold3/klue/roberta-base_128/0f_3e_0.8942_s.pth")
    _, logit2 = run_predict("./saved_models/fold3/klue/roberta-base_128/1f_4e_0.8972_s.pth")
    _, logit3 = run_predict("./saved_models/fold3/klue/roberta-base_128/2f_4e_0.8891_s.pth")
    _, logit4 = run_predict("./saved_models/fold3/klue/roberta-base_128/3f_4e_0.8990_s.pth")
    _, logit5 = run_predict("./saved_models/fold3/klue/roberta-base_128/4f_3e_0.8987_s.pth")

    final_logit += (logit1+logit2+logit3+logit4+logit5)/5

    #####################
    args.pt = 'klue/roberta-small'
    preds1, logit1 = run_predict("./saved_models/fold3/klue/roberta-small_128/0f_5e_0.9075_s.pth")
    preds2, logit2 = run_predict("./saved_models/fold3/klue/roberta-small_128/1f_3e_0.8981_s.pth")
    preds3, logit3 = run_predict("./saved_models/fold3/klue/roberta-small_128/2f_5e_0.8900_s.pth")
    preds4, logit4 = run_predict("./saved_models/fold3/klue/roberta-small_128/3f_4e_0.8963_s.pth")
    preds5, logit5 = run_predict("./saved_models/fold3/klue/roberta-small_128/4f_3e_0.8954_s.pth") # 8884 가능
    final_logit += (logit1+logit2+logit3+logit4+logit5)/5
    #####################

    args.pt = 'bert-base-multilingual-uncased'
    preds1, logit1 = run_predict("./saved_models/fold3/bert-base-multilingual-uncased_128/0f_5e_0.8939_s.pth")
    preds2, logit2 = run_predict("./saved_models/fold3/bert-base-multilingual-uncased_128/1f_5e_0.8903_s.pth")
    preds3, logit3 = run_predict("./saved_models/fold3/bert-base-multilingual-uncased_128/2f_4e_0.8785_s.pth")
    preds4, logit4 = run_predict("./saved_models/fold3/bert-base-multilingual-uncased_128/3f_5e_0.8848_s.pth")
    preds5, logit5 = run_predict("./saved_models/fold3/bert-base-multilingual-uncased_128/4f_3e_0.8869_s.pth")
    final_logit += (logit1+logit2+logit3+logit4+logit5)/5
    #####################
    args.pt = 'klue/roberta-large'
    preds1, logit1 = run_predict("./saved_models/fold3/klue/roberta-large_128/0f_3e_0.9048_s.pth")
    preds2, logit2 = run_predict("./saved_models/fold3/klue/roberta-large_128/1f_5e_0.9027_s.pth")
    preds3, logit3 = run_predict("./saved_models/fold3/klue/roberta-large_128/2f_3e_0.8948_s.pth")
    preds4, logit4 = run_predict("./saved_models/fold3/klue/roberta-large_128/3f_4e_0.9005_s.pth")
    preds5, logit5 = run_predict("./saved_models/fold3/klue/roberta-large_128/4f_2e_0.9024_s.pth")
    final_logit += (logit1+logit2+logit3+logit4+logit5)/5
    #####################
    args.pt = 'xlm-roberta-large'
    preds1, logit1 = run_predict("./saved_models/fold3/xlm-roberta-large_128/0f_5e_0.9054_s.pth")
    preds2, logit2 = run_predict("./saved_models/fold3/xlm-roberta-large_128/1f_4e_0.9011_s.pth")
    preds3, logit3 = run_predict("./saved_models/fold3/xlm-roberta-large_128/2f_4e_0.8881_s.pth")
    preds4, logit4 = run_predict("./saved_models/fold3/xlm-roberta-large_128/3f_4e_0.8900_s.pth")
    preds5, logit5 = run_predict("./saved_models/fold3/xlm-roberta-large_128/4f_3e_0.8903_s.pth")
    final_logit += (logit1+logit2+logit3+logit4+logit5)/5
    #####################
    args.max_len=96
    args.pt = 'klue/roberta-large'
    preds1, logit1 = run_predict("./saved_models/fold3/klue/roberta-large_96/0f_5e_0.9039_s.pth")
    preds2, logit2 = run_predict("./saved_models/fold3//klue/roberta-large_96/1f_5e_0.8930_s.pth")
    preds3, logit3 = run_predict("./saved_models/fold3//klue/roberta-large_96/2f_5e_0.8948_s.pth")
    preds4, logit4 = run_predict("./saved_models/fold3//klue/roberta-large_96/3f_4e_0.9030_s.pth")
    preds5, logit5 = run_predict("./saved_models/fold3//klue/roberta-large_96/4f_5e_0.8942_s.pth")
    final_logit += (logit1+logit2+logit3+logit4+logit5)/5
    
    return final_logit


In [31]:
final_logit = ensemble()

test load
set testloader


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/222 [00:00<?, ?it/s]

load saved models


100%|██████████| 222/222 [00:08<00:00, 25.41it/s]


complete predict
test load
set testloader


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/222 [00:00<?, ?it/s]

load saved models


100%|██████████| 222/222 [00:08<00:00, 26.87it/s]


complete predict
test load
set testloader


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/222 [00:00<?, ?it/s]

load saved models


100%|██████████| 222/222 [00:08<00:00, 26.70it/s]


complete predict
test load
set testloader


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/222 [00:00<?, ?it/s]

load saved models


100%|██████████| 222/222 [00:08<00:00, 26.69it/s]


complete predict
test load
set testloader


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/222 [00:00<?, ?it/s]

load saved models


100%|██████████| 222/222 [00:08<00:00, 26.93it/s]


complete predict
test load
set testloader


Some weights of the model checkpoint at klue/roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier

load saved models


100%|██████████| 222/222 [00:08<00:00, 26.96it/s]


complete predict
test load
set testloader


Some weights of the model checkpoint at klue/roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier

load saved models


100%|██████████| 222/222 [00:08<00:00, 25.30it/s]


complete predict
test load
set testloader


Some weights of the model checkpoint at klue/roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier

load saved models


100%|██████████| 222/222 [00:08<00:00, 25.28it/s]


complete predict
test load
set testloader


Some weights of the model checkpoint at klue/roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier

load saved models


100%|██████████| 222/222 [00:08<00:00, 26.39it/s]


complete predict
test load
set testloader


Some weights of the model checkpoint at klue/roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier

load saved models


100%|██████████| 222/222 [00:08<00:00, 25.73it/s]


complete predict
test load
set testloader


Some weights of the model checkpoint at klue/roberta-small were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-small and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifi

load saved models


100%|██████████| 222/222 [00:05<00:00, 42.04it/s]


complete predict
test load
set testloader


Some weights of the model checkpoint at klue/roberta-small were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-small and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifi

load saved models


100%|██████████| 222/222 [00:04<00:00, 45.06it/s]


complete predict
test load
set testloader


Some weights of the model checkpoint at klue/roberta-small were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-small and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifi

load saved models


100%|██████████| 222/222 [00:04<00:00, 45.81it/s]


complete predict
test load
set testloader


Some weights of the model checkpoint at klue/roberta-small were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-small and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifi

load saved models


100%|██████████| 222/222 [00:05<00:00, 41.65it/s]


complete predict
test load
set testloader


Some weights of the model checkpoint at klue/roberta-small were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-small and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifi

load saved models


100%|██████████| 222/222 [00:04<00:00, 44.86it/s]


complete predict
test load
set testloader


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 

load saved models


100%|██████████| 222/222 [00:08<00:00, 25.53it/s]


complete predict
test load
set testloader


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 

load saved models


100%|██████████| 222/222 [00:08<00:00, 26.47it/s]


complete predict
test load
set testloader


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 

load saved models


100%|██████████| 222/222 [00:08<00:00, 26.64it/s]


complete predict
test load
set testloader


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 

load saved models


100%|██████████| 222/222 [00:08<00:00, 25.57it/s]


complete predict
test load
set testloader


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 

load saved models


100%|██████████| 222/222 [00:08<00:00, 26.88it/s]


complete predict
test load
set testloader


Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifi

load saved models


100%|██████████| 222/222 [00:21<00:00, 10.45it/s]


complete predict
test load
set testloader


Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifi

load saved models


100%|██████████| 222/222 [00:21<00:00, 10.39it/s]


complete predict
test load
set testloader


Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifi

load saved models


100%|██████████| 222/222 [00:21<00:00, 10.14it/s]


complete predict
test load
set testloader


Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifi

load saved models


100%|██████████| 222/222 [00:21<00:00, 10.29it/s]


complete predict
test load
set testloader


Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifi

load saved models


100%|██████████| 222/222 [00:21<00:00, 10.36it/s]


complete predict
test load
set testloader


Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.den

load saved models


100%|██████████| 222/222 [00:21<00:00, 10.42it/s]


complete predict
test load
set testloader


Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.den

load saved models


100%|██████████| 222/222 [00:21<00:00, 10.44it/s]


complete predict
test load
set testloader


Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.den

load saved models


100%|██████████| 222/222 [00:21<00:00, 10.34it/s]


complete predict
test load
set testloader


Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.den

load saved models


100%|██████████| 222/222 [00:21<00:00, 10.36it/s]


complete predict
test load
set testloader


Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.den

load saved models


100%|██████████| 222/222 [00:21<00:00, 10.29it/s]


complete predict
test load
set testloader


Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifi

load saved models


100%|██████████| 222/222 [00:15<00:00, 13.97it/s]


complete predict
test load
set testloader


Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifi

load saved models


100%|██████████| 222/222 [00:16<00:00, 13.76it/s]


complete predict
test load
set testloader


Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifi

load saved models


100%|██████████| 222/222 [00:16<00:00, 13.81it/s]


complete predict
test load
set testloader


Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifi

load saved models


100%|██████████| 222/222 [00:16<00:00, 13.47it/s]


complete predict
test load
set testloader


Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifi

load saved models


100%|██████████| 222/222 [00:15<00:00, 14.13it/s]

complete predict





In [42]:
final_logit_type=final_logit

In [39]:
submission = pd.read_csv('./data/sample_submission.csv')

train=pd.read_csv('./data/train.csv')
    

# 2. Label Encoding (유형, 극성, 시제, 확실성)
from sklearn.preprocessing import LabelEncoder
type_le = LabelEncoder()
train["유형"] = type_le.fit_transform(train["유형"].values)


polarity_le = LabelEncoder()
train["극성"] = polarity_le.fit_transform(train["극성"].values)

tense_le = LabelEncoder()
train["시제"] = tense_le.fit_transform(train["시제"].values)

certainty_le = LabelEncoder()
train["확실성"] = certainty_le.fit_transform(train["확실성"].values)


In [43]:
final_logit_type = np.argmax(final_logit_type, axis=1)
type_pred_ang=type_le.inverse_transform(final_logit_type)
submission['유형']= type_pred_ang

# submission

In [None]:
sub = pd.read_csv(".data/sample_submission.csv")
sub['topic_idx'] = final_logit.argmax(1)
# preds
#sub.to_csv('./submission/final_submission.csv', index=False)

