<a href="https://colab.research.google.com/github/mk0653/untitled/blob/master/iryou_bert_FT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import sys

#from google.colab import drive
#drive.mount('/gdrive')


In [2]:
!pip install -q transformers

[K     |████████████████████████████████| 2.8 MB 15.7 MB/s 
[K     |████████████████████████████████| 636 kB 82.0 MB/s 
[K     |████████████████████████████████| 3.3 MB 48.5 MB/s 
[K     |████████████████████████████████| 895 kB 52.6 MB/s 
[K     |████████████████████████████████| 52 kB 2.0 MB/s 
[?25h

In [3]:
import math
import random
import time
import warnings

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import transformers as T
import torch.optim as optim
from transformers import (AutoModelForSequenceClassification, AutoTokenizer,AutoConfig,
                          get_cosine_schedule_with_warmup)
from sklearn.metrics import fbeta_score
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm

In [4]:
DATA_DIR = "/content/drive/MyDrive/[NLP]医療論文仕分けコンペ/"
OUTPUT_DIR = "/content/drive/MyDrive/output/models/"

In [5]:
warnings.filterwarnings("ignore")

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [7]:
def init_logger(log_file= "./train.log"):
    from logging import INFO, FileHandler, Formatter, StreamHandler, getLogger

    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = init_logger()

In [8]:
def seed_torch(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

seed = 471
seed_torch(seed)

In [9]:
class Config:
  pretrained_model_path = '/content/drive/MyDrive/RoBERTa-base-PM-M3-Voc-train-longer/RoBERTa-base-PM-M3-Voc-train-longer-hf'
  model_name = 'roberta-base'
  batch_size = 16
  lr = 2e-5

In [10]:
train = pd.read_csv(DATA_DIR + "train.csv")
test = pd.read_csv(DATA_DIR + "test.csv")
sub = pd.read_csv(DATA_DIR + "sample_submit.csv", header=None)
sub.columns = ["id", "judgement"]
train['title'] = train['title'].str.rstrip(".")
test['title'] = test['title'].str.rstrip(".")
train['abstract'] = train['title'].str.cat(train['abstract'],sep='.', na_rep='')
test['abstract'] = test['title'].str.cat(test['abstract'],sep='.', na_rep='')


In [11]:
test.to_csv("test_re.csv")

In [12]:
# この値を境に、モデルの出力を 0 と 1 にします。
#border = len(train[train["judgement"] == 1]) / len(train["judgement"])
border = 0.020
print(border)

0.02


In [13]:
def get_train_data(train):

    # 交差検証 用の番号を振ります。
    Fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    for n, (train_index, val_index) in enumerate(Fold.split(train, train["judgement"])):
        train.loc[val_index, "fold"] = int(n)
    train["fold"] = train["fold"].astype(np.uint8)

    return train

In [14]:
def get_test_data(test):
    return test

In [15]:
train = get_train_data(train)

In [16]:
train.head()

Unnamed: 0,id,title,abstract,judgement,fold
0,0,One-year age changes in MRI brain volumes in o...,One-year age changes in MRI brain volumes in o...,0,0
1,1,Supportive CSF biomarker evidence to enhance t...,Supportive CSF biomarker evidence to enhance t...,0,1
2,2,Occurrence of basal ganglia germ cell tumors w...,Occurrence of basal ganglia germ cell tumors w...,0,4
3,3,New developments in diagnosis and therapy of C...,New developments in diagnosis and therapy of C...,0,3
4,4,Prolonged shedding of SARS-CoV-2 in an elderly...,Prolonged shedding of SARS-CoV-2 in an elderly...,0,1


In [17]:
test.head()

Unnamed: 0,id,title,abstract
0,27145,Estimating the potential effects of COVID-19 p...,Estimating the potential effects of COVID-19 p...
1,27146,Leukoerythroblastic reaction in a patient with...,Leukoerythroblastic reaction in a patient with...
2,27147,[15O]-water PET and intraoperative brain mappi...,[15O]-water PET and intraoperative brain mappi...
3,27148,Adaptive image segmentation for robust measure...,Adaptive image segmentation for robust measure...
4,27149,Comparison of Epidemiological Variations in CO...,Comparison of Epidemiological Variations in CO...


In [18]:
class BaseDataset(Dataset):
    def __init__(self, df, model_name, include_labels=True):
        #tokenizer = T.BertTokenizer.from_pretrained(model_name)
        tokenizer = AutoTokenizer.from_pretrained(Config.pretrained_model_path)

        self.df = df
        self.include_labels = include_labels

        self.title = df["abstract"].tolist()
        self.encoded = tokenizer.batch_encode_plus(
            self.title,
            padding = 'max_length',            
            max_length = 256,
            truncation = True,
            return_attention_mask=True
        )
        
        if self.include_labels:
            self.labels = df["judgement"].values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        input_ids = torch.tensor(self.encoded['input_ids'][idx])
        attention_mask = torch.tensor(self.encoded['attention_mask'][idx])

        if self.include_labels:
            label = torch.tensor(self.labels[idx]).float()
            return input_ids, attention_mask, label

        return input_ids, attention_mask

In [19]:
import torch
import torch.nn as nn

class AttentionHead(nn.Module):
    def __init__(self, h_size, hidden_dim=512):
        super().__init__()
        self.W = nn.Linear(h_size, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        
    def forward(self, features):
        att = torch.tanh(self.W(features))
        score = self.V(att)
        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector

class BaseModel(nn.Module):
    def __init__(self, model_name):
        super().__init__()

        #self.model = T.BertForSequenceClassification.from_pretrained(model_name, num_labels=1)
        #self.model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)
        self.config = AutoConfig.from_pretrained(model_name, num_labels=1)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name, config=self.config)
        #self.h_size = config.hidden_size
        #self.head = AttentionHead(self.h_size)
        #self.linear = nn.Linear(self.h_size, 1)
        self.sigmoid = nn.Sigmoid()


    def forward(self, input_ids, attention_mask):
        out = self.model(input_ids=input_ids, attention_mask=attention_mask)
        #x = self.head(out.loss)
        #x = self.linear(x)
        #print(out)
        #out = self.sigmoid(out.logits).squeeze()
        #print(out)
        out = self.sigmoid(out.logits).squeeze()
        #out = self.linear(x)

        return out

In [20]:
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count
        #print(self.sum,self.count,self.avg)


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return "%dm %ds" % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (remain %s)" % (asMinutes(s), asMinutes(rs))

In [21]:
def train_fn(train_loader, model, criterion, optimizer, epoch, device):
    start = end = time.time()
    losses = AverageMeter()

    # switch to train mode
    model.train()

    for step, (input_ids, attention_mask, labels) in enumerate(train_loader):
        optimizer.zero_grad()

        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        y_preds = model(input_ids, attention_mask)

        loss = criterion(y_preds, labels)
        # record loss
        losses.update(loss.item(), batch_size)
        loss.backward()

        optimizer.step()

        if step % 100 == 0 or step == (len(train_loader) - 1):
            print(
                f"Epoch: [{epoch + 1}][{step}/{len(train_loader)}] "
                f"Elapsed {timeSince(start, float(step + 1) / len(train_loader)):s} "
                f"Loss: {losses.avg:.4f} "
            )

    return losses.avg

In [22]:
def valid_fn(valid_loader, model, criterion, device):
    start = end = time.time()
    losses = AverageMeter()

    # switch to evaluation mode
    model.eval()
    preds = []

    for step, (input_ids, attention_mask, labels) in enumerate(valid_loader):
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)

        # compute loss
        with torch.no_grad():
            y_preds = model(input_ids, attention_mask)

        loss = criterion(y_preds, labels)
        losses.update(loss.item(), batch_size)

        # record score
        preds.append(y_preds.to("cpu").numpy())

        if step % 100 == 0 or step == (len(valid_loader) - 1):
            print(
                f"EVAL: [{step}/{len(valid_loader)}] "
                f"Elapsed {timeSince(start, float(step + 1) / len(valid_loader)):s} "
                f"Loss: {losses.avg:.4f} "
            )

    predictions = np.concatenate(preds)
    return losses.avg, predictions

In [23]:
def inference():
    predictions = []

    test_dataset = BaseDataset(test, Config.model_name, include_labels=False)
    test_loader = DataLoader(
        test_dataset, batch_size=16, shuffle=False, num_workers=4, pin_memory=True
    )

    for fold in range(5):
        LOGGER.info(f"========== model:robertaPMBase fold: {fold} inference ==========")
        model = BaseModel(Config.pretrained_model_path)
        model.to(device)
        model.load_state_dict(torch.load(OUTPUT_DIR + f"robertaPMBase_fold{fold}_best.pth")["model"])
        model.eval()
        preds = []
        for i, (input_ids, attention_mask) in tqdm(enumerate(test_loader), total=len(test_loader)):
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            with torch.no_grad():
                y_preds = model(input_ids, attention_mask)
            preds.append(y_preds.to("cpu").numpy())
        preds = np.concatenate(preds)
        predictions.append(preds)
    predictions = np.mean(predictions, axis=0)

    return predictions

In [24]:
def create_optimizer(model):
    named_parameters = list(model.named_parameters())    
    
    roberta_parameters = named_parameters[:197]    
    attention_parameters = named_parameters[199:203]
    regressor_parameters = named_parameters[203:]
        
    attention_group = [params for (name, params) in attention_parameters]
    regressor_group = [params for (name, params) in regressor_parameters]

    parameters = []
    parameters.append({"params": attention_group})
    parameters.append({"params": regressor_group})

    for layer_num, (name, params) in enumerate(roberta_parameters):
        weight_decay = 0.0 if "bias" in name else 0.01

        lr = Config.lr

        if layer_num >= 69:        
            lr = Config.lr * 2.5

        if layer_num >= 133:
            lr = Config.lr * 5

        parameters.append({"params": params,
                           "weight_decay": weight_decay,
                           "lr": lr})
        
    return optim.AdamW(parameters)

In [25]:
def train_loop(train, fold):

    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # Data Loader
    # ====================================================
    trn_idx = train[train["fold"] != fold].index
    val_idx = train[train["fold"] == fold].index

    train_folds = train.loc[trn_idx].reset_index(drop=True)
    valid_folds = train.loc[val_idx].reset_index(drop=True)

    train_dataset = BaseDataset(train_folds, Config.model_name)
    valid_dataset = BaseDataset(valid_folds, Config.model_name)

    train_loader = DataLoader(
        train_dataset,
        batch_size=Config.batch_size,
        shuffle=True,
        num_workers=0,
        pin_memory=True,
        drop_last=True,
    )
    valid_loader = DataLoader(
        valid_dataset,
        batch_size=Config.batch_size,
        shuffle=False,
        num_workers=0,
        pin_memory=True,
        drop_last=False,
    )

    # ====================================================
    # Model
    # ====================================================
    model = BaseModel(Config.pretrained_model_path)
    model.to(device)

    #optimizer = create_optimizer(model)
    optimizer =optim.AdamW(model.parameters(), lr=2e-5)

    criterion = nn.BCELoss()

    # ====================================================
    # Loop
    # ====================================================
    best_score = -1
    best_loss = np.inf

    for epoch in range(3):
        start_time = time.time()
        
        # train
        avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch, device)

        # eval
        avg_val_loss, preds = valid_fn(valid_loader, model, criterion, device)
        valid_labels = valid_folds["judgement"].values

        # scoring
        score = fbeta_score(valid_labels, np.where(preds < border, 0, 1), beta=7.0)

        elapsed = time.time() - start_time
        LOGGER.info(
            f"Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s"
        )
        LOGGER.info(f"Epoch {epoch+1} - Score: {score}")

        if score > best_score:
            best_score = score
            LOGGER.info(f"Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model")
            torch.save(
                {"model": model.state_dict(), "preds": preds}, OUTPUT_DIR + f"robertaPMBase_fold{fold}_best.pth"
            )

    check_point = torch.load(OUTPUT_DIR + f"robertaPMBase_fold{fold}_best.pth")

    valid_folds["preds"] = check_point["preds"]

    return valid_folds

In [26]:
def get_result(result_df):
    preds = result_df["preds"].values
    labels = result_df["judgement"].values
    score = fbeta_score(labels, np.where(preds < border, 0, 1), beta=7.0)
    LOGGER.info(f"Score: {score:<.5f}")

In [27]:
def main():
    # Training
    oof_df = pd.DataFrame()
    for fold in range(5):
        _oof_df = train_loop(train, fold)
        oof_df = pd.concat([oof_df, _oof_df])
        LOGGER.info(f"========== fold: {fold} result ==========")
        get_result(_oof_df)
    
    # CV result
    LOGGER.info(f"========== CV ==========")
    get_result(oof_df)
    
    # Save OOF result
    oof_df.to_csv(OUTPUT_DIR + "oof_df.csv", index=False)

    # Inference
    predictions = inference()
    predictions = np.where(predictions < border, 0, 1)

    # submission
    sub["judgement"] = predictions
    sub.to_csv(OUTPUT_DIR + "submission.csv", index=False, header=False)
      

In [28]:
if __name__ == "__main__":
    main()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at /content/drive/MyDrive/RoBERTa-base-PM-M3-Voc-train-longer/RoBERTa-base-PM-M3-Voc-train-longer-hf were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint

Epoch: [1][0/1357] Elapsed 0m 0s (remain 9m 4s) Loss: 0.6284 
Epoch: [1][100/1357] Elapsed 0m 23s (remain 4m 48s) Loss: 0.1507 
Epoch: [1][200/1357] Elapsed 0m 46s (remain 4m 24s) Loss: 0.1370 
Epoch: [1][300/1357] Elapsed 1m 8s (remain 4m 2s) Loss: 0.1225 
Epoch: [1][400/1357] Elapsed 1m 31s (remain 3m 39s) Loss: 0.1128 
Epoch: [1][500/1357] Elapsed 1m 54s (remain 3m 16s) Loss: 0.1048 
Epoch: [1][600/1357] Elapsed 2m 17s (remain 2m 53s) Loss: 0.0977 
Epoch: [1][700/1357] Elapsed 2m 40s (remain 2m 30s) Loss: 0.0941 
Epoch: [1][800/1357] Elapsed 3m 3s (remain 2m 7s) Loss: 0.0893 
Epoch: [1][900/1357] Elapsed 3m 26s (remain 1m 44s) Loss: 0.0850 
Epoch: [1][1000/1357] Elapsed 3m 49s (remain 1m 21s) Loss: 0.0817 
Epoch: [1][1100/1357] Elapsed 4m 12s (remain 0m 58s) Loss: 0.0788 
Epoch: [1][1200/1357] Elapsed 4m 35s (remain 0m 35s) Loss: 0.0756 
Epoch: [1][1300/1357] Elapsed 4m 58s (remain 0m 12s) Loss: 0.0735 
Epoch: [1][1356/1357] Elapsed 5m 11s (remain 0m 0s) Loss: 0.0723 
EVAL: [0/340] 

Epoch 1 - avg_train_loss: 0.0723  avg_val_loss: 0.0397  time: 336s
Epoch 1 - Score: 0.9142685851318945
Epoch 1 - Save Best Score: 0.9143 Model


EVAL: [339/340] Elapsed 0m 24s (remain 0m 0s) Loss: 0.0397 
Epoch: [2][0/1357] Elapsed 0m 0s (remain 5m 33s) Loss: 0.0030 
Epoch: [2][100/1357] Elapsed 0m 23s (remain 4m 48s) Loss: 0.0302 
Epoch: [2][200/1357] Elapsed 0m 46s (remain 4m 25s) Loss: 0.0293 
Epoch: [2][300/1357] Elapsed 1m 9s (remain 4m 2s) Loss: 0.0375 
Epoch: [2][400/1357] Elapsed 1m 32s (remain 3m 39s) Loss: 0.0396 
Epoch: [2][500/1357] Elapsed 1m 55s (remain 3m 16s) Loss: 0.0400 
Epoch: [2][600/1357] Elapsed 2m 17s (remain 2m 53s) Loss: 0.0377 
Epoch: [2][700/1357] Elapsed 2m 40s (remain 2m 30s) Loss: 0.0364 
Epoch: [2][800/1357] Elapsed 3m 3s (remain 2m 7s) Loss: 0.0354 
Epoch: [2][900/1357] Elapsed 3m 26s (remain 1m 44s) Loss: 0.0348 
Epoch: [2][1000/1357] Elapsed 3m 49s (remain 1m 21s) Loss: 0.0341 
Epoch: [2][1100/1357] Elapsed 4m 12s (remain 0m 58s) Loss: 0.0343 
Epoch: [2][1200/1357] Elapsed 4m 35s (remain 0m 35s) Loss: 0.0359 
Epoch: [2][1300/1357] Elapsed 4m 58s (remain 0m 12s) Loss: 0.0366 
Epoch: [2][1356/135

Epoch 2 - avg_train_loss: 0.0364  avg_val_loss: 0.0410  time: 336s
Epoch 2 - Score: 0.9151030452168565
Epoch 2 - Save Best Score: 0.9151 Model


EVAL: [339/340] Elapsed 0m 24s (remain 0m 0s) Loss: 0.0410 
Epoch: [3][0/1357] Elapsed 0m 0s (remain 5m 15s) Loss: 0.0122 
Epoch: [3][100/1357] Elapsed 0m 23s (remain 4m 48s) Loss: 0.0242 
Epoch: [3][200/1357] Elapsed 0m 46s (remain 4m 25s) Loss: 0.0224 
Epoch: [3][300/1357] Elapsed 1m 9s (remain 4m 2s) Loss: 0.0231 
Epoch: [3][400/1357] Elapsed 1m 32s (remain 3m 39s) Loss: 0.0224 
Epoch: [3][500/1357] Elapsed 1m 55s (remain 3m 16s) Loss: 0.0233 
Epoch: [3][600/1357] Elapsed 2m 18s (remain 2m 53s) Loss: 0.0234 
Epoch: [3][700/1357] Elapsed 2m 40s (remain 2m 30s) Loss: 0.0242 
Epoch: [3][800/1357] Elapsed 3m 3s (remain 2m 7s) Loss: 0.0241 
Epoch: [3][900/1357] Elapsed 3m 26s (remain 1m 44s) Loss: 0.0249 
Epoch: [3][1000/1357] Elapsed 3m 49s (remain 1m 21s) Loss: 0.0258 
Epoch: [3][1100/1357] Elapsed 4m 12s (remain 0m 58s) Loss: 0.0263 
Epoch: [3][1200/1357] Elapsed 4m 35s (remain 0m 35s) Loss: 0.0259 
Epoch: [3][1300/1357] Elapsed 4m 58s (remain 0m 12s) Loss: 0.0251 
Epoch: [3][1356/135

Epoch 3 - avg_train_loss: 0.0248  avg_val_loss: 0.0444  time: 336s
Epoch 3 - Score: 0.8522283033620016


EVAL: [339/340] Elapsed 0m 24s (remain 0m 0s) Loss: 0.0444 


Score: 0.91510
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at /content/drive/MyDrive/RoBERTa-base-PM-M3-Voc-train-longer/RoBERTa-base-PM-M3-Voc-train-longer-hf were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from

Epoch: [1][0/1357] Elapsed 0m 0s (remain 5m 36s) Loss: 0.7075 
Epoch: [1][100/1357] Elapsed 0m 23s (remain 4m 47s) Loss: 0.1501 
Epoch: [1][200/1357] Elapsed 0m 46s (remain 4m 25s) Loss: 0.1269 
Epoch: [1][300/1357] Elapsed 1m 9s (remain 4m 2s) Loss: 0.1098 
Epoch: [1][400/1357] Elapsed 1m 32s (remain 3m 39s) Loss: 0.1023 
Epoch: [1][500/1357] Elapsed 1m 55s (remain 3m 16s) Loss: 0.0936 
Epoch: [1][600/1357] Elapsed 2m 17s (remain 2m 53s) Loss: 0.0895 
Epoch: [1][700/1357] Elapsed 2m 40s (remain 2m 30s) Loss: 0.0843 
Epoch: [1][800/1357] Elapsed 3m 3s (remain 2m 7s) Loss: 0.0792 
Epoch: [1][900/1357] Elapsed 3m 26s (remain 1m 44s) Loss: 0.0770 
Epoch: [1][1000/1357] Elapsed 3m 49s (remain 1m 21s) Loss: 0.0758 
Epoch: [1][1100/1357] Elapsed 4m 12s (remain 0m 58s) Loss: 0.0733 
Epoch: [1][1200/1357] Elapsed 4m 35s (remain 0m 35s) Loss: 0.0721 
Epoch: [1][1300/1357] Elapsed 4m 58s (remain 0m 12s) Loss: 0.0695 
Epoch: [1][1356/1357] Elapsed 5m 11s (remain 0m 0s) Loss: 0.0681 
EVAL: [0/340]

Epoch 1 - avg_train_loss: 0.0681  avg_val_loss: 0.0463  time: 336s
Epoch 1 - Score: 0.8728564807662601
Epoch 1 - Save Best Score: 0.8729 Model


EVAL: [339/340] Elapsed 0m 24s (remain 0m 0s) Loss: 0.0463 
Epoch: [2][0/1357] Elapsed 0m 0s (remain 5m 39s) Loss: 0.0061 
Epoch: [2][100/1357] Elapsed 0m 23s (remain 4m 49s) Loss: 0.0363 
Epoch: [2][200/1357] Elapsed 0m 46s (remain 4m 25s) Loss: 0.0379 
Epoch: [2][300/1357] Elapsed 1m 9s (remain 4m 2s) Loss: 0.0374 
Epoch: [2][400/1357] Elapsed 1m 32s (remain 3m 39s) Loss: 0.0391 
Epoch: [2][500/1357] Elapsed 1m 55s (remain 3m 16s) Loss: 0.0372 
Epoch: [2][600/1357] Elapsed 2m 18s (remain 2m 53s) Loss: 0.0380 
Epoch: [2][700/1357] Elapsed 2m 41s (remain 2m 30s) Loss: 0.0373 
Epoch: [2][800/1357] Elapsed 3m 4s (remain 2m 7s) Loss: 0.0373 
Epoch: [2][900/1357] Elapsed 3m 27s (remain 1m 44s) Loss: 0.0373 
Epoch: [2][1000/1357] Elapsed 3m 50s (remain 1m 21s) Loss: 0.0380 
Epoch: [2][1100/1357] Elapsed 4m 13s (remain 0m 58s) Loss: 0.0379 
Epoch: [2][1200/1357] Elapsed 4m 35s (remain 0m 35s) Loss: 0.0379 
Epoch: [2][1300/1357] Elapsed 4m 58s (remain 0m 12s) Loss: 0.0372 
Epoch: [2][1356/135

Epoch 2 - avg_train_loss: 0.0371  avg_val_loss: 0.0383  time: 336s
Epoch 2 - Score: 0.891621829362029
Epoch 2 - Save Best Score: 0.8916 Model


EVAL: [339/340] Elapsed 0m 24s (remain 0m 0s) Loss: 0.0383 
Epoch: [3][0/1357] Elapsed 0m 0s (remain 5m 14s) Loss: 0.0059 
Epoch: [3][100/1357] Elapsed 0m 23s (remain 4m 48s) Loss: 0.0183 
Epoch: [3][200/1357] Elapsed 0m 46s (remain 4m 25s) Loss: 0.0188 
Epoch: [3][300/1357] Elapsed 1m 9s (remain 4m 2s) Loss: 0.0289 
Epoch: [3][400/1357] Elapsed 1m 32s (remain 3m 39s) Loss: 0.0266 
Epoch: [3][500/1357] Elapsed 1m 55s (remain 3m 16s) Loss: 0.0262 
Epoch: [3][600/1357] Elapsed 2m 18s (remain 2m 53s) Loss: 0.0251 
Epoch: [3][700/1357] Elapsed 2m 40s (remain 2m 30s) Loss: 0.0268 
Epoch: [3][800/1357] Elapsed 3m 3s (remain 2m 7s) Loss: 0.0275 
Epoch: [3][900/1357] Elapsed 3m 26s (remain 1m 44s) Loss: 0.0272 
Epoch: [3][1000/1357] Elapsed 3m 49s (remain 1m 21s) Loss: 0.0277 
Epoch: [3][1100/1357] Elapsed 4m 12s (remain 0m 58s) Loss: 0.0267 
Epoch: [3][1200/1357] Elapsed 4m 35s (remain 0m 35s) Loss: 0.0262 
Epoch: [3][1300/1357] Elapsed 4m 58s (remain 0m 12s) Loss: 0.0263 
Epoch: [3][1356/135

Epoch 3 - avg_train_loss: 0.0275  avg_val_loss: 0.0441  time: 336s
Epoch 3 - Score: 0.9195402298850575
Epoch 3 - Save Best Score: 0.9195 Model


EVAL: [339/340] Elapsed 0m 24s (remain 0m 0s) Loss: 0.0441 


Score: 0.91954
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at /content/drive/MyDrive/RoBERTa-base-PM-M3-Voc-train-longer/RoBERTa-base-PM-M3-Voc-train-longer-hf were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from

Epoch: [1][0/1357] Elapsed 0m 0s (remain 5m 40s) Loss: 0.6346 
Epoch: [1][100/1357] Elapsed 0m 23s (remain 4m 48s) Loss: 0.1487 
Epoch: [1][200/1357] Elapsed 0m 46s (remain 4m 25s) Loss: 0.1326 
Epoch: [1][300/1357] Elapsed 1m 9s (remain 4m 2s) Loss: 0.1202 
Epoch: [1][400/1357] Elapsed 1m 32s (remain 3m 39s) Loss: 0.1074 
Epoch: [1][500/1357] Elapsed 1m 55s (remain 3m 16s) Loss: 0.1003 
Epoch: [1][600/1357] Elapsed 2m 18s (remain 2m 53s) Loss: 0.0928 
Epoch: [1][700/1357] Elapsed 2m 41s (remain 2m 30s) Loss: 0.0851 
Epoch: [1][800/1357] Elapsed 3m 4s (remain 2m 7s) Loss: 0.0827 
Epoch: [1][900/1357] Elapsed 3m 27s (remain 1m 44s) Loss: 0.0824 
Epoch: [1][1000/1357] Elapsed 3m 50s (remain 1m 21s) Loss: 0.0791 
Epoch: [1][1100/1357] Elapsed 4m 13s (remain 0m 58s) Loss: 0.0764 
Epoch: [1][1200/1357] Elapsed 4m 36s (remain 0m 35s) Loss: 0.0758 
Epoch: [1][1300/1357] Elapsed 4m 59s (remain 0m 12s) Loss: 0.0741 
Epoch: [1][1356/1357] Elapsed 5m 12s (remain 0m 0s) Loss: 0.0727 
EVAL: [0/340]

Epoch 1 - avg_train_loss: 0.0727  avg_val_loss: 0.0521  time: 337s
Epoch 1 - Score: 0.7772423441629098
Epoch 1 - Save Best Score: 0.7772 Model


EVAL: [339/340] Elapsed 0m 24s (remain 0m 0s) Loss: 0.0521 
Epoch: [2][0/1357] Elapsed 0m 0s (remain 5m 48s) Loss: 0.0033 
Epoch: [2][100/1357] Elapsed 0m 23s (remain 4m 49s) Loss: 0.0377 
Epoch: [2][200/1357] Elapsed 0m 46s (remain 4m 26s) Loss: 0.0345 
Epoch: [2][300/1357] Elapsed 1m 9s (remain 4m 3s) Loss: 0.0328 
Epoch: [2][400/1357] Elapsed 1m 32s (remain 3m 39s) Loss: 0.0356 
Epoch: [2][500/1357] Elapsed 1m 55s (remain 3m 16s) Loss: 0.0365 
Epoch: [2][600/1357] Elapsed 2m 18s (remain 2m 53s) Loss: 0.0390 
Epoch: [2][700/1357] Elapsed 2m 41s (remain 2m 30s) Loss: 0.0392 
Epoch: [2][800/1357] Elapsed 3m 4s (remain 2m 7s) Loss: 0.0387 
Epoch: [2][900/1357] Elapsed 3m 27s (remain 1m 44s) Loss: 0.0379 
Epoch: [2][1000/1357] Elapsed 3m 50s (remain 1m 21s) Loss: 0.0379 
Epoch: [2][1100/1357] Elapsed 4m 13s (remain 0m 58s) Loss: 0.0384 
Epoch: [2][1200/1357] Elapsed 4m 36s (remain 0m 35s) Loss: 0.0385 
Epoch: [2][1300/1357] Elapsed 4m 59s (remain 0m 12s) Loss: 0.0385 
Epoch: [2][1356/135

Epoch 2 - avg_train_loss: 0.0382  avg_val_loss: 0.0422  time: 337s
Epoch 2 - Score: 0.9140353527723221
Epoch 2 - Save Best Score: 0.9140 Model


EVAL: [339/340] Elapsed 0m 24s (remain 0m 0s) Loss: 0.0422 
Epoch: [3][0/1357] Elapsed 0m 0s (remain 5m 16s) Loss: 0.1022 
Epoch: [3][100/1357] Elapsed 0m 23s (remain 4m 49s) Loss: 0.0197 
Epoch: [3][200/1357] Elapsed 0m 46s (remain 4m 25s) Loss: 0.0239 
Epoch: [3][300/1357] Elapsed 1m 9s (remain 4m 2s) Loss: 0.0238 
Epoch: [3][400/1357] Elapsed 1m 32s (remain 3m 39s) Loss: 0.0261 
Epoch: [3][500/1357] Elapsed 1m 55s (remain 3m 16s) Loss: 0.0246 
Epoch: [3][600/1357] Elapsed 2m 18s (remain 2m 53s) Loss: 0.0243 
Epoch: [3][700/1357] Elapsed 2m 41s (remain 2m 30s) Loss: 0.0249 
Epoch: [3][800/1357] Elapsed 3m 3s (remain 2m 7s) Loss: 0.0268 
Epoch: [3][900/1357] Elapsed 3m 26s (remain 1m 44s) Loss: 0.0265 
Epoch: [3][1000/1357] Elapsed 3m 49s (remain 1m 21s) Loss: 0.0272 
Epoch: [3][1100/1357] Elapsed 4m 12s (remain 0m 58s) Loss: 0.0265 
Epoch: [3][1200/1357] Elapsed 4m 35s (remain 0m 35s) Loss: 0.0260 
Epoch: [3][1300/1357] Elapsed 4m 58s (remain 0m 12s) Loss: 0.0260 
Epoch: [3][1356/135

Epoch 3 - avg_train_loss: 0.0264  avg_val_loss: 0.0371  time: 336s
Epoch 3 - Score: 0.906183368869936


EVAL: [339/340] Elapsed 0m 24s (remain 0m 0s) Loss: 0.0371 


Score: 0.91404
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at /content/drive/MyDrive/RoBERTa-base-PM-M3-Voc-train-longer/RoBERTa-base-PM-M3-Voc-train-longer-hf were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from

Epoch: [1][0/1357] Elapsed 0m 0s (remain 5m 40s) Loss: 0.6095 
Epoch: [1][100/1357] Elapsed 0m 23s (remain 4m 47s) Loss: 0.1448 
Epoch: [1][200/1357] Elapsed 0m 46s (remain 4m 25s) Loss: 0.1240 
Epoch: [1][300/1357] Elapsed 1m 9s (remain 4m 2s) Loss: 0.1094 
Epoch: [1][400/1357] Elapsed 1m 32s (remain 3m 39s) Loss: 0.1004 
Epoch: [1][500/1357] Elapsed 1m 55s (remain 3m 16s) Loss: 0.0945 
Epoch: [1][600/1357] Elapsed 2m 17s (remain 2m 53s) Loss: 0.0899 
Epoch: [1][700/1357] Elapsed 2m 40s (remain 2m 30s) Loss: 0.0864 
Epoch: [1][800/1357] Elapsed 3m 3s (remain 2m 7s) Loss: 0.0816 
Epoch: [1][900/1357] Elapsed 3m 26s (remain 1m 44s) Loss: 0.0790 
Epoch: [1][1000/1357] Elapsed 3m 49s (remain 1m 21s) Loss: 0.0774 
Epoch: [1][1100/1357] Elapsed 4m 12s (remain 0m 58s) Loss: 0.0747 
Epoch: [1][1200/1357] Elapsed 4m 35s (remain 0m 35s) Loss: 0.0722 
Epoch: [1][1300/1357] Elapsed 4m 58s (remain 0m 12s) Loss: 0.0712 
Epoch: [1][1356/1357] Elapsed 5m 11s (remain 0m 0s) Loss: 0.0701 
EVAL: [0/340]

Epoch 1 - avg_train_loss: 0.0701  avg_val_loss: 0.0496  time: 336s
Epoch 1 - Score: 0.8219178082191781
Epoch 1 - Save Best Score: 0.8219 Model


EVAL: [339/340] Elapsed 0m 24s (remain 0m 0s) Loss: 0.0496 
Epoch: [2][0/1357] Elapsed 0m 0s (remain 5m 40s) Loss: 0.0354 
Epoch: [2][100/1357] Elapsed 0m 23s (remain 4m 49s) Loss: 0.0367 
Epoch: [2][200/1357] Elapsed 0m 46s (remain 4m 25s) Loss: 0.0367 
Epoch: [2][300/1357] Elapsed 1m 9s (remain 4m 2s) Loss: 0.0333 
Epoch: [2][400/1357] Elapsed 1m 32s (remain 3m 39s) Loss: 0.0387 
Epoch: [2][500/1357] Elapsed 1m 55s (remain 3m 16s) Loss: 0.0383 
Epoch: [2][600/1357] Elapsed 2m 18s (remain 2m 53s) Loss: 0.0378 
Epoch: [2][700/1357] Elapsed 2m 41s (remain 2m 30s) Loss: 0.0369 
Epoch: [2][800/1357] Elapsed 3m 4s (remain 2m 7s) Loss: 0.0369 
Epoch: [2][900/1357] Elapsed 3m 27s (remain 1m 44s) Loss: 0.0378 
Epoch: [2][1000/1357] Elapsed 3m 49s (remain 1m 21s) Loss: 0.0390 
Epoch: [2][1100/1357] Elapsed 4m 12s (remain 0m 58s) Loss: 0.0399 
Epoch: [2][1200/1357] Elapsed 4m 35s (remain 0m 35s) Loss: 0.0389 
Epoch: [2][1300/1357] Elapsed 4m 58s (remain 0m 12s) Loss: 0.0390 
Epoch: [2][1356/135

Epoch 2 - avg_train_loss: 0.0385  avg_val_loss: 0.0484  time: 337s
Epoch 2 - Score: 0.8487536320538308
Epoch 2 - Save Best Score: 0.8488 Model


EVAL: [339/340] Elapsed 0m 24s (remain 0m 0s) Loss: 0.0484 
Epoch: [3][0/1357] Elapsed 0m 0s (remain 5m 20s) Loss: 0.0030 
Epoch: [3][100/1357] Elapsed 0m 23s (remain 4m 49s) Loss: 0.0250 
Epoch: [3][200/1357] Elapsed 0m 46s (remain 4m 25s) Loss: 0.0251 
Epoch: [3][300/1357] Elapsed 1m 9s (remain 4m 2s) Loss: 0.0238 
Epoch: [3][400/1357] Elapsed 1m 32s (remain 3m 39s) Loss: 0.0239 
Epoch: [3][500/1357] Elapsed 1m 55s (remain 3m 16s) Loss: 0.0246 
Epoch: [3][600/1357] Elapsed 2m 18s (remain 2m 53s) Loss: 0.0249 
Epoch: [3][700/1357] Elapsed 2m 40s (remain 2m 30s) Loss: 0.0253 
Epoch: [3][800/1357] Elapsed 3m 3s (remain 2m 7s) Loss: 0.0265 
Epoch: [3][900/1357] Elapsed 3m 26s (remain 1m 44s) Loss: 0.0273 
Epoch: [3][1000/1357] Elapsed 3m 49s (remain 1m 21s) Loss: 0.0275 
Epoch: [3][1100/1357] Elapsed 4m 12s (remain 0m 58s) Loss: 0.0280 
Epoch: [3][1200/1357] Elapsed 4m 35s (remain 0m 35s) Loss: 0.0276 
Epoch: [3][1300/1357] Elapsed 4m 58s (remain 0m 12s) Loss: 0.0279 
Epoch: [3][1356/135

Epoch 3 - avg_train_loss: 0.0279  avg_val_loss: 0.0469  time: 336s
Epoch 3 - Score: 0.8680555555555555
Epoch 3 - Save Best Score: 0.8681 Model


EVAL: [339/340] Elapsed 0m 24s (remain 0m 0s) Loss: 0.0469 


Score: 0.86806
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at /content/drive/MyDrive/RoBERTa-base-PM-M3-Voc-train-longer/RoBERTa-base-PM-M3-Voc-train-longer-hf were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from

Epoch: [1][0/1357] Elapsed 0m 0s (remain 5m 25s) Loss: 0.6261 
Epoch: [1][100/1357] Elapsed 0m 23s (remain 4m 48s) Loss: 0.1430 
Epoch: [1][200/1357] Elapsed 0m 46s (remain 4m 25s) Loss: 0.1300 
Epoch: [1][300/1357] Elapsed 1m 9s (remain 4m 2s) Loss: 0.1147 
Epoch: [1][400/1357] Elapsed 1m 32s (remain 3m 39s) Loss: 0.1065 
Epoch: [1][500/1357] Elapsed 1m 54s (remain 3m 16s) Loss: 0.0953 
Epoch: [1][600/1357] Elapsed 2m 17s (remain 2m 53s) Loss: 0.0922 
Epoch: [1][700/1357] Elapsed 2m 40s (remain 2m 30s) Loss: 0.0849 
Epoch: [1][800/1357] Elapsed 3m 3s (remain 2m 7s) Loss: 0.0802 
Epoch: [1][900/1357] Elapsed 3m 26s (remain 1m 44s) Loss: 0.0767 
Epoch: [1][1000/1357] Elapsed 3m 49s (remain 1m 21s) Loss: 0.0747 
Epoch: [1][1100/1357] Elapsed 4m 12s (remain 0m 58s) Loss: 0.0719 
Epoch: [1][1200/1357] Elapsed 4m 35s (remain 0m 35s) Loss: 0.0698 
Epoch: [1][1300/1357] Elapsed 4m 58s (remain 0m 12s) Loss: 0.0697 
Epoch: [1][1356/1357] Elapsed 5m 11s (remain 0m 0s) Loss: 0.0686 
EVAL: [0/340]

Epoch 1 - avg_train_loss: 0.0686  avg_val_loss: 0.0489  time: 336s
Epoch 1 - Score: 0.8806986382474836
Epoch 1 - Save Best Score: 0.8807 Model


EVAL: [339/340] Elapsed 0m 24s (remain 0m 0s) Loss: 0.0489 
Epoch: [2][0/1357] Elapsed 0m 0s (remain 5m 39s) Loss: 0.0896 
Epoch: [2][100/1357] Elapsed 0m 23s (remain 4m 48s) Loss: 0.0291 
Epoch: [2][200/1357] Elapsed 0m 46s (remain 4m 25s) Loss: 0.0388 
Epoch: [2][300/1357] Elapsed 1m 9s (remain 4m 2s) Loss: 0.0367 
Epoch: [2][400/1357] Elapsed 1m 31s (remain 3m 39s) Loss: 0.0379 
Epoch: [2][500/1357] Elapsed 1m 54s (remain 3m 16s) Loss: 0.0400 
Epoch: [2][600/1357] Elapsed 2m 17s (remain 2m 53s) Loss: 0.0381 
Epoch: [2][700/1357] Elapsed 2m 40s (remain 2m 30s) Loss: 0.0388 
Epoch: [2][800/1357] Elapsed 3m 3s (remain 2m 7s) Loss: 0.0378 
Epoch: [2][900/1357] Elapsed 3m 26s (remain 1m 44s) Loss: 0.0365 
Epoch: [2][1000/1357] Elapsed 3m 49s (remain 1m 21s) Loss: 0.0365 
Epoch: [2][1100/1357] Elapsed 4m 12s (remain 0m 58s) Loss: 0.0365 
Epoch: [2][1200/1357] Elapsed 4m 35s (remain 0m 35s) Loss: 0.0366 
Epoch: [2][1300/1357] Elapsed 4m 58s (remain 0m 12s) Loss: 0.0371 
Epoch: [2][1356/135

Epoch 2 - avg_train_loss: 0.0375  avg_val_loss: 0.0418  time: 336s
Epoch 2 - Score: 0.8724428399518652


EVAL: [339/340] Elapsed 0m 24s (remain 0m 0s) Loss: 0.0418 
Epoch: [3][0/1357] Elapsed 0m 0s (remain 5m 18s) Loss: 0.0015 
Epoch: [3][100/1357] Elapsed 0m 23s (remain 4m 48s) Loss: 0.0196 
Epoch: [3][200/1357] Elapsed 0m 46s (remain 4m 25s) Loss: 0.0211 
Epoch: [3][300/1357] Elapsed 1m 9s (remain 4m 2s) Loss: 0.0253 
Epoch: [3][400/1357] Elapsed 1m 31s (remain 3m 39s) Loss: 0.0263 
Epoch: [3][500/1357] Elapsed 1m 54s (remain 3m 16s) Loss: 0.0262 
Epoch: [3][600/1357] Elapsed 2m 17s (remain 2m 53s) Loss: 0.0264 
Epoch: [3][700/1357] Elapsed 2m 40s (remain 2m 30s) Loss: 0.0267 
Epoch: [3][800/1357] Elapsed 3m 3s (remain 2m 7s) Loss: 0.0281 
Epoch: [3][900/1357] Elapsed 3m 26s (remain 1m 44s) Loss: 0.0275 
Epoch: [3][1000/1357] Elapsed 3m 49s (remain 1m 21s) Loss: 0.0266 
Epoch: [3][1100/1357] Elapsed 4m 12s (remain 0m 58s) Loss: 0.0274 
Epoch: [3][1200/1357] Elapsed 4m 35s (remain 0m 35s) Loss: 0.0268 
Epoch: [3][1300/1357] Elapsed 4m 58s (remain 0m 12s) Loss: 0.0276 
Epoch: [3][1356/135

Epoch 3 - avg_train_loss: 0.0275  avg_val_loss: 0.0413  time: 336s
Epoch 3 - Score: 0.8968266135213859
Epoch 3 - Save Best Score: 0.8968 Model


EVAL: [339/340] Elapsed 0m 24s (remain 0m 0s) Loss: 0.0413 


Score: 0.89683
Score: 0.90263
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at /content/drive/MyDrive/RoBERTa-base-PM-M3-Voc-train-longer/RoBERTa-base-PM-M3-Voc-train-longer-hf were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Rob

  0%|          | 0/2553 [00:00<?, ?it/s]

Some weights of the model checkpoint at /content/drive/MyDrive/RoBERTa-base-PM-M3-Voc-train-longer/RoBERTa-base-PM-M3-Voc-train-longer-hf were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialize

  0%|          | 0/2553 [00:00<?, ?it/s]

Some weights of the model checkpoint at /content/drive/MyDrive/RoBERTa-base-PM-M3-Voc-train-longer/RoBERTa-base-PM-M3-Voc-train-longer-hf were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialize

  0%|          | 0/2553 [00:00<?, ?it/s]

Some weights of the model checkpoint at /content/drive/MyDrive/RoBERTa-base-PM-M3-Voc-train-longer/RoBERTa-base-PM-M3-Voc-train-longer-hf were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialize

  0%|          | 0/2553 [00:00<?, ?it/s]

Some weights of the model checkpoint at /content/drive/MyDrive/RoBERTa-base-PM-M3-Voc-train-longer/RoBERTa-base-PM-M3-Voc-train-longer-hf were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialize

  0%|          | 0/2553 [00:00<?, ?it/s]

In [29]:
b

NameError: ignored

In [None]:
border = 0.018

In [None]:
    oof_df = pd.read_csv("/content/oof_df (1).csv")
    
    # CV result
    LOGGER.info(f"========== CV ==========")
    get_result(oof_df)
    

In [None]:
# Save OOF result
#oof_df.to_csv("/content/preds.csv", index=False)

    # Inference
predictions = inference()
predictions = np.where(predictions < border, 0, 1)

    # submission
sub["judgement"] = predictions
sub.to_csv("/content/submission.csv", index=False, header=False)



In [None]:
border = 0.015

In [None]:
# Save OOF result
oof_df.to_csv("/content/preds2.csv", index=False)

    # Inference
predictions = inference()
predictions = np.where(predictions < border, 0, 1)

    # submission
sub["judgement"] = predictions
sub.to_csv("/content/submission2.csv", index=False, header=False)