<a href="https://colab.research.google.com/github/mk0653/untitled/blob/master/iryou_bert_FT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [180]:
import os
import sys

#from google.colab import drive
#drive.mount('/gdrive')


In [181]:
!pip install -q transformers

In [182]:
import math
import random
import time
import warnings

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import transformers as T
from transformers import (AutoModelForSequenceClassification, AutoTokenizer,AutoConfig,
                          get_cosine_schedule_with_warmup)
from sklearn.metrics import fbeta_score
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm

In [183]:
DATA_DIR = "/content/drive/MyDrive/[NLP]医療論文仕分けコンペ/"
OUTPUT_DIR = "/content/drive/MyDrive/output/models/"

In [184]:
warnings.filterwarnings("ignore")

In [185]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [186]:
def init_logger(log_file= "./train.log"):
    from logging import INFO, FileHandler, Formatter, StreamHandler, getLogger

    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = init_logger()

In [187]:
def seed_torch(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

seed = 471
seed_torch(seed)

In [188]:
class Config:
  #pretrained_model_path = '/content/drive/MyDrive/roberta_large'
  model_name = 'roberta-base'

In [189]:
train = pd.read_csv(DATA_DIR + "train.csv")
test = pd.read_csv(DATA_DIR + "test.csv")
sub = pd.read_csv(DATA_DIR + "sample_submit.csv", header=None)
sub.columns = ["id", "judgement"]
train['abstract'] = train['title'].str.cat(train['abstract'], na_rep='')
test['abstract'] = test['title'].str.cat(test['abstract'], na_rep='')

In [190]:
train.head()

Unnamed: 0,id,title,abstract,judgement
0,0,One-year age changes in MRI brain volumes in o...,One-year age changes in MRI brain volumes in o...,0
1,1,Supportive CSF biomarker evidence to enhance t...,Supportive CSF biomarker evidence to enhance t...,0
2,2,Occurrence of basal ganglia germ cell tumors w...,Occurrence of basal ganglia germ cell tumors w...,0
3,3,New developments in diagnosis and therapy of C...,New developments in diagnosis and therapy of C...,0
4,4,Prolonged shedding of SARS-CoV-2 in an elderly...,Prolonged shedding of SARS-CoV-2 in an elderly...,0


In [191]:
test.head()

Unnamed: 0,id,title,abstract
0,27145,Estimating the potential effects of COVID-19 p...,Estimating the potential effects of COVID-19 p...
1,27146,Leukoerythroblastic reaction in a patient with...,Leukoerythroblastic reaction in a patient with...
2,27147,[15O]-water PET and intraoperative brain mappi...,[15O]-water PET and intraoperative brain mappi...
3,27148,Adaptive image segmentation for robust measure...,Adaptive image segmentation for robust measure...
4,27149,Comparison of Epidemiological Variations in CO...,Comparison of Epidemiological Variations in CO...


In [192]:
# この値を境に、モデルの出力を 0 と 1 にします。
border = len(train[train["judgement"] == 1]) / len(train["judgement"])
print(border)

0.023282372444280715


In [193]:
def get_train_data(train):

    # 交差検証 用の番号を振ります。
    Fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    for n, (train_index, val_index) in enumerate(Fold.split(train, train["judgement"])):
        train.loc[val_index, "fold"] = int(n)
    train["fold"] = train["fold"].astype(np.uint8)

    return train

In [194]:
def get_test_data(test):
    return test

In [195]:
train = get_train_data(train)

In [196]:
class BaseDataset(Dataset):
    def __init__(self, df, model_name, include_labels=True):
        #tokenizer = T.BertTokenizer.from_pretrained(model_name)
        tokenizer = AutoTokenizer.from_pretrained(model_name)

        self.df = df
        self.include_labels = include_labels

        self.title = df["abstract"].tolist()
        self.encoded = tokenizer.batch_encode_plus(
            self.title,
            padding = 'max_length',            
            max_length = 256,
            truncation = True,
            return_attention_mask=True
        )
        
        if self.include_labels:
            self.labels = df["judgement"].values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        input_ids = torch.tensor(self.encoded['input_ids'][idx])
        attention_mask = torch.tensor(self.encoded['attention_mask'][idx])

        if self.include_labels:
            label = torch.tensor(self.labels[idx]).float()
            return input_ids, attention_mask, label

        return input_ids, attention_mask

In [197]:
class BaseModel(nn.Module):
    def __init__(self, model_name):
        super().__init__()

        #self.model = T.BertForSequenceClassification.from_pretrained(model_name, num_labels=1)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)
        self.config = AutoConfig.from_pretrained(model_name)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        out = self.model(input_ids=input_ids, attention_mask=attention_mask)
        out = self.sigmoid(out.logits).squeeze()

        return out

In [198]:
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return "%dm %ds" % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (remain %s)" % (asMinutes(s), asMinutes(rs))

In [199]:
def train_fn(train_loader, model, criterion, optimizer, epoch, device):
    start = end = time.time()
    losses = AverageMeter()

    # switch to train mode
    model.train()

    for step, (input_ids, attention_mask, labels) in enumerate(train_loader):
        optimizer.zero_grad()

        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)

        y_preds = model(input_ids, attention_mask)

        loss = criterion(y_preds, labels)

        # record loss
        losses.update(loss.item(), batch_size)
        loss.backward()

        optimizer.step()

        if step % 100 == 0 or step == (len(train_loader) - 1):
            print(
                f"Epoch: [{epoch + 1}][{step}/{len(train_loader)}] "
                f"Elapsed {timeSince(start, float(step + 1) / len(train_loader)):s} "
                f"Loss: {losses.avg:.4f} "
            )

    return losses.avg

In [200]:
def valid_fn(valid_loader, model, criterion, device):
    start = end = time.time()
    losses = AverageMeter()

    # switch to evaluation mode
    model.eval()
    preds = []

    for step, (input_ids, attention_mask, labels) in enumerate(valid_loader):
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)

        # compute loss
        with torch.no_grad():
            y_preds = model(input_ids, attention_mask)

        loss = criterion(y_preds, labels)
        losses.update(loss.item(), batch_size)

        # record score
        preds.append(y_preds.to("cpu").numpy())

        if step % 100 == 0 or step == (len(valid_loader) - 1):
            print(
                f"EVAL: [{step}/{len(valid_loader)}] "
                f"Elapsed {timeSince(start, float(step + 1) / len(valid_loader)):s} "
                f"Loss: {losses.avg:.4f} "
            )

    predictions = np.concatenate(preds)
    return losses.avg, predictions

In [201]:
def inference():
    predictions = []

    test_dataset = BaseDataset(test, Config.model_name, include_labels=False)
    test_loader = DataLoader(
        test_dataset, batch_size=16, shuffle=False, num_workers=4, pin_memory=True
    )

    for fold in range(5):
        LOGGER.info(f"========== model: roberta-large fold: {fold} inference ==========")
        model = BaseModel(Config.model_name)
        model.to(device)
        model.load_state_dict(torch.load(OUTPUT_DIR + f"roberta_fold{fold}_best.pth")["model"])
        model.eval()
        preds = []
        for i, (input_ids, attention_mask) in tqdm(enumerate(test_loader), total=len(test_loader)):
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            with torch.no_grad():
                y_preds = model(input_ids, attention_mask)
            preds.append(y_preds.to("cpu").numpy())
        preds = np.concatenate(preds)
        predictions.append(preds)
    predictions = np.mean(predictions, axis=0)

    return predictions

In [202]:
def train_loop(train, fold):

    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # Data Loader
    # ====================================================
    trn_idx = train[train["fold"] != fold].index
    val_idx = train[train["fold"] == fold].index

    train_folds = train.loc[trn_idx].reset_index(drop=True)
    valid_folds = train.loc[val_idx].reset_index(drop=True)

    train_dataset = BaseDataset(train_folds, "roberta-base")
    valid_dataset = BaseDataset(valid_folds, "roberta-base")

    train_loader = DataLoader(
        train_dataset,
        batch_size=16,
        shuffle=True,
        num_workers=4,
        pin_memory=True,
        drop_last=True,
    )
    valid_loader = DataLoader(
        valid_dataset,
        batch_size=16,
        shuffle=False,
        num_workers=4,
        pin_memory=True,
        drop_last=False,
    )

    # ====================================================
    # Model
    # ====================================================
    model = BaseModel(Config.model_name)
    model.to(device)

    optimizer =T.AdamW(model.parameters(), lr=2e-5)

    criterion = nn.BCELoss()

    # ====================================================
    # Loop
    # ====================================================
    best_score = -1
    best_loss = np.inf

    for epoch in range(3):
        start_time = time.time()
        
        # train
        avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch, device)

        # eval
        avg_val_loss, preds = valid_fn(valid_loader, model, criterion, device)
        valid_labels = valid_folds["judgement"].values

        # scoring
        score = fbeta_score(valid_labels, np.where(preds < border, 0, 1), beta=7.0)

        elapsed = time.time() - start_time
        LOGGER.info(
            f"Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s"
        )
        LOGGER.info(f"Epoch {epoch+1} - Score: {score}")

        if score > best_score:
            best_score = score
            LOGGER.info(f"Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model")
            torch.save(
                {"model": model.state_dict(), "preds": preds}, OUTPUT_DIR + f"roberta_fold{fold}_best.pth"
            )

    check_point = torch.load(OUTPUT_DIR + f"roberta_fold{fold}_best.pth")

    valid_folds["preds"] = check_point["preds"]

    return valid_folds

In [203]:
def get_result(result_df):
    preds = result_df["preds"].values
    labels = result_df["judgement"].values
    score = fbeta_score(labels, np.where(preds < border, 0, 1), beta=7.0)
    LOGGER.info(f"Score: {score:<.5f}")

In [204]:
def main():
    # Training
    oof_df = pd.DataFrame()
    for fold in range(4,5):  #for fold in range(5)
        _oof_df = train_loop(train, fold)
        oof_df = pd.concat([oof_df, _oof_df])
        LOGGER.info(f"========== fold: {fold} result ==========")
        get_result(_oof_df)
    
    # CV result
    LOGGER.info(f"========== CV ==========")
    get_result(oof_df)
    
    # Save OOF result
    oof_df.to_csv(OUTPUT_DIR + "oof_df.csv", index=False)

    # Inference
    predictions = inference()
    predictions = np.where(predictions < border, 0, 1)

    # submission
    sub["judgement"] = predictions
    sub.to_csv(OUTPUT_DIR + "submission.csv", index=False, header=False)
      

In [205]:
 config = AutoConfig.from_pretrained(Config.model_name)
 config

RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.10.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

In [206]:
if __name__ == "__main__":
    main()

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

Epoch: [1][0/1357] Elapsed 0m 0s (remain 12m 36s) Loss: 0.6446 
Epoch: [1][100/1357] Elapsed 0m 23s (remain 4m 47s) Loss: 0.1611 
Epoch: [1][200/1357] Elapsed 0m 45s (remain 4m 22s) Loss: 0.1400 
Epoch: [1][300/1357] Elapsed 1m 8s (remain 3m 59s) Loss: 0.1274 
Epoch: [1][400/1357] Elapsed 1m 30s (remain 3m 36s) Loss: 0.1248 
Epoch: [1][500/1357] Elapsed 1m 53s (remain 3m 13s) Loss: 0.1195 
Epoch: [1][600/1357] Elapsed 2m 15s (remain 2m 50s) Loss: 0.1119 
Epoch: [1][700/1357] Elapsed 2m 38s (remain 2m 28s) Loss: 0.1032 
Epoch: [1][800/1357] Elapsed 3m 1s (remain 2m 5s) Loss: 0.0994 
Epoch: [1][900/1357] Elapsed 3m 23s (remain 1m 43s) Loss: 0.0989 
Epoch: [1][1000/1357] Elapsed 3m 46s (remain 1m 20s) Loss: 0.0950 
Epoch: [1][1100/1357] Elapsed 4m 8s (remain 0m 57s) Loss: 0.0939 
Epoch: [1][1200/1357] Elapsed 4m 31s (remain 0m 35s) Loss: 0.0907 
Epoch: [1][1300/1357] Elapsed 4m 54s (remain 0m 12s) Loss: 0.0907 
Epoch: [1][1356/1357] Elapsed 5m 6s (remain 0m 0s) Loss: 0.0889 
EVAL: [0/340]

Epoch 1 - avg_train_loss: 0.0889  avg_val_loss: 0.0661  time: 331s
Epoch 1 - avg_train_loss: 0.0889  avg_val_loss: 0.0661  time: 331s
Epoch 1 - avg_train_loss: 0.0889  avg_val_loss: 0.0661  time: 331s
Epoch 1 - avg_train_loss: 0.0889  avg_val_loss: 0.0661  time: 331s
Epoch 1 - avg_train_loss: 0.0889  avg_val_loss: 0.0661  time: 331s
Epoch 1 - avg_train_loss: 0.0889  avg_val_loss: 0.0661  time: 331s
Epoch 1 - avg_train_loss: 0.0889  avg_val_loss: 0.0661  time: 331s
Epoch 1 - Score: 0.7217444717444716
Epoch 1 - Score: 0.7217444717444716
Epoch 1 - Score: 0.7217444717444716
Epoch 1 - Score: 0.7217444717444716
Epoch 1 - Score: 0.7217444717444716
Epoch 1 - Score: 0.7217444717444716
Epoch 1 - Score: 0.7217444717444716
Epoch 1 - Save Best Score: 0.7217 Model
Epoch 1 - Save Best Score: 0.7217 Model
Epoch 1 - Save Best Score: 0.7217 Model
Epoch 1 - Save Best Score: 0.7217 Model
Epoch 1 - Save Best Score: 0.7217 Model
Epoch 1 - Save Best Score: 0.7217 Model
Epoch 1 - Save Best Score: 0.7217 Model

EVAL: [339/340] Elapsed 0m 24s (remain 0m 0s) Loss: 0.0661 
Epoch: [2][0/1357] Elapsed 0m 0s (remain 11m 53s) Loss: 0.0037 
Epoch: [2][100/1357] Elapsed 0m 23s (remain 4m 47s) Loss: 0.0684 
Epoch: [2][200/1357] Elapsed 0m 45s (remain 4m 22s) Loss: 0.0575 
Epoch: [2][300/1357] Elapsed 1m 8s (remain 3m 59s) Loss: 0.0531 
Epoch: [2][400/1357] Elapsed 1m 30s (remain 3m 36s) Loss: 0.0557 
Epoch: [2][500/1357] Elapsed 1m 53s (remain 3m 13s) Loss: 0.0543 
Epoch: [2][600/1357] Elapsed 2m 15s (remain 2m 50s) Loss: 0.0535 
Epoch: [2][700/1357] Elapsed 2m 38s (remain 2m 28s) Loss: 0.0519 
Epoch: [2][800/1357] Elapsed 3m 1s (remain 2m 5s) Loss: 0.0537 
Epoch: [2][900/1357] Elapsed 3m 23s (remain 1m 43s) Loss: 0.0533 
Epoch: [2][1000/1357] Elapsed 3m 46s (remain 1m 20s) Loss: 0.0533 
Epoch: [2][1100/1357] Elapsed 4m 8s (remain 0m 57s) Loss: 0.0526 
Epoch: [2][1200/1357] Elapsed 4m 31s (remain 0m 35s) Loss: 0.0530 
Epoch: [2][1300/1357] Elapsed 4m 54s (remain 0m 12s) Loss: 0.0527 
Epoch: [2][1356/13

Epoch 2 - avg_train_loss: 0.0525  avg_val_loss: 0.0478  time: 331s
Epoch 2 - avg_train_loss: 0.0525  avg_val_loss: 0.0478  time: 331s
Epoch 2 - avg_train_loss: 0.0525  avg_val_loss: 0.0478  time: 331s
Epoch 2 - avg_train_loss: 0.0525  avg_val_loss: 0.0478  time: 331s
Epoch 2 - avg_train_loss: 0.0525  avg_val_loss: 0.0478  time: 331s
Epoch 2 - avg_train_loss: 0.0525  avg_val_loss: 0.0478  time: 331s
Epoch 2 - avg_train_loss: 0.0525  avg_val_loss: 0.0478  time: 331s
Epoch 2 - Score: 0.8880866425992778
Epoch 2 - Score: 0.8880866425992778
Epoch 2 - Score: 0.8880866425992778
Epoch 2 - Score: 0.8880866425992778
Epoch 2 - Score: 0.8880866425992778
Epoch 2 - Score: 0.8880866425992778
Epoch 2 - Score: 0.8880866425992778
Epoch 2 - Save Best Score: 0.8881 Model
Epoch 2 - Save Best Score: 0.8881 Model
Epoch 2 - Save Best Score: 0.8881 Model
Epoch 2 - Save Best Score: 0.8881 Model
Epoch 2 - Save Best Score: 0.8881 Model
Epoch 2 - Save Best Score: 0.8881 Model
Epoch 2 - Save Best Score: 0.8881 Model

EVAL: [339/340] Elapsed 0m 24s (remain 0m 0s) Loss: 0.0478 
Epoch: [3][0/1357] Elapsed 0m 0s (remain 11m 52s) Loss: 0.1588 
Epoch: [3][100/1357] Elapsed 0m 23s (remain 4m 47s) Loss: 0.0355 
Epoch: [3][200/1357] Elapsed 0m 45s (remain 4m 22s) Loss: 0.0330 
Epoch: [3][300/1357] Elapsed 1m 8s (remain 3m 59s) Loss: 0.0373 
Epoch: [3][400/1357] Elapsed 1m 30s (remain 3m 36s) Loss: 0.0344 
Epoch: [3][500/1357] Elapsed 1m 53s (remain 3m 13s) Loss: 0.0358 
Epoch: [3][600/1357] Elapsed 2m 15s (remain 2m 50s) Loss: 0.0380 
Epoch: [3][700/1357] Elapsed 2m 38s (remain 2m 28s) Loss: 0.0380 
Epoch: [3][800/1357] Elapsed 3m 0s (remain 2m 5s) Loss: 0.0381 
Epoch: [3][900/1357] Elapsed 3m 23s (remain 1m 42s) Loss: 0.0381 
Epoch: [3][1000/1357] Elapsed 3m 45s (remain 1m 20s) Loss: 0.0391 
Epoch: [3][1100/1357] Elapsed 4m 8s (remain 0m 57s) Loss: 0.0396 
Epoch: [3][1200/1357] Elapsed 4m 31s (remain 0m 35s) Loss: 0.0398 
Epoch: [3][1300/1357] Elapsed 4m 53s (remain 0m 12s) Loss: 0.0392 
Epoch: [3][1356/13

Epoch 3 - avg_train_loss: 0.0387  avg_val_loss: 0.0523  time: 331s
Epoch 3 - avg_train_loss: 0.0387  avg_val_loss: 0.0523  time: 331s
Epoch 3 - avg_train_loss: 0.0387  avg_val_loss: 0.0523  time: 331s
Epoch 3 - avg_train_loss: 0.0387  avg_val_loss: 0.0523  time: 331s
Epoch 3 - avg_train_loss: 0.0387  avg_val_loss: 0.0523  time: 331s
Epoch 3 - avg_train_loss: 0.0387  avg_val_loss: 0.0523  time: 331s
Epoch 3 - avg_train_loss: 0.0387  avg_val_loss: 0.0523  time: 331s
Epoch 3 - Score: 0.8770603357024045
Epoch 3 - Score: 0.8770603357024045
Epoch 3 - Score: 0.8770603357024045
Epoch 3 - Score: 0.8770603357024045
Epoch 3 - Score: 0.8770603357024045
Epoch 3 - Score: 0.8770603357024045
Epoch 3 - Score: 0.8770603357024045


EVAL: [339/340] Elapsed 0m 24s (remain 0m 0s) Loss: 0.0523 


Score: 0.88809
Score: 0.88809
Score: 0.88809
Score: 0.88809
Score: 0.88809
Score: 0.88809
Score: 0.88809
Score: 0.88809
Score: 0.88809
Score: 0.88809
Score: 0.88809
Score: 0.88809
Score: 0.88809
Score: 0.88809
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertF

  0%|          | 0/2553 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

  0%|          | 0/2553 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

  0%|          | 0/2553 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

  0%|          | 0/2553 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

  0%|          | 0/2553 [00:00<?, ?it/s]