In [1]:
import os
import random
import math
import gc
import time
import copy

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset


import transformers

from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [2]:
class CFG :
    debug_one_epoch=False
    debug_one_fold=False
    run_cv=False
    only_infer=True
    num_workers=8
    num_epochs=10
    warmup_prop=0.1
    lr=5e-5
    max_len=512
    batch_size=8
    early_stopping_rounds=5
    random_seed=42
    model_path="/kaggle/input/huggingfacedebertav3variants/deberta-v3-base" 
    pretrained_path="/kaggle/input/commmonlit-deberta"
    optimizer=torch.optim.AdamW
    criterion=torch.nn.MSELoss()
    device=device
    fold=5
    
def seed_torch(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_torch(CFG.random_seed)

In [3]:
if CFG.debug_one_epoch:
    CFG.num_epochs = 1

In [4]:
train = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
test = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
submission = pd.read_csv("../input/commonlitreadabilityprize/sample_submission.csv")

In [5]:
train["target"].describe()

count    2834.000000
mean       -0.959319
std         1.033579
min        -3.676268
25%        -1.690320
50%        -0.912190
75%        -0.202540
max         1.711390
Name: target, dtype: float64

In [6]:
model = transformers.AutoModel.from_pretrained(CFG.model_path)

In [7]:
class CommonlitDataset(Dataset):
    def __init__(self, texts, targets, tokenizer, max_len=512, test=False):
        self.texts = texts
        if test == False:
            self.targets = targets
        self.test = test
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        
        ids = inputs["input_ids"].squeeze()
        mask = inputs["attention_mask"].squeeze()
        
        targets = torch.tensor(-100, dtype=torch.float)
        if self.test == False:
            targets = torch.tensor(self.targets[idx], dtype=torch.float)
            
        return {
            "input_ids": torch.tensor(ids, dtype=torch.long),
            "attention_mask": torch.tensor(mask, dtype=torch.long),
            "targets":targets
        }

In [8]:
tokenizer = transformers.AutoTokenizer.from_pretrained(CFG.model_path)

test_dataset = CommonlitDataset(texts=test["excerpt"].values, targets=None, tokenizer=tokenizer, max_len=CFG.max_len, test=True)
test_dataloader = DataLoader(test_dataset,batch_size=CFG.batch_size, shuffle=False, num_workers=CFG.num_workers, pin_memory=True)

In [9]:
model = transformers.AutoModelForSequenceClassification.from_pretrained(CFG.model_path, num_labels=1)
model.to(CFG.device)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/huggingfacedebertav3variants/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine

In [10]:
def train_one_epoch(model, dataloader, optimizer, scheduler, criterion):
    model.to(CFG.device)
    
    model.train()
    
    losses = []
    preds = []
    
    for data in tqdm(dataloader):
        input_ids = data["input_ids"].to(CFG.device)
        attention_mask = data["attention_mask"].to(CFG.device)
        targets = data["targets"].to(CFG.device)
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs.logits.squeeze(-1), targets)
        loss.backward()
        pred = outputs.logits.squeeze(-1).detach().cpu().numpy()
        optimizer.step()
        scheduler.step()
        losses.append(loss.item())
        preds.append(pred)
    
    return np.mean(losses), np.concatenate(preds)

In [11]:
def eval_one_epoch(model, dataloader, criterion):
    model.to(CFG.device)
    model.eval()

    losses = []
    preds = []
    for data in tqdm(dataloader):
        with torch.no_grad():
            input_ids = data["input_ids"].to(CFG.device)
            attention_mask = data["attention_mask"].to(CFG.device)
            targets = data["targets"].to(CFG.device)
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs.logits.squeeze(-1), targets)
            pred = outputs.logits.squeeze(-1).cpu().numpy()
            losses.append(loss.item())
            preds.append(pred)
            
    return np.mean(losses), np.concatenate(preds)

In [12]:
def run_train_cv(train, test, tokenizer):
    kf = KFold(n_splits=CFG.fold, shuffle=True, random_state=CFG.random_seed) 
    for fold, (train_idx, valid_idx) in enumerate(kf.split(train)):
        print(f"=====================fold {fold}=====================")
        train_dataset = CommonlitDataset(texts=train.loc[train_idx, "excerpt"].values, targets=train.loc[train_idx, "target"].values, tokenizer=tokenizer, max_len=CFG.max_len, test=False)
        valid_dataset = CommonlitDataset(texts=train.loc[valid_idx, "excerpt"].values, targets=train.loc[valid_idx, "target"].values, tokenizer=tokenizer, max_len=CFG.max_len, test=False)
        train_dataloader = DataLoader(train_dataset, batch_size=CFG.batch_size, shuffle=True, num_workers=CFG.num_workers, pin_memory=True)
        valid_dataloader = DataLoader(valid_dataset, batch_size=CFG.batch_size, shuffle=False, num_workers=CFG.num_workers, pin_memory=True)

        model = transformers.AutoModelForSequenceClassification.from_pretrained(CFG.model_path, num_labels=1) 

        optimizer = CFG.optimizer(model.parameters(), lr=CFG.lr)
        scheduler = transformers.get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=len(train_dataloader)*CFG.num_epochs*CFG.warmup_prop, num_training_steps=len(train_dataloader)*CFG.num_epochs)
        criterion = CFG.criterion.to(CFG.device)

        test_preds = []
        
        best_rmse = np.inf
        best_model = None
        best_preds = None
        best_epoch = 0

        for epoch in range(CFG.num_epochs):
            train_loss, train_preds = train_one_epoch(model, train_dataloader, optimizer, scheduler, criterion)
            valid_loss, valid_preds = eval_one_epoch(model, valid_dataloader, criterion)
            valid_rmse = math.sqrt(mean_squared_error(valid_preds, train.loc[valid_idx, 'target'].values))
            print(f"epoch {epoch} train_loss : {train_loss}, valid_loss : {valid_loss}, valid_rmse :{valid_rmse}")
            
            if valid_rmse < best_rmse:
                print("best rmse is updated, save model")
                best_rmse = valid_rmse
                best_model = copy.deepcopy(model)
                best_preds = valid_preds
                best_epoch = epoch
            
            if CFG.early_stopping_rounds >= 0 and best_epoch-epoch >= CFG.early_stopping_rounds:
                print("early stopping")
                break
        
        torch.save(best_model.state_dict(), f"model_{fold}.pth")
        
        test_preds.append(eval_one_epoch(best_model, test_dataloader, criterion)[1])
        
        del model
        torch.cuda.empty_cache()

        if CFG.debug_one_fold == True: 
            break

    return test_preds

In [13]:
def only_infer(test, model):
    test_dataset = CommonlitDataset(texts=test["excerpt"].values, targets=None, tokenizer=tokenizer, max_len=CFG.max_len, test=True)
    test_dataloader = DataLoader(test_dataset, batch_size=CFG.batch_size, shuffle=False, num_workers=CFG.num_workers, pin_memory=True)
    test_preds = [] 
    criterion = CFG.criterion.to(CFG.device)
    for fold in range(CFG.fold):
        model = transformers.AutoModelForSequenceClassification.from_pretrained(CFG.model_path, num_labels=1)
        model.load_state_dict(torch.load(f"{CFG.pretrained_path}/model_{fold}.pth"))
        test_preds.append(eval_one_epoch(model, test_dataloader, criterion)[1])
        del model
        torch.cuda.empty_cache()
        

    return test_preds

In [14]:
def main():
    if CFG.run_cv == True:
        test_preds = run_train_cv(train, test, tokenizer)
        
    elif CFG.only_infer == True:
        test_preds = only_infer(test, tokenizer)

    submission["target"] = np.mean(test_preds, axis=0)
    submission.to_csv("submission.csv", index=False)
    
if __name__ == "__main__":
    main()

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/huggingfacedebertav3variants/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1 [00:00<?, ?it/s]

Exception ignored in: <function _ConnectionBase.__del__ at 0x7c5ab343e7a0>
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/connection.py", line 132, in __del__
    self._close()
  File "/usr/lib/python3.10/multiprocessing/connection.py", line 361, in _close
    _close(self._handle)
OSError: [Errno 9] Bad file descriptor
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/huggingfacedebertav3variants/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1 [00:00<?, ?it/s]

Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/queues.py", line 239, in _feed
    reader_close()Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/queues.py", line 239, in _feed

  File "/usr/lib/python3.10/multiprocessing/connection.py", line 177, in close
Exception ignored in: <function _ConnectionBase.__del__ at 0x7c5ab343e7a0>
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/connection.py", line 132, in __del__
    self._close()    
  File "/usr/lib/python3.10/multiprocessing/connection.py", line 361, in _close
    self._close()
  File "/usr/lib/python3.10/multiprocessing/connection.py", line 361, in _close
    reader_close()
  File "/usr/lib/python3.10/multiprocessing/connection.py", line 177, in close
_close(self._handle)
OSError: [Errno 9] Bad file descriptor

During handling of the above exception, another except

  0%|          | 0/1 [00:00<?, ?it/s]

Exception in thread QueueFeederThread:
Exception ignored in: <function _ConnectionBase.__del__ at 0x7c5ab343e7a0>
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/connection.py", line 132, in __del__
    Traceback (most recent call last):
Exception in thread QueueFeederThread:
Traceback (most recent call last):
self._close()
  File "/usr/lib/python3.10/multiprocessing/connection.py", line 361, in _close
  File "/usr/lib/python3.10/multiprocessing/queues.py", line 239, in _feed
      File "/usr/lib/python3.10/multiprocessing/queues.py", line 239, in _feed
    reader_close()
_close(self._handle)
OSError: [Errno 9] Bad file descriptor
      File "/usr/lib/python3.10/multiprocessing/connection.py", line 177, in close
reader_close()
  File "/usr/lib/python3.10/multiprocessing/connection.py", line 177, in close
    self._close()    self._close()
  File "/usr/lib/python3.10/multiprocessing/connection.py", line 361, in _close

  File "/usr/lib/python3.10/multiproc

  0%|          | 0/1 [00:00<?, ?it/s]

Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/queues.py", line 239, in _feed
    Exception ignored in: reader_close()
Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/queues.py", line 239, in _feed
  File "/usr/lib/python3.10/multiprocessing/connection.py", line 177, in close
<function _ConnectionBase.__del__ at 0x7c5ab343e7a0>
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/connection.py", line 132, in __del__
    reader_close()
  File "/usr/lib/python3.10/multiprocessing/connection.py", line 177, in close
    self._close()
  File "/usr/lib/python3.10/multiprocessing/connection.py", line 361, in _close
    _close(self._handle)
OSError: [Errno 9] Bad file descriptor
        self._close()
  File "/usr/lib/python3.10/multiprocessing/connection.py", line 361, in _close
self._close()
  File "/usr/lib/python3.10/multiproc

  0%|          | 0/1 [00:00<?, ?it/s]

# 8. 工夫のしどころ、キーワード
- BERT の fine-tuning （全員向け） : このコンペのネタバレを含みますが、[BERT の fine-tuning のみを取り扱った良記事](https://www.ai-shift.co.jp/techblog/2138) があります。まずは CFG の値を変えてみるところからでいいので、やってみましょう！
- アンサンブルをしよう（全員向け） : IOAI 2024 だとあまり試す機会が無かったものの、 kaggle に近い問題になればなるほど強力な手法です。身につけておきましょう。モデルを変える、埋込表現だけ使う、など色々な方法が考えられると思います！
- 使用モデルの検討（初中級者↑向け） : BERT にも様々な発展形があります。調べて使ってみましょう。また、本当に BERT がベストなのでしょうか......?
- アンサンブル方法の検討（中級者↑向け） : スタッキングを知っていますか？私は知っています。
- AMP 対応（中上級者↑向け） : 名前に large が付いていたりする大きめのモデルだと学習と推論に時間がかかります。実験効率も大事なので対応させましょう。思想が許せば pytorch lightning が楽です。
- full train 戦略や random seed ensemble などの細かいテク（上級者向け） : ここらへんはコンペの振り返り記事を読み漁ると時々出てきます。類似コンペを調べてみましょう。
- 実験効率化（上級者向け） : 実験パイプラインの管理と高速化を大事にしましょう。wandb は特に at-home task での実験管理でとても便利です。また、先述の AMP を含めた高速化も実験数で勝負する時に重要になります。代表を強く意識しているのならば出来て損はないはずです。
- スコアアタック（上級者向け） : 数年前のコンペなので、最新の model や手法を使えばかなり面白いのではないでしょうか！