In [1]:
!pip install bitsandbytes accelerate peft
!pip install -U lightning

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl.metadata (2.9 kB)
Collecting peft
  Downloading peft-0.14.0-py3-none-any.whl.metadata (13 kB)
Collecting huggingface-hub>=0.21.0 (from accelerate)
  Downloading huggingface_hub-0.27.1-py3-none-any.whl.metadata (13 kB)
Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl (69.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.1/69.1 MB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading peft-0.14.0-py3-none-any.whl (374 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.8/374.8 kB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading huggingface_hub-0.27.1-py3-none-any.whl (450 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m450.7/450.7 kB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: huggingface-hub, bitsandbytes, peft
  Attempting uninstall: huggingface-hub
 

In [2]:
import os
import random
import math
import gc
import time
import copy

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.distributed import DistributedSampler


import transformers
from transformers import(
    BitsAndBytesConfig,
    Gemma2ForSequenceClassification,
    GemmaTokenizerFast,
    Gemma2Config,
    EvalPrediction,
    Trainer, 
    TrainingArguments,
    DataCollatorWithPadding
)
from peft import LoraConfig, TaskType, prepare_model_for_kbit_training, get_peft_model
import bitsandbytes as bnb

from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

import pytorch_lightning as pl
from lightning.pytorch import seed_everything
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [3]:
class CFG :
    debug_one_epoch=True
    debug_one_fold=False
    run_cv=True
    only_infer=False
    num_workers=2
    num_epochs=3
    warmup_prop=0.1
    lr=1e-4
    max_len=512
    batch_size=2
    early_stopping_rounds=5
    random_seed=42
    model_path="unsloth/gemma-2-9b-it-bnb-4bit" 
    pretrained_path=""
    optimizer="adamw_8bit"
    criterion=torch.nn.MSELoss()
    device=device
    fold=5
    use_amp = True
    per_device_train_batch_size = 2
    gradient_accumulation_steps = 2
    per_device_eval_batch_size = 8

    
    num_warmup_steps = 10
    num_training_steps = -1
    
    # lora config
    freeze_layer = 16
    tasktype = TaskType.SEQ_CLS
    lora_r = 16
    lora_alpha = lora_r * 2
    lora_dropout = 0.05
    lora_bias = "none"
    
    
def seed_torch(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    transformers.set_seed(seed)

    
seed_torch(CFG.random_seed)

In [4]:
lora_config = LoraConfig(
    task_type = CFG.tasktype,
    inference_mode = CFG.only_infer,
    r = CFG.lora_r,
    lora_alpha = CFG.lora_alpha,
    target_modules = ["q_proj", "k_proj", "v_proj"], #, "o_proj","gate_proj"
    layers_to_transform =  [i for i in range(42) if i >= CFG.freeze_layer], 
    lora_dropout = CFG.lora_dropout,
    bias = CFG.lora_bias,
)

In [5]:
if CFG.debug_one_epoch:
    CFG.num_epochs = 1

In [6]:
train = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
test = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
submission = pd.read_csv("../input/commonlitreadabilityprize/sample_submission.csv")

In [7]:
train["target"].describe()

count    2834.000000
mean       -0.959319
std         1.033579
min        -3.676268
25%        -1.690320
50%        -0.912190
75%        -0.202540
max         1.711390
Name: target, dtype: float64

In [8]:
for i in range(len(train)):
    before_txt = train.loc[i,"excerpt"]
    train.loc[i,"excerpt"] = f"You are a highschool teacher. Please grade the following essay carefully. \n {before_txt}"

In [9]:
train.loc[0,"excerpt"]

'You are a highschool teacher. Please grade the following essay carefully. \n When the young people returned to the ballroom, it presented a decidedly changed appearance. Instead of an interior scene, it was a winter landscape.\nThe floor was covered with snow-white canvas, not laid on smoothly, but rumpled over bumps and hillocks, like a real snow field. The numerous palms and evergreens that had decorated the room, were powdered with flour and strewn with tufts of cotton, like snow. Also diamond dust had been lightly sprinkled on them, and glittering crystal icicles hung from the branches.\nAt each end of the room, on the wall, hung a beautiful bear-skin rug.\nThese rugs were for prizes, one for the girls and one for the boys. And this was the game.\nThe girls were gathered at one end of the room and the boys at the other, and one end was called the North Pole, and the other the South Pole. Each player was given a small flag which they were to plant on reaching the Pole.\nThis would 

In [10]:
class CommonlitDataset(Dataset):
    def __init__(self, texts, targets, tokenizer, max_len=512, test=False):
        self.texts = texts
        if test == False:
            self.targets = targets
        self.test = test
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        inputs = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        
        ids = inputs["input_ids"].squeeze()
        mask = inputs["attention_mask"].squeeze()
        
        targets = torch.tensor(-100, dtype=torch.float)
        if self.test == False:
            targets = torch.tensor(self.targets[idx], dtype=torch.float)
            
        return {
            "input_ids": torch.tensor(ids, dtype=torch.long),
            "attention_mask": torch.tensor(mask, dtype=torch.long),
            "labels":targets
        }

In [11]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred  # 出力 (予測値, ラベル)
    predictions = np.squeeze(logits)  # 必要に応じて次元を調整
    mse = mean_squared_error(labels, predictions)
    return {"mse": mse}

LoRA の multi GPU 対応が面倒だったので苦渋の決断ですが Trainer を使うことにします（悲しい）

In [12]:
def main():
    if CFG.run_cv == True:
        kf = KFold(n_splits=CFG.fold, shuffle=True, random_state=CFG.random_seed) 
        for fold, (train_idx, valid_idx) in enumerate(kf.split(train)):
            tokenizer = GemmaTokenizerFast.from_pretrained(CFG.model_path)
            tokenizer.add_eos_token = True
            tokenizer.padding_side = "right"
            base_model = transformers.Gemma2ForSequenceClassification.from_pretrained(CFG.model_path, num_labels=1, torch_dtype=torch.float16, device_map = "auto")
            base_model_kbit = prepare_model_for_kbit_training(base_model)
            model = get_peft_model(base_model, lora_config)
            model.print_trainable_parameters()
            train_dataset = CommonlitDataset(texts=train.loc[train_idx, "excerpt"].values, targets=train.loc[train_idx, "target"].values, tokenizer = tokenizer, max_len=CFG.max_len, test=False)
            valid_dataset = CommonlitDataset(texts=train.loc[valid_idx, "excerpt"].values, targets=train.loc[valid_idx, "target"].values, tokenizer = tokenizer, max_len=CFG.max_len, test=False)
            outputdir_str = f"results_{fold}"
            train_args = TrainingArguments(
                output_dir = outputdir_str,
                overwrite_output_dir = True,
                report_to = "none",
                num_train_epochs = CFG.num_epochs,
                gradient_accumulation_steps=CFG.gradient_accumulation_steps,
                per_device_eval_batch_size=CFG.per_device_eval_batch_size,
                logging_strategy="steps",
                logging_steps=50,
                eval_strategy="epoch",
                save_strategy="steps",
                save_steps=200,
                optim=CFG.optimizer,
                fp16=True,
                learning_rate=CFG.lr,
                warmup_steps=CFG.num_warmup_steps,
            )
            trainer = Trainer(
                args = train_args,
                model = model, 
                tokenizer = tokenizer,
                train_dataset = train_dataset,
                eval_dataset = valid_dataset,
                compute_metrics = compute_metrics,
            )
            print("train started")
            trainer.train()
            if CFG.debug_one_fold == True: 
                break

    

    return

if __name__ == "__main__":
    main()

tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


model.safetensors:   0%|          | 0.00/6.13G [00:00<?, ?B/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at unsloth/gemma-2-9b-it-bnb-4bit and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 7,884,288 || all params: 9,249,593,856 || trainable%: 0.0852
train started


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Epoch,Training Loss,Validation Loss,Mse
1,0.5105,0.316927,0.316927


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at unsloth/gemma-2-9b-it-bnb-4bit and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 7,884,288 || all params: 9,249,593,856 || trainable%: 0.0852
train started


Epoch,Training Loss,Validation Loss,Mse
1,0.4815,0.362672,0.362672


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at unsloth/gemma-2-9b-it-bnb-4bit and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 7,884,288 || all params: 9,249,593,856 || trainable%: 0.0852
train started


Epoch,Training Loss,Validation Loss,Mse
1,0.5329,0.347817,0.347817


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at unsloth/gemma-2-9b-it-bnb-4bit and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 7,884,288 || all params: 9,249,593,856 || trainable%: 0.0852
train started


Epoch,Training Loss,Validation Loss,Mse
1,0.5096,0.378167,0.378167


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at unsloth/gemma-2-9b-it-bnb-4bit and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 7,884,288 || all params: 9,249,593,856 || trainable%: 0.0852
train started


Epoch,Training Loss,Validation Loss,Mse
1,0.5564,0.359961,0.359961


# 工夫のしどころ、キーワード
- BERT の fine-tuning （全員向け） : このコンペのネタバレを含みますが、[BERT の fine-tuning のみを取り扱った良記事](https://www.ai-shift.co.jp/techblog/2138) があります。まずは CFG の値を変えてみるところからでいいので、やってみましょう！
- アンサンブルをしよう（全員向け） : IOAI 2024 だとあまり試す機会が無かったものの、 kaggle に近い問題になればなるほど強力な手法です。身につけておきましょう。モデルを変える、埋込表現だけ使う、など色々な方法が考えられると思います！
- 使用モデルの検討（初中級者↑向け） : BERT にも様々な発展形があります。調べて使ってみましょう。また、本当に BERT がベストなのでしょうか......?
- アンサンブル方法の検討（中級者↑向け） : スタッキングを知っていますか？私は知っています。
- AMP 対応（中上級者↑向け） : 名前に large が付いていたりする大きめのモデルだと学習と推論に時間がかかります。実験効率も大事なので対応させましょう。思想が許せば pytorch lightning が楽です。
- full train 戦略や random seed ensemble などの細かいテク（上級者向け） : ここらへんはコンペの振り返り記事を読み漁ると時々出てきます。類似コンペを調べてみましょう。
- 実験効率化（上級者向け） : 実験パイプラインの管理と高速化を大事にしましょう。wandb は特に at-home task での実験管理でとても便利です。また、先述の AMP を含めた高速化も実験数で勝負する時に重要になります。代表を強く意識しているのならば出来て損はないはずです。
- スコアアタック（上級者向け） : 数年前のコンペなので、最新の model や手法を使えばかなり面白いのではないでしょうか！