# Library

In [3]:
%%writefile ../config/exp_118_train.yaml
exp: "118"
first_exp: "087"
run_type: "train"
task_type: "classify"
device: "cuda"
seed: 10

# data preprocess
first_negative_th: 0.400  # [TODO]あとで調整
n_fold: 3
use_fold: 3

# dataset, dataloader
add_newline_token: true
max_length: 2048
train_stride: 1024
eval_stride: 1024
train_batch: 2
eval_batch: 4

# model
model_path: "microsoft/deberta-v3-large"
class_num: 1
lstm_type: "none"
use_hidden_states: 2
dropout: 0.10
hidden_dropout: 0.10
attention_dropout: 0.10
reinit_layer_num: 0
freeze_layer_num: 0

# loss
positive_class_weight: 10

# optimizer
optimizer_type: "AdamW"
pretrained_lr: 1e-6
head_lr: 1e-4
weight_decay: 0.01
betas: [0.9, 0.999]

# scheduler
scheduler_type: "cosine_custom"
first_cycle_epochs: 4
cycle_factor: 1
num_warmup_steps: 0
min_lr: 1e-9
gamma: 1.0

# training
epochs: 4
accumulation_steps: 8
eval_steps: 1000
# negative_th: 0.660
# negative_th_method: "overall"
amp: true
ema: true
ema_decay: 0.999
ema_update_after_step: 8000

# full training
full_train: true

Overwriting ../config/exp_118_train.yaml


In [4]:
import gc
import pickle
import sys
import warnings
from collections import Counter, defaultdict
from pathlib import Path

import numpy as np

warnings.filterwarnings("ignore")

import polars as pl
from tqdm.auto import tqdm

sys.path.append("..")

import torch
from torch.utils.data import DataLoader

from src.postprocess import PostProcessor
from src.preprocess import ClassifyDataReader
from src.train import Trainer, get_full_train_loader, get_train_loaders
from src.train.dataloader_utils import CollateFn, get_sampler, get_tokenizer
from src.utils import TimeUtil, get_config, get_logger, seed_everything
from src.utils.metric import get_best_negative_threshold

# Setup

In [5]:
# 2nd-stageの目的は, 1st-stageでのNAME-STUDENTのFPを現象させること
# 後処理は2nd-stageの前では行わずに, 2nd-stageの後で行う

In [6]:
# [TODO]コマンドライン引数
config_name = "exp_118_train"
debug = False

In [7]:
config = get_config(config_name, config_dir=Path("../config"))
config.debug = debug
logger = get_logger(config.output_path)
logger.info(f"exp:{config.exp} start")

seed_everything(config.seed)

[ [32m2024-10-22 05:49:45[0m | [1mINFO ] exp:118 start[0m


In [8]:
config.input_path = Path("../data/input")
config.exter_path = Path("../data/input/external")
config.output_path = Path("../data/output") / config.exp
config.output_path.mkdir(exist_ok=True, parents=True)

# DataLoad

In [9]:
reader = ClassifyDataReader(config, "train")
data = reader.load_data(first_exp=config.first_exp)

In [10]:
dataloaders = get_train_loaders(config, data)

# Trainer

In [11]:
# pred_df : [document, token_idx, binary_pred, name_pred]

In [37]:
import loguru
import torch.nn.functional as F
from omegaconf import DictConfig
from torch import nn
from torch.cuda import amp

from src.train.component_factory import ComponentFactory
from src.train.ema import ModelEmaV3
from src.utils.competition_utils import (
    get_char2org_df,
    get_char_pred_df,
    get_first_pred_df,
    get_pred_df,
    get_truth_df,
    restore_prefix,
)
from src.utils.utils import AverageMeter, clean_message

ImportError: cannot import name 'get_first_pred_df' from 'src.utils.competition_utils' (/root/kaggle-pii-5th-place-solution/exp/../src/utils/competition_utils.py)

In [15]:
class Trainer:
    def __init__(self, config: DictConfig, logger: loguru._Logger, save_suffix: str = ""):
        self.config = config
        self.logger = logger
        self.save_suffix = save_suffix
        self.detail_pbar = True

        self.model = ComponentFactory.get_model(config)
        self.model = self.model.to(config.device)

        if self.config.ema:
            self.model_ema = ModelEmaV3(
                self.model,
                decay=config.ema_decay,
                update_after_step=config.ema_update_after_step,
                device=config.device,
            )

        self.loss_fn = ComponentFactory.get_loss(config)
        self.train_loss = AverageMeter()
        self.valid_loss = AverageMeter()

        self.optimizer = None
        self.scheduler = None
        self.grad_scaler = amp.GradScaler(enabled=config.amp)

        self.truth_df = None
        self.first_pred_df = None

    def train(
        self,
        train_loader: DataLoader,
        valid_loader: DataLoader | None,
        retrain: bool = False,
        retrain_weight_name: str = "",
        retrain_best_score: float = -np.inf,
        full_train: bool = False,
        full_steps: int = 0,
        eval_only: bool = False,
    ):
        if eval_only:
            assert not full_train, "eval_only and full_train cannot be True at the same time"
            score, loss, oof_df = self.valid_evaluate(valid_loader, epoch=-1, load_best_weight=True)
            return score, -1, oof_df

        self.optimizer = ComponentFactory.get_optimizer(self.config, self.model)

        global_steps = 0
        update_steps = 0
        best_score = -np.inf
        best_steps = 0
        best_oof = None
        full_train_complete = False

        if retrain:
            self.model.load_state_dict(torch.load(self.config.output_path / f"{retrain_weight_name}.pth"))
            self.model_ema.update_after_step = 0
            best_score = retrain_best_score

        # 学習ループの開始
        epochs = self.config.epochs if not retrain else self.config.add_epochs
        for epoch in tqdm(range(epochs)):
            if full_train_complete:
                break

            self.model.train()
            self.train_loss.reset()

            # 1epoch目はbackboneをfreezeする
            if epoch == 0 and not retrain:
                self.model.freeze_backbone(self.config.reinit_layer_num)
            elif epoch == 1 and not retrain:
                self.model.unfreeze_backbone(self.config.freeze_layer_num)

            iterations = tqdm(train_loader, total=len(train_loader)) if self.detail_pbar else train_loader
            for data in iterations:
                if full_train_complete:  # ループの上部でbreakする必要がある
                    break

                _, loss = self.forward_step(self.model, data)
                self.train_loss.update(loss.item(), n=data[0].size(0))
                loss = loss / self.config.accumulation_steps
                self.grad_scaler.scale(loss).backward()
                global_steps += 1

                if global_steps % self.config.accumulation_steps == 0:
                    self.grad_scaler.step(self.optimizer)
                    self.grad_scaler.update()
                    self.optimizer.zero_grad()
                    update_steps += 1

                    if self.config.ema:
                        self.model_ema.update(self.model, update_steps)

                    # backboneの学習が始まってからschedulerを適用
                    if epoch >= 1 or retrain:
                        if self.scheduler is None:
                            first_cycle_epochs = (
                                self.config.first_cycle_epochs if not retrain else self.config.add_first_cycle_epochs
                            )
                            total_steps = first_cycle_epochs * len(train_loader)
                            if not retrain:
                                total_steps -= len(train_loader)  # 最初の1epoch分はstepしないから
                            self.scheduler = ComponentFactory.get_scheduler(
                                self.config, self.optimizer, total_steps=total_steps
                            )
                        self.scheduler.step()

                # if global_steps % self.config.eval_steps == 0 and not full_train:
                #     score, loss, oof_df = self.valid_evaluate(valid_loader, epoch, load_best_weight=False)
                #     if score > best_score:
                #         best_score = score
                #         best_steps = global_steps
                #         best_oof = oof_df
                #         parameters = self.model_ema.module.state_dict() if self.config.ema else self.model.state_dict()
                #         torch.save(
                #             parameters,
                #             self.config.output_path / f"model{self.save_suffix}_best.pth",
                #         )
                #     self.model.train()

                # if full_train and global_steps == full_steps:
                #     parameters = self.model_ema.module.state_dict() if self.config.ema else self.model.state_dict()
                #     torch.save(
                #         parameters,
                #         self.config.output_path / f"model{self.save_suffix}_full.pth",
                #     )
                #     full_train_complete = True  # ここでbreakすると何故かnotebook kernelが落ちる現象が発生する

            message = f"""
                [Train] :
                    Epoch={epoch},
                    Loss={self.train_loss.avg:.5f},
                    LR={self.optimizer.param_groups[0]["lr"]:.5e}
            """
            self.logger.info(clean_message(message))

            # if self.config.smooth_type == "online":
            #     self.loss_fn.update_soft_matrix()

        return best_score, best_steps, best_oof

    def valid_evaluate(self, valid_loader: DataLoader, epoch: int, load_best_weight: bool = False):
        if load_best_weight:
            self.model.load_state_dict(torch.load(self.config.output_path / f"model{self.save_suffix}_best.pth"))

        self.model.eval()
        preds = []
        with torch.no_grad():
            iterations = tqdm(valid_loader, total=len(valid_loader)) if self.detail_pbar else valid_loader
            for data in iterations:
                if load_best_weight or not self.config.ema:
                    out, loss = self.forward_step(self.model, data)
                else:
                    out, loss = self.forward_step(self.model_ema, data)

                self.valid_loss.update(loss.item(), n=data[0].size(0))
                if self.config.task_type == "detect":
                    preds.extend(F.softmax(out, dim=-1).cpu().numpy().tolist())
                elif self.config.task_type == "classify":
                    preds.extend(F.sigmoid(out).cpu().numpy().tolist())

        oof_df = self.get_oof_df(preds, valid_loader)

        if self.truth_df is None:
            self.truth_df = get_truth_df(self.config, oof_df["document"].unique().to_list(), convert_idx=True)

        if self.first_pred_df is None:
            self.first_pred_df = get_first_pred_df(
                self.config,
                oof_file_path=self.config.output_path.parent / self.config.first_exp / "oof.parquet",
                document_ids=oof_df["document"].unique().to_list(),
                negative_th=self.config.first_negative_th,
            )

        if self.task_type == "detect":
            score, best_th = get_best_negative_threshold(self.config, oof_df, self.truth_df)
        elif self.task_type == "classify":
            score, best_th = None  # [TODO]

        loss = self.valid_loss.avg
        message = f"""
            Valid :
                Epoch={epoch},
                Loss={loss:.5f},
                Score={score:.5f}
                Threshold={best_th}
        """
        self.logger.info(clean_message(message))
        return score, loss, oof_df

        #     self.first_pred_df = first_pred_df

        # pred_df = self.first_pred_df.join(oof_df, on=["document", "token_idx"], how="left", coalesce=True)
        # pass
        # return score, best_th

    def forward_step(self, model: nn.Module, data: torch.Tensor):
        if self.config.task_type == "detect":
            input_ids, attention_mask, positions_feats, labels = data
            positions_feats = positions_feats.to(self.config.device)
        elif self.config.task_type == "classify":
            input_ids, attention_mask, labels = data

        input_ids = input_ids.to(self.config.device)
        attention_mask = attention_mask.to(self.config.device)
        labels = labels.to(self.config.device)

        with amp.autocast(enabled=self.config.amp):
            if self.config.task_type == "detect":
                out = model(input_ids, attention_mask, positions_feats)
            elif self.config.task_type == "classify":
                out = model(input_ids, attention_mask)
            loss = self.loss_fn(out, labels)
        return out, loss

    def get_oof_df(self, preds: list[list[float]], valid_loader: DataLoader):
        char_pred_df = get_char_pred_df(
            preds,
            valid_loader.dataset.overlap_doc_ids,
            valid_loader.dataset.offset_mapping,
            class_num=self.config.class_num,
        )
        char2org_df = get_char2org_df(
            valid_loader.dataset.doc_ids,
            valid_loader.dataset.full_texts,
            valid_loader.dataset.org_tokens,
            valid_loader.dataset.whitespaces,
        )
        oof_df = char2org_df.join(char_pred_df, on=["document", "char_idx"], how="left", coalesce=True)
        oof_df = (
            oof_df.filter(pl.col("token_idx") != -1)
            .group_by("document", "token_idx")
            .agg([pl.col(f"pred_{i}").mean() for i in range(self.config.class_num)])
        )
        return oof_df

In [17]:
valid_loader = dataloaders[0][1]
model = ComponentFactory.get_model(config)
loss_fn = ComponentFactory.get_loss(config)
model = model.to(config.device)


preds = []
with torch.no_grad():
    iterations = tqdm(valid_loader, total=len(valid_loader))
    for data in iterations:
        input_ids, attention_mask, labels = data
        input_ids = input_ids.to(config.device)
        attention_mask = attention_mask.to(config.device)
        labels = labels.to(config.device)

        with amp.autocast(enabled=config.amp):
            out = model(input_ids, attention_mask)
            loss = loss_fn(out, labels)
        preds.extend(F.sigmoid(out).cpu().numpy().tolist())

  0%|          | 0/9 [00:00<?, ?it/s]

In [18]:
config.class_num = 1

char_pred_df = get_char_pred_df(
    preds,
    valid_loader.dataset.overlap_doc_ids,
    valid_loader.dataset.offset_mapping,
    class_num=config.class_num,
)
char2org_df = get_char2org_df(
    valid_loader.dataset.doc_ids,
    valid_loader.dataset.full_texts,
    valid_loader.dataset.org_tokens,
    valid_loader.dataset.whitespaces,
)
oof_df = char2org_df.join(char_pred_df, on=["document", "char_idx"], how="left", coalesce=True)
oof_df = (
    oof_df.filter(pl.col("token_idx") != -1)
    .group_by("document", "token_idx")
    .agg([pl.col(f"pred_{i}").mean() for i in range(config.class_num)])
)

In [19]:
truth_df = get_truth_df(config, oof_df["document"].unique().to_list(), convert_idx=True)

In [22]:
first_pred_df

document,token_idx,pred_0,pred_1,pred_2,pred_3,pred_4,pred_5,pred_6,pred_7
i32,i32,f64,f64,f64,f64,f64,f64,f64,f64
1210,153,0.646484,0.082275,0.04895,0.042053,0.046341,0.037064,0.046234,0.050644
2058,857,0.711182,0.056061,0.037735,0.035416,0.039917,0.031471,0.042725,0.045547
1325,110,0.692627,0.041061,0.049194,0.038605,0.047852,0.03714,0.048416,0.045334
2058,75,0.698975,0.050385,0.046539,0.029518,0.036362,0.039703,0.042831,0.055679
166,92,0.715332,0.052261,0.043762,0.031235,0.038345,0.03476,0.048981,0.035248
…,…,…,…,…,…,…,…,…,…
2745,548,0.689697,0.035629,0.033188,0.037094,0.035004,0.035995,0.091553,0.041748
1185,398,0.736084,0.0383,0.029083,0.03653,0.036774,0.034592,0.045471,0.043182
2722,859,0.722656,0.040604,0.034805,0.036758,0.036713,0.035248,0.056259,0.036911
1814,228,0.71582,0.038513,0.029724,0.03952,0.034882,0.036697,0.054214,0.05069


In [23]:
first_pred_df = pl.read_parquet(config.output_path.parent / config.first_exp / "oof.parquet")
first_pred_df = first_pred_df.filter(pl.col("document").is_in(oof_df["document"].unique()))
class_num = len(list(filter(lambda x: "pred" in x, first_pred_df.columns)))
first_pred_df = get_pred_df(first_pred_df, class_num=class_num, negative_th=config.first_negative_th)
if class_num == 8:
    first_pred_df = restore_prefix(config, first_pred_df)

In [28]:
pred_df = first_pred_df.join(
    oof_df.rename({"pred_0": "name_pred"}), on=["document", "token_idx"], how="left", coalesce=True
)

In [34]:
from src.utils.metric import evaluate_metric


def get_best_name_pred_threshold(
    config: DictConfig, pred_df: pl.DataFrame, truth_df: pl.DataFrame, stride: float = 0.025
):
    """
    2nd-stage(classification)における閾値を探索する関数
    args:
        pred_df(pl.DataFrame): [document, token_idx, detect_prob, detect_pred, name_pred]
    """
    best_score = evaluate_metric(pred_df, truth_df)
    best_th = None

    pred_df = pred_df.with_columns(org_pred=pl.col("pred"))
    min_th, max_th = 0.10, 0.90
    for th in np.arange(min_th, max_th, stride):
        pred_df = pred_df.with_columns(
            pred=pl.when((pl.col("org_pred").is_in([1, 8])) & (pl.col("name_pred") < th))
            .then(0)
            .otherwise(pl.col("pred"))
        )
        score = evaluate_metric(pred_df, truth_df)
        if score > best_score:
            best_score = score
            best_th = th
    return best_score, best_th

In [35]:
best_score, best_th = get_best_name_pred_threshold(config, pred_df, truth_df)

In [36]:
best_score, best_th

(0.0911214953271028, None)

In [9]:
oof_dfs = []
best_steps, best_add_steps = [], []
collate_fn = CollateFn(get_tokenizer(config), is_train=True)

# この学習でベストなステップ数とOOFに対する予測値を取ることが目的
for fold, (train_loader, valid_loader) in enumerate(dataloaders):
    logger.info(f"FOLD{fold} : Training Start...")

    # First Training
    trainer = Trainer(config, logger, save_suffix=f"_fold{fold}")
    best_score, best_steps_, oof_df = trainer.train(train_loader, valid_loader)
    if config.smooth_type == "online":
        loss_soft_matrix = trainer.loss_fn.soft_matrix.clone()
    best_steps.append(best_steps_)
    logger.info(f"FOLD{fold} : First Training Done! -->> Best Score: {best_score}, Best Steps: {best_steps_}")

    del trainer
    gc.collect()
    torch.cuda.empty_cache()

    oof_df.write_parquet(config.output_path / f"oof_fold{fold}.parquet")
    oof_dfs.append(oof_df)
    logger.info(
        f"FOLD{fold} : Additional Training Done! -->> Best Score: {best_score}, Best Add Steps: {best_add_steps_}"
    )

    del train_loader, valid_loader, train_dataset, trainer, oof_df
    gc.collect()
    torch.cuda.empty_cache()

del dataloaders
gc.collect()

# Save OOF
oof_df = pl.concat(oof_dfs)
oof_df.write_parquet(config.output_path / "oof.parquet")
del oof_dfs
gc.collect()

# Get Best Negative Threshold
best_score, best_th = get_best_negative_threshold(config, oof_df)
message = f"Overall OOF Best Score: {best_score}, Best Negative Threshold: {best_th}"
logger.info(message)
config.negative_th = best_th.item()

In [10]:
for data in train_loader:
    input_ids, attention_mask, labels = data
    break

In [11]:
input_ids.size(), attention_mask.size(), labels.size()

(torch.Size([2, 914]), torch.Size([2, 914]), torch.Size([2, 914]))

In [None]:
class BaseModel(nn.Module):
    def __init__(self):
        super().__init__()

    import torch


from omegaconf import DictConfig
from torch import nn
from transformers import AutoConfig, AutoModel


class DetectModel(nn.Module):
    def __init__(self, config: DictConfig):
        super().__init__()
        self.config = config
        self.use_hidden_states = config.use_hidden_states
        self.model_config = AutoConfig.from_pretrained(config.model_path)
        self.model_config.update(
            {
                "hidden_dropout_prob": config.hidden_dropout,
                "attention_probs_dropout_prob": config.attention_dropout,
                "output_hidden_states": True,
            }
        )
        hidden_size = self.model_config.hidden_size
        self.backbone = AutoModel.from_pretrained(config.model_path, config=self.model_config)

        self.lstm_type = config.lstm_type
        if config.lstm_type == "lstm":
            self.lstm = nn.LSTM(
                hidden_size * self.use_hidden_states, hidden_size, num_layers=1, batch_first=True, bidirectional=True
            )
        elif config.lstm_type == "gru":
            self.lstm = nn.GRU(
                hidden_size * self.use_hidden_states, hidden_size, num_layers=1, batch_first=True, bidirectional=True
            )

        self.pos_emb = nn.Sequential(
            nn.Linear(2, hidden_size * self.use_hidden_states),
            nn.Dropout(config.dropout),
        )
        head_input_size = hidden_size * self.use_hidden_states if config.lstm_type == "none" else hidden_size * 2
        self.head = nn.Sequential(
            nn.Linear(head_input_size, 128),
            nn.LayerNorm(128),
            nn.ReLU(),
            nn.Dropout(config.dropout),
            nn.Linear(128, config.class_num),
        )
        self.layer_norm = nn.LayerNorm(hidden_size * self.use_hidden_states)

        self.head.apply(self._init_weights)
        if config.lstm_type != "none":
            self._lstm_init_weights(self.lstm)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.model_config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.model_config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    # Tensorflow/Keras-like initialization for GRU
    def _lstm_init_weights(self, module):
        for name, p in module.named_parameters():
            if "weight_ih" in name:
                nn.init.xavier_uniform_(p.data)
            elif "weight_hh" in name:
                nn.init.orthogonal_(p.data)
            elif "bias" in name:
                p.data.fill_(0)

    def forward(self, input_ids, attention_mask, positions):
        x_pos = self.pos_emb(positions)
        x_bb = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        x_bb = torch.cat(x_bb.hidden_states[-self.use_hidden_states :], dim=-1)
        x = x_bb + x_pos
        x = self.layer_norm(x)
        if self.lstm_type != "none":
            x, _ = self.lstm(x)
        x = self.head(x)
        return x

In [6]:
# [TODO]コマンドライン引数
config_name = "exp_087_train"
debug = False

In [8]:
config = get_config(config_name, config_dir=Path("../config"))
logger = get_logger(config.output_path)
logger.info(f"exp:{config.exp} start")

seed_everything(config.seed)

[ [32m2024-10-21 08:05:14[0m | [1mINFO ] exp:087 start[0m


In [9]:
# [TODO]
config.debug = debug
config.use_fold = 3
config.eval_steps = 500  # 100
config.ema_update_after_step = 100

config.epochs = 2
config.first_cycle_epochs = 2
config.add_epochs = 2
config.add_first_cycle_epochs = 2

# Data

In [10]:
dpr = DetectDataProvider(config, "train")
data = dpr.load_data()
logger.info(f"Data Size: {len(data)}")

[ [32m2024-10-21 08:05:24[0m | [1mINFO ] Data Size: 13854[0m


In [11]:
# [TODO]データサイズを調整する

data_ = []
for fold in [-1, 0, 1, 2]:
    fold_data = [d for d in data if d["fold"] == fold]
    fold_data = fold_data[:100]
    data_.extend(fold_data)

data = data_
len(data)

400

In [12]:
dataloaders = get_train_loaders(config, data)

# Model

In [None]:
class ClassifyModel(nn.Module):
    def __init__(self, config: DictConfig):
        super().__init__()
        self.config = config
        self.use_hidden_states = config.use_hidden_states
        self.model_config = AutoConfig.from_pretrained(config.model_path)
        self.model_config.update(
            {
                "hidden_dropout_prob": config.hidden_dropout,
                "attention_probs_dropout_prob": config.attention_dropout,
                "output_hidden_states": True,
            }
        )
        hidden_size = self.model_config.hidden_size
        self.backbone = AutoModel.from_pretrained(config.model_path, config=self.model_config)

        self.lstlm_type = config.lstm_type
        if config.lstm_type == "lstm":
            self.lstm = nn.LSTM(
                hidden_size * self.use_hidden_states, hidden_size, num_layers=1, batch_first=True, bidirectional=True
            )
        elif config.lstm_type == "gru":
            self.lstm = nn.GRU(
                hidden_size * self.use_hidden_states, hidden_size, num_layers=1, batch_first=True, bidirectional=True
            )

        self.head = nn.Sequential(
            nn.Linear(hidden_size * self.use_hidden_states, 128),
            nn.LayerNorm(128),
            nn.ReLU(),
            nn.Dropout(config.dropout),
            nn.Linear(128, 1),
        )
        self.layer_norm = nn.LayerNorm(hidden_size * self.use_hidden_states)
        self.dropout = nn.Dropout(config.dropout)

        self.head.apply(self._init_weights)

    # DeBERTaの重み初期化関数
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.model_config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.model_config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, input_ids, attention_mask):
        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        x = torch.cat(outputs.hidden_states[-self.use_hidden_states :], dim=-1)  # N層分のhidden_statesをconcat
        if self.lstm_type != "none":
            x, _ = self.lstm(x)
            x = self.dropout(x)
        x = self.head(x)
        return x

    # 指定した層の数だけ再初期化する(エンコーダーの最後の層からカウント)
    def reinit_layers(self, reinit_layer_num: int):
        for i in range(1, reinit_layer_num + 1):
            self.backbone.encoder.layer[-i].apply(self._init_weights)

    # 指定した層の数だけFreezeする(エンコーダーの最初の層からカウント)
    def freeze_layers(self, freeze_layer_num: int):
        for i in range(freeze_layer_num):
            if i == 0:
                for params in self.backbone.embeddings.parameters():
                    params.requires_grad = False
            else:
                for params in self.backbone.encoder.layer[i - 1].parameters():
                    params.requires_grad = False

    # 初期化した層以外の層をFreezeする
    def freeze_backbone(self, reinit_layer_num: int):
        for param in self.backbone.parameters():
            param.requires_grad = False

        for i in range(1, reinit_layer_num + 1):
            for params in self.backbone.encoder.layer[-i].parameters():
                params.requires_grad = True

    # BackboneのFreezeを解除する, 元からFreezeに指定した層はFreezeのまま
    def unfreeze_backbone(self, freeze_layer_num: int):
        for param in self.backbone.parameters():
            param.requires_grad = True

        self.freeze_layers(freeze_layer_num)

# Run

In [13]:
oof_dfs = []
best_steps, best_add_steps = [], []
collate_fn = CollateFn(get_tokenizer(config), is_train=True)

# この学習でベストなステップ数とOOFに対する予測値を取ることが目的
for fold, (train_loader, valid_loader) in enumerate(dataloaders):
    logger.info(f"FOLD{fold} : Training Start...")

    # First Training
    trainer = Trainer(config, logger, save_suffix=f"_fold{fold}")
    best_score, best_steps_, _ = trainer.train(train_loader, valid_loader)
    if config.smooth_type == "online":
        loss_soft_matrix = trainer.loss_fn.soft_matrix.clone()
    best_steps.append(best_steps_)
    logger.info(f"FOLD{fold} : First Training Done! -->> Best Score: {best_score}, Best Steps: {best_steps_}")

    del trainer
    gc.collect()
    torch.cuda.empty_cache()

    # Create High-Quality Dataloader
    train_dataset = train_loader.dataset
    train_dataset.drop_first_only_data()
    train_loader = DataLoader(
        train_dataset,
        sampler=get_sampler(train_dataset),
        batch_size=config.train_batch,
        collate_fn=collate_fn,
        pin_memory=True,
        drop_last=True,
    )

    # Additional Training
    trainer = Trainer(config, logger, save_suffix=f"_fold{fold}")
    if config.smooth_type == "online":
        trainer.loss_fn.soft_matrix = loss_soft_matrix
    best_score, best_add_steps_, oof_df = trainer.train(
        train_loader,
        valid_loader,
        retrain=True,
        retrain_weight_name=f"model_fold{fold}_best",
        retrain_best_score=best_score,
    )
    best_add_steps.append(best_add_steps_)
    oof_df.write_parquet(config.output_path / f"oof_fold{fold}.parquet")
    oof_dfs.append(oof_df)
    logger.info(
        f"FOLD{fold} : Additional Training Done! -->> Best Score: {best_score}, Best Add Steps: {best_add_steps_}"
    )

    del train_loader, valid_loader, train_dataset, trainer, oof_df
    gc.collect()
    torch.cuda.empty_cache()

del dataloaders
gc.collect()

# Save OOF
oof_df = pl.concat(oof_dfs)
oof_df.write_parquet(config.output_path / "oof.parquet")
del oof_dfs
gc.collect()

# Get Best Negative Threshold
best_score, best_th = get_best_negative_threshold(config, oof_df)
message = f"Overall OOF Best Score: {best_score}, Best Negative Threshold: {best_th}"
logger.info(message)
config.negative_th = best_th.item()

[ [32m2024-10-21 08:05:41[0m | [1mINFO ] FOLD0 : Training Start...[0m


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/505 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

[ [32m2024-10-21 08:08:44[0m | [1mINFO ] Valid : Epoch=0, Loss=1.83803, Score=0.80858 Threshold=0.32499999999999996[0m
[ [32m2024-10-21 08:08:46[0m | [1mINFO ] [Train] : Epoch=0, Loss=2.55542, LR=1.00000e-06[0m


  0%|          | 0/505 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

[ [32m2024-10-21 08:12:41[0m | [1mINFO ] Valid : Epoch=1, Loss=4.20555, Score=0.84081 Threshold=0.27499999999999997[0m
[ [32m2024-10-21 08:12:46[0m | [1mINFO ] [Train] : Epoch=1, Loss=6.40125, LR=4.98946e-07[0m
[ [32m2024-10-21 08:12:50[0m | [1mINFO ] FOLD0 : First Training Done! -->> Best Score: 0.8408107492598499, Best Steps: 1000[0m


NameError: name 'torch' is not defined

In [13]:
# # 全データ学習を行う
if config.full_train:
    full_steps = np.max(best_steps)
    full_add_steps = np.max(best_add_steps)
    logger.info("Full Train : Training Start...")
    train_loader = get_full_train_loader(config, data)

    # First Training
    trainer = Trainer(config, logger, save_suffix="")
    trainer.train(train_loader, valid_loader=None, full_train=True, full_steps=full_steps)
    if config.smooth_type == "online":
        loss_soft_matrix = trainer.loss_fn.soft_matrix.clone()
    logger.info("Full Train : First Training Done!")

    # Create High-Quality Dataloader
    train_dataset = train_loader.dataset
    train_dataset.drop_first_only_data()
    train_loader = DataLoader(
        train_dataset,
        sampler=get_sampler(train_dataset),
        batch_size=config.train_batch,
        collate_fn=collate_fn,
        pin_memory=True,
        drop_last=True,
    )

    # Additional Training
    trainer = Trainer(config, logger, save_suffix="")
    if config.smooth_type == "online":
        trainer.loss_fn.soft_matrix = loss_soft_matrix
    trainer.train(
        train_loader,
        valid_loader=None,
        retrain=True,
        retrain_weight_name="model_full",
        full_train=True,
        full_steps=full_add_steps,
    )
    logger.info("Full Train : Additional Training Done!")

    del train_loader, trainer
    gc.collect()
    torch.cuda.empty_cache()

[ [32m2024-10-21 07:50:35[0m | [1mINFO ] Full Train : Training Start...[0m


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/654 [00:00<?, ?it/s]

[ [32m2024-10-21 07:51:08[0m | [1mINFO ] [Train] : Epoch=0, Loss=3.26228, LR=1.00000e-06[0m
[ [32m2024-10-21 07:51:08[0m | [1mINFO ] Full Train : First Training Done![0m


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/654 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 23.58 GiB total capacity; 23.20 GiB already allocated; 19.19 MiB free; 23.23 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

# PostProcess

In [24]:
# [TODO] 後で編集する
oof_df = pl.read_parquet(config.output_path / "oof.parquet")
pred_df = get_pred_df(oof_df, config.class_num, negative_th=0.35)
truth_df = get_truth_df(config, pred_df["document"].unique().to_list(), convert_idx=True)
score = evaluate_metric(pred_df, truth_df)
print(score)

pper = PostProcessor(config)
pred_df = pper.post_process(pred_df)
score = evaluate_metric(pred_df, truth_df)
print(score)

# logger.info(f"Post Processed Score: {score:.4f}")

0.04906380169061845


Check Prefix Validity: 100%|██████████| 300/300 [00:00<00:00, 843.14it/s]
Check PII Validity: 100%|██████████| 155221/155221 [14:01<00:00, 184.44it/s]


0.572628474337191


NameError: name 'pred_df' is not defined