# Library

In [1]:
%%writefile ../config/exp_087.yaml
exp: "087"
seed: 10
task_type: "detect"

# data preprocess
remove_prefix: true
exter_dataset:
  - ["nicholas", true]
  - ["mpware", false]
  - ["pjma", false]

n_fold: 3
use_fold: 3

# dataset, dataloader
add_newline_token: true
max_length: 128
train_stride: 96
eval_stride: 64
train_batch: 16
eval_batch: 64

# model
model_path: "microsoft/deberta-v3-large"
class_num: 8 # with prefix -> 13, without prefix -> 8
lstm_type: "none"
use_hidden_states: 2
dropout: 0.10
hidden_dropout: 0.10
attention_dropout: 0.10
reinit_layer_num: 0
freeze_layer_num: 0

# loss
smooth_type: "none"
smooth_ratio: 0.05
smooth_pair: 0.05
positive_class_weight: 10

# optimizer
optimizer_type: "AdamW"
pretrained_lr: 1e-6
head_lr: 1e-4
weight_decay: 0.01
betas: [0.9, 0.999]

# scheduler
scheduler_type: "cosine_custom"
first_cycle_epochs: 1
cycle_factor: 1
num_warmup_steps: 100
min_lr: 1e-9
gamma: 1.0

# training
epochs: 4
accumulation_steps: 2
eval_steps: 1000
negative_th: 0.660
device: "cuda"
amp: true
ema: true
ema_decay: 0.999
ema_update_after_step: 8000

# additional training
add_train: true
add_epochs: 2

# full training
full_train: true


Overwriting ../config/exp_087.yaml


In [2]:
import gc
import pickle
import sys
from collections import Counter, defaultdict
from pathlib import Path

import polars as pl
from tqdm.auto import tqdm

sys.path.append("..")

from src.preprocess import DetectDataProvider
from src.train import get_train_loaders
from src.utils import TimeUtil, get_config, get_logger, seed_everything

# Setup

In [3]:
# コマンドライン引数
exp = "087"

In [4]:
config = get_config(exp, config_dir=Path("../config"))
logger = get_logger(config.output_path)
logger.info(f"exp:{exp} start")

seed_everything(config.seed)

[ [32m2024-10-17 09:03:16[0m | [1mINFO ] exp:087 start[0m


In [5]:
config.debug = True
config.use_fold = 1
config.exter_dataset

[['nicholas', True], ['mpware', False], ['pjma', False]]

# Data

In [6]:
dpr = DetectDataProvider(config, "train")
data = dpr.load_data()
len(data)

400

In [7]:
labels = []
for d in data:
    labels.extend(d["labels"])

In [8]:
Counter(labels)

Counter({0: 299461, 7: 2821, 1: 2664, 5: 1188, 4: 438, 6: 348, 3: 316, 2: 295})

In [9]:
dataloaders = get_train_loaders(config, data)



# Model

In [10]:
from src.train.component_factory import ComponentFactory

In [11]:
import torch
import torch.nn.functional as F
from omegaconf import DictConfig
from torch import nn

TARGET_PAIR_DICT = {
    0: None,
    1: 8,
    2: None,
    3: None,
    4: 9,
    5: 10,
    6: 11,
    7: 12,
    8: 1,
    9: 4,
    10: 5,
    11: 6,
    12: 7,
}


class SmoothingCELoss(nn.Module):
    def __init__(self, config: DictConfig, class_weight: list[float] | None = None):
        super().__init__()
        if class_weight is not None:
            class_weight = torch.tensor(class_weight, dtype=torch.float, device=config.device)
        self.loss = nn.CrossEntropyLoss(weight=class_weight)
        self.device = config.device

        self.class_num = config.class_num
        if config.remove_prefix and config.smooth_type == "weighted":
            config.smooth_type = "normal"
        self.smooth_type = config.smooth_type
        self.smooth_ratio = config.smooth_ratio
        self.smooth_pair = config.smooth_pair
        self.soft_matrix = self.get_soft_matrix()

    def forward(self, y_pred: torch.Tensor, y_true: torch.Tensor):
        y_pred = y_pred.view(-1, y_pred.size(-1))
        y_true = y_true.view(-1)

        valid_idx = y_true != -1
        y_pred = y_pred[valid_idx]
        y_true = y_true[valid_idx]

        y_true = self.get_soft_label(y_true)
        return self.loss(y_pred, y_true)

    def get_soft_label(self, y_true: torch.Tensor):
        if self.smooth_type in ["normal", "weighted"]:
            return self.soft_matrix[y_true]
        else:
            return y_true

    def get_soft_matrix(self):
        soft_matrix = torch.eye(self.class_num)

        if self.smooth_type == "normal":
            soft_matrix = soft_matrix * (1 - self.smooth_ratio) + self.smooth_ratio / self.class_num
            return soft_matrix.to(self.device)

        elif self.smooth_type == "weighted":
            for c, c_p in TARGET_PAIR_DICT.items():
                soft_label = soft_matrix[c]
                if c_p is not None:
                    soft_label[c_p] = self.smooth_pair

                soft_label = torch.where(soft_label == 0, self.smooth_ratio / self.class_num, soft_label)
                soft_label[c] = 1 - torch.sum(soft_label[soft_label != 1])
                soft_matrix[c] = soft_label
            return soft_matrix.to(self.device)
        else:
            return None

In [12]:
from omegaconf import DictConfig
from torch import nn
from torch.optim.optimizer import Optimizer

from src.model.detect_model import DetectModel

# from src.model.classify_model import ClassifyModel
# from src.train.loss import OnlineSmoothingCELoss, SmoothingCELoss
from src.train.optimizer import get_optimizer
from src.train.scheduler import get_scheduler


class ComponentFactory:
    # [TODO]要編集
    @staticmethod
    def get_model(config: DictConfig):
        if config.task_type == "detect":
            model = DetectModel(config)
        elif config.task_type == "classify":
            pass
            # model = ClassifyModel(config)

        if config.reinit_layer_num > 0:
            model.reinit_layers(config.reinit_layer_num)
        if config.freeze_layer_num > 0:
            model.freeze_layers(config.freeze_layer_num)
        return model

    # [TODO]要編集
    @staticmethod
    def get_loss(config: DictConfig):
        if config.task_type == "detect":
            class_weight = [1] + [config.positive_class_weight] * (config.class_num - 1)
            if config.smooth_type == "online":
                loss_fn = OnlineSmoothingCELoss(config, class_weight=class_weight)
            else:
                loss_fn = SmoothingCELoss(config, class_weight=class_weight)
        elif config.task_type == "classify":
            # loss_fn = WeightedBCELoss()
            pass
        return loss_fn

    @staticmethod
    def get_optimizer(config: DictConfig, model):
        optimizer = get_optimizer(
            model,
            optimizer_type=config.optimizer_type,
            pretrained_lr=config.pretrained_lr,
            head_lr=config.head_lr,
            weight_decay=config.weight_decay,
            betas=config.betas,
        )
        return optimizer

    @staticmethod
    def get_scheduler(config: DictConfig, optimizer: Optimizer, steps_per_epoch: int):
        total_steps = (config.epochs - 1) * steps_per_epoch  # 1epoch目はlrを減衰させない
        if config.scheduler_type == "linear":
            scheduler_args = {
                "num_warmup_steps": config.num_warmup_steps,
                "num_training_steps": total_steps,
            }
        elif config.scheduler_type == "cosine":
            scheduler_args = {
                "num_warmup_steps": config.num_warmup_steps,
                "num_training_steps": total_steps,
                "num_cycles": config.num_cycles,
            }
        elif config.scheduler_type == "cosine_custom":
            first_cycle_steps = config.first_cycle_epochs * steps_per_epoch
            scheduler_args = {
                "first_cycle_steps": first_cycle_steps,
                "cycle_factor": config.cycle_factor,
                "num_warmup_steps": config.num_warmup_steps,
                "min_lr": config.min_lr,
                "gamma": config.gamma,
            }
        elif config.scheduler_type == "reduce_on_plateau":
            scheduler_args = {
                "mode": config.mode,
                "factor": config.factor,
                "patience": config.patience,
                "min_lr": config.min_lr,
            }
        else:
            raise ValueError(f"Invalid scheduler_type: {config.scheduler_type}")

        scheduler = get_scheduler(optimizer, scheduler_type=config.scheduler_type, scheduler_args=scheduler_args)
        return scheduler

In [21]:
def calculate_fbeta(tp: int, fp: int, fn: int, beta: float = 5.0):
    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0
    if precision == 0 and recall == 0:
        return 0.0
    score = (1 + (beta**2)) * precision * recall / (beta**2 * precision + recall)
    return score


def evaluate_metric(
    pred_df: pl.DataFrame,
    truth_df: pl.DataFrame,
) -> float:
    truth_df = truth_df.join(pred_df, on=["document", "token_idx"], how="left")
    truth_df = truth_df.with_columns(pred=pl.col("pred").fill_null(0))

    tp = len(truth_df.filter((pl.col("label") != 0) & (pl.col("label") == pl.col("pred"))))
    fp = len(truth_df.filter((pl.col("label") == 0) & (pl.col("pred") != 0)))
    fn = len(truth_df.filter((pl.col("label") != 0) & (pl.col("pred") == 0)))
    fp_fn = len(truth_df.filter((pl.col("label") != 0) & (pl.col("pred") != 0) & (pl.col("label") != pl.col("pred"))))
    score = calculate_fbeta(tp, fp + fp_fn, fn + fp_fn)
    return score

In [22]:
import loguru
import numpy as np
import polars as pl
import torch
from omegaconf import DictConfig
from torch import nn
from torch.cuda import amp
from torch.utils.data import DataLoader

from src.train.ema import ModelEmaV3
from src.utils.competition_utils import (
    get_char2org_df,
    get_char_pred_df,
    get_original_token_df,
    get_pred_df,
    get_truth_df,
    restore_prefix,
)
from src.utils.utils import AverageMeter, clean_message

# from src.utils.metric import evaluate_metric


class Trainer:
    def __init__(self, config: DictConfig, logger: loguru._Logger, save_suffix: str = ""):
        self.config = config
        self.logger = logger
        self.save_suffix = save_suffix
        self.detail_pbar = True

        self.model = ComponentFactory.get_model(config)
        self.model = self.model.to(config.device)
        n_device = torch.cuda.device_count()
        if n_device > 1:
            self.model = nn.DataParallel(self.model)

        if self.config.ema:
            self.model_ema = ModelEmaV3(
                self.model,
                decay=config.ema_decay,
                update_after_step=config.ema_update_after_step,
                device=config.device,
            )

        self.loss_fn = ComponentFactory.get_loss(config)
        self.train_loss = AverageMeter()
        self.valid_loss = AverageMeter()
        self.grad_scaler = amp.GradScaler(enabled=config.amp)

    def train(
        self,
        train_loader: DataLoader,
        valid_loader: DataLoader,
        retrain: bool = False,
        retrain_weight_name: str = "",
        retrain_global_steps: int = 0,
        retrain_best_score: float = -np.inf,
        eval_only: bool = False,
    ):
        if eval_only:
            score, loss, oof_df = self.valid_evaluate(valid_loader, epoch=-1, load_best_weight=True)
            return score, -1, oof_df

        self.optimizer = ComponentFactory.get_optimizer(self.config, self.model)
        self.scheduler = ComponentFactory.get_scheduler(self.config, self.optimizer, steps_per_epoch=len(train_loader))

        global_steps = 0
        update_steps = 0
        best_score = -np.inf

        if retrain:
            self.model.load_state_dict(torch.load(self.config.output_path / f"{retrain_weight_name}.pth"))
            global_steps = retrain_global_steps
            best_score = retrain_best_score

        # 学習ループの開始
        for epoch in tqdm(range(self.config.epochs)):
            self.model.train()
            self.train_loss.reset()

            # 1epoch目はbackboneをfreezeする
            if epoch == 0:
                self.model.freeze_backbone(config.reinit_layer_num)
            elif epoch == 1:
                self.model.unfreeze_backbone(config.freeze_layer_num)

            iterations = tqdm(train_loader, total=len(train_loader)) if self.detail_pbar else train_loader
            for data in iterations:
                _, loss = self.forward_step(self.model, data)
                self.train_loss.update(loss.item(), n=data[0].size(0))
                loss = loss / self.config.accumulation_steps
                self.grad_scaler.scale(loss).backward()
                global_steps += 1

                if global_steps % self.config.accumulation_steps == 0:
                    self.grad_scaler.step(self.optimizer)
                    self.grad_scaler.update()
                    self.optimizer.zero_grad()

                    if self.config.ema:
                        self.model_ema.update(self.model, global_steps)

                    # backboneの学習が始まってからschedulerを適用
                    if epoch > 0:
                        self.scheduler.step()
                        update_steps += 1

                if global_steps % self.config.eval_steps == 0:
                    score, loss, oof_df = self.valid_evaluate(valid_loader, epoch, load_best_weight=False)
                    if score > best_score:
                        best_score = score
                        best_steps = global_steps
                        best_oof = oof_df
                        parameters = self.model_ema.module.state_dict() if self.config.ema else self.model.state_dict()
                        torch.save(
                            parameters,
                            self.config.output_path / f"model{self.save_suffix}_best.pth",
                        )
                    self.model.train()

            message = f"""
                [Train] :
                    Epoch={epoch},
                    Loss={self.train_loss.avg:.5f},
                    LR={self.optimizer.param_groups[0]["lr"]:.5e}
            """
            self.logger.info(clean_message(message))

            if self.config.smooth_type == "online":
                self.loss_fn.update_soft_matrix()

        return best_score, best_steps, best_oof

    def valid_evaluate(self, valid_loader: DataLoader, epoch: int, load_best_weight: bool = False):
        if load_best_weight:
            self.model.load_state_dict(torch.load(self.config.output_path / f"model{self.save_suffix}_best.pth"))

        self.model.eval()
        preds = []
        with torch.no_grad():
            iterations = tqdm(valid_loader, total=len(valid_loader)) if self.detail_pbar else valid_loader
            for data in iterations:
                if load_best_weight or not self.config.ema:
                    out, loss = self.forward_step(self.model, data)
                else:
                    out, loss = self.forward_step(self.model_ema, data)

                self.valid_loss.update(loss.item(), n=data[0].size(0))
                preds.extend(F.softmax(out, dim=-1).cpu().numpy().tolist())

        oof_df = self.get_oof_df(preds, valid_loader)
        pred_df = get_pred_df(oof_df, self.config.class_num, self.config.negative_th)
        if self.config.remove_prefix:
            pred_df = restore_prefix(self.config, pred_df)

        truth_df = get_truth_df(self.config, pred_df["document"].unique().to_list(), convert_idx=True)
        score = evaluate_metric(pred_df, truth_df)

        loss = self.valid_loss.avg
        message = f"""
            Valid :
                Epoch={epoch},
                Loss={loss:.5f},
                Score={score:.5f}
        """
        self.logger.info(clean_message(message))
        return score, loss, oof_df

    def forward_step(self, model: nn.Module, data: torch.Tensor):
        input_ids, attention_mask, positions_feats, labels = data
        input_ids = input_ids.to(self.config.device)
        attention_mask = attention_mask.to(self.config.device)
        positions_feats = positions_feats.to(self.config.device)
        labels = labels.to(self.config.device)
        with amp.autocast(enabled=self.config.amp):
            out = model(input_ids, attention_mask, positions_feats)
            loss = self.loss_fn(out, labels)
        return out, loss

    def get_oof_df(self, preds: list[list[float]], valid_loader: DataLoader):
        char_pred_df = get_char_pred_df(
            preds,
            valid_loader.dataset.overlap_doc_ids,
            valid_loader.dataset.offset_mapping,
            class_num=self.config.class_num,
        )
        char2org_df = get_char2org_df(
            valid_loader.dataset.doc_ids,
            valid_loader.dataset.full_texts,
            valid_loader.dataset.org_tokens,
            valid_loader.dataset.whitespaces,
        )
        oof_df = char_pred_df.join(char2org_df, on=["document", "char_idx"], how="left")
        oof_df = (
            oof_df.filter(pl.col("token_idx") != -1)
            .group_by("document", "token_idx")
            .agg([pl.col(f"pred_{i}").mean() for i in range(self.config.class_num)])
        )
        return oof_df

# Run

In [24]:
from src.train.train_utils import CollateFn, get_sampler, get_tokenizer

oof_dfs = []
tokenizer = get_tokenizer(config)
collate_fn = CollateFn(tokenizer, is_train=True)

for fold, (train_loader, valid_loader) in enumerate(dataloaders):
    logger.info(f"\n FOLD{fold} : Training Start \n")

    # First Training
    trainer = Trainer(config, logger, save_suffix=f"_fold{fold}")
    best_score, best_steps, _ = trainer.train(train_loader, valid_loader)
    break

    # High Quality Data Training
    # train_dataset = train_loader.dataset
    # train_dataset.drop_first_only_data()
    # train_loader = DataLoader(
    #     train_dataset,
    #     sampler=get_sampler(train_dataset),
    #     batch_size=config.batch_size,
    #     collate_fn=collate_fn,
    #     pin_memory=True,
    #     drop_last=True,
    # )

    # trainer = Trainer(config, logger, save_suffix=f"_fold{fold}")
    # best_score, best_steps, oof_df = trainer.train(
    #     train_loader,
    #     valid_loader,
    #     retrain=True,
    #     retrain_weight_name=f"model_fold{fold}_best",
    #     retrain_best_score=best_score,
    # )



[ [32m2024-10-17 09:33:44[0m | [1mINFO ] 
 FOLD0 : Training Start 
[0m




  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/561 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  oof_df = char_pred_df.join(char2org_df, on=["document", "char_idx"], how="left")
  pred_df = pred_df.with_columns(


[ [32m2024-10-17 09:33:49[0m | [1mINFO ] Valid : Epoch=0, Loss=2.19669, Score=0.02382[0m


  truth_df = truth_df.join(pred_df, on=["document", "token_idx"], how="left")


  0%|          | 0/7 [00:00<?, ?it/s]

  oof_df = char_pred_df.join(char2org_df, on=["document", "char_idx"], how="left")
  pred_df = pred_df.with_columns(


[ [32m2024-10-17 09:33:54[0m | [1mINFO ] Valid : Epoch=0, Loss=2.19668, Score=0.02382[0m


  truth_df = truth_df.join(pred_df, on=["document", "token_idx"], how="left")


  0%|          | 0/7 [00:00<?, ?it/s]

  oof_df = char_pred_df.join(char2org_df, on=["document", "char_idx"], how="left")
  pred_df = pred_df.with_columns(


[ [32m2024-10-17 09:33:57[0m | [1mINFO ] Valid : Epoch=0, Loss=2.19668, Score=0.02382[0m


  truth_df = truth_df.join(pred_df, on=["document", "token_idx"], how="left")


  0%|          | 0/7 [00:00<?, ?it/s]

  oof_df = char_pred_df.join(char2org_df, on=["document", "char_idx"], how="left")
  pred_df = pred_df.with_columns(


[ [32m2024-10-17 09:34:01[0m | [1mINFO ] Valid : Epoch=0, Loss=2.19668, Score=0.02382[0m


  truth_df = truth_df.join(pred_df, on=["document", "token_idx"], how="left")


  0%|          | 0/7 [00:00<?, ?it/s]

  oof_df = char_pred_df.join(char2org_df, on=["document", "char_idx"], how="left")
  pred_df = pred_df.with_columns(


[ [32m2024-10-17 09:34:05[0m | [1mINFO ] Valid : Epoch=0, Loss=2.19668, Score=0.02382[0m


  truth_df = truth_df.join(pred_df, on=["document", "token_idx"], how="left")


  0%|          | 0/7 [00:00<?, ?it/s]

  oof_df = char_pred_df.join(char2org_df, on=["document", "char_idx"], how="left")
  pred_df = pred_df.with_columns(


[ [32m2024-10-17 09:34:08[0m | [1mINFO ] Valid : Epoch=0, Loss=2.19667, Score=0.02382[0m


  truth_df = truth_df.join(pred_df, on=["document", "token_idx"], how="left")


  0%|          | 0/7 [00:00<?, ?it/s]

  oof_df = char_pred_df.join(char2org_df, on=["document", "char_idx"], how="left")
  pred_df = pred_df.with_columns(


[ [32m2024-10-17 09:34:12[0m | [1mINFO ] Valid : Epoch=0, Loss=2.19667, Score=0.02382[0m


  truth_df = truth_df.join(pred_df, on=["document", "token_idx"], how="left")


  0%|          | 0/7 [00:00<?, ?it/s]

  oof_df = char_pred_df.join(char2org_df, on=["document", "char_idx"], how="left")
  pred_df = pred_df.with_columns(


[ [32m2024-10-17 09:34:16[0m | [1mINFO ] Valid : Epoch=0, Loss=2.19667, Score=0.02382[0m


  truth_df = truth_df.join(pred_df, on=["document", "token_idx"], how="left")


  0%|          | 0/7 [00:00<?, ?it/s]

  oof_df = char_pred_df.join(char2org_df, on=["document", "char_idx"], how="left")
  pred_df = pred_df.with_columns(


[ [32m2024-10-17 09:34:19[0m | [1mINFO ] Valid : Epoch=0, Loss=2.19667, Score=0.02382[0m


  truth_df = truth_df.join(pred_df, on=["document", "token_idx"], how="left")


KeyboardInterrupt: 

In [None]:
# train over folds
oof_dfs = []
best_steps_list, best_add_steps_list = [], []
for fold, (train_loader, valid_loader) in enumerate(dataloaders):
    logger.info(f"\n FOLD{fold} : Training Start \n")
    model = get_model(config)
    optimizer = get_optimizer(config, model)
    oof_df, score, best_steps, best_add_steps = train_model(
        config,
        model,
        train_loader,
        valid_loader,
        optimizer,
        logger,
        fold,
        suffix=suffix,
    )
    oof_df.write_parquet(config.oof_path / f"oof_fold{fold}{suffix}.parquet")
    oof_dfs.append(oof_df)
    best_score, best_th = get_best_negative_threshold(config, oof_df)
    config.negative_th = best_th
    message = f"FOLD: {fold}, Steps: {best_steps} + {best_add_steps}, Best Score: {best_score}, Best Negative Threshold: {best_th}"
    logger.info(message)
    best_steps_list.append(best_steps)
    best_add_steps_list.append(best_add_steps)

    del train_loader, valid_loader, model
    gc.collect()
    torch.cuda.empty_cache()
del dataloaders
gc.collect()

In [None]:
# save oof
oof_df = pl.concat(oof_dfs)
oof_df.write_parquet(config.oof_path / f"oof_{config.exp}{suffix}.parquet")

# get best threshold
best_score, best_th = get_best_negative_threshold(config, oof_df)
message = f"Overall OOF Best Score: {best_score}, Best Negative Threshold: {best_th}"
logger.info(message)
config.negative_th = best_th

# full train
if config.full_train:
    full_train_steps, full_train_add_steps = np.max(best_steps_list), np.max(best_add_steps_list)
    logger.info(f"\n Full Train : Training Start, Num of Steps : {full_train_steps} + {full_train_add_steps}\n")
    train_loader = get_full_train_loader(config, train_data)
    model = get_model(config)
    optimizer = get_optimizer(config, model)
    full_train_model(config, model, train_loader, optimizer, full_train_steps, full_train_add_steps, logger, suffix)
    message = "Full Train Completed"
    logger.info(message)

torch.Size([16, 128, 2])