# Library

In [5]:
%%writefile ../config/exp_118_train.yaml
exp: "118"
first_exp: "087"
run_type: "train"
task_type: "classify"
device: "cuda"
seed: 10

# data preprocess
first_negative_th: 0.400  # [TODO]あとで調整
n_fold: 3
use_fold: 3

# dataset, dataloader
add_newline_token: true
# max_length: 128
# train_stride: 96
# eval_stride: 64
# train_batch: 16
# eval_batch: 64

# model
model_path: "microsoft/deberta-v3-large"
lstm_type: "none"
use_hidden_states: 2
dropout: 0.10
hidden_dropout: 0.10
attention_dropout: 0.10
reinit_layer_num: 0
freeze_layer_num: 0

# loss
positive_class_weight: 10

# optimizer
optimizer_type: "AdamW"
pretrained_lr: 1e-6
head_lr: 1e-4
weight_decay: 0.01
betas: [0.9, 0.999]

# scheduler
scheduler_type: "cosine_custom"
first_cycle_epochs: 4
cycle_factor: 1
num_warmup_steps: 0
min_lr: 1e-9
gamma: 1.0

# # training
# epochs: 4
# accumulation_steps: 2
# eval_steps: 1000
# negative_th: 0.660
# negative_th_method: "overall"
# amp: true
# ema: true
# ema_decay: 0.999
# ema_update_after_step: 8000

# full training
full_train: true

Overwriting ../config/exp_118_train.yaml


In [6]:
import gc
import pickle
import sys
import warnings
from collections import Counter, defaultdict
from pathlib import Path

import numpy as np

warnings.filterwarnings("ignore")

import polars as pl
from tqdm.auto import tqdm

sys.path.append("..")

import torch
from torch.utils.data import DataLoader

from src.postprocess import PostProcessor
from src.preprocess import DetectDataProvider
from src.train import Trainer, get_full_train_loader, get_train_loaders
from src.train.dataloader_utils import CollateFn, get_sampler, get_tokenizer
from src.utils import TimeUtil, get_config, get_logger, seed_everything
from src.utils.metric import get_best_negative_threshold

# Setup

In [7]:
# 2nd-stageの目的は, 1st-stageでのNAME-STUDENTのFPを現象させること
# 後処理は2nd-stageの前では行わずに, 2nd-stageの後で行う

In [8]:
# [TODO]コマンドライン引数
config_name = "exp_118_train"
debug = False

In [14]:
config = get_config(config_name, config_dir=Path("../config"))
logger = get_logger(config.output_path)
logger.info(f"exp:{config.exp} start")

seed_everything(config.seed)

[ [32m2024-10-21 11:26:23[0m | [1mINFO ] exp:118 start[0m


In [16]:
# [TODO] Notebookの時はPathを変更する
config.input_path = Path("../data/input")
config.exter_path = Path("../data/input/external")
config.output_path = Path("../data/output") / config.exp
config.output_path.mkdir(exist_ok=True, parents=True)

In [19]:
# コンペデータの読み込み
train_data = load_json_data(config.input_path / "train.json", debug=config.debug)
train_data = convert_label_str2index(train_data, remove_prefix=False)

# 1st stageで得られたoof_dfに対して予測を行う
oof_df = pl.read_parquet(config.output_path.parent / config.first_exp / "oof.parquet")
class_num = len(list(filter(lambda x: "pred" in x, oof_df.columns)))
pred_df = get_pred_df(oof_df, class_num=class_num, negative_th=th)
if class_num == 8:
    pred_df = restore_prefix(config, pred_df)

8

In [None]:
# # 実際に実行する際のコード


# # load 2nd stage data
# train_data = load_2nd_stage_data(train_data, pred_df, is_train=True)
# if config.debug:
#     train_data = train_data[:300]

# # get cv fold
# fold_array = get_cv_fold_2nd(train_data, n_splits=config.n_fold, seed=config.seed)

# # get dataloader
# dataloaders = get_train_loaders_2nd(config, train_data, fold_array, use_fold=config.use_fold)

In [None]:
class ClassifyDataLoader:
    pass

In [None]:
def load_2nd_stage_data(
    org_data: List[dict], pred_df: pl.DataFrame, is_train: bool = True, use_fn_label: bool = False
) -> List[dict]:
    # トークンのインデックスに整合性を持たせる処理が必要
    token_index_df = []
    for data in org_data:
        token_index_df.append(
            pl.DataFrame(
                {
                    "document": [data["document"]] * len(data["tokens"]),
                    "token_index": list(range(len(data["tokens"]))),
                }
            )
        )
    token_index_df = pl.concat(token_index_df)

    # 予測のないトークンに予測値を0にする
    pred_ids = pred_df["document"].unique().to_list()
    token_index_df = token_index_df.filter(pl.col("document").is_in(pred_ids))
    pred_df = token_index_df.join(pred_df, on=["document", "token_index"], how="left")
    pred_df = pred_df.with_columns(
        name_pred=pl.col("pred").replace(
            {1: 1, 8: 1}, default=0
        )  # 1st_stage -> with prefix, 2nd_stage -> without prefix
    )
    pred_df = pred_df.sort("document", "token_index")
    name_pred_dict = {doc_id: tmp_df["name_pred"].to_list() for doc_id, tmp_df in pred_df.group_by("document")}

    # 元のデータセットにNAME_STUDENTの予測値を結合する
    merged_data = []
    for data in org_data:
        doc_id = data["document"]
        # pred_dfに含まれないデータは含めない
        if doc_id not in name_pred_dict:
            continue
        name_preds = name_pred_dict[doc_id]
        data["name_preds"] = name_preds

        # 2nd stageで使用するラベルを作成する
        if is_train:
            second_labels, name_preds_ow = [], []
            for org_label, name_pred in zip(data["labels"], name_preds):
                if name_pred == 1 and org_label in [1, 8]:
                    second_labels.append(1)
                    name_preds_ow.append(1)
                elif name_pred == 1 and org_label not in [1, 8]:
                    second_labels.append(0)
                    name_preds_ow.append(1)
                # FNも学習対象とするとき
                elif use_fn_label and name_pred == 0 and org_label in [1, 8]:
                    second_labels.append(1)
                    name_preds_ow.append(1)
                else:
                    second_labels.append(-1)
                    name_preds_ow.append(0)

            data["second_labels"] = second_labels
            data["name_preds"] = name_preds_ow

        # 学習するトークンがない場合はあらかじめ除外
        trainable_num = sum(data["name_preds"])
        if trainable_num == 0:
            continue
        merged_data.append(data)
    return merged_data

In [None]:
def get_cv_fold_2nd(data: List[dict], n_splits: int, seed: int = 10) -> np.ndarray:
    stratify_list = []
    for d in data:
        second_label = [label for label in d["second_labels"] if label != -1]
        if len(set(second_label)) == 2:
            stratify_list.append(2)  # 2 when both 1 and 0 are included
        else:
            stratify_list.append(second_label[0])  # 1 or 0

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    folds = np.ones(len(data)) * -1
    for fold, (_, valid_idx) in enumerate(skf.split(data, y=stratify_list)):
        folds[valid_idx] = fold
    return folds

In [None]:
class BaseModel(nn.Module):
    def __init__(self):
        super().__init__()

    import torch


from omegaconf import DictConfig
from torch import nn
from transformers import AutoConfig, AutoModel


class DetectModel(nn.Module):
    def __init__(self, config: DictConfig):
        super().__init__()
        self.config = config
        self.use_hidden_states = config.use_hidden_states
        self.model_config = AutoConfig.from_pretrained(config.model_path)
        self.model_config.update(
            {
                "hidden_dropout_prob": config.hidden_dropout,
                "attention_probs_dropout_prob": config.attention_dropout,
                "output_hidden_states": True,
            }
        )
        hidden_size = self.model_config.hidden_size
        self.backbone = AutoModel.from_pretrained(config.model_path, config=self.model_config)

        self.lstm_type = config.lstm_type
        if config.lstm_type == "lstm":
            self.lstm = nn.LSTM(
                hidden_size * self.use_hidden_states, hidden_size, num_layers=1, batch_first=True, bidirectional=True
            )
        elif config.lstm_type == "gru":
            self.lstm = nn.GRU(
                hidden_size * self.use_hidden_states, hidden_size, num_layers=1, batch_first=True, bidirectional=True
            )

        self.pos_emb = nn.Sequential(
            nn.Linear(2, hidden_size * self.use_hidden_states),
            nn.Dropout(config.dropout),
        )
        head_input_size = hidden_size * self.use_hidden_states if config.lstm_type == "none" else hidden_size * 2
        self.head = nn.Sequential(
            nn.Linear(head_input_size, 128),
            nn.LayerNorm(128),
            nn.ReLU(),
            nn.Dropout(config.dropout),
            nn.Linear(128, config.class_num),
        )
        self.layer_norm = nn.LayerNorm(hidden_size * self.use_hidden_states)

        self.head.apply(self._init_weights)
        if config.lstm_type != "none":
            self._lstm_init_weights(self.lstm)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.model_config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.model_config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    # Tensorflow/Keras-like initialization for GRU
    def _lstm_init_weights(self, module):
        for name, p in module.named_parameters():
            if "weight_ih" in name:
                nn.init.xavier_uniform_(p.data)
            elif "weight_hh" in name:
                nn.init.orthogonal_(p.data)
            elif "bias" in name:
                p.data.fill_(0)

    def forward(self, input_ids, attention_mask, positions):
        x_pos = self.pos_emb(positions)
        x_bb = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        x_bb = torch.cat(x_bb.hidden_states[-self.use_hidden_states :], dim=-1)
        x = x_bb + x_pos
        x = self.layer_norm(x)
        if self.lstm_type != "none":
            x, _ = self.lstm(x)
        x = self.head(x)
        return x

In [6]:
# [TODO]コマンドライン引数
config_name = "exp_087_train"
debug = False

In [8]:
config = get_config(config_name, config_dir=Path("../config"))
logger = get_logger(config.output_path)
logger.info(f"exp:{config.exp} start")

seed_everything(config.seed)

[ [32m2024-10-21 08:05:14[0m | [1mINFO ] exp:087 start[0m


In [9]:
# [TODO]
config.debug = debug
config.use_fold = 3
config.eval_steps = 500  # 100
config.ema_update_after_step = 100

config.epochs = 2
config.first_cycle_epochs = 2
config.add_epochs = 2
config.add_first_cycle_epochs = 2

# Data

In [10]:
dpr = DetectDataProvider(config, "train")
data = dpr.load_data()
logger.info(f"Data Size: {len(data)}")

[ [32m2024-10-21 08:05:24[0m | [1mINFO ] Data Size: 13854[0m


In [11]:
# [TODO]データサイズを調整する

data_ = []
for fold in [-1, 0, 1, 2]:
    fold_data = [d for d in data if d["fold"] == fold]
    fold_data = fold_data[:100]
    data_.extend(fold_data)

data = data_
len(data)

400

In [12]:
dataloaders = get_train_loaders(config, data)

# Model

In [None]:
class ClassifyModel(nn.Module):
    def __init__(self, config: DictConfig):
        super().__init__()
        self.config = config
        self.use_hidden_states = config.use_hidden_states
        self.model_config = AutoConfig.from_pretrained(config.model_path)
        self.model_config.update(
            {
                "hidden_dropout_prob": config.hidden_dropout,
                "attention_probs_dropout_prob": config.attention_dropout,
                "output_hidden_states": True,
            }
        )
        hidden_size = self.model_config.hidden_size
        self.backbone = AutoModel.from_pretrained(config.model_path, config=self.model_config)

        self.lstlm_type = config.lstm_type
        if config.lstm_type == "lstm":
            self.lstm = nn.LSTM(
                hidden_size * self.use_hidden_states, hidden_size, num_layers=1, batch_first=True, bidirectional=True
            )
        elif config.lstm_type == "gru":
            self.lstm = nn.GRU(
                hidden_size * self.use_hidden_states, hidden_size, num_layers=1, batch_first=True, bidirectional=True
            )

        self.head = nn.Sequential(
            nn.Linear(hidden_size * self.use_hidden_states, 128),
            nn.LayerNorm(128),
            nn.ReLU(),
            nn.Dropout(config.dropout),
            nn.Linear(128, 1),
        )
        self.layer_norm = nn.LayerNorm(hidden_size * self.use_hidden_states)
        self.dropout = nn.Dropout(config.dropout)

        self.head.apply(self._init_weights)

    # DeBERTaの重み初期化関数
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.model_config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.model_config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, input_ids, attention_mask):
        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        x = torch.cat(outputs.hidden_states[-self.use_hidden_states :], dim=-1)  # N層分のhidden_statesをconcat
        if self.lstm_type != "none":
            x, _ = self.lstm(x)
            x = self.dropout(x)
        x = self.head(x)
        return x

    # 指定した層の数だけ再初期化する(エンコーダーの最後の層からカウント)
    def reinit_layers(self, reinit_layer_num: int):
        for i in range(1, reinit_layer_num + 1):
            self.backbone.encoder.layer[-i].apply(self._init_weights)

    # 指定した層の数だけFreezeする(エンコーダーの最初の層からカウント)
    def freeze_layers(self, freeze_layer_num: int):
        for i in range(freeze_layer_num):
            if i == 0:
                for params in self.backbone.embeddings.parameters():
                    params.requires_grad = False
            else:
                for params in self.backbone.encoder.layer[i - 1].parameters():
                    params.requires_grad = False

    # 初期化した層以外の層をFreezeする
    def freeze_backbone(self, reinit_layer_num: int):
        for param in self.backbone.parameters():
            param.requires_grad = False

        for i in range(1, reinit_layer_num + 1):
            for params in self.backbone.encoder.layer[-i].parameters():
                params.requires_grad = True

    # BackboneのFreezeを解除する, 元からFreezeに指定した層はFreezeのまま
    def unfreeze_backbone(self, freeze_layer_num: int):
        for param in self.backbone.parameters():
            param.requires_grad = True

        self.freeze_layers(freeze_layer_num)

# Run

In [13]:
oof_dfs = []
best_steps, best_add_steps = [], []
collate_fn = CollateFn(get_tokenizer(config), is_train=True)

# この学習でベストなステップ数とOOFに対する予測値を取ることが目的
for fold, (train_loader, valid_loader) in enumerate(dataloaders):
    logger.info(f"FOLD{fold} : Training Start...")

    # First Training
    trainer = Trainer(config, logger, save_suffix=f"_fold{fold}")
    best_score, best_steps_, _ = trainer.train(train_loader, valid_loader)
    if config.smooth_type == "online":
        loss_soft_matrix = trainer.loss_fn.soft_matrix.clone()
    best_steps.append(best_steps_)
    logger.info(f"FOLD{fold} : First Training Done! -->> Best Score: {best_score}, Best Steps: {best_steps_}")

    del trainer
    gc.collect()
    torch.cuda.empty_cache()

    # Create High-Quality Dataloader
    train_dataset = train_loader.dataset
    train_dataset.drop_first_only_data()
    train_loader = DataLoader(
        train_dataset,
        sampler=get_sampler(train_dataset),
        batch_size=config.train_batch,
        collate_fn=collate_fn,
        pin_memory=True,
        drop_last=True,
    )

    # Additional Training
    trainer = Trainer(config, logger, save_suffix=f"_fold{fold}")
    if config.smooth_type == "online":
        trainer.loss_fn.soft_matrix = loss_soft_matrix
    best_score, best_add_steps_, oof_df = trainer.train(
        train_loader,
        valid_loader,
        retrain=True,
        retrain_weight_name=f"model_fold{fold}_best",
        retrain_best_score=best_score,
    )
    best_add_steps.append(best_add_steps_)
    oof_df.write_parquet(config.output_path / f"oof_fold{fold}.parquet")
    oof_dfs.append(oof_df)
    logger.info(
        f"FOLD{fold} : Additional Training Done! -->> Best Score: {best_score}, Best Add Steps: {best_add_steps_}"
    )

    del train_loader, valid_loader, train_dataset, trainer, oof_df
    gc.collect()
    torch.cuda.empty_cache()

del dataloaders
gc.collect()

# Save OOF
oof_df = pl.concat(oof_dfs)
oof_df.write_parquet(config.output_path / "oof.parquet")
del oof_dfs
gc.collect()

# Get Best Negative Threshold
best_score, best_th = get_best_negative_threshold(config, oof_df)
message = f"Overall OOF Best Score: {best_score}, Best Negative Threshold: {best_th}"
logger.info(message)
config.negative_th = best_th.item()

[ [32m2024-10-21 08:05:41[0m | [1mINFO ] FOLD0 : Training Start...[0m


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/505 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

[ [32m2024-10-21 08:08:44[0m | [1mINFO ] Valid : Epoch=0, Loss=1.83803, Score=0.80858 Threshold=0.32499999999999996[0m
[ [32m2024-10-21 08:08:46[0m | [1mINFO ] [Train] : Epoch=0, Loss=2.55542, LR=1.00000e-06[0m


  0%|          | 0/505 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

[ [32m2024-10-21 08:12:41[0m | [1mINFO ] Valid : Epoch=1, Loss=4.20555, Score=0.84081 Threshold=0.27499999999999997[0m
[ [32m2024-10-21 08:12:46[0m | [1mINFO ] [Train] : Epoch=1, Loss=6.40125, LR=4.98946e-07[0m
[ [32m2024-10-21 08:12:50[0m | [1mINFO ] FOLD0 : First Training Done! -->> Best Score: 0.8408107492598499, Best Steps: 1000[0m


NameError: name 'torch' is not defined

In [13]:
# # 全データ学習を行う
if config.full_train:
    full_steps = np.max(best_steps)
    full_add_steps = np.max(best_add_steps)
    logger.info("Full Train : Training Start...")
    train_loader = get_full_train_loader(config, data)

    # First Training
    trainer = Trainer(config, logger, save_suffix="")
    trainer.train(train_loader, valid_loader=None, full_train=True, full_steps=full_steps)
    if config.smooth_type == "online":
        loss_soft_matrix = trainer.loss_fn.soft_matrix.clone()
    logger.info("Full Train : First Training Done!")

    # Create High-Quality Dataloader
    train_dataset = train_loader.dataset
    train_dataset.drop_first_only_data()
    train_loader = DataLoader(
        train_dataset,
        sampler=get_sampler(train_dataset),
        batch_size=config.train_batch,
        collate_fn=collate_fn,
        pin_memory=True,
        drop_last=True,
    )

    # Additional Training
    trainer = Trainer(config, logger, save_suffix="")
    if config.smooth_type == "online":
        trainer.loss_fn.soft_matrix = loss_soft_matrix
    trainer.train(
        train_loader,
        valid_loader=None,
        retrain=True,
        retrain_weight_name="model_full",
        full_train=True,
        full_steps=full_add_steps,
    )
    logger.info("Full Train : Additional Training Done!")

    del train_loader, trainer
    gc.collect()
    torch.cuda.empty_cache()

[ [32m2024-10-21 07:50:35[0m | [1mINFO ] Full Train : Training Start...[0m


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/654 [00:00<?, ?it/s]

[ [32m2024-10-21 07:51:08[0m | [1mINFO ] [Train] : Epoch=0, Loss=3.26228, LR=1.00000e-06[0m
[ [32m2024-10-21 07:51:08[0m | [1mINFO ] Full Train : First Training Done![0m


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/654 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 23.58 GiB total capacity; 23.20 GiB already allocated; 19.19 MiB free; 23.23 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

# PostProcess

In [24]:
# [TODO] 後で編集する
oof_df = pl.read_parquet(config.output_path / "oof.parquet")
pred_df = get_pred_df(oof_df, config.class_num, negative_th=0.35)
truth_df = get_truth_df(config, pred_df["document"].unique().to_list(), convert_idx=True)
score = evaluate_metric(pred_df, truth_df)
print(score)

pper = PostProcessor(config)
pred_df = pper.post_process(pred_df)
score = evaluate_metric(pred_df, truth_df)
print(score)

# logger.info(f"Post Processed Score: {score:.4f}")

0.04906380169061845


Check Prefix Validity: 100%|██████████| 300/300 [00:00<00:00, 843.14it/s]
Check PII Validity: 100%|██████████| 155221/155221 [14:01<00:00, 184.44it/s]


0.572628474337191


NameError: name 'pred_df' is not defined