# Library

In [13]:
%%writefile ../config/exp_087_train.yaml
exp: "087"
run_type: "train"
task_type: "detect"
device: "cuda"
seed: 10

# data preprocess
remove_prefix: true
exter_dataset:
  - ["nicholas", true]
  - ["mpware", false]
  - ["pjma", false]
n_fold: 3
use_fold: 3

# dataset, dataloader
add_newline_token: true
max_length: 128
train_stride: 96
eval_stride: 64
train_batch: 16
eval_batch: 64

# model
model_path: "microsoft/deberta-v3-large"
class_num: 8 # with prefix -> 13, without prefix -> 8
lstm_type: "none"
use_hidden_states: 2
dropout: 0.10
hidden_dropout: 0.10
attention_dropout: 0.10
reinit_layer_num: 0
freeze_layer_num: 0

# loss
smooth_type: "online"
smooth_ratio: 0.05
smooth_pair: 0.05
positive_class_weight: 10

# optimizer
optimizer_type: "AdamW"
pretrained_lr: 1e-6
head_lr: 1e-4
weight_decay: 0.01
betas: [0.9, 0.999]

# scheduler
scheduler_type: "cosine_custom"
first_cycle_epochs: 4
cycle_factor: 1
num_warmup_steps: 0
min_lr: 1e-9
gamma: 1.0

# training
epochs: 4
accumulation_steps: 2
eval_steps: 1000
negative_th: 0.660
negative_th_method: "overall"
amp: true
ema: true
ema_decay: 0.999
ema_update_after_step: 8000

# additional training
add_train: true
add_epochs: 4
add_first_cycle_epochs: 4

# full training
full_train: true

In [14]:
import gc
import pickle
import sys
import warnings
from collections import Counter, defaultdict
from pathlib import Path

import numpy as np

warnings.filterwarnings("ignore")

import polars as pl
from tqdm.auto import tqdm

sys.path.append("..")

import torch
from torch.utils.data import DataLoader

from src.postprocess import PostProcessor
from src.preprocess import DetectDataReader
from src.train import Trainer, get_full_train_loader, get_train_loaders
from src.train.dataloader_utils import CollateFn, get_sampler, get_tokenizer
from src.utils import TimeUtil, get_config, get_logger, seed_everything
from src.utils.metric import get_best_negative_threshold

# Setup

In [44]:
# [TODO]コマンドライン引数
config_name = "exp_082_train"
debug = False

In [45]:
config = get_config(config_name, config_dir=Path("../config"))
logger = get_logger(config.output_path)
logger.info(f"exp:{config.exp} start")

seed_everything(config.seed)

[ [32m2024-10-22 01:41:56[0m | [1mINFO ] exp:082 start[0m


In [46]:
# [TODO]
config.input_path = Path("../data/input")
config.exter_path = Path("../data/input/external")
config.output_path = Path("../data/output") / config.exp
config.output_path.mkdir(exist_ok=True, parents=True)

config.debug = debug
config.use_fold = 3
config.eval_steps = 500  # 100
config.ema_update_after_step = 100

config.epochs = 2
config.first_cycle_epochs = 2
config.add_epochs = 2
config.add_first_cycle_epochs = 2

# Data

In [47]:
dpr = DetectDataReader(config, "train")
data = dpr.load_data()
logger.info(f"Data Size: {len(data)}")

[ [32m2024-10-22 01:42:02[0m | [1mINFO ] Data Size: 13854[0m


In [52]:
# config.use_fold = 1

In [53]:
# # [TODO]データサイズを調整する

# data_ = []
# for fold in [-1, 0, 1, 2]:
#     fold_data = [d for d in data if d["fold"] == fold]
#     fold_data = fold_data[:100]
#     data_.extend(fold_data)

# data = data_
# len(data)

In [51]:
dataloaders = get_train_loaders(config, data)

In [54]:
len(dataloaders[0][0])

21110

# Run

In [13]:
oof_dfs = []
best_steps, best_add_steps = [], []
collate_fn = CollateFn(get_tokenizer(config), is_train=True)

# この学習でベストなステップ数とOOFに対する予測値を取ることが目的
for fold, (train_loader, valid_loader) in enumerate(dataloaders):
    logger.info(f"FOLD{fold} : Training Start...")

    # First Training
    trainer = Trainer(config, logger, save_suffix=f"_fold{fold}")
    best_score, best_steps_, _ = trainer.train(train_loader, valid_loader)
    if config.smooth_type == "online":
        loss_soft_matrix = trainer.loss_fn.soft_matrix.clone()
    best_steps.append(best_steps_)
    logger.info(f"FOLD{fold} : First Training Done! -->> Best Score: {best_score}, Best Steps: {best_steps_}")

    del trainer
    gc.collect()
    torch.cuda.empty_cache()

    # Create High-Quality Dataloader
    train_dataset = train_loader.dataset
    train_dataset.drop_first_only_data()
    train_loader = DataLoader(
        train_dataset,
        sampler=get_sampler(train_dataset),
        batch_size=config.train_batch,
        collate_fn=collate_fn,
        pin_memory=True,
        drop_last=True,
    )

    # Additional Training
    trainer = Trainer(config, logger, save_suffix=f"_fold{fold}")
    if config.smooth_type == "online":
        trainer.loss_fn.soft_matrix = loss_soft_matrix
    best_score, best_add_steps_, oof_df = trainer.train(
        train_loader,
        valid_loader,
        retrain=True,
        retrain_weight_name=f"model_fold{fold}_best",
        retrain_best_score=best_score,
    )
    best_add_steps.append(best_add_steps_)
    oof_df.write_parquet(config.output_path / f"oof_fold{fold}.parquet")
    oof_dfs.append(oof_df)
    logger.info(
        f"FOLD{fold} : Additional Training Done! -->> Best Score: {best_score}, Best Add Steps: {best_add_steps_}"
    )

    del train_loader, valid_loader, train_dataset, trainer, oof_df
    gc.collect()
    torch.cuda.empty_cache()

del dataloaders
gc.collect()

# Save OOF
oof_df = pl.concat(oof_dfs)
oof_df.write_parquet(config.output_path / "oof.parquet")
del oof_dfs
gc.collect()

# Get Best Negative Threshold
best_score, best_th = get_best_negative_threshold(config, oof_df)
message = f"Overall OOF Best Score: {best_score}, Best Negative Threshold: {best_th}"
logger.info(message)
config.negative_th = best_th.item()

[ [32m2024-10-21 08:05:41[0m | [1mINFO ] FOLD0 : Training Start...[0m


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/505 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

[ [32m2024-10-21 08:08:44[0m | [1mINFO ] Valid : Epoch=0, Loss=1.83803, Score=0.80858 Threshold=0.32499999999999996[0m
[ [32m2024-10-21 08:08:46[0m | [1mINFO ] [Train] : Epoch=0, Loss=2.55542, LR=1.00000e-06[0m


  0%|          | 0/505 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

[ [32m2024-10-21 08:12:41[0m | [1mINFO ] Valid : Epoch=1, Loss=4.20555, Score=0.84081 Threshold=0.27499999999999997[0m
[ [32m2024-10-21 08:12:46[0m | [1mINFO ] [Train] : Epoch=1, Loss=6.40125, LR=4.98946e-07[0m
[ [32m2024-10-21 08:12:50[0m | [1mINFO ] FOLD0 : First Training Done! -->> Best Score: 0.8408107492598499, Best Steps: 1000[0m


NameError: name 'torch' is not defined

In [13]:
# # 全データ学習を行う
if config.full_train:
    full_steps = np.max(best_steps)
    full_add_steps = np.max(best_add_steps)
    logger.info("Full Train : Training Start...")
    train_loader = get_full_train_loader(config, data)

    # First Training
    trainer = Trainer(config, logger, save_suffix="")
    trainer.train(train_loader, valid_loader=None, full_train=True, full_steps=full_steps)
    if config.smooth_type == "online":
        loss_soft_matrix = trainer.loss_fn.soft_matrix.clone()
    logger.info("Full Train : First Training Done!")

    # Create High-Quality Dataloader
    train_dataset = train_loader.dataset
    train_dataset.drop_first_only_data()
    train_loader = DataLoader(
        train_dataset,
        sampler=get_sampler(train_dataset),
        batch_size=config.train_batch,
        collate_fn=collate_fn,
        pin_memory=True,
        drop_last=True,
    )

    # Additional Training
    trainer = Trainer(config, logger, save_suffix="")
    if config.smooth_type == "online":
        trainer.loss_fn.soft_matrix = loss_soft_matrix
    trainer.train(
        train_loader,
        valid_loader=None,
        retrain=True,
        retrain_weight_name="model_full",
        full_train=True,
        full_steps=full_add_steps,
    )
    logger.info("Full Train : Additional Training Done!")

    del train_loader, trainer
    gc.collect()
    torch.cuda.empty_cache()

[ [32m2024-10-21 07:50:35[0m | [1mINFO ] Full Train : Training Start...[0m


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/654 [00:00<?, ?it/s]

[ [32m2024-10-21 07:51:08[0m | [1mINFO ] [Train] : Epoch=0, Loss=3.26228, LR=1.00000e-06[0m
[ [32m2024-10-21 07:51:08[0m | [1mINFO ] Full Train : First Training Done![0m


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/654 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 23.58 GiB total capacity; 23.20 GiB already allocated; 19.19 MiB free; 23.23 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

# PostProcess

In [21]:
# oof_df = pl.read_parquet(Path("../data/output/058") / "oof.parquet")
# oof_df

In [20]:
# config.class_num = 13
# config.negative_th = 0.42499999999999993
# config.input_path = Path("../data/input")
# config.debug = True

# # Post-Process
# pred_df = get_pred_df(oof_df, config.class_num, negative_th=config.negative_th)
# truth_df = get_truth_df(config, pred_df["document"].unique().to_list(), convert_idx=True)

# pper = PostProcessor(config)
# pred_df = pper.post_process(pred_df)
# score = evaluate_metric(pred_df, truth_df)
# logger.info(f"OOF Score after Post-Process: {score:.5f}")

Check Prefix Validity: 100%|██████████| 100/100 [00:00<00:00, 1373.59it/s]
Check PII Validity: 100%|██████████| 413/413 [00:00<00:00, 618.52it/s]

[ [32m2024-10-21 11:58:02[0m | [1mINFO ] OOF Score after Post-Process: 0.88480[0m



