In [None]:
import numpy as np
import torch
from sklearn.metrics import classification_report
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from transformers import logging

import src
from src.bert import module
from src.bert import training
from src.bert.dataset import PBertDataset
from src.bert.dataset import strategies

In [None]:
logging.set_verbosity_error()

# model hyper-parameters
LR = 4e-6
N_EPOCHS = 13
BATCH_SIZE = 16

TOKENIZER = "deepset/gbert-large"
BASE_MODEL = "deepset/gbert-large"

STRATEGY = strategies.MLMin1PopIdeol(output_fmt="single_task")

DEVICE = "cuda"

In [None]:
dataset = PBertDataset.from_disk(src.PATH / "data/bert/dataset.csv.zip", label_strategy=STRATEGY)

In [None]:
len(dataset)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)

In [None]:
collate_fn = dataset.create_collate_fn(tokenizer)
dataset_loader = DataLoader(dataset, collate_fn=collate_fn, batch_size=BATCH_SIZE, shuffle=True)

In [None]:
model = module.BertSingleTaskMultiLabel(num_labels=dataset.num_labels, name=BASE_MODEL)
model.train()
model = model.to(DEVICE)
model.set_seed(seed=10)

optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=LR,
    amsgrad=False,
    weight_decay=1e-2,
)

lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    optimizer,
    T_max=N_EPOCHS,
    eta_min=1e-9,
)

for epoch in range(1, N_EPOCHS + 1):
    train_loss = training.train_epoch(model, dataset_loader, optimizer, lr_scheduler)
    print(f"{epoch=} {train_loss=:.4f}")

In [None]:
torch.save(model, src.PATH / "tmp/full_model_v9.4.model")