# Setup

In [1]:
from dataclasses import dataclass
from tqdm.autonotebook import tqdm
from src.utils.other import set_seeds
import wandb

import numpy as np
import torch
from datasets import Dataset
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModel,
    AutoModelForTokenClassification,
    Trainer,
    TrainingArguments
)

set_seeds(seed=42)
tqdm.pandas()

  from tqdm.autonotebook import tqdm


# Config

In [2]:
@dataclass
class Config:
    dataset_name: str = "Goader/ner-uk-2.0"
    
    pretrained: str = "microsoft/mdeberta-v3-base"
    # max_length: int = 1024
    merge_subwords: bool = True
    
    wandb_init_args = {
        'project': "sl-ner-uk-2.0",
        'entity': "havlytskyi-thesis",
        'name': "mdeberta-v3-base--word-level"
    }

config = Config()

# Training Arguments

In [3]:
training_args = TrainingArguments(
    output_dir=f'./checkpoints/{config.wandb_init_args["name"]}',
    logging_dir=f'./logs/{config.wandb_init_args["name"]}',
    learning_rate=2e-5,
    weight_decay=0.01,
    lr_scheduler_type='cosine',
    warmup_ratio=0.0,
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    #bf16=True,
    report_to="wandb",
    optim='adamw_torch',
    eval_strategy='steps',
    save_strategy="steps",
    eval_steps=200,
    logging_steps=10,
    save_steps=200,
    save_total_limit=10,
    metric_for_best_model='eval_f1',
    greater_is_better=True,
    load_best_model_at_end=True,
)

# Data & Tokenizer

In [4]:
tokenizer = AutoTokenizer.from_pretrained(config.pretrained)



In [5]:
from word_utils.data import NerUKDataset

dataset = NerUKDataset(
    tokenizer=tokenizer,
    splits=('train', 'validation', 'test'),
    dataset_name=config.dataset_name,
)

Tokenizing train split:   0%|          | 0/10980 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Tokenizing validation split:   0%|          | 0/1206 [00:00<?, ? examples/s]

Tokenizing test split:   0%|          | 0/5593 [00:00<?, ? examples/s]

# Model

In [6]:
model_config = AutoConfig.from_pretrained(
    config.pretrained,
    num_labels=len(dataset.label_list),
    id2label=dataset.id2label,
    label2id=dataset.label2id,
    )

base_model = AutoModel.from_pretrained(config.pretrained)

In [7]:
from word_utils.word_model import ModelForWordTask

model = ModelForWordTask(
    model=base_model,
    merge_subwords=True,
    config=model_config
)

# Train

In [8]:
from transformers import DataCollatorForTokenClassification
from word_utils.metric import NerMetrics


data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

trainer = Trainer(
    model=model, 
    args=training_args, 
    train_dataset=dataset.train,
    eval_dataset=dataset.val,
    data_collator=data_collator,
    compute_metrics=NerMetrics(label2id=dataset.label2id).compute_metrics,
    tokenizer=tokenizer,
)

  trainer = Trainer(
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [9]:
wandb.init(**config.wandb_init_args)

trainer.train()

[34m[1mwandb[0m: [32m[41mERROR[0m Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mivan-havlytskyi[0m ([33mivan-havlytskyiz[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
200,0.1945,0.186618,0.67426,0.692668,0.68334,0.953512
400,0.1194,0.131532,0.695291,0.783151,0.73661,0.960871
600,0.0599,0.117681,0.738636,0.811232,0.773234,0.965489
800,0.0512,0.130645,0.743281,0.819813,0.779674,0.962546
1000,0.0498,0.11047,0.798808,0.836193,0.817073,0.969702
1200,0.0526,0.112097,0.8,0.842434,0.820669,0.969905
1400,0.0485,0.111615,0.789818,0.847114,0.817463,0.969549
1600,0.0365,0.113741,0.787945,0.846334,0.816096,0.969702


  _warn_prf(average, modifier, msg_start, len(result))
[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/mdeberta-v3-base--word-level/checkpoint-200)... Done. 3.7s
  _warn_prf(average, modifier, msg_start, len(result))
[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/mdeberta-v3-base--word-level/checkpoint-400)... Done. 3.6s
  _warn_prf(average, modifier, msg_start, len(result))
[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/mdeberta-v3-base--word-level/checkpoint-600)... Done. 3.7s
  _warn_prf(average, modifier, msg_start, len(result))
[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/mdeberta-v3-base--word-level/checkpoint-800)... Done. 3.7s
  _warn_prf(average, modifier, msg_start, len(result))
[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/mdeberta-v3-base--word-level/checkpoint-1000)... Done. 3.7s
  _warn_prf(average, modifier, msg_start, len(result))
[34m[1mwandb[0m: Adding directory to artifact (./ch

TrainOutput(global_step=1715, training_loss=0.11426291197277715, metrics={'train_runtime': 1576.0014, 'train_samples_per_second': 34.835, 'train_steps_per_second': 1.088, 'total_flos': 2753882444671200.0, 'train_loss': 0.11426291197277715, 'epoch': 4.986899563318778})

# Inference

## Checkpoint

In [13]:
FINETUNED_MODEL = f'checkpoints/{config.wandb_init_args["name"]}/checkpoint-1200'

In [14]:
trainer._load_from_checkpoint(FINETUNED_MODEL)

## Test

In [17]:
test_preds = trainer.predict(dataset.test)
test_metrics = trainer.compute_metrics((test_preds.predictions, test_preds.label_ids))

test_metrics

{'precision': 0.8410812672176309,
 'recall': 0.839058742700103,
 'f1': 0.8400687876182287,
 'accuracy': 0.9780628085712831}