# Setup

In [1]:
from dataclasses import dataclass
from tqdm.autonotebook import tqdm
from src.utils.other import set_seeds
import wandb

import numpy as np
import torch
from datasets import Dataset
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModel,
    AutoModelForTokenClassification,
    Trainer,
    TrainingArguments
)

set_seeds(seed=42)
tqdm.pandas()

  from tqdm.autonotebook import tqdm


# Config

In [2]:
@dataclass
class Config:
    dataset_name: str = "Goader/ner-uk-2.0"
    
    pretrained: str = "benjamin/roberta-large-wechsel-ukrainian"
    # pretrained: str = "microsoft/mdeberta-v3-base"
    max_length: int = 1024
    merge_subwords: bool = True
    
    # wandb_init_args = {
    #     'project': "sl-ner-uk-2.0",
    #     'entity': "havlytskyi-thesis",
    #     'name': "benjamin--word-level"
    # }
    wandb_init_args = {
        'project': "thesis-sequence-labeling",
        'entity': "ivan-havlytskyiz",
        'name': "benjamin--word-level"
    }

config = Config()

# Training Arguments

In [3]:
training_args = TrainingArguments(
    output_dir=f'./checkpoints/{config.wandb_init_args["name"]}',
    logging_dir=f'./logs/{config.wandb_init_args["name"]}',
    learning_rate=2e-5,
    weight_decay=0.01,
    lr_scheduler_type='cosine',
    warmup_ratio=0.0,
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    #bf16=True,
    report_to="wandb",
    optim='adamw_torch',
    eval_strategy='epoch',
    save_strategy="epoch",
    # eval_steps=100,
    logging_steps=10,
    # save_steps=100,
    save_total_limit=10,
    metric_for_best_model='eval_f1',
    greater_is_better=True,
    load_best_model_at_end=True,
)

# Data & Tokenizer

In [4]:
tokenizer = AutoTokenizer.from_pretrained(config.pretrained)
tokenizer.add_prefix_space=True

In [5]:
from utils.data import NerUKDataset

dataset = NerUKDataset(
    tokenizer=tokenizer,
    splits=('train', 'validation', 'test'),
    dataset_name=config.dataset_name,
)

ModuleNotFoundError: No module named 'utils'

# Model

In [None]:
model_config = AutoConfig.from_pretrained(
    config.pretrained,
    num_labels=len(dataset.label_list),
    id2label=dataset.id2label,
    label2id=dataset.label2id,
    )

base_model = AutoModel.from_pretrained(config.pretrained)

In [None]:
from utils.word_model import ModelForWordTask

model = ModelForWordTask(
    model=base_model,
    merge_subwords=True,
    config=model_config
)

# Train

In [None]:
from transformers import DataCollatorForTokenClassification
from utils.metric import NerMetrics


data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

trainer = Trainer(
    model=model, 
    args=training_args, 
    train_dataset=dataset.train,
    eval_dataset=dataset.val,
    data_collator=data_collator,
    compute_metrics=NerMetrics(label2id=dataset.label2id).compute_metrics,
    tokenizer=tokenizer,
)

In [None]:
wandb.init(**config.wandb_init_args)

trainer.train()

# Inference

## Checkpoint

In [None]:
from utils.metric import score as char_f1
from utils.utils import inference_aggregation

FINETUNED_MODEL = 'checkpoints/mdeberta-v3-base/checkpoint-300'

In [None]:
trainer._load_from_checkpoint(FINETUNED_MODEL)

## Threshold Selection

In [13]:
valid_preds = trainer.predict(ds_valid)
valid_metrics = trainer.compute_metrics((valid_preds.predictions, valid_preds.label_ids))

valid_metrics

  0%|          | 0/41 [00:00<?, ?it/s]

  0%|          | 0/41 [00:00<?, ?it/s]

{'precision': 0.5634806362192857,
 'recall': 0.7106647658808954,
 'f1': 0.6285716430120762,
 'thold': 0.16}

In [16]:
from utils.utils import find_class_balance_threshold

test_preds = trainer.predict(ds_test)
test_probabilities = torch.softmax(torch.tensor(test_preds.predictions), dim=-1).cpu().numpy()

test_distr_th = find_class_balance_threshold(
    desired_positive_ratio=positive_class_balance,
    probabilities=test_probabilities,
    labels=test_preds.label_ids
    )

print(test_distr_th)

  0%|          | 0/41 [00:00<?, ?it/s]

0.23767676767676768

In [20]:
final_th = valid_metrics['thold']

## CV-Score

In [23]:
valid_probabilities = torch.softmax(torch.tensor(valid_preds.predictions), dim=-1).cpu().numpy()
valid_results = inference_aggregation(
    probabilities=valid_probabilities,
    labels=valid_preds.label_ids,
    offset_mappings=ds_valid['offset_mapping'],
    thold=final_th
)

In [24]:
from copy import deepcopy

df_valid_gt = df[df.fold==4][['id', 'trigger_words']].reset_index(drop=True)
df_valid = deepcopy(df_valid_gt)
df_valid['trigger_words'] = valid_results

cv_score = char_f1(df_valid_gt, df_valid, row_id_column_name='id')
cv_score

0.6285716430120762

## Predict Test

In [37]:
test_results = inference_aggregation(
    probabilities=test_probabilities,
    labels=test_preds.label_ids,
    offset_mappings=ds_test['offset_mapping'],
    thold=final_th
)

In [38]:
df_test_gt = pd.read_csv(config.data_path + 'solution.csv')[['id', 'trigger_words']]
df_test = deepcopy(df_test_gt)
df_test['trigger_words'] = test_results

test_score = char_f1(df_test_gt, df_test, row_id_column_name='id')
test_score

0.6047970079507895