# Application of PhoBERT (Nguyen and Nguyen, 2020) on Vietnamese Dataset

source: https://github.com/VinAIResearch/PhoBERT

Install necessary libraries for script

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
pip install transformers



In [2]:
pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m41.0/43.6 kB[0m [31m1.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m933.1 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=f9922f2490abd2471bea345429a21f12db68dbcd79fb8fdb113b5e2a720d339d
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


Running it is necessarcy to use FastTokenizer

In [3]:
!git clone --single-branch --branch fast_tokenizers_BARTpho_PhoBERT_BERTweet https://github.com/datquocnguyen/transformers.git
%cd transformers
!pip3 install -e .

Cloning into 'transformers'...
remote: Enumerating objects: 138580, done.[K
remote: Counting objects: 100% (47/47), done.[K
remote: Compressing objects: 100% (46/46), done.[K
remote: Total 138580 (delta 1), reused 47 (delta 1), pack-reused 138533[K
Receiving objects: 100% (138580/138580), 145.02 MiB | 14.87 MiB/s, done.
Resolving deltas: 100% (104140/104140), done.
/content/transformers
Obtaining file:///content/transformers
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.32.0.dev0)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m46.3 MB/s[0m eta [36m0:00:00[0m
Building wheel

In [4]:
pip install pytorch-crf

Collecting pytorch-crf
  Downloading pytorch_crf-0.7.2-py3-none-any.whl (9.5 kB)
Installing collected packages: pytorch-crf
Successfully installed pytorch-crf-0.7.2


**Setting for Training, Fine-tuning and Evaluation.**


In [2]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, RobertaForTokenClassification, RobertaConfig, AdamW, get_linear_schedule_with_warmup
from sklearn.utils.class_weight import compute_class_weight
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from seqeval.metrics import f1_score, classification_report
from tqdm import tqdm
import os

# Define label map
label_map = {'O': 0, 'B-Skill': 1, 'I-Skill': 2}

# Save model for further use
def save_model(model_state_dict, path):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    torch.save(model_state_dict, path)

# Load the saved model
def load_model(model_class, path, pretrained_model_name, num_labels):
    model = model_class.from_pretrained(pretrained_model_name, num_labels=num_labels)
    model.load_state_dict(torch.load(path, map_location=torch.device('cpu')))

    return model

# Flatten labels for class weight computation
def flatten_labels(data):
    flat_labels = []
    for _, _, labels in data:
        flat_labels.extend([label for label in labels.numpy() if label != -100])
    return flat_labels

# Compute class weights
def compute_class_weights(train_data, device):
    flat_train_labels = flatten_labels(train_data)
    class_weights = compute_class_weight('balanced', classes=np.unique(flat_train_labels), y=flat_train_labels)
    return torch.tensor(class_weights, dtype=torch.float).to(device)

# Encode tokens and map labels to token ids

def encode_labels(tokens, text_labels,tokenizer):
    if not all(isinstance(token, str) for token in tokens):
        tokens = [str(token) for token in tokens]

    # max_length is determined by the same way on BERT models
    encoded_inputs = tokenizer(tokens, is_split_into_words=True, add_special_tokens=True,
                               max_length=32, truncation=True, padding='max_length',
                               return_attention_mask=True, return_tensors='pt')

    labels = []
    attention_masks = []
    previous_word_idx = None
    is_first_token = True

    for i, word_id in enumerate(encoded_inputs.word_ids()):
        if word_id is None:
            labels.append(label_map['O'])
            if is_first_token:
                attention_masks.append(1)
            else:
                attention_masks.append(0)
        else:
            labels.append(text_labels[word_id])
            attention_masks.append(1)
            is_first_token = False

    labels = labels[:32] + [label_map['O']] * (32- len(labels))
    attention_masks = torch.tensor(attention_masks)

    return encoded_inputs['input_ids'][0], attention_masks, labels

# processes the data into a format suitable for training, including tokenization and label encoding.
def process_dataframe(df, tokenizer, label_map):
    grouped_data = df.groupby('sentence_id').agg({
        'word': list,
        'tag': list
    }).reset_index()

    new_rows = []

    # Adding "_" underscore to help the model better understand
    # because the tokenizer employed in PhoBERT use underscore character as the connection between syllables within a compound word
    for _, row in grouped_data.iterrows():
        new_tokens, new_labels = [], []
        for token, label in zip(row['word'], row['tag']):
            combined_token = str(token).replace(' ', '_')
            new_tokens.append(combined_token)
            new_labels.append(label)

        new_rows.append({'word': new_tokens, 'tag': new_labels})

    new_grouped_data = pd.DataFrame(new_rows)

    new_grouped_data['label_ids'] = new_grouped_data['tag'].apply(lambda tags: [label_map[tag] for tag in tags])

    encoded_data = [encode_labels(sentence_tokens, sentence_labels, tokenizer)
                    for sentence_tokens, sentence_labels in zip(new_grouped_data['word'], new_grouped_data['label_ids'])]

    input_ids, attention_masks, labels = zip(*encoded_data)

    return torch.stack(input_ids), torch.stack(attention_masks), torch.tensor(labels)

# Train a model
def train_model(model, tokenizer, train_dataloader, dev_dataloader, optimizer, scheduler, device, num_epochs, class_weights):
    train_losses = []

    for epoch_i in range(num_epochs):
        model.train()
        total_loss = 0
        for step, batch in tqdm(enumerate(train_dataloader), total=len(train_dataloader), desc="Training"):
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch

            model.zero_grad()
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
            # Employing CrossEntropyLoss to calculate thee class weight
            loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)
            loss = loss_fct(outputs.logits.view(-1, model.num_labels), b_labels.view(-1))
            total_loss += loss.item()
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

        avg_train_loss = total_loss / len(train_dataloader)
        train_losses.append(avg_train_loss)
        print(f"Epoch {epoch_i + 1} - Average training loss: {avg_train_loss}")

        dev_f1 = calculate_f1_score(model, dev_dataloader, tokenizer, device)
        print(f"Epoch {epoch_i + 1} - Dev F1 Score: {dev_f1}")

    return model

# Evaluate a model
def evaluate_model(model, dataloader, tokenizer, device):
    model.eval()
    model.to(device)
    all_predictions, all_true_labels, all_words = [], [], []

    for batch in tqdm(dataloader, desc="Evaluating"):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_input_mask)
            logits = outputs.logits
            batch_predictions = torch.argmax(logits, dim=-1)
            batch_predictions = batch_predictions.detach().cpu().numpy()

        label_ids = b_labels.to('cpu').numpy()
        words = [tokenizer.convert_ids_to_tokens(input_id) for input_id in b_input_ids.to('cpu').numpy()]

        for i in range(label_ids.shape[0]):
            input_len = sum(b_input_mask[i])
            sentence_predictions = [list(label_map.keys())[list(label_map.values()).index(label_idx)] for label_idx in batch_predictions[i][1:input_len-1]]
            sentence_true_labels = [list(label_map.keys())[list(label_map.values()).index(l)] for l in label_ids[i][1:input_len-1]]
            sentence_words = words[i][1:input_len-1]

            all_predictions.append(sentence_predictions)
            all_true_labels.append(sentence_true_labels)
            all_words.extend(sentence_words)

    return all_predictions, all_true_labels, all_words


# Calcute the F1 score
def calculate_f1_score(model, dataloader, tokenizer, device):
    predictions, true_labels,_ = evaluate_model(model, dataloader, tokenizer, device)
    return f1_score(true_labels, predictions)

# Save data with predicted labels for further use
def save_predictions_to_csv(sentence_id, words, true_labels, predictions, file_path):
    df = pd.DataFrame({
        'Sentence_id': sentence_id,
        'Word': words,
        'True_Label': true_labels,
        'Prediction': predictions
    })
    df.to_csv(file_path, index=False)


def main():

    # Load and preprocess data
    train_file_path = 'processed_kaggle_dataset.csv'
    validation_file_path = 'processed_website_dataset.csv'

    train_set = pd.read_csv(train_file_path)
    validation_data_df = pd.read_csv(validation_file_path)

    unique_sentence_ids = validation_data_df['sentence_id'].unique()
    split_index = len(unique_sentence_ids) // 2
    test_ids, dev_ids = unique_sentence_ids[:split_index], unique_sentence_ids[split_index:]
    test_set = validation_data_df[validation_data_df['sentence_id'].isin(test_ids)]
    dev_set = validation_data_df[validation_data_df['sentence_id'].isin(dev_ids)]

    pretrained_model_name = 'vinai/phobert-base-v2'
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_inputs, train_masks, train_labels = process_dataframe(train_set, tokenizer, label_map)
    dev_inputs, dev_masks, dev_labels = process_dataframe(dev_set, tokenizer, label_map)
    test_inputs, test_masks, test_labels = process_dataframe(test_set, tokenizer, label_map)


    # Create DataLoaders
    train_dataset = TensorDataset(train_inputs, train_masks, train_labels)
    train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=8)

    dev_dataset = TensorDataset(dev_inputs, dev_masks, dev_labels)
    dev_dataloader = DataLoader(dev_dataset, sampler=SequentialSampler(dev_dataset), batch_size=8)

    test_dataset = TensorDataset(test_inputs, test_masks, test_labels)
    test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=8)

    # Compute class weight
    flat_train_labels = flatten_labels(train_dataset)
    class_weights = compute_class_weight('balanced', classes=np.unique(flat_train_labels), y=flat_train_labels)
    class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)


    # Training, Fine-tuning and evaluating process
    learning_rates = [1e-3,1e-4, 1e-5]
    num_epochs = 10
    epsilon=1e-8
    best_lr = None
    best_f1_score = 0
    best_model_info = {
          "f1_score": 0,
          "learning_rate": None,
          "epoch": None,
          "weight_decay": 0.01,
          "model_state_dict": None
      }

    for lr in learning_rates:
        print(f"\nTesting with learning rate: {lr}")
        model = RobertaForTokenClassification.from_pretrained(
          pretrained_model_name,
          num_labels=len(label_map)
        )
        model.to(device)
        # Initialize the model and optimizer with the current learning rate
        optimizer = AdamW(model.parameters(), lr=lr, eps=epsilon, weight_decay=0.01)
        total_steps = len(train_dataloader) * num_epochs
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

        # Train the model and capture training performance
        model = train_model(model, tokenizer, train_dataloader, dev_dataloader, optimizer, scheduler, device, num_epochs, class_weights)
        # Calculate F1-Score of the model with current learning rate on the development set
        current_f1_score = calculate_f1_score(model, dev_dataloader, tokenizer, device)
        print(f"Learning rate: {lr} - F1 Score: {current_f1_score}")

        # Best learning rate is chosen by F1-Score
        if current_f1_score > best_model_info["f1_score"]:
              best_model_info = {
                  "f1_score": current_f1_score,
                  "learning_rate": lr,
                  "model_state_dict": model.state_dict()
            }

    # Save the best model
    best_model_save_path = 'phobert_best_model.pth'
    save_model(best_model_info["model_state_dict"], best_model_save_path)
    print(f"\nBest model saved at: {best_model_save_path}")

    # Load the best model for evaluation
    best_model = load_model(RobertaForTokenClassification, best_model_save_path, pretrained_model_name, len(label_map))
    best_model.to(device)

    # Evaluate the best model on the dev set
    dev_predictions, dev_true_labels,_ = evaluate_model(best_model, dev_dataloader, tokenizer, device)
    dev_report = classification_report(dev_true_labels, dev_predictions)
    print(f"\nDevelopment Set Classification Report:\n{dev_report}")

    # Evaluate the best model on the test set
    test_predictions, test_true_labels, test_words = evaluate_model(best_model, test_dataloader, tokenizer, device)
    test_report = classification_report(test_true_labels, test_predictions)
    print(f"\nTest Set Classification Report:\n{test_report}")

    # Save output
    gold_labels, predicted_labels, sentence_ids = [], [], []
    for sentence_id, sentence_labels in enumerate(test_true_labels):
        sentence_length = len(sentence_labels)
        gold_labels.extend(sentence_labels)
        predicted_labels.extend(test_predictions[sentence_id])
        sentence_ids.extend([sentence_id] * sentence_length)
    output_csv_path = f'phobert_best_model_output.csv'
    save_predictions_to_csv(sentence_ids, test_words, gold_labels, predicted_labels, output_csv_path)
    print(f"Test predictions saved to {output_csv_path}")

if __name__ == "__main__":
    main()




Testing with learning rate: 0.001


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training: 100%|██████████| 125/125 [00:11<00:00, 10.63it/s]


Epoch 1 - Average training loss: 1.1271787385940553


Evaluating: 100%|██████████| 50/50 [00:00<00:00, 57.15it/s]


Epoch 1 - Dev F1 Score: 0.041312272174969626


Training: 100%|██████████| 125/125 [00:11<00:00, 10.79it/s]


Epoch 2 - Average training loss: 1.106575803756714


Evaluating: 100%|██████████| 50/50 [00:01<00:00, 39.87it/s]


Epoch 2 - Dev F1 Score: 0.0


Training: 100%|██████████| 125/125 [00:11<00:00, 10.81it/s]


Epoch 3 - Average training loss: 1.1037739925384522


Evaluating: 100%|██████████| 50/50 [00:01<00:00, 41.02it/s]


Epoch 3 - Dev F1 Score: 0.053450164293537786


Training: 100%|██████████| 125/125 [00:12<00:00, 10.17it/s]


Epoch 4 - Average training loss: 1.1028475313186645


Evaluating: 100%|██████████| 50/50 [00:00<00:00, 54.88it/s]


Epoch 4 - Dev F1 Score: 0.0


Training: 100%|██████████| 125/125 [00:11<00:00, 10.57it/s]


Epoch 5 - Average training loss: 1.1020944147109986


Evaluating: 100%|██████████| 50/50 [00:00<00:00, 55.93it/s]


Epoch 5 - Dev F1 Score: 0.0


Training: 100%|██████████| 125/125 [00:11<00:00, 10.59it/s]


Epoch 6 - Average training loss: 1.1034166316986085


Evaluating: 100%|██████████| 50/50 [00:00<00:00, 55.38it/s]


Epoch 6 - Dev F1 Score: 0.0


Training: 100%|██████████| 125/125 [00:11<00:00, 10.58it/s]


Epoch 7 - Average training loss: 1.100015296936035


Evaluating: 100%|██████████| 50/50 [00:01<00:00, 41.37it/s]


Epoch 7 - Dev F1 Score: 0.0


Training: 100%|██████████| 125/125 [00:11<00:00, 10.78it/s]


Epoch 8 - Average training loss: 1.0997870063781738


Evaluating: 100%|██████████| 50/50 [00:01<00:00, 41.84it/s]


Epoch 8 - Dev F1 Score: 0.0


Training: 100%|██████████| 125/125 [00:11<00:00, 10.64it/s]


Epoch 9 - Average training loss: 1.0992054748535156


Evaluating: 100%|██████████| 50/50 [00:00<00:00, 55.93it/s]


Epoch 9 - Dev F1 Score: 0.0


Training: 100%|██████████| 125/125 [00:11<00:00, 10.49it/s]


Epoch 10 - Average training loss: 1.0978698205947877


Evaluating: 100%|██████████| 50/50 [00:00<00:00, 56.95it/s]


Epoch 10 - Dev F1 Score: 0.0


Evaluating: 100%|██████████| 50/50 [00:00<00:00, 57.14it/s]


Learning rate: 0.001 - F1 Score: 0.0

Testing with learning rate: 0.0001


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training: 100%|██████████| 125/125 [00:11<00:00, 10.53it/s]


Epoch 1 - Average training loss: 0.4337220377922058


Evaluating: 100%|██████████| 50/50 [00:00<00:00, 54.88it/s]


Epoch 1 - Dev F1 Score: 0.35463546354635467


Training: 100%|██████████| 125/125 [00:11<00:00, 10.51it/s]


Epoch 2 - Average training loss: 0.2956003125011921


Evaluating: 100%|██████████| 50/50 [00:00<00:00, 55.49it/s]


Epoch 2 - Dev F1 Score: 0.2518628912071535


Training: 100%|██████████| 125/125 [00:11<00:00, 10.60it/s]


Epoch 3 - Average training loss: 0.23372454723715783


Evaluating: 100%|██████████| 50/50 [00:01<00:00, 41.64it/s]


Epoch 3 - Dev F1 Score: 0.40379146919431286


Training: 100%|██████████| 125/125 [00:11<00:00, 10.66it/s]


Epoch 4 - Average training loss: 0.1514118373245001


Evaluating: 100%|██████████| 50/50 [00:01<00:00, 41.37it/s]


Epoch 4 - Dev F1 Score: 0.47597254004576656


Training: 100%|██████████| 125/125 [00:11<00:00, 10.63it/s]


Epoch 5 - Average training loss: 0.12002944368869066


Evaluating: 100%|██████████| 50/50 [00:00<00:00, 55.16it/s]


Epoch 5 - Dev F1 Score: 0.5182863113897597


Training: 100%|██████████| 125/125 [00:11<00:00, 10.45it/s]


Epoch 6 - Average training loss: 0.08266250353306531


Evaluating: 100%|██████████| 50/50 [00:00<00:00, 54.15it/s]


Epoch 6 - Dev F1 Score: 0.5135135135135135


Training: 100%|██████████| 125/125 [00:11<00:00, 10.45it/s]


Epoch 7 - Average training loss: 0.05587905881926417


Evaluating: 100%|██████████| 50/50 [00:00<00:00, 55.47it/s]


Epoch 7 - Dev F1 Score: 0.490134994807892


Training: 100%|██████████| 125/125 [00:11<00:00, 10.42it/s]


Epoch 8 - Average training loss: 0.03470095931366086


Evaluating: 100%|██████████| 50/50 [00:01<00:00, 47.66it/s]


Epoch 8 - Dev F1 Score: 0.5432098765432098


Training: 100%|██████████| 125/125 [00:11<00:00, 10.49it/s]


Epoch 9 - Average training loss: 0.03207064516469836


Evaluating: 100%|██████████| 50/50 [00:01<00:00, 40.92it/s]


Epoch 9 - Dev F1 Score: 0.5384615384615384


Training: 100%|██████████| 125/125 [00:11<00:00, 10.58it/s]


Epoch 10 - Average training loss: 0.02457548227161169


Evaluating: 100%|██████████| 50/50 [00:01<00:00, 41.80it/s]


Epoch 10 - Dev F1 Score: 0.5352422907488987


Evaluating: 100%|██████████| 50/50 [00:01<00:00, 40.64it/s]


Learning rate: 0.0001 - F1 Score: 0.5352422907488987

Testing with learning rate: 1e-05


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training: 100%|██████████| 125/125 [00:11<00:00, 10.48it/s]


Epoch 1 - Average training loss: 0.6667037689685822


Evaluating: 100%|██████████| 50/50 [00:01<00:00, 41.20it/s]


Epoch 1 - Dev F1 Score: 0.2788244159758855


Training: 100%|██████████| 125/125 [00:12<00:00, 10.41it/s]


Epoch 2 - Average training loss: 0.31588127100467683


Evaluating: 100%|██████████| 50/50 [00:00<00:00, 53.05it/s]


Epoch 2 - Dev F1 Score: 0.3290267011197244


Training: 100%|██████████| 125/125 [00:12<00:00, 10.29it/s]


Epoch 3 - Average training loss: 0.2679609753489494


Evaluating: 100%|██████████| 50/50 [00:00<00:00, 53.84it/s]


Epoch 3 - Dev F1 Score: 0.3709981167608286


Training: 100%|██████████| 125/125 [00:12<00:00, 10.29it/s]


Epoch 4 - Average training loss: 0.2154887993633747


Evaluating: 100%|██████████| 50/50 [00:00<00:00, 54.03it/s]


Epoch 4 - Dev F1 Score: 0.4058536585365854


Training: 100%|██████████| 125/125 [00:12<00:00, 10.31it/s]


Epoch 5 - Average training loss: 0.1902212354838848


Evaluating: 100%|██████████| 50/50 [00:01<00:00, 42.45it/s]


Epoch 5 - Dev F1 Score: 0.45201238390092885


Training: 100%|██████████| 125/125 [00:11<00:00, 10.44it/s]


Epoch 6 - Average training loss: 0.1764991851449013


Evaluating: 100%|██████████| 50/50 [00:01<00:00, 40.47it/s]


Epoch 6 - Dev F1 Score: 0.44373673036093414


Training: 100%|██████████| 125/125 [00:11<00:00, 10.50it/s]


Epoch 7 - Average training loss: 0.14478787258267403


Evaluating: 100%|██████████| 50/50 [00:01<00:00, 44.69it/s]


Epoch 7 - Dev F1 Score: 0.45286885245901637


Training: 100%|██████████| 125/125 [00:12<00:00, 10.33it/s]


Epoch 8 - Average training loss: 0.13758896723389624


Evaluating: 100%|██████████| 50/50 [00:00<00:00, 53.65it/s]


Epoch 8 - Dev F1 Score: 0.45603271983640087


Training: 100%|██████████| 125/125 [00:12<00:00, 10.36it/s]


Epoch 9 - Average training loss: 0.12043536245822907


Evaluating: 100%|██████████| 50/50 [00:00<00:00, 54.16it/s]


Epoch 9 - Dev F1 Score: 0.4783068783068783


Training: 100%|██████████| 125/125 [00:12<00:00, 10.33it/s]


Epoch 10 - Average training loss: 0.1157145794481039


Evaluating: 100%|██████████| 50/50 [00:01<00:00, 47.21it/s]


Epoch 10 - Dev F1 Score: 0.4807492195629553


Evaluating: 100%|██████████| 50/50 [00:00<00:00, 53.58it/s]


Learning rate: 1e-05 - F1 Score: 0.4807492195629553

Best model saved at: /content/drive/MyDrive/testBA/PhoBERT_model/best_model.pth


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 50/50 [00:01<00:00, 50.00it/s]



Development Set Classification Report:
              precision    recall  f1-score   support

       Skill       0.51      0.57      0.54       428

   micro avg       0.51      0.57      0.54       428
   macro avg       0.51      0.57      0.54       428
weighted avg       0.51      0.57      0.54       428



Evaluating: 100%|██████████| 50/50 [00:00<00:00, 54.81it/s]



Test Set Classification Report:
              precision    recall  f1-score   support

       Skill       0.57      0.58      0.57       540

   micro avg       0.57      0.58      0.57       540
   macro avg       0.57      0.58      0.57       540
weighted avg       0.57      0.58      0.57       540

Test predictions saved to /content/drive/MyDrive/testBA/PhoBERT_model/best_model_output.csv
