In [1]:
!pip install git+https://github.com/datquocnguyen/transformers

Collecting git+https://github.com/datquocnguyen/transformers
  Cloning https://github.com/datquocnguyen/transformers to /tmp/pip-req-build-tmytc3cz
  Running command git clone --filter=blob:none --quiet https://github.com/datquocnguyen/transformers /tmp/pip-req-build-tmytc3cz
  Resolved https://github.com/datquocnguyen/transformers to commit 83cb6dab92114f8772a0f3d955b58bcb039dfb06
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.32.0.dev0)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m48.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25ldone
[?25h  Created 

In [2]:
from transformers import TrainingArguments,Trainer
import os

import datasets
from torch.utils.data import DataLoader
import torch
import numpy as np



In [3]:
from transformers import BertForQuestionAnswering, PhobertTokenizerFast, AutoModelForQuestionAnswering

tokenizer = PhobertTokenizerFast.from_pretrained("vinai/phobert-base")
model = AutoModelForQuestionAnswering.from_pretrained("/kaggle/input/phobert-finetuned/phobert_tuned_vn/phobert_tuned_vn")

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.13M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

In [4]:
tokenizer.is_fast

True

In [5]:
max_length = tokenizer.model_max_length
stride = 128

In [6]:
from transformers import EvalPrediction
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred: EvalPrediction):
    #print(pred)
    start_positions = pred.label_ids[0]
    start_predictions = pred.predictions[0].argmax(-1)
    end_positions = pred.label_ids[1]
    end_predictions = pred.predictions[1].argmax(-1)
    print('label',start_positions,'\n predict',end_predictions)
    
    # Compute Exact Match (EM) score
    em_score = sum([1 if sp == lp and ep == le else 0 for sp, ep, lp, le in zip(start_predictions, end_predictions, start_positions, end_positions)]) / len(start_positions)
    
    f1_start = f1_score(start_positions, start_predictions, average='macro')
    f1_end = f1_score(end_positions, end_predictions, average='macro')
    return {
        'f1_start': f1_start,
        'f1_end': f1_end,
        'f1': min(f1_start, f1_end),
        'em_score': em_score  # Exact Match score
    }


def tokenize_function(example):
    encoding = tokenizer(example['context'], example['question'], truncation=True, padding='max_length',
                         max_length=tokenizer.model_max_length
                        )
    start_positions = encoding.char_to_token(example['answer_start_idx'])
    end_idx = example['answer_start_idx'] + len(example['answer_text']) - 1
    end_positions = encoding.char_to_token(max(0, end_idx))
    if start_positions is None:
        start_positions = tokenizer.model_max_length
    if end_positions is None:
        end_positions = tokenizer.model_max_length
    if (len(example['answer_text']) == 0):
        start_positions = tokenizer.model_max_length
        end_positions = tokenizer.model_max_length
    return {'input_ids': encoding['input_ids'],
          'attention_mask': encoding['attention_mask'],
          'start_positions': start_positions,
          'end_positions': end_positions}


def get_dataloader(train_path, valid_path, num_proc=10):
    train_set = datasets.load_from_disk(train_path)
    valid_set = datasets.load_from_disk(valid_path)
    print("Train set: ", len(train_set))
    print("Valid set: ", len(valid_set))

    # Filter out examples that are longer than the tokenizer's max length
    train_set = train_set.filter(lambda example: len(tokenizer(example['context'], truncation=False)['input_ids']) <= tokenizer.model_max_length)
    valid_set = valid_set.filter(lambda example: len(tokenizer(example['context'], truncation=False)['input_ids']) <= tokenizer.model_max_length)
    
#     # Filter out examples that have empty answers
#     train_set = train_set.filter(lambda example: len(example["answer_text"]) > 0)
#     valid_set = valid_set.filter(lambda example: len(example["answer_text"]) > 0)

    train_set = train_set.shuffle().map(tokenize_function, batched=False, num_proc=num_proc)
    valid_set = valid_set.map(tokenize_function, batched=False, num_proc=num_proc)
    
    print("Train set: ", len(train_set))
    print("Valid set: ", len(valid_set))
    return train_set, valid_set

In [7]:
# tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

print(model)
print(model.config)

train_dataset, valid_dataset = get_dataloader(
    train_path='/kaggle/input/squadv2/processed/train.dataset',
    valid_path='/kaggle/input/squadv2/processed/valid.dataset'
)

RobertaForQuestionAnswering(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(258, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (Lay

  0%|          | 0/20 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (318 > 256). Running this sequence through the model will result in indexing errors


  0%|          | 0/1 [00:00<?, ?ba/s]

              

#0:   0%|          | 0/1275 [00:00<?, ?ex/s]

  

#2:   0%|          | 0/1275 [00:00<?, ?ex/s]

 

#3:   0%|          | 0/1274 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/1275 [00:00<?, ?ex/s]

#4:   0%|          | 0/1274 [00:00<?, ?ex/s]

  

#6:   0%|          | 0/1274 [00:00<?, ?ex/s]

#5:   0%|          | 0/1274 [00:00<?, ?ex/s]

#7:   0%|          | 0/1274 [00:00<?, ?ex/s]

#8:   0%|          | 0/1274 [00:00<?, ?ex/s]

#9:   0%|          | 0/1274 [00:00<?, ?ex/s]

              

#0:   0%|          | 0/57 [00:00<?, ?ex/s]

    

#1:   0%|          | 0/57 [00:00<?, ?ex/s]

#2:   0%|          | 0/57 [00:00<?, ?ex/s]

#3:   0%|          | 0/57 [00:00<?, ?ex/s]

#5:   0%|          | 0/57 [00:00<?, ?ex/s]

 

#4:   0%|          | 0/57 [00:00<?, ?ex/s]

 

#7:   0%|          | 0/56 [00:00<?, ?ex/s]

#6:   0%|          | 0/56 [00:00<?, ?ex/s]

#8:   0%|          | 0/56 [00:00<?, ?ex/s]

#9:   0%|          | 0/56 [00:00<?, ?ex/s]

Train set:  12743
Valid set:  566


In [8]:
print(train_dataset)

Dataset({
    features: ['context', 'question', 'answer_text', 'answer_start_idx', 'answer_word_start_idx', 'answer_word_end_idx', 'input_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 12743
})


In [9]:
training_args = TrainingArguments("/kaggle/working/model-bin/test",
                                  num_train_epochs= 5, # 8, #10,
                                  learning_rate=1e-5, #1e-4
                                  warmup_ratio=0.05,
                                  weight_decay=0.01,
                                  per_device_train_batch_size=4,
                                  per_device_eval_batch_size=4,
                                  gradient_accumulation_steps=1,
                                  logging_dir='/kaggle/working/log',
                                  logging_steps=5,
                                  group_by_length=True, 
                                  save_strategy="epoch",
                                  metric_for_best_model='f1',
                                  save_total_limit=2,
                                  #remove_unused_columns=False, #this is IMPORTANT: not to get error
                                  #eval_steps=200,
                                  load_best_model_at_end=True,
                                  #save_steps=200,
                                  evaluation_strategy="epoch",
)

In [10]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=data_collator,
    compute_metrics = compute_metrics
)

trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Epoch,Training Loss,Validation Loss,F1 Start,F1 End,F1,Em Score
1,0.8794,0.819185,0.586687,0.7017,0.586687,0.704947
2,0.9289,0.79787,0.623819,0.724272,0.623819,0.733216
3,0.6055,0.884592,0.65001,0.745891,0.65001,0.745583
4,0.3088,0.90378,0.639545,0.752617,0.639545,0.743816
5,0.0034,0.980593,0.645252,0.749641,0.645252,0.75265


label [  4  12  33  80  74  18  13  14  28  17  26   1   3  24  39  85  29  25
   7  16   7  11  34   4  14  19  28   1  22  42  69  80  24  59  17  91
  51   1  23  44   2  77   1  25  17   6  15   1  22  14   1  30   1  51
  31  33  51   2  29   1  11  38  11  45  22  17  14   8   8   1  27  14
   4  20  10  28   1  19  37  13   1  19  19  18  14   6  29  24  17   4
   1  26  62   1   4  31  13  19   9   1  76  16  31  50  16  48  14  55
   9   1  27  12  14  12  31  38  48  32  22  14  16  50 136  13  11  23
   3   1   1   3   8  35  86   7   1   1 150  48  18   7   1  36  51  19
  26   1   5   5  12   1  39   3  11   1   9   1  57   7  16  40   1   2
  41   7  28   6   1  44   3  11   9  36   1  44   4   5   1  48   1   1
  16  12  18 108  53   9  27   7   5  35  15 204   1  39  47  11  10  35
  31  29   5   1   9  16  23   5   9  31  54   2  29  29  36  12  24  37
   3  23   1  31  18  23  28   3  14  88  13  18  12  42   3  52  74  66
  43  23  51  57  49  11  52  14  66   7  28 



label [  4  12  33  80  74  18  13  14  28  17  26   1   3  24  39  85  29  25
   7  16   7  11  34   4  14  19  28   1  22  42  69  80  24  59  17  91
  51   1  23  44   2  77   1  25  17   6  15   1  22  14   1  30   1  51
  31  33  51   2  29   1  11  38  11  45  22  17  14   8   8   1  27  14
   4  20  10  28   1  19  37  13   1  19  19  18  14   6  29  24  17   4
   1  26  62   1   4  31  13  19   9   1  76  16  31  50  16  48  14  55
   9   1  27  12  14  12  31  38  48  32  22  14  16  50 136  13  11  23
   3   1   1   3   8  35  86   7   1   1 150  48  18   7   1  36  51  19
  26   1   5   5  12   1  39   3  11   1   9   1  57   7  16  40   1   2
  41   7  28   6   1  44   3  11   9  36   1  44   4   5   1  48   1   1
  16  12  18 108  53   9  27   7   5  35  15 204   1  39  47  11  10  35
  31  29   5   1   9  16  23   5   9  31  54   2  29  29  36  12  24  37
   3  23   1  31  18  23  28   3  14  88  13  18  12  42   3  52  74  66
  43  23  51  57  49  11  52  14  66   7  28 



label [  4  12  33  80  74  18  13  14  28  17  26   1   3  24  39  85  29  25
   7  16   7  11  34   4  14  19  28   1  22  42  69  80  24  59  17  91
  51   1  23  44   2  77   1  25  17   6  15   1  22  14   1  30   1  51
  31  33  51   2  29   1  11  38  11  45  22  17  14   8   8   1  27  14
   4  20  10  28   1  19  37  13   1  19  19  18  14   6  29  24  17   4
   1  26  62   1   4  31  13  19   9   1  76  16  31  50  16  48  14  55
   9   1  27  12  14  12  31  38  48  32  22  14  16  50 136  13  11  23
   3   1   1   3   8  35  86   7   1   1 150  48  18   7   1  36  51  19
  26   1   5   5  12   1  39   3  11   1   9   1  57   7  16  40   1   2
  41   7  28   6   1  44   3  11   9  36   1  44   4   5   1  48   1   1
  16  12  18 108  53   9  27   7   5  35  15 204   1  39  47  11  10  35
  31  29   5   1   9  16  23   5   9  31  54   2  29  29  36  12  24  37
   3  23   1  31  18  23  28   3  14  88  13  18  12  42   3  52  74  66
  43  23  51  57  49  11  52  14  66   7  28 



label [  4  12  33  80  74  18  13  14  28  17  26   1   3  24  39  85  29  25
   7  16   7  11  34   4  14  19  28   1  22  42  69  80  24  59  17  91
  51   1  23  44   2  77   1  25  17   6  15   1  22  14   1  30   1  51
  31  33  51   2  29   1  11  38  11  45  22  17  14   8   8   1  27  14
   4  20  10  28   1  19  37  13   1  19  19  18  14   6  29  24  17   4
   1  26  62   1   4  31  13  19   9   1  76  16  31  50  16  48  14  55
   9   1  27  12  14  12  31  38  48  32  22  14  16  50 136  13  11  23
   3   1   1   3   8  35  86   7   1   1 150  48  18   7   1  36  51  19
  26   1   5   5  12   1  39   3  11   1   9   1  57   7  16  40   1   2
  41   7  28   6   1  44   3  11   9  36   1  44   4   5   1  48   1   1
  16  12  18 108  53   9  27   7   5  35  15 204   1  39  47  11  10  35
  31  29   5   1   9  16  23   5   9  31  54   2  29  29  36  12  24  37
   3  23   1  31  18  23  28   3  14  88  13  18  12  42   3  52  74  66
  43  23  51  57  49  11  52  14  66   7  28 



label [  4  12  33  80  74  18  13  14  28  17  26   1   3  24  39  85  29  25
   7  16   7  11  34   4  14  19  28   1  22  42  69  80  24  59  17  91
  51   1  23  44   2  77   1  25  17   6  15   1  22  14   1  30   1  51
  31  33  51   2  29   1  11  38  11  45  22  17  14   8   8   1  27  14
   4  20  10  28   1  19  37  13   1  19  19  18  14   6  29  24  17   4
   1  26  62   1   4  31  13  19   9   1  76  16  31  50  16  48  14  55
   9   1  27  12  14  12  31  38  48  32  22  14  16  50 136  13  11  23
   3   1   1   3   8  35  86   7   1   1 150  48  18   7   1  36  51  19
  26   1   5   5  12   1  39   3  11   1   9   1  57   7  16  40   1   2
  41   7  28   6   1  44   3  11   9  36   1  44   4   5   1  48   1   1
  16  12  18 108  53   9  27   7   5  35  15 204   1  39  47  11  10  35
  31  29   5   1   9  16  23   5   9  31  54   2  29  29  36  12  24  37
   3  23   1  31  18  23  28   3  14  88  13  18  12  42   3  52  74  66
  43  23  51  57  49  11  52  14  66   7  28 

TrainOutput(global_step=7965, training_loss=0.5992311342255424, metrics={'train_runtime': 2671.7899, 'train_samples_per_second': 23.847, 'train_steps_per_second': 2.981, 'total_flos': 8324261427717120.0, 'train_loss': 0.5992311342255424, 'epoch': 5.0})

In [12]:
trainer.save_model("/kaggle/model")