# **QA in Vietnamese with PhoBERT fine-tuning**

First, we need to install required libraries and download the pre-trained PhoBERT model.

In [2]:
!pip install transformers -q
!pip install datasets -q

### **1. Import libraries**

In [3]:
import os, sys, argparse, gc
from typing import Any

try:
    import pandas as pd
except ImportError:
    !pip install pandas
    import pandas as pd

try:
    import numpy as np
except ImportError:
    !pip install numpy
    import numpy as np

try:
    import tensorflow as tf
except ImportError:
    !pip install tensorflow
    import tensorflow as tf

try:
    from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering, DefaultDataCollator
except ImportError:
    !pip install transformers
    from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering, DefaultDataCollator

try:
    from torch.utils.data import Dataset
except ImportError:
    !pip install torch
    from torch.utils.data import Dataset

### **2. Run code**

Check tf version

In [4]:
print("TF Version: ", tf.__version__)
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
    # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

TF Version:  2.17.0


Define model name

In [5]:
#########################################################
MODEL_PHOBERT_BASE = 'vinai/phobert-base'
MODEL_PHOBERT_LARGE = 'vinai/phobert-large'
#########################################################

Design arguments:
- `--model`: model name
- `--lr`: learning rate
- `--bs`: batch size
- `--epochs`: number of epochs
- `--maxlen`: maximum length of input
- `--stride`: stride
- `--use_fast`: use fast tokenizers

In [6]:
def get_arguments():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", type=str, default=MODEL_PHOBERT_LARGE, help="Pretrained model bert")
    parser.add_argument("--lr", type=float, default=1e-5, help="Learning rate")
    parser.add_argument("--bs", type=int, default=16, help="Batch size")
    parser.add_argument("--epochs", type=int, default=20, help="Number of epochs")
    parser.add_argument("--maxlen", type=int, default=256 , help="Max sentence length")
    parser.add_argument("--stride", type=int, default=128, help="Stride value for window slide")
    parser.add_argument("--use_fast", type=bool, default=True, help="Tokenize sentence with fast bpe")

    return parser.parse_args(args=[])

Data preprocessing

In [7]:
def read_data(path):
    data = pd.read_csv(path)
    return data

def save_data(data, path):
    data.to_csv(path, index=False, encoding='utf-8')

data = read_data('datasets/ViWikiQA1.0/ws_train.csv')
save_data(data, 'datasets/ViWikiQA1.0/ws_train.csv')

data = read_data('datasets/ViWikiQA1.0/ws_dev.csv')
save_data(data, 'datasets/ViWikiQA1.0/ws_dev.csv')

data = read_data('datasets/ViWikiQA1.0/ws_test.csv')
save_data(data, 'datasets/ViWikiQA1.0/ws_test.csv')

Preprocess data:
- Our data is in the form of a dictionary with keys: `question`, `context`, `answers_start`, `answers_end`, and `answer`

- Need to create columns: `input_ids`, `attention_mask`, `start_positions`, `end_positions`

In [8]:
#########################################################
def preprocess_dataset(ds: pd.DataFrame, tokenizer: Any, maxlen: int):
    questions = [q.strip() for q in ds["question"]]
    contexts = [str(t) for t in ds["context"]]
    inputs = tokenizer(
        questions,
        contexts,
        max_length=maxlen,
        truncation="only_second",
        return_token_type_ids=True,
        padding="max_length",
    )

    answer_starts = ds["answer_start"]
    answer_ends = ds["answer_end"]
    answers = ds["answer"]
    start_positions = []
    end_positions = []
    assert len(answer_starts) == len(answer_ends)

    for i in range(len(answer_starts)):
        start_char = answer_starts[i]
        end_char = answer_ends[i]
        if start_char == 0 and end_char == 0:
            start_positions.append(0)
            end_positions.append(0)
            continue

        answer = answers[i]
        context = contexts[i]
        input_ids = inputs["input_ids"][i]

        # Find the start and end of the context
        idx = 0
        while input_ids[idx] != 2:
            idx += 1
        idx += 2
        context_start = idx
        # print("context start:", context_start, ",id:", input_ids[context_start])
        while idx < len(input_ids) and input_ids[idx] != 1:
            idx += 1
        context_end = idx - 1
        if input_ids[context_end] == 2:
            context_end -= 1
        # print("context end:", context_end, "id:", input_ids[context_end])

        pre_ans = tokenizer.encode(context[:start_char], add_special_tokens=False)
        ans_ids = tokenizer.encode(answer, add_special_tokens=False)

        start_position = context_start + len(pre_ans)
        end_position = start_position + len(ans_ids) - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if start_position < context_start or end_position > context_end:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            start_positions.append(start_position)
            end_positions.append(end_position)

    # print("Start positions:", start_positions)
    # print("End positions:", end_positions)

    # calculate length of context for each pair of positions
    lengths = []

    for i in range(len(start_positions)):
        if start_positions[i] == 0:
            lengths.append(0)
        else:
            lengths.append(end_positions[i] - start_positions[i] + 1)

    # print("Lengths:", lengths)

    inputs["start_positions"] = torch.tensor(start_positions, dtype=torch.long)
    inputs["end_positions"] = torch.tensor(end_positions, dtype=torch.long)
    return inputs

We generate the dataset by using the `datasets` library from Hugging Face. The dataset is then tokenized and preprocessed before being fed into the model.

In [9]:
class QADataset(Dataset):
    def __init__(self, inputs):
        self.inputs = inputs

    def __len__(self):
        return len(self.inputs["input_ids"])

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.inputs.items()}


Print out data information

In [10]:
def data_info(df):
    print("Dataframe shape:", df.shape)
    print("Columns:", df.columns)
    print("Dataframe head:", df.head())
    print("Dataframe tail:", df.tail())
    print("Dataframe info:", df.info())
    print("Dataframe describe:", df.describe())

Generate the dataset by drop the `title` column and fill the missing values with empty strings

In [11]:
def generate_dataset(file_name: str, tokenizer: Any, maxlen: int):
    df = pd.read_csv(file_name)
    df.drop("title", axis=1, inplace=True)
    df.fillna({"answer": ""}, inplace=True)

    # data_info(df)
    processed_data = preprocess_dataset(df, tokenizer, maxlen)
    return QADataset(processed_data)

In [12]:
# #########################################################
# class myCallback(tf.keras.callbacks.Callback):
#     def __init__(self, saved_model_name: str):
#         super().__init__()

#         self.min_loss = sys.float_info.max
#         self.min_val_loss = sys.float_info.max

#         self.saved_model_name = saved_model_name

#     def on_epoch_end(self, epoch, logs={}):
#         min_loss = logs.get('loss')
#         min_val_loss = logs.get('val_loss')

#         if min_loss <= self.min_loss and min_val_loss <= self.min_val_loss:
#             self.min_loss = min_loss
#             self.min_val_loss = min_val_loss

#             print("\nsave model at epoch {}".format(epoch+1))
#             # self.model.save("models/{}.h5".format(self.saved_model_name))
#             self.model.save("model/vinai-phobert-large", save_format='tf')

In [13]:
args = get_arguments()
model_name = args.model
lr = args.lr
batch_size = 2 #args.bs
epochs = 10 # args.epochs
maxlen = args.maxlen
stride = args.stride
use_fast = args.use_fast

Model information

In [14]:
print("##############################")
print("Model :", model_name)
print("Learning Rate :", lr)
print("Batch Size :", batch_size)
print("Epochs :", epochs)
print("Max Token Length :", maxlen)
print("Stride :", stride)
print("Use Fast :", use_fast)
print("##############################")

##############################
Model : vinai/phobert-large
Learning Rate : 1e-05
Batch Size : 2
Epochs : 10
Max Token Length : 256
Stride : 128
Use Fast : True
##############################


In [15]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, AdamW
from torch.utils.data import DataLoader
from transformers import get_linear_schedule_with_warmup
from torch.cuda.amp import GradScaler, autocast

In [16]:
# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Define collate function

In [17]:
def collate_fn(batch):
    return {
        'input_ids': torch.stack([item['input_ids'] for item in batch]),
        'attention_mask': torch.stack([item['attention_mask'] for item in batch]),
        'start_positions': torch.tensor([item['start_positions'] for item in batch]),
        'end_positions': torch.tensor([item['end_positions'] for item in batch])
    }

Define training function. With framework is PyTorch.

Can setup the training along with the validation process

In [1]:
from transformers.models.roberta import RobertaForQuestionAnswering
from tqdm import tqdm
if __name__ == "__main__":
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=use_fast)
    data_collator = DefaultDataCollator(return_tensors="tf")
    dataset_tr = generate_dataset(
        "datasets/ViWikiQA1.0/ws_train.csv",
        tokenizer, maxlen
    )
    dataset_val = generate_dataset(
        "datasets/ViWikiQA1.0/ws_dev.csv",
        tokenizer, maxlen
    )

    # Create DataLoader for training and validation datasets
    train_loader = DataLoader(dataset_tr, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(dataset_val, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

    # Define the initial learning rate
    learning_rate = 0.001  # Set an initial learning rate

    # Load the pre-trained model for question answering
    model: RobertaForQuestionAnswering = AutoModelForQuestionAnswering.from_pretrained(model_name).to(device)

    # Prepare the optimizer
    optimizer = AdamW(model.parameters(), lr=learning_rate)

    # Define a learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=len(train_loader) * epochs
    )

    # Mixed-precision training setup
    scaler = torch.amp.GradScaler('cpu')

    # Early stopping parameters
    early_stop_patience = 3
    best_val_loss = float('inf')
    early_stop_counter = 0

    # Training loop
    history = {'loss': [], 'val_loss': []}
    # print(train_loader)

    print("[INFO] Start training...")
    for epoch in tqdm(range(epochs), desc="Training", unit="epoch"):
        model.train()
        total_loss = 0

        # Iterate over training batches
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)

            # print(f'[DEBUG] input_ids: {input_ids.shape=}')
            # print(f'[DEBUG] attention_mask: {attention_mask.shape=}')
            # print(f'[DEBUG] start_positions: {start_positions.shape=}')
            # print(f'[DEBUG] end_positions: {end_positions.shape=}')

            # # check start_positions and end_positions of each 
            # print(f'[DEBUG] start_positions: {start_positions}')
            # print(f'[DEBUG] end_positions: {end_positions}')

            # # embedding = model.get_input_embeddings()
            # # torch.embedding(embedding., input_ids)

            # print(input_ids.max(), input_ids.min())
            # print(tokenizer.total_vocab_size)

            with torch.amp.autocast(device_type='cpu'):
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    start_positions=start_positions,
                    end_positions=end_positions
                )
                loss = outputs.loss

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        history['loss'].append(avg_loss)

        # Validation phase
        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                start_positions = batch['start_positions'].to(device)
                end_positions = batch['end_positions'].to(device)

                with torch.amp.autocast(device_type='cpu'):
                    outputs = model(
                        input_ids=input_ids,
                        attention_mask=attention_mask,
                        start_positions=start_positions,
                        end_positions=end_positions
                    )
                    val_loss = outputs.loss

                total_val_loss += val_loss.item()


        avg_val_loss = total_val_loss / len(val_loader)
        history['val_loss'].append(avg_val_loss)

        # Early stopping logic
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            early_stop_counter = 0
            model.save_pretrained("checkpoints/{}".format(model_name.replace("/", "-")))
        else:
            early_stop_counter += 1

        if early_stop_counter >= early_stop_patience:
            # print(f"Early stopping at epoch {epoch + 1}")
            break

    # Save training history to CSV
    hist_df = pd.DataFrame(history)
    hist_df.to_csv("checkpoints/plots/{}_bs{}_lr{}.csv".format(model_name.replace("/", "-"), batch_size, learning_rate), index=False)

  from .autonotebook import tqdm as notebook_tqdm


KeyboardInterrupt: 

In [49]:
# if __name__ == "__main__":    
#     tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=use_fast)
#     data_collator = DefaultDataCollator(return_tensors="tf")
#     dataset_tr = generate_dataset(
#         "datasets/ViWikiQA1.0/ws_train.csv",
#         tokenizer, data_collator, maxlen,
#         stride, batch_size
#     )
#     dataset_val = generate_dataset(
#         "datasets/ViWikiQA1.0/ws_dev.csv",
#         tokenizer, data_collator, maxlen, stride,
#         batch_size
#     )

#     # Define the learning rate variable
#     learning_rate = tf.Variable(0.001, trainable=False)

#     # During training, update the learning rate as needed
#     # For example, set a new learning rate of 0.0001
#     tf.keras.backend.set_value(learning_rate, 0.0001)

#     model = TFAutoModelForQuestionAnswering.from_pretrained(model_name)
#     model.compile(optimizer= 'adam')

#     # Train in mixed-precision float16
#     tf.keras.mixed_precision.set_global_policy("mixed_float16")

#     cb = myCallback(model_name.replace("/", "-"))

#     history = model.fit(
#         dataset_tr,
#         validation_data=dataset_val,
#         epochs=epochs
#     )

#     hist = pd.DataFrame(history.history)
#     hist.to_csv("checkpoints/plots/{}_bs{}_lr{}.csv".format(model_name.replace("/", "-"), batch_size, lr))

Map: 100%|██████████| 18572/18572 [00:32<00:00, 566.62 examples/s]
Map: 100%|██████████| 2285/2285 [00:03<00:00, 608.44 examples/s]
All model checkpoint layers were used when initializing TFRobertaForQuestionAnswering.

Some layers of TFRobertaForQuestionAnswering were not initialized from the model checkpoint at vinai/phobert-large and are newly initialized: ['qa_outputs']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10



InvalidArgumentError: Graph execution error:

Detected at node tf_roberta_for_question_answering_7/roberta/embeddings/Gather_1 defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "e:\General_Subjects\Natural Language Processing\Lab-NLP\christ\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>

  File "e:\General_Subjects\Natural Language Processing\Lab-NLP\christ\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance

  File "e:\General_Subjects\Natural Language Processing\Lab-NLP\christ\Lib\site-packages\ipykernel\kernelapp.py", line 739, in start

  File "e:\General_Subjects\Natural Language Processing\Lab-NLP\christ\Lib\site-packages\tornado\platform\asyncio.py", line 205, in start

  File "C:\Users\HP\AppData\Local\Programs\Python\Python311\Lib\asyncio\base_events.py", line 608, in run_forever

  File "C:\Users\HP\AppData\Local\Programs\Python\Python311\Lib\asyncio\base_events.py", line 1936, in _run_once

  File "C:\Users\HP\AppData\Local\Programs\Python\Python311\Lib\asyncio\events.py", line 84, in _run

  File "e:\General_Subjects\Natural Language Processing\Lab-NLP\christ\Lib\site-packages\ipykernel\kernelbase.py", line 545, in dispatch_queue

  File "e:\General_Subjects\Natural Language Processing\Lab-NLP\christ\Lib\site-packages\ipykernel\kernelbase.py", line 534, in process_one

  File "e:\General_Subjects\Natural Language Processing\Lab-NLP\christ\Lib\site-packages\ipykernel\kernelbase.py", line 437, in dispatch_shell

  File "e:\General_Subjects\Natural Language Processing\Lab-NLP\christ\Lib\site-packages\ipykernel\ipkernel.py", line 362, in execute_request

  File "e:\General_Subjects\Natural Language Processing\Lab-NLP\christ\Lib\site-packages\ipykernel\kernelbase.py", line 778, in execute_request

  File "e:\General_Subjects\Natural Language Processing\Lab-NLP\christ\Lib\site-packages\ipykernel\ipkernel.py", line 449, in do_execute

  File "e:\General_Subjects\Natural Language Processing\Lab-NLP\christ\Lib\site-packages\ipykernel\zmqshell.py", line 549, in run_cell

  File "e:\General_Subjects\Natural Language Processing\Lab-NLP\christ\Lib\site-packages\IPython\core\interactiveshell.py", line 3075, in run_cell

  File "e:\General_Subjects\Natural Language Processing\Lab-NLP\christ\Lib\site-packages\IPython\core\interactiveshell.py", line 3130, in _run_cell

  File "e:\General_Subjects\Natural Language Processing\Lab-NLP\christ\Lib\site-packages\IPython\core\async_helpers.py", line 128, in _pseudo_sync_runner

  File "e:\General_Subjects\Natural Language Processing\Lab-NLP\christ\Lib\site-packages\IPython\core\interactiveshell.py", line 3334, in run_cell_async

  File "e:\General_Subjects\Natural Language Processing\Lab-NLP\christ\Lib\site-packages\IPython\core\interactiveshell.py", line 3517, in run_ast_nodes

  File "e:\General_Subjects\Natural Language Processing\Lab-NLP\christ\Lib\site-packages\IPython\core\interactiveshell.py", line 3577, in run_code

  File "C:\Users\HP\AppData\Local\Temp\ipykernel_4556\564697334.py", line 30, in <module>

  File "e:\General_Subjects\Natural Language Processing\Lab-NLP\christ\Lib\site-packages\transformers\modeling_tf_utils.py", line 1229, in fit

  File "e:\General_Subjects\Natural Language Processing\Lab-NLP\christ\Lib\site-packages\tf_keras\src\utils\traceback_utils.py", line 65, in error_handler

  File "e:\General_Subjects\Natural Language Processing\Lab-NLP\christ\Lib\site-packages\tf_keras\src\engine\training.py", line 1804, in fit

  File "e:\General_Subjects\Natural Language Processing\Lab-NLP\christ\Lib\site-packages\tf_keras\src\engine\training.py", line 1398, in train_function

  File "e:\General_Subjects\Natural Language Processing\Lab-NLP\christ\Lib\site-packages\tf_keras\src\engine\training.py", line 1381, in step_function

  File "e:\General_Subjects\Natural Language Processing\Lab-NLP\christ\Lib\site-packages\tf_keras\src\engine\training.py", line 1370, in run_step

  File "e:\General_Subjects\Natural Language Processing\Lab-NLP\christ\Lib\site-packages\transformers\modeling_tf_utils.py", line 1672, in train_step

  File "e:\General_Subjects\Natural Language Processing\Lab-NLP\christ\Lib\site-packages\tf_keras\src\utils\traceback_utils.py", line 65, in error_handler

  File "e:\General_Subjects\Natural Language Processing\Lab-NLP\christ\Lib\site-packages\tf_keras\src\engine\training.py", line 588, in __call__

  File "e:\General_Subjects\Natural Language Processing\Lab-NLP\christ\Lib\site-packages\tf_keras\src\utils\traceback_utils.py", line 65, in error_handler

  File "e:\General_Subjects\Natural Language Processing\Lab-NLP\christ\Lib\site-packages\tf_keras\src\engine\base_layer.py", line 1136, in __call__

  File "e:\General_Subjects\Natural Language Processing\Lab-NLP\christ\Lib\site-packages\tf_keras\src\utils\traceback_utils.py", line 96, in error_handler

  File "e:\General_Subjects\Natural Language Processing\Lab-NLP\christ\Lib\site-packages\transformers\modeling_tf_utils.py", line 1707, in run_call_with_unpacked_inputs

  File "e:\General_Subjects\Natural Language Processing\Lab-NLP\christ\Lib\site-packages\transformers\models\roberta\modeling_tf_roberta.py", line 1724, in call

  File "e:\General_Subjects\Natural Language Processing\Lab-NLP\christ\Lib\site-packages\tf_keras\src\utils\traceback_utils.py", line 65, in error_handler

  File "e:\General_Subjects\Natural Language Processing\Lab-NLP\christ\Lib\site-packages\tf_keras\src\engine\base_layer.py", line 1136, in __call__

  File "e:\General_Subjects\Natural Language Processing\Lab-NLP\christ\Lib\site-packages\tf_keras\src\utils\traceback_utils.py", line 96, in error_handler

  File "e:\General_Subjects\Natural Language Processing\Lab-NLP\christ\Lib\site-packages\transformers\modeling_tf_utils.py", line 1707, in run_call_with_unpacked_inputs

  File "e:\General_Subjects\Natural Language Processing\Lab-NLP\christ\Lib\site-packages\transformers\models\roberta\modeling_tf_roberta.py", line 745, in call

  File "e:\General_Subjects\Natural Language Processing\Lab-NLP\christ\Lib\site-packages\tf_keras\src\utils\traceback_utils.py", line 65, in error_handler

  File "e:\General_Subjects\Natural Language Processing\Lab-NLP\christ\Lib\site-packages\tf_keras\src\engine\base_layer.py", line 1136, in __call__

  File "e:\General_Subjects\Natural Language Processing\Lab-NLP\christ\Lib\site-packages\tf_keras\src\utils\traceback_utils.py", line 96, in error_handler

  File "e:\General_Subjects\Natural Language Processing\Lab-NLP\christ\Lib\site-packages\transformers\models\roberta\modeling_tf_roberta.py", line 164, in call

indices[0,384] = 386 is not in [0, 258)
	 [[{{node tf_roberta_for_question_answering_7/roberta/embeddings/Gather_1}}]] [Op:__inference_train_function_94272]

## **3. Load Test Data**

In [None]:
df_test = pd.read_excel("datasets\Corona_NLP_test.xlsx")
df_test.drop("title", axis=1, inplace=True)
df_test.info()

In [None]:
df_test["context_len"] = df_test["context"].apply(lambda x: len(x.split()))
df_test["context_len"].hist()

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
ds_test = datasets.Dataset.from_dict(df_test)

dataset_test = ds_test.map(
    lambda ds: preprocess_dataset(ds, tokenizer, maxlen),
    batched=True,
    remove_columns=ds_test.column_names,
)

In [None]:
from torch.utils.data import DataLoader

batch_size = 16
test_loader = DataLoader(dataset_test, batch_size=batch_size)

## **4. Load model**

In [None]:
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
model.load_state_dict(torch.load("checkpoints/{}".format(model_name.replace("/", "-"))))

# Set the model to evaluation model
model.eval()

# Print model architecture summary
print(model)

In [None]:

# #########################################################
# #MODEL_PHOBERT_BASE = 'vinai/phobert-base'
# MODEL_PHOBERT_LARGE = 'vinai/phobert-large'

# #########################################################
# def get_arguments():
#     parser = argparse.ArgumentParser()
#     parser.add_argument("--model", type=str, default=MODEL_PHOBERT_LARGE, help="Pretrained model bert")
#     parser.add_argument("--lr", type=float, default=1e-5, help="Learning rate")
#     parser.add_argument("--bs", type=int, default=16, help="Batch size")
#     parser.add_argument("--epochs", type=int, default=20, help="Number of epochs")
#     parser.add_argument("--maxlen", type=int, default=512, help="Max sentence length")
#     parser.add_argument("--stride", type=int, default=128, help="Stride value for window slide")
#     parser.add_argument("--use_fast", type=bool, default=True, help="Tokenize sentence with fast bpe")

#     return parser.parse_args(args=[])

# #########################################################
# def preprocess_dataset(ds: pd.DataFrame, tokenizer: Any, maxlen: int):
#     questions = [q.strip() for q in ds["question"]]
#     contexts = [str(t) for t in ds["context"]]
#     inputs = tokenizer(
#         questions,
#         contexts,
#         max_length=maxlen,
#         truncation="only_second",
#         return_token_type_ids=True,
#         padding="max_length",
#     )

#     answer_starts = ds["answer_start"]
#     answer_ends = ds["answer_end"]
#     answers = ds["answer"]
#     start_positions = []
#     end_positions = []
#     assert len(answer_starts) == len(answer_ends)

#     for i in range(len(answer_starts)):
#         start_char = answer_starts[i]
#         end_char = answer_ends[i]
#         if start_char == 0 and end_char == 0:
#             start_positions.append(0)
#             end_positions.append(0)
#             continue

#         answer = answers[i]
#         context = contexts[i]
#         input_ids = inputs["input_ids"][i]

#         # Find the start and end of the context
#         idx = 0
#         while input_ids[idx] != 2:
#             idx += 1
#         idx += 2
#         context_start = idx
#         # print("context start:", context_start, ",id:", input_ids[context_start])
#         while idx < len(input_ids) and input_ids[idx] != 1:
#             idx += 1
#         context_end = idx - 1
#         if input_ids[context_end] == 2:
#             context_end -= 1
#         # print("context end:", context_end, "id:", input_ids[context_end])

#         pre_ans = tokenizer.encode(context[:start_char], add_special_tokens=False)
#         ans_ids = tokenizer.encode(answer, add_special_tokens=False)

#         start_position = context_start + len(pre_ans)
#         end_position = start_position + len(ans_ids) - 1

#         # If the answer is not fully inside the context, label is (0, 0)
#         if start_position < context_start or end_position > context_end:
#             start_positions.append(0)
#             end_positions.append(0)
#         else:
#             # Otherwise it's the start and end token positions
#             start_positions.append(start_position)
#             end_positions.append(end_position)

#     inputs["start_positions"] = np.array(start_positions, dtype=np.int32)
#     inputs["end_positions"] = np.array(end_positions, dtype=np.int32)
#     return inputs

# def generate_dataset(file_name: str, tokenizer: Any, data_collator: Any, maxlen: int, stride: int, batch_size: int, model_name: str):
#     df = pd.read_csv(file_name)
#     df.drop("title", axis=1, inplace=True)
#     df.fillna({"answer": ""}, inplace=True)
#     ds = datasets.Dataset.from_dict(df)

#     dataset =  ds.map(
#        lambda x: preprocess_dataset(x, tokenizer, maxlen),
#        batched=True,
#        remove_columns=ds.column_names,
#     )
    
#     return dataset.to_tf_dataset(
#         columns=[
#             "input_ids",
#             "start_positions",
#             "end_positions",
#             "attention_mask",
#             "token_type_ids",
#         ],
#         collate_fn=data_collator,
#         shuffle=True,
#         batch_size=batch_size,
#     )

#########################################################
# class myCallback(tf.keras.callbacks.Callback):
#     def __init__(self, saved_model_name: str):
#         super().__init__()

#         self.min_loss = sys.float_info.max
#         self.min_val_loss = sys.float_info.max

#         self.saved_model_name = saved_model_name

#     def on_epoch_end(self, epoch, logs={}):
#         min_loss = logs.get('loss')
#         min_val_loss = logs.get('val_loss')

#         if min_loss <= self.min_loss and min_val_loss <= self.min_val_loss:
#             self.min_loss = min_loss
#             self.min_val_loss = min_val_loss

#             print("\nsave model at epoch {}".format(epoch+1))
#             # self.model.save("models/{}.h5".format(self.saved_model_name))
#             self.model.save("/content/gdrive/MyDrive/Research/NLP-Labs/Lab7.1_BERT-Question-and-Answering-Vietnamese/model/vinai-phobert-large", save_format='tf')
            


## **5. Predict**

In [None]:
# Tokenizing the input with PyTorch-compatible tensors
def tokenize_question_context(question, context, tokenizer, maxlen, stride):
    question = question.strip()
    context = context.strip()
    inputs = tokenizer(
        question,
        context,
        max_length=maxlen,
        truncation="only_second",
        stride=stride,
        padding="max_length",
        return_tensors="pt"
    )
    return inputs

  
# Prediction function using a PyTorch model
def predict(model, question, context, tokenizer, maxlen, stride):
    # Tokenize input
    inputs = tokenize_question_context(question, context, tokenizer, maxlen, stride)

    # Move input tensors to the model's device (e.g., GPU if available)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Run the model in evaluation mode
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract start and end logits and move to CPU if necessary
    start_logits = outputs.start_logits.cpu().numpy()
    end_logits = outputs.end_logits.cpu().numpy()

    # Get the positions with the highest scores
    starts = np.argmax(start_logits, axis=1)
    ends = np.argmax(end_logits, axis=1)

    # Calculate scores
    start_scores = np.max(start_logits, axis=1)
    end_scores = np.max(end_logits, axis=1)
    scores = start_scores + end_scores

    # Filter valid answers
    indices = []
    for idx, start in enumerate(starts):
        end = ends[idx]
        if start == 0 and end == 0:  # Skip if no valid answer
            continue
        if end < start:  # Skip if end is before start
            continue
        indices.append(idx)

    # Decode answers
    answers = []
    for idx in indices:
        score = scores[idx]
        ans_ids = inputs["input_ids"][idx][starts[idx]:ends[idx] + 1]
        answer = tokenizer.decode(ans_ids, skip_special_tokens=True)
        answers.append((answer, score))
    
    return answers


df = df_test[df_test["context_len"] > 500]
df.head()

In [None]:
df_test.head()

In [None]:
idx = 4  # Select an index from the test DataFrame

# Extract question and context for the selected index
question = df_test.loc[idx, "question"]
print("Question:", question)

context = df_test.loc[idx, "context"]
print("Context:", context)

# Run the prediction using the adapted PyTorch `predict` function
answers = predict(model, question, context, tokenizer, maxlen, stride)

# Output the predicted answer(s)
print("Answer:", answers)