# Reader: DistilBERT

In [None]:
!apt install git-lfs
!pip install -qq transformers[sentencepiece,torch] datasets evaluate accelerate --upgrade

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.3).
0 upgraded, 0 newly installed, 0 to remove and 20 not upgraded.
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.9/484.9 kB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m336.6/336.6 kB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m64.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m41.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import numpy as np
from tqdm.auto import tqdm
import collections

import torch

from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import AutoModelForQuestionAnswering
from transformers import TrainingArguments
from transformers import Trainer
import evaluate

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")



In [None]:

MODEL_NAME =  "distilbert-base-uncased"
# độ dài tối đa cho mỗi đoạn văn bản sau khi được xủ lý
MAX_LENGTH = 384
# khoảng cách giữa các điểm bắt đầu của các đoạn văn bản liên tiếp
STRIDE = 128



In [None]:
DATASET_NAME = "squad_v2" # tải dataset từ huggingface
raw_datasets = load_dataset(DATASET_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) # load tokenizer

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
# hàm tiền xử lý dữ liệu -> nhận vào examples(câu hỏi + câu trả lời)
# trích xuất câu hỏi -> tokenize -> tạo ra offset_mapping và sample_map để ánh xạ từ token sang câu hỏi gốc

def preprocess_training_examples(examples):

  questions = [q.strip() for q in examples['question']] # loại bỏ khoảng trống

  inputs = tokenizer(
      questions,
      examples['context'],
      max_length=MAX_LENGTH,
      truncation = "only_second",
      stride=STRIDE,
      return_overflowing_tokens = True,
      return_offsets_mapping = True,
      padding="max_length"
  )

  offset_mapping = inputs.pop("offset_mapping")

  sample_map = inputs.pop("overflow_to_sample_mapping")

  answers = examples['answers']

  start_positions = []
  end_positions = []

  for i,offset in enumerate(offset_mapping):

    sample_idx = sample_map[i]

    sequence_ids = inputs.sequence_ids(i)

    idx = 0
    while sequence_ids[idx] != 1:
      idx+=1
    context_start = idx
    while sequence_ids[idx] == 1:
      idx+=1
    context_end = idx-1

    answer = answers[sample_idx]

    if len(answer['text']) == 0:
      start_positions.append(0)
      end_positions.append(0)
    else:

      start_char = answer['answer_start'][0]
      end_char = answer['answer_start'][0] + len(answer['text'][0])
      if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
        start_positions.append(0)
        end_positions.append(0)
      else:
        idx = context_start
        while idx <= context_end and offset[idx][0] <= start_char:
          idx+=1
        start_positions.append(idx-1)
        idx = context_end
        while idx >= context_start and offset[idx][1] >=end_char:
          idx-=1
        end_positions.append(idx+1)

  inputs['start_positions'] = start_positions
  inputs['end_positions'] = end_positions

  return inputs

In [None]:
train_dataset = raw_datasets["train"].map(preprocess_training_examples,batched=True,remove_columns = raw_datasets["train"].column_names)

len(raw_datasets["train"]), len(train_dataset)

Map:   0%|          | 0/130319 [00:00<?, ? examples/s]

(130319, 131754)

In [None]:
def preprocess_validation_examples(examples):
  questions = [q.strip() for q in examples["question"]]

  inputs = tokenizer(
      questions,
      examples["context"],
      max_length = MAX_LENGTH,
      truncation = "only_second",
      stride = STRIDE,
      return_overflowing_tokens = True,
      return_offsets_mapping = True,
      padding="max_length"
  )

  sample_map = inputs.pop("overflow_to_sample_mapping")
  example_ids = []

  for i in range(len(inputs['input_ids'])):
    sample_idx = sample_map[i]
    example_ids.append(examples["id"][sample_idx])

    sequence_ids = inputs.sequence_ids(i)
    offset = inputs['offset_mapping'][i]

    inputs['offset_mapping'][i] = [ o if  sequence_ids[k]==1 else None  for k,o in enumerate(offset)]
  inputs['example_id'] = example_ids
  return inputs

In [None]:
validation_dataset = raw_datasets['validation'].map(preprocess_validation_examples,batched=True,remove_columns=raw_datasets['validation'].column_names)

len(raw_datasets['validation']),len(validation_dataset)

Map:   0%|          | 0/11873 [00:00<?, ? examples/s]

(11873, 12134)

In [None]:
model = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME)
args = TrainingArguments(
    output_dir = "distilbert-base-uncased-squad2",
    evaluation_strategy = "epoch",
    save_strategy="epoch",
    learning_rate = 2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
    push_to_hub=True
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer
)
trainer.train()

  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mphuochuunguyen2009[0m ([33mphuochuunguyen2009-localcompany[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss
1,1.186,No log
2,0.8918,No log


Epoch,Training Loss,Validation Loss
1,1.186,No log
2,0.8918,No log
3,0.6779,No log


TrainOutput(global_step=49410, training_loss=1.016939915415971, metrics={'train_runtime': 5108.5678, 'train_samples_per_second': 77.372, 'train_steps_per_second': 9.672, 'total_flos': 3.873165421863629e+16, 'train_loss': 1.016939915415971, 'epoch': 3.0})

In [None]:
trainer.push_to_hub(commit_message="Training complete")


CommitInfo(commit_url='https://huggingface.co/phuoc2k9/distilbert-base-uncased-squad2/commit/23fc6341ec8e35bfa9c91d4e416c4ac2784df652', commit_message='Training complete', commit_description='', oid='23fc6341ec8e35bfa9c91d4e416c4ac2784df652', pr_url=None, repo_url=RepoUrl('https://huggingface.co/phuoc2k9/distilbert-base-uncased-squad2', endpoint='https://huggingface.co', repo_type='model', repo_id='phuoc2k9/distilbert-base-uncased-squad2'), pr_revision=None, pr_num=None)

In [None]:
metric = evaluate.load("squad_v2")

Downloading builder script:   0%|          | 0.00/6.47k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/11.3k [00:00<?, ?B/s]

In [None]:
N_BEST = 20
MAX_ANS_LENGTH = 30

def compute_metrics(start_logits,end_losgits,features,examples):

  example_to_features = collections.defaultdict(list)

  for idx,feature in enumerate(features):
    example_to_features[feature['example_id']].append(idx)
  predicted_answers = []
  for example in tqdm(examples):
    example_id = example['id']
    context = example['context']
    answers = []

    for feature_index in example_to_features[example_id]:
      start_logit = start_logits[feature_index]
      end_logit = end_logits[feature_index]
      offsets = features[feature_index]['offset_mapping']

      start_indexes = np.argsort(start_logit)[-1:-N_BEST-1:-1].tolist()
      end_indexes = np.argsort(end_logit)[-1:-N_BEST-1:-1].tolist()

      for start_index in start_indexes:
        for end_index in end_indexes:

          if offsets[start_index] is None or offsets[end_index] is None:
            continue

          if end_index - start_index+1 > MAX_ANS_LENGTH:
            continue

          text= context[offsets[start_index][0]:offsets[end_index][1]]
          logit_score = start_logit[start_index] + end_logit[end_index]
          answer = {
              "text":text,
              "logit_score":logit_score
          }
          answers.append(answer)
    if len(answers) > 0:
      best_answer = max(answers,key=lambda x:x['logit_score'])
      answer_dict = {
          "id":example_id,
          "prediction_text":best_answer['text'],
          "no_answer_probability":1 - best_answer['logit_score']
      }
    else:
      answer_dict = {
          "id":example_id,
          "prediction_text":"",
          "no_answer_probability":1.0

      }
    predicted_answers.append(answer_dict)
  theoretical_answers = [
      {'id':ex['id'],'answers':ex['answers']} for ex in examples
  ]
  return metric.compute(
      predictions=predicted_answers,
      references=theoretical_answers
  )

In [None]:
predictions, _,_ = trainer.predict(validation_dataset)
start_logits,end_logits = predictions

results = compute_metrics(
    start_logits,
    end_logits,
    validation_dataset,
    raw_datasets['validation']
)
results

  0%|          | 0/11873 [00:00<?, ?it/s]

{'exact': 48.479743956876945,
 'f1': 52.377425485872166,
 'total': 11873,
 'HasAns_exact': 75.42172739541161,
 'HasAns_f1': 83.22826801514165,
 'HasAns_total': 5928,
 'NoAns_exact': 21.6148023549201,
 'NoAns_f1': 21.6148023549201,
 'NoAns_total': 5945,
 'best_exact': 65.72896487829529,
 'best_exact_thresh': -11.17578125,
 'best_f1': 67.59187175707329,
 'best_f1_thresh': -9.734375}

# Retriever : Faiss

In [None]:
!pip install -qq transformers[sentencepiece] datasets==2.16.1 evaluate==0.4.1
! sudo apt-get install libomp-dev
!pip install faiss-cpu

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.5.1+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.5.3.2 which is incompatible.
torch 2.5.1+cu124 requires nvidia-cuda-cupti-cu12==12.4.127; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cuda-cupti-cu12 12.5.82 which is incompatible.
torch 2.5.1+cu124 requires nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cuda-nvrtc-cu12 12.5.82 which is incompatible.
torch 2.5.1+cu124 requires nvidia-cuda-runtime-cu12==12.4.127; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cuda-runtime-cu12 12.5.82 which is incompatible.
torch 2.5.1+cu124 requires nvidia-cudnn-cu12==9.1.0.70; platform_sy

In [None]:
import numpy as np
import collections
import torch
import faiss
import evaluate

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
from transformers import AutoModelForQuestionAnswering
from transformers import TrainingArguments
from transformers import Trainer
from tqdm.auto import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
DATASET_NAME = "squad_v2"
raw_datasets = load_dataset(DATASET_NAME, split="train+validation")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/8.92k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]

In [None]:
raw_datasets = raw_datasets.filter(lambda x:len(x['answers']['text'])>0)
MODEL_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)

Filter:   0%|          | 0/142192 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [None]:
def cls_pooling(model_output):
  return model_output.last_hidden_state[:,0]
def get_embeddings(text_list):
  encoded_input = tokenizer(text_list,padding=True,
                             truncation = True,
                             return_tensors="pt")
  encoded_input = {k: v.to(device) for k,v in encoded_input.items()}
  model_output = model(**encoded_input)

  return cls_pooling(model_output)

EMBEDDING_COLUMN = "question_embedding"
embeddings_dataset = raw_datasets.map(
    lambda x : {
        EMBEDDING_COLUMN: get_embeddings( x['question']).detach().cpu().numpy()[0]

    }
)

Map:   0%|          | 0/92749 [00:00<?, ? examples/s]

In [None]:
embeddings_dataset.add_faiss_index(column=EMBEDDING_COLUMN)


  0%|          | 0/93 [00:00<?, ?it/s]

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'question_embedding'],
    num_rows: 92749
})

In [None]:
input_question = "When did Beyonce start becoming popular?"

input_quest_embedding = get_embeddings([input_question])
input_quest_embedding = input_quest_embedding.cpu().detach().numpy()

TOP_K = 5
scores,samples = embeddings_dataset.get_nearest_examples(
    EMBEDDING_COLUMN,
    input_quest_embedding,
    k=TOP_K
)

for idx,score in enumerate(scores):
  print(f"Top {idx+1}\tScore: {score}")
  print(f"Question: {samples['question'][idx]}")
  print(f'Context: {samples["context"][idx]}')
  print()

Top 1	Score: 0.0
Question: When did Beyonce start becoming popular?
Context: Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".

Top 2	Score: 2.613530397415161
Question: When did Beyoncé rise to fame?
Context: Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress.

In [None]:
from transformers import pipeline

PIPELINE_NAME = "question-answering"
MODEL_NAME= "thangduong0509/distilbert-finetuned-squadv2"
pipe = pipeline(PIPELINE_NAME,model=MODEL_NAME)


config.json:   0%|          | 0.00/561 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Device set to use cuda:0


In [None]:
print(f"Input question: {input_question}")
for idx,score in enumerate(scores):
  question = samples['question'][idx]
  context = samples['context'][idx]
  answer = pipe(question=question,context=context)
  print(f"Top {idx+1}\tScore: { score}")
  print(f"Context: { context}")
  print(f"Answer: {answer}")

Input question: When did Beyonce start becoming popular?
Top 1	Score: 0.0
Context: Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".
Answer: {'score': 0.6091992855072021, 'start': 276, 'end': 286, 'answer': 'late 1990s'}
Top 2	Score: 2.613530397415161
Context: Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an Ameri