In [10]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DefaultDataCollator
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
from transformers import pipeline

In [11]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [12]:
squad = load_dataset("squad", split="train[:5000]")
squad = squad.train_test_split(test_size=0.2)
print(squad["train"][0])

{'id': '56ce362faab44d1400b885c7', 'title': 'Sino-Tibetan_relations_during_the_Ming_dynasty', 'context': "With the death of Zhengde and ascension of Jiajing, the politics at court shifted in favor of the Neo-Confucian establishment which not only rejected the Portuguese embassy of Fernão Pires de Andrade (d. 1523), but had a predisposed animosity towards Tibetan Buddhism and lamas. Evelyn S. Rawski, a professor in the Department of History of the University of Pittsburgh, writes that the Ming's unique relationship with Tibetan prelates essentially ended with Jiajing's reign while Ming influence in the Amdo region was supplanted by the Mongols.", 'question': 'What establishment did the politics at court shift in favor of? ', 'answers': {'text': ['Neo-Confucian establishment'], 'answer_start': [98]}}


In [13]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [14]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [15]:
tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)
data_collator = DefaultDataCollator()

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [16]:
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
training_args = TrainingArguments(
    output_dir="a_qa_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

  0%|          | 0/750 [00:00<?, ?it/s]

  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 2.485596179962158, 'eval_runtime': 7.3489, 'eval_samples_per_second': 136.074, 'eval_steps_per_second': 8.573, 'epoch': 1.0}
{'loss': 2.7624, 'learning_rate': 6.666666666666667e-06, 'epoch': 2.0}


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 1.9034978151321411, 'eval_runtime': 7.4677, 'eval_samples_per_second': 133.909, 'eval_steps_per_second': 8.436, 'epoch': 2.0}


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 1.8137667179107666, 'eval_runtime': 7.3476, 'eval_samples_per_second': 136.098, 'eval_steps_per_second': 8.574, 'epoch': 3.0}
{'train_runtime': 293.616, 'train_samples_per_second': 40.87, 'train_steps_per_second': 2.554, 'train_loss': 2.3418420003255207, 'epoch': 3.0}


TrainOutput(global_step=750, training_loss=2.3418420003255207, metrics={'train_runtime': 293.616, 'train_samples_per_second': 40.87, 'train_steps_per_second': 2.554, 'train_loss': 2.3418420003255207, 'epoch': 3.0})

In [18]:
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/11acc/a_qa_model/commit/ec4e253bb6ca553c7f824aa84bbe7dc3589d0332', commit_message='End of training', commit_description='', oid='ec4e253bb6ca553c7f824aa84bbe7dc3589d0332', pr_url=None, pr_revision=None, pr_num=None)

Now we use the trained model

In [27]:
def read_text_from_file(file_path):
    try:
        text = ""
        with open(file_path, 'r') as file:
            line = file.readline()
            while line:
                text += line
                line = file.readline()
        return text
    except FileNotFoundError:
        print(f"File '{file_path}' not found.")
        return None

# Example usage:
file_path = "./docs/Taiwan_text.txt"
context = read_text_from_file(file_path)

In [29]:
question = "What actions can Congress take to make sure the President follows the Taiwan Relations Act?"

question_answerer = pipeline("question-answering", model="a_qa_model")
question_answerer(question=question, context=context)

{'score': 0.11351613700389862,
 'start': 159,
 'end': 170,
 'answer': 'legislation'}

In [21]:
# Tokenize the text and return PyTorch tensors:
tokenizer = AutoTokenizer.from_pretrained("a_qa_model")
inputs = tokenizer(question, context, return_tensors="pt")

In [22]:
# Pass your inputs to the model and return the logits
model = AutoModelForQuestionAnswering.from_pretrained("a_qa_model")
with torch.no_grad():
    outputs = model(**inputs)

In [23]:
# Get the highest probability from the model output for the start and end positions
answer_start_index = outputs.start_logits.argmax()
answer_end_index = outputs.end_logits.argmax()

In [24]:
# Decode the predicted tokens to get the answer
predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
tokenizer.decode(predict_answer_tokens)

'176 billion parameters and can generate text in 46 languages natural languages and 13'