In [1]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
import collections
import numpy as np

print("PyTorch and CUDA available:", torch.cuda.is_available())


PyTorch and CUDA available: True


In [17]:
dataset_name = "tydiqa"
task = "primary_task"
print(f"Loading a sample of the '{dataset_name}' dataset...")

train_dataset = load_dataset(dataset_name, task, split='train[:1000]')
validation_dataset = load_dataset(dataset_name, task, split='validation[:200]')

Loading a sample of the 'tydiqa' dataset...


In [3]:
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForQuestionAnswering were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
print(train_dataset)
print(train_dataset.column_names)

Dataset({
    features: ['passage_answer_candidates', 'question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url'],
    num_rows: 10000
})
['passage_answer_candidates', 'question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url']


In [18]:
MAX_LENGTH = 384
DOC_STRIDE = 128

def preprocess_function(examples):
    questions = [q.strip() for q in examples["question_text"]]
    contexts = examples["document_plaintext"]
    answers = examples["annotations"]

    inputs = tokenizer(
        questions,
        contexts,
        max_length=MAX_LENGTH,
        truncation="only_second",
        stride=DOC_STRIDE,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")

    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        annotation = answers[sample_idx]


        answer = annotation.get('minimal_answer', {})
        start_char = answer.get('span_start', -1)
        span_text = answer.get('span_text', "")

        if start_char == -1 or not span_text:
            start_positions.append(0)
            end_positions.append(0)
            continue

        end_char = start_char + len(span_text)
        sequence_ids = inputs.sequence_ids(i)

        context_start = sequence_ids.index(1)
        context_end = len(sequence_ids) - 1 - sequence_ids[::-1].index(1)

        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            token_start_index = context_start
            while token_start_index <= context_end and offset[token_start_index][0] < start_char:
                token_start_index += 1

            token_end_index = context_end
            while token_end_index >= context_start and offset[token_end_index][1] > end_char:
                token_end_index -= 1

            start_positions.append(token_start_index)
            end_positions.append(token_end_index)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

print("Preprocessing data...")

tokenized_train = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
tokenized_validation = validation_dataset.map(preprocess_function, batched=True, remove_columns=validation_dataset.column_names)

Preprocessing data...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [19]:
training_args = TrainingArguments(
    output_dir="./qa_results",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,
    weight_decay=0.01,
    fp16=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_validation,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [20]:
print("Starting training... 🚀")
trainer.train()
print("Training finished! ✅")




Starting training... 🚀


Step,Training Loss
500,0.0003
1000,0.0
1500,0.0
2000,0.0
2500,0.0
3000,0.0
3500,0.0
4000,0.0
4500,0.0
5000,0.0


Training finished! ✅

--- Testing the model on a sample ---


KeyError: 'question'

In [22]:
print("\n--- Testing the model on a sample ---")
random_sample = validation_dataset[10] # اخترنا عينة عشوائية للتقييم
# --- التعديل هنا ---
# استخدام أسماء الأعمدة الصحيحة من مجموعة البيانات الأصلية
question = random_sample['question_text']
context = random_sample['document_plaintext']
# التحقق بأمان من وجود إجابة فعلية
actual_answer = random_sample.get('annotations', {}).get('minimal_answer', {}).get('span_text', ['N/A'])[0]


inputs = tokenizer(question, context, return_tensors="pt")


Token indices sequence length is longer than the specified maximum sequence length for this model (1524 > 512). Running this sequence through the model will result in indexing errors



--- Testing the model on a sample ---


In [25]:
print("\n--- Testing the model on a sample ---")
random_sample = validation_dataset[10] # اخترنا عينة عشوائية للتقييم
# --- التعديل هنا ---
# استخدام أسماء الأعمدة الصحيحة من مجموعة البيانات الأصلية
question = random_sample['question_text']
context = random_sample['document_plaintext']
# التحقق بأمان من وجود إجابة فعلية
actual_answer = random_sample.get('annotations', {}).get('minimal_answer', {}).get('span_text', ['N/A'])[0]



inputs = tokenizer(
    question,
    context,
    return_tensors="pt",
    max_length=MAX_LENGTH,
    truncation="only_second",
    padding="max_length"
)

device = "cuda" if torch.cuda.is_available() else "cpu"
inputs = {k: v.to(device) for k, v in inputs.items()}
model.to(device)

with torch.no_grad():
    outputs = model(**inputs)

answer_start_index = torch.argmax(outputs.start_logits)
answer_end_index = torch.argmax(outputs.end_logits)

predict_answer_tokens = inputs['input_ids'][0, answer_start_index : answer_end_index + 1]
predicted_answer = tokenizer.decode(predict_answer_tokens)

print(f"\n[Context]: {context[:500]}...")
print(f"\n[Question]: {question}")
print(f"\n[Predicted Answer]: {predicted_answer}")
print(f"\n[Actual Answer]: {actual_answer}")




--- Testing the model on a sample ---

[Context]: 
بطولة كأس العالم لكرة القدم هي أهم مسابقة كرة قدم دولية يقيمها الاتحاد الدولي لكرة القدم (الـفيفا). أقيمت البطولة أول مرة عام 1930م وتقام بطولة كأس العالم لكرة القدم حتى الآن كل أربع سنوات باستثناء بطولتي عام 1942 و1946م اللتين ألغيتا بسبب الحرب العالمية الثانية[1]. ويعد المنتخب الفرنسي هو البطل الحالي للبطولة بعد فوزه في المباراة النهائية أمام المنتخب الكرواتي في نهائي كاس العالم 2018[2] تقام المباراة النهائية من مباراة على شوطين، وفي حالة التعادل يتم اللجوء إلى وقت إضافي ثم الركلات الترجيحية....

[Question]: كم عدد مرات فوز الأوروغواي ببطولة كاس العالم لكرو القدم؟

[Predicted Answer]: <s>

[Actual Answer]: N/A


Please note that since the model is trained on a very small dataset, the generated answers may not be complete or accurate. For high-quality, reliable results, the model must be trained on a much larger dataset (e.g., the full training set) and for more epochs (e.g., 3-5).
