# Вопрос-ответ

In [1]:
!pip install -q transformers datasets torch scipy scikit-learn accelerate evaluate nltk rouge_score sentencepiece sacrebleu

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [17]:
from datasets import load_dataset
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    Trainer,
    Seq2SeqTrainingArguments
)
import torch
from torch.utils.data import Dataset
import random

class QADataset(Dataset):
    def __init__(self, contexts, questions, answers, tokenizer):
        # Format: "<start_question> {question} <end_question> <start_context> {context} <end_context>"
        self.inputs = tokenizer(
            [f"<start_question> {q} <end_question> <start_context> {c} <end_context>" for q, c in zip(questions, contexts)],
            max_length=512,
            truncation=True,
            padding='max_length',
            return_tensors="pt"
        )
        self.targets = tokenizer(
            answers,
            max_length=128,
            truncation=True,
            padding='max_length',
            return_tensors="pt"
        )

    def __len__(self):
        return len(self.targets["input_ids"])

    def __getitem__(self, idx):
        return {
            "input_ids": self.inputs["input_ids"][idx],
            "attention_mask": self.inputs["attention_mask"][idx],
            "labels": self.targets["input_ids"][idx]
        }

def train_qa_model():
    # Load model and tokenizer
    tokenizer = T5Tokenizer.from_pretrained("t5-small")
    model = T5ForConditionalGeneration.from_pretrained("t5-small").cuda()

    # Add special tokens for QA task
    special_tokens_dict = {
        'additional_special_tokens': [
            '<start_question>', '<end_question>',
            '<start_context>', '<end_context>'
        ]
    }
    tokenizer.add_special_tokens(special_tokens_dict)
    model.resize_token_embeddings(len(tokenizer))

    # Load SQuAD dataset
    dataset = load_dataset("squad", split="train[:10%]", trust_remote_code=True)

    # Take share of the loaded data if needed
    total_examples = len(dataset)
    subsample_size = total_examples

    # Randomly sample indices
    all_indices = list(range(total_examples))
    selected_indices = random.sample(all_indices, subsample_size)

    # Get subsampled data
    contexts = [dataset[i]["context"] for i in selected_indices]
    questions = [dataset[i]["question"] for i in selected_indices]
    answers = [dataset[i]["answers"]["text"][0] for i in selected_indices]  # Taking first answer if multiple exist

    # Create dataset
    train_size = int(len(contexts) * 0.8)
    train_dataset = QADataset(
        contexts[:train_size],
        questions[:train_size],
        answers[:train_size],
        tokenizer
    )
    eval_dataset = QADataset(
        contexts[train_size:],
        questions[train_size:],
        answers[train_size:],
        tokenizer
    )

    # Training configuration
    training_args = Seq2SeqTrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        report_to="tensorboard",
        learning_rate=1e-4,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        weight_decay=0.01,
        save_total_limit=2,
        num_train_epochs=1,
        predict_with_generate=True,
        logging_dir="./logs",
        logging_steps=5,
        push_to_hub=False,
        save_strategy="epoch"
    )

    # Training
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset
    )

    trainer.train()
    trainer.save_model("./qa_model")
    return model, tokenizer

def answer_question(context, question, model, tokenizer):
    input_text = f"<start_question> {question} <end_question> <start_context> {context} <end_context>"
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)

    # Move inputs to CUDA
    inputs = {k: v.cuda() for k, v in inputs.items()}

    answer_ids = model.generate(
        inputs["input_ids"],
        max_new_tokens =128,
        min_new_tokens =1,
        num_beams=4,
        length_penalty=0.6,
        early_stopping=True
    )
    return tokenizer.decode(answer_ids[0], skip_special_tokens=True)

In [18]:
main()



Epoch,Training Loss,Validation Loss
1,0.0176,0.012601



Context: 
    The Apollo program was the third United States human spaceflight program carried out
    by NASA. It accomplished landing the first humans on the Moon from 1969 to 1972.
    During the Apollo 11 mission, astronauts Neil Armstrong and Buzz Aldrin landed their
    lunar module and walked on the lunar surface, while Michael Collins remained in lunar orbit.
    

Answering questions:

Q: Who was the first person to walk on the Moon?
A: Neil Armstrong and Buzz Aldrin

Q: What was the name of the space program?
A: Apollo

Q: How many astronauts landed on the Moon during Apollo 11?
A: Neil Armstrong
