In [1]:
pip install transformers[torch]


Collecting accelerate>=0.20.3 (from transformers[torch])
  Obtaining dependency information for accelerate>=0.20.3 from https://files.pythonhosted.org/packages/53/fe/0251ccd9e0015c705e772da0fb2c96cdafd87b1d7dd45dc13dca7ced0eb7/accelerate-0.29.3-py3-none-any.whl.metadata
  Downloading accelerate-0.29.3-py3-none-any.whl.metadata (18 kB)
Downloading accelerate-0.29.3-py3-none-any.whl (297 kB)
   ---------------------------------------- 0.0/297.6 kB ? eta -:--:--
   ---------------------------------------- 0.0/297.6 kB ? eta -:--:--
   - -------------------------------------- 10.2/297.6 kB ? eta -:--:--
   ---- ---------------------------------- 30.7/297.6 kB 262.6 kB/s eta 0:00:02
   ----- --------------------------------- 41.0/297.6 kB 245.8 kB/s eta 0:00:02
   --------- ----------------------------- 71.7/297.6 kB 357.2 kB/s eta 0:00:01
   ------------ -------------------------- 92.2/297.6 kB 403.5 kB/s eta 0:00:01
   -------------- ----------------------- 112.6/297.6 kB 435.7 kB/s eta 

In [11]:
pip install transformers torch pandas datasets





In [75]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForQuestionAnswering, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset

data = {
    "context": [
               # "Helicobacter pylori infection is strongly associated with gastric cancer and peptic ulcer disease.",
               # "The standard treatment for H. pylori involves a combination of antibiotics such as amoxicillin and clarithromycin.",
               # "H. pylori is a Gram-negative, microaerophilic bacterium found in the stomach.",
               # "Over half of the world's population harbors H. pylori in their upper gastrointestinal tract.",
              #  "Infection with H. pylori is usually asymptomatic, but it can cause gastric problems and is a risk factor for stomach cancer.",
              #  "The urease test is a diagnostic tool for detecting H. pylori infection based on the bacterium's ability to hydrolyze urea.",
               # "H. pylori's ability to produce urease is a key factor in its survival in the acidic environment of the stomach.",
                "Triple therapy for H. pylori infection includes a proton pump inhibitor and two antibiotics for 14 days.",
                #"The breath test for H. pylori involves drinking a liquid containing urea, which the bacteria can break down if present.",
                "H. pylori was first identified in 1982 by Barry Marshall and Robin Warren, who later won the Nobel Prize for this discovery."],
    "question": [ 
                #"What diseases are associated with Helicobacter pylori?",
                # "What antibiotics are used to treat H. pylori?",
               # "What type of bacterium is H. pylori?",
                # "How common is H. pylori infection worldwide?",
                # "What are the possible consequences of an H. pylori infection?",
                 #"What does the urease test diagnose?",
                 #"Why is urease important for H. pylori?",
                 "What does triple therapy for H. pylori involve?",
                # "How does the breath test for H. pylori work?",
                # "Who discovered H. pylori and what was the recognition for this work?"],
    "answer": [#"Gastric cancer and peptic ulcer disease",
              # "Amoxicillin and clarithromycin",
              # "A Gram-negative, microaerophilic bacterium",
            #  "Over half of the world's population",
            #   "Gastric problems and stomach cancer",
              # "H. pylori infection",
               #"Helps survival in acidic stomach conditions",
               "A proton pump inhibitor and two antibiotics for 14 days",
              # "Drinking a liquid with urea, which H. pylori breaks down if present",
               "Barry Marshall and Robin Warren, Nobel Prize"]
}

df = pd.DataFrame(data)
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)

class QADataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer

    def __getitem__(self, index):
        question = self.data.iloc[index]['question']
        context = self.data.iloc[index]['context']
        answer_text = self.data.iloc[index]['answer']

        # Encode the inputs
        inputs = self.tokenizer.encode_plus(question, context, add_special_tokens=True, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
        input_ids = inputs["input_ids"].squeeze()

        # Find the position of the answer in the context
        answer_start_index = context.find(answer_text)
        answer_end_index = answer_start_index + len(answer_text)

        # Locate the positions of the start and end of the answer in the tokens
        answer_start_token = self.tokenizer.encode(context[:answer_start_index], add_special_tokens=False, return_tensors="pt").size(1)
        answer_end_token = answer_start_token + self.tokenizer.encode(answer_text, add_special_tokens=False, return_tensors="pt").size(1) - 1

        return {
            "input_ids": input_ids,
            "attention_mask": inputs["attention_mask"].squeeze(),
            "start_positions": torch.tensor(answer_start_token),
            "end_positions": torch.tensor(answer_end_token)
        }

    def __len__(self):
        return self.len


In [76]:
val_df

Unnamed: 0,context,question,answer
1,H. pylori was first identified in 1982 by Barr...,Who discovered H. pylori and what was the reco...,"Barry Marshall and Robin Warren, Nobel Prize"


In [77]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

train_dataset = QADataset(train_df, tokenizer)
val_dataset = QADataset(val_df, tokenizer)


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [78]:
from sklearn.metrics import accuracy_score

def compute_metrics(p):
    # Извлечение истинных значений
    true_starts, true_ends = p.label_ids

    # Извлечение предсказаний
    predictions = p.predictions
    pred_starts = predictions[0].argmax(-1)
    pred_ends = predictions[1].argmax(-1)

    # Вычисление точности для начальной и конечной позиций
    start_accuracy = accuracy_score(true_starts, pred_starts)
    end_accuracy = accuracy_score(true_ends, pred_ends)

    return {
        'start_accuracy': start_accuracy,
        'end_accuracy': end_accuracy
    }


training_args = TrainingArguments(
    output_dir='./result',
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=50,
    save_steps=500,
    save_total_limit=2
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

eval_results = trainer.evaluate()
print(eval_results)


dataloader_config = DataLoaderConfiguration(dispatch_batches=None)


Step,Training Loss,Validation Loss


{'eval_loss': 5.970719337463379, 'eval_start_accuracy': 0.0, 'eval_end_accuracy': 1.0, 'eval_runtime': 1.0322, 'eval_samples_per_second': 0.969, 'eval_steps_per_second': 0.969, 'epoch': 3.0}


In [79]:
from transformers import pipeline

# Создаем пайплайн для ответов на вопросы с использованием модели и токенизатора
qa_pipeline = pipeline(
    "question-answering",
    model=model,
    tokenizer=tokenizer
)

# Пример вопроса и контекста из валидационного набора данных
#example = val_df.iloc[1]
for idx, example in val_df.iterrows():
    context = example['context']
    question = example['question']
    

# Выполнение предсказания
    result = qa_pipeline({
    'question': question,
    'context': context
    })

    print(f"Question: {question}")
    print(f"Context: {context}")
    print(f"Answer: {result['answer']}")
    print(f"Score: {result['score']}")
    print(f"Start position: {result['start']}")
    print(f"End position: {result['end']}")


Question: Who discovered H. pylori and what was the recognition for this work?
Context: H. pylori was first identified in 1982 by Barry Marshall and Robin Warren, who later won the Nobel Prize for this discovery.
Answer: 1982 by Barry Marshall and Robin Warren,
Score: 0.003282047575339675
Start position: 34
End position: 74
