In [1]:
from datasets import load_dataset, get_dataset_split_names

get_dataset_split_names("curaihealth/medical_questions_pairs")

  from .autonotebook import tqdm as notebook_tqdm


['train']

In [2]:
get_dataset_split_names("ruslanmv/ai-medical-chatbot")

['train']

In [2]:
ds = load_dataset("curaihealth/medical_questions_pairs")

In [None]:
ds = ds["train"].train_test_split(test_size=0.2, seed=42)

In [4]:
ds["train"][0]

{'dr_id': 4,
 'question_1': 'What are the signs of having frostbite?',
 'question_2': 'What exactly is the treatment for frostbite?',
 'label': 0}

In [5]:
ds["train"][:3]

{'dr_id': [4, 2, 2],
 'question_1': ['What are the signs of having frostbite?',
  "I'm due for my birth control shot on the 24 but I've been having uterus pain on the right side of my uterus I'm not sure what it can be?",
  "If I don't have the egg white cervical musus am I still ovualating?"],
 'question_2': ['What exactly is the treatment for frostbite?',
  'After how long of stopping the birth control shots can one get pregnant?',
  'Is it imperative that one has egg white cervical mucous while ovulating?'],
 'label': [0, 0, 1]}

In [6]:
ds

DatasetDict({
    train: Dataset({
        features: ['dr_id', 'question_1', 'question_2', 'label'],
        num_rows: 2438
    })
    test: Dataset({
        features: ['dr_id', 'question_1', 'question_2', 'label'],
        num_rows: 610
    })
})

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [8]:
print(f"The max length for the tokenizer is: {tokenizer.model_max_length}")

The max length for the tokenizer is: 512


In [None]:
tokenizer(
    ds["train"][0]["question_1"], ds["train"][0]["question_2"], return_tensors="pt"
)

{'input_ids': tensor([[  101,  2054,  2024,  1996,  5751,  1997,  2383, 10097, 16313,  2063,
          1029,   102,  2054,  3599,  2003,  1996,  3949,  2005, 10097, 16313,
          2063,  1029,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [None]:
def tokenization(example):
    return tokenizer(
        example["question_1"], example["question_2"], return_tensors="pt", padding=True
    )


tokenized_ds = ds.map(tokenization, batched=True)
print(tokenized_ds)

Map: 100%|██████████| 610/610 [00:00<00:00, 5041.63 examples/s]

DatasetDict({
    train: Dataset({
        features: ['dr_id', 'question_1', 'question_2', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2438
    })
    test: Dataset({
        features: ['dr_id', 'question_1', 'question_2', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 610
    })
})





In [None]:
tokenized_ds["train"].features

{'dr_id': Value(dtype='int32', id=None),
 'question_1': Value(dtype='string', id=None),
 'question_2': Value(dtype='string', id=None),
 'label': ClassLabel(names=[0, 1], id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=2
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import wandb

wandb.init(project="medical_bot", name="bert_similarity")

[34m[1mwandb[0m: Currently logged in as: [33marz-mikhail[0m ([33marz-mikhail-rtu-mirea[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./model",
    # evaluation_strategy="steps",
    eval_steps=50,
    logging_steps=20,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    report_to="wandb",  # логирование в W&B
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds["train"],
    eval_dataset=ds["test"],
    compute_metrics=lambda p: {
        "accuracy": (p.predictions.argmax(-1) == p.label_ids).mean()
    },
)
trainer.train()



ValueError: You have to specify either input_ids or inputs_embeds