In [1]:
import numpy as np
import tempfile
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer

ModuleNotFoundError: No module named 'datasets'

In [None]:
from datasets import load_dataset

dataset = load_dataset('KaungHtetCho/MedicalQA')
seed = 123

# Split the dataset into training and test data (90% train, 10% test)
reduce_dataset_split = dataset["train"].train_test_split(test_size=0.001, seed=seed)
train_set = reduce_dataset_split['test']

train_test_split = train_set.train_test_split(test_size=0.1, seed=seed)
test_set = train_test_split['test']

# Further split the test set into validation and test sets (50% validation, 50% test)
val_test_split = test_set.train_test_split(test_size=0.5, seed=seed)
validation_set = val_test_split['train']
test_set = val_test_split['test']

# Update dataset to include these splits
dataset["train"] = train_test_split['train']
dataset["validation"] = validation_set
dataset["test"] = test_set

In [None]:
def concatenate_utterances(example):
    example['dialog'] = example['Patient'] + " " + example['Doctor']
    del example['Description']
    del example['Patient']
    del example['Doctor']
    return example

dataset = dataset.map(concatenate_utterances, remove_columns=['Description', 'Patient', 'Doctor'])

In [None]:
dataset

In [None]:
# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('microsoft/DialoGPT-small')
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained('microsoft/DialoGPT-small')

In [None]:
# Encode the dataset
def encode(examples):
    encoded = tokenizer(examples['dialog'], truncation=True, padding='max_length', max_length=128)
    encoded['labels'] = encoded['input_ids'][:]
    return encoded

encoded_dataset = dataset.map(encode, batched=True)

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir=tempfile.mkdtemp(),   # output directory
    num_train_epochs=1,             # total number of training epochs
    per_device_train_batch_size=2,  # batch size per device during training
    per_device_eval_batch_size=2,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir=None,                # directory for storing logs
    fp16=True                        # use floating point 16 bit precision for training
)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['validation']
)

In [None]:
# Evaluate before fine-tuning
pre_eval_results = trainer.evaluate(encoded_dataset['validation'])

# Get predictions for validation set before fine tuning for 10 samples
pre_val_predictions = trainer.predict(encoded_dataset['validation'].select(range(10)))

In [None]:
# Fine-tune the model
trainer.train()