In [1]:
import shutil

original_dataset_path = "/kaggle/input/medqa-senior/dataset.txt"
destination_path = "/kaggle/working/dataset_copy.txt"
shutil.copyfile(original_dataset_path, destination_path)
dataset_path = destination_path

In [36]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments


model_name = "gpt2-medium"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=dataset_path,
    block_size=128 
)


data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)


training_args = TrainingArguments(
    output_dir="./output",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)


trainer.train()
model.save_pretrained("./fine_tuned_model")



dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


In [58]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import textwrap
import re

# Load fine-tuned GPT-2 model and tokenizer
model_path = "./fine_tuned_model"  # Path to your fine-tuned model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained(model_path)
pattern = r".*\?[\n\n| ]"

# Define a function to generate responses to questions
def generate_response(question, max_length=100):
    input_text = question
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    output = model.generate(input_ids,pad_token_id=tokenizer.pad_token_id, max_length=max_length, num_return_sequences=1)
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

def answer(question):
    response = generate_response(question)
    response = re.split(pattern,response[:response.rfind('.')+1])
    response = response[1].strip()
    print("Response:", textwrap.fill(response,width = 80))

In [42]:
# Example usage
question = "What is diabetes?"
answer(question)

Response: Diabetes is a chronic disease that affects the blood sugar level. It affects
about one in every 100 people.  People with diabetes have trouble getting enough
blood sugar (sugar) to support normal blood pressure and heart function.


In [39]:
question = "How does diabetes affect blood sugar levels?"
answer(question)

Response: Diabetes affects the body's ability to use sugar as fuel. When blood sugar
levels are too high, the body cannot use glucose properly.


In [44]:
question = "What are the risks of Diabetes?"
answer(question)

Response: Diabetes is a chronic disease that affects the body's ability to use glucose. It
affects the body's ability to use insulin, the hormone that regulates blood
sugar. Insulin helps the body use glucose to fuel the body's cells.  Diabetes
affects about one in every 100 people.  People with diabetes have a higher risk
of developing other types of heart disease, stroke, kidney disease, and certain
cancers.


In [51]:
question = "What is Alzheimer's disease?"
answer(question)

Response: Alzheimer's disease is a progressive brain disease that affects the brain and
body. It is caused by a buildup of plaques and tangles in the brain. The brain
is made up of nerve cells called neurons. The brain is divided into two parts:
the frontal lobe, which controls movement, and the temporal lobe, which controls
thinking and memory. Alzheimer's affects the brain's wiring and connections
between these two parts.


In [60]:
question = "What are the common symptoms of Alzheimer's disease?"
print(generate_response(question))

What are the common symptoms of Alzheimer's disease?

Symptoms of Alzheimer's disease include:

Memory loss

Difficulty concentrating

Difficulty thinking

Difficulty remembering things

Difficulty thinking clearly

Difficulty remembering things

Difficulty remembering things

Difficulty thinking clearly

Difficulty thinking clearly

Difficulty thinking clearly

Difficulty thinking clearly

Difficulty thinking clearly

Difficulty thinking clearly

Difficulty thinking clearly
