<a href="https://colab.research.google.com/github/mobarakol/tutorial_notebooks/blob/main/LLM_GPT2_QA_Finetune_MedQuad.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#MedQuAD NLP Dataset
Kaggle: https://www.kaggle.com/datasets/dibyasankhapal/medquad-nlp-dataset


In [1]:
!gdown 1CTpMAMLcBfTLXyqYGkUkEfuN4C1aQ1BK
!unzip -q medquad.zip

Downloading...
From: https://drive.google.com/uc?id=1CTpMAMLcBfTLXyqYGkUkEfuN4C1aQ1BK
To: /content/medquad.zip
100% 5.09M/5.09M [00:00<00:00, 11.4MB/s]


In [2]:
!pip -q install datasets

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/179.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/143.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Cleaning dataset

In [3]:
import numpy as np
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from transformers import pipeline
import torch
from datasets import load_dataset
import os

# Reading the CSV file from folder location
url = 'medquad/medquad.csv'
df = pd.read_csv(url)
print(df.head(2))  # Displaying first few rows to verify

# Clean specific string columns
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].str.lower().str.split().str.join(' ')

# Print cleaned DataFrame
print(df.head(2))

removed_records = pd.DataFrame()

# Handle missing values by dropping rows with missing data
missing_values = df[df.isnull().any(axis=1)]
# Append to removed records
removed_records = pd.concat([removed_records, missing_values])
# Drop missing values
df = df.dropna()

# Remove duplicates based on 'Question' and 'Answer' columns
duplicates = df[df.duplicated(subset=['question', 'answer'], keep=False)]
# Append to removed records
removed_records = pd.concat([removed_records, duplicates])
# Drop duplicates
df = df.drop_duplicates(subset=['question', 'answer'])

# Save the removed records to a text file for auditing
if not removed_records.empty:
    removed_records.to_csv('removed_records_audit.csv', index=False)

                   question  \
0  What is (are) Glaucoma ?   
1    What causes Glaucoma ?   

                                              answer           source  \
0  Glaucoma is a group of diseases that can damag...  NIHSeniorHealth   
1  Nearly 2.7 million people have glaucoma, a lea...  NIHSeniorHealth   

  focus_area  
0   Glaucoma  
1   Glaucoma  
                   question  \
0  what is (are) glaucoma ?   
1    what causes glaucoma ?   

                                              answer           source  \
0  glaucoma is a group of diseases that can damag...  nihseniorhealth   
1  nearly 2.7 million people have glaucoma, a lea...  nihseniorhealth   

  focus_area  
0   glaucoma  
1   glaucoma  


Preparing Dataloader:

In [4]:
# fetch top 100 categories (focus_area) based on record counts
top_100_categories = df['focus_area'].value_counts().nlargest(100).index.tolist()

train_data = pd.DataFrame()
val_data = pd.DataFrame()

for category in top_100_categories:
    # Select 4 records per category for training
    train_samples = df[df['focus_area'] == category].sample(n=4, random_state=42)

    # Select 1 sample per category for validaDation (note : excluding training samples)
    val_samples = df[(df['focus_area'] == category) & (~df.index.isin(train_samples.index))].sample(n=1, random_state=42)

    # Append to the training and validation dataframes
    train_data = pd.concat([train_data, train_samples])
    val_data = pd.concat([val_data, val_samples])

print(f"Training set size: {train_data.shape[0]}")  # Should be 400
print(f"Validation set size: {val_data.shape[0]}")  # Should be 100

# Combine Questions and Answers for train and val data
## sequence = '<question> ' + question + ' <answer> ' + answer + ' <end>'

# 5.1 Combine Question and Answer for training and validation
def combine_text(df):
    combined_sequences = df.apply(lambda row: f"<question>{row['question']}<answer>{row['answer']}<end>", axis=1)
    return combined_sequences

train_sequences = combine_text(train_data)
val_sequences = combine_text(val_data)

print(train_sequences.head())
print(val_sequences.head())

# Train and Validation text for all Q&As
# Join the combined text using '\n' into a single string for training and validation separately
train_text = '\n'.join(train_sequences)
val_text = '\n'.join(val_sequences)

# Save the training and validation data as text files

# Save the training and validation strings as separate text files
with open('train_data.txt', 'w') as f:
    f.write(train_text)

with open('val_data.txt', 'w') as f:
    f.write(val_text)

Training set size: 400
Validation set size: 100
380     <question>what is (are) breast cancer ?<answer...
385     <question>what are the treatments for breast c...
775     <question>what are the treatments for breast c...
393     <question>who is at risk for breast cancer? ?<...
1166    <question>what is (are) prostate cancer ?<answ...
dtype: object
402      <question>what is (are) breast cancer ?<answer...
1309     <question>how to prevent prostate cancer ?<ans...
9262     <question>what is (are) stroke ?<answer>a stro...
1441     <question>what are the treatments for skin can...
10262    <question>what are the genetic changes related...
dtype: object


In [5]:
import numpy as np
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from transformers import pipeline
import torch
from datasets import load_dataset
import os

# Load pre-trained GPT2LMHeadModel
model = GPT2LMHeadModel.from_pretrained('gpt2', use_cache = False)

# Load GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Add special tokens used in the MedQuAD dataset
tokenizer.add_special_tokens({'pad_token': '<pad>', 'bos_token': '<question>', 'eos_token': '<end>', 'sep_token': '<answer>'})

def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=1024) #max_length=512

# Load dataset from the text files
dataset = load_dataset("text", data_files={'train': 'train_data.txt', 'validation': 'val_data.txt'})

# Convert to PyTorch tensors. This helps in training process.
dataset.set_format("torch")
tokenized_datasets = dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=['text'])

# optional activity just for reference of how data looks.
print(f"printing train. Total number of records are {len(dataset['train'])}")
print(f"printing validation. Total number of records are {len(dataset['validation'])}")

# Create a DataCollator object for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer = tokenizer,
    mlm=False,  # No masked language modeling for GPT-2.
    return_tensors="pt"
)

# optional activity just for reference of how tokenized data looks after DataLanguageModelling
num_samples = min(5, len(tokenized_datasets["train"])) # this code give which ever is minimum value (either 5 or len(tokenized_dataset))
print(num_samples)

for i in range(num_samples):
    batch = data_collator([tokenized_datasets["train"][i]])
    print(f"\nData Collator Output for sample {i}:{batch}")
    for key, value in batch.items():
        print(f"{key}: shape {value.shape}, dtype {value.dtype}")

# Decode a sample to check content
print("\nDecoded content of the first sample:")
decoded = tokenizer.decode(tokenized_datasets['train'][0]['input_ids'])
print(decoded)  # Print first 500 characters


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Map (num_proc=4):   0%|          | 0/400 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/100 [00:00<?, ? examples/s]

printing train. Total number of records are 400
printing validation. Total number of records are 100
5

Data Collator Output for sample 0:{'input_ids': tensor([[50258, 10919,   318,  ..., 50257, 50257, 50257]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[50258, 10919,   318,  ...,  -100,  -100,  -100]])}
input_ids: shape torch.Size([1, 1024]), dtype torch.int64
attention_mask: shape torch.Size([1, 1024]), dtype torch.int64
labels: shape torch.Size([1, 1024]), dtype torch.int64

Data Collator Output for sample 1:{'input_ids': tensor([[50258, 10919,   389,  ..., 50257, 50257, 50257]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[50258, 10919,   389,  ...,  -100,  -100,  -100]])}
input_ids: shape torch.Size([1, 1024]), dtype torch.int64
attention_mask: shape torch.Size([1, 1024]), dtype torch.int64
labels: shape torch.Size([1, 1024]), dtype torch.int64

Data Collator Output for sample 2:{'input_ids': tensor([[50258, 10919,   389, 

In [None]:
# Moves the model to GPU (if available)
# for faster training and inference by leveraging parallel
# computation on the GPU.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Adjusts the model’s token embedding matrix to include new tokens
# added to the tokenizer, such as special tokens.
model.resize_token_embeddings(len(tokenizer))

# Specify training arguments
training_args = TrainingArguments(
    output_dir="/content/gpt_finetuned_MedQuAD",  # Output directory
    eval_strategy="epoch",  # Evaluate at the end of each epoch
    save_strategy="epoch",  # Save checkpoint at the end of each epoch to match eval strategy
    #learning_rate= 5e-5, #0.001,
    warmup_steps=500,
    per_device_train_batch_size=2,  # Adjust based on available GPU memory
    per_device_eval_batch_size=2,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",  # Directory for storing logs
    #load_best_model_at_end=True,  # Load the best model at the end of training
    #logging_steps=500,  # Log every 500 steps
    #save_steps=500,  # Save checkpoint every 500 steps
    #gradient_accumulation_steps=4,  # Simulate larger batch size
    fp16=True  # Enable mixed precision training
)

# Train the model
# 10.2 Train the GPT-2 model using the provided training arguments
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator
)

trainer.train()  # Start training

# Save the resulting trained model and tokenizer
model.save_pretrained(training_args.output_dir)
tokenizer.save_pretrained(training_args.output_dir)
print("Model and tokenizer saved.")

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss
