In [1]:
import torch
print(torch.cuda.is_available())


True


In [None]:
!pip install  transformers datasets accelerate huggingface_hub

In [7]:
import transformers
print(transformers.__version__)

4.53.1


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import load_dataset


In [2]:
import pandas as pd

df = pd.read_parquet("hf://datasets/AnonymousSub/MedQuAD_47441_Question_Answer_Pairs/data/train-00000-of-00001-4401d00b2bdd1863.parquet")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [14]:
print(df.info())




<class 'pandas.core.frame.DataFrame'>
Index: 16407 entries, 0 to 47440
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Questions  16407 non-null  object
 1   Answers    16407 non-null  object
 2   prompt     16407 non-null  object
 3   response   16407 non-null  object
dtypes: object(4)
memory usage: 640.9+ KB
None


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47441 entries, 0 to 47440
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Questions  47441 non-null  object
 1   Answers    16407 non-null  object
dtypes: object(2)
memory usage: 741.4+ KB


In [20]:
# Normalize weird phrasing
df["prompt"] = df["prompt"].str.replace(r"\(are\)", "is", regex=True)

# Step 1: Select and rename
df["prompt"] = df["Questions"].str.strip()
df["response"] = df["Answers"].str.strip()

# ✅ Step 2: Drop real NaN first
df = df.dropna(subset=["prompt", "response"])

# ✅ Step 3: Drop if still any empty or 'None' as string
df = df[(df["prompt"].str.lower() != "none") & (df["response"].str.lower() != "none")]
df = df[(df["prompt"].str.strip() != "") & (df["response"].str.strip() != "")]

# ✅ Step 4: Now convert to string for safety
df["prompt"] = df["prompt"].astype(str)
df["response"] = df["response"].astype(str)

# ✅ Final format
final_df = df[["prompt", "response"]]

# ✅ Confirm
print(final_df.sample(3))
print(f"\n✅ Cleaned dataset ready with {len(final_df)} samples.")




                                                  prompt  \
6330   What are the genetic changes related to famili...   
30879  What are the symptoms of Blue rubber bleb nevu...   
6824   How many people are affected by Alpers-Huttenl...   

                                                response  
6330   Mutations in the PNKD gene cause familial paro...  
30879  What are the signs and symptoms of Blue rubber...  
6824   The prevalence of Alpers-Huttenlocher syndrome...  

✅ Cleaned dataset ready with 16407 samples.


In [21]:
final_df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 16407 entries, 0 to 47440
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   prompt    16407 non-null  object
 1   response  16407 non-null  object
dtypes: object(2)
memory usage: 384.5+ KB


In [22]:
from datasets import Dataset
dff = Dataset.from_pandas(final_df)
print(dff[0])

{'prompt': 'What is (are) Hepatitis B: What Asian and Pacific Islander Americans Need to Know ?', 'response': 'Hepatitis B is a liver disease spread through contact with blood, semen, or other body fluids from a person infected with the hepatitis B virus. The disease is most commonly spread from an infected mother to her infant at birth. Hepatitis B is also spread through sex, wound-to-wound contact, and contact with items that may have blood on them, such as shaving razors, toothbrushes, syringes, and tattoo and body piercing needles.\n                \nHepatitis B is not spread through casual contact such as shaking hands or hugging; nor is it spread by sharing food or beverages, by sneezing and coughing, or through breastfeeding.', '__index_level_0__': 0}


In [23]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [24]:
def tokenize_function(examples):
    # Combine prompt + response for each example in batch
    full_texts = [p + " " + r for p, r in zip(examples["prompt"], examples["response"])]

    # Tokenize all at once (batch)
    tokenized = tokenizer(
        full_texts,
        truncation=True,
        padding="max_length",
        max_length=512
    )

    # GPT-style: labels = input_ids
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized


In [25]:
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("gpt2")

model.resize_token_embeddings(len(tokenizer))  # Token embeddings adjust kiye
model.config.pad_token_id = tokenizer.pad_token_id

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [26]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./gpt2-medical-finetuned",       # Output directory
    eval_strategy="no",                      # No evaluation
    per_device_train_batch_size=4,              # Batch size per device
    num_train_epochs=1,                         # Number of training epochs
    save_steps=500,                             # Save model every 500 steps
    # eval_steps=500,                           # Commented, kyunki eval_strategy="no"
    logging_steps=100,                          # Log every 100 steps
    warmup_steps=100,                           # Learning rate warmup
    weight_decay=0.01,                          # Regularization
    save_total_limit=2,                         # Max saved checkpoints
    logging_dir="./logs",                       # Logging directory
    fp16=True,                                  # Use mixed precision (if GPU supports)
    report_to="none"                            # Disable external reporting
)

tokenized_data = dff.map(tokenize_function, batched=True)





Map:   0%|          | 0/16407 [00:00<?, ? examples/s]

In [27]:
from transformers import Trainer
trainer = Trainer(
    model = model,
    args= training_args,
    train_dataset=tokenized_data,
    tokenizer = tokenizer
)

  trainer = Trainer(


In [28]:
trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,2.5038
200,1.0401
300,0.9559
400,0.8387
500,0.9476
600,0.8785
700,0.8836
800,0.8682
900,0.8248
1000,0.8326


TrainOutput(global_step=4102, training_loss=0.852257413320109, metrics={'train_runtime': 1226.3653, 'train_samples_per_second': 13.379, 'train_steps_per_second': 3.345, 'total_flos': 4287018369024000.0, 'train_loss': 0.852257413320109, 'epoch': 1.0})

In [2]:
model.save_pretrained('gpt2-medical-finetuned1')

NameError: name 'model' is not defined

In [3]:
tokenizer.save_pretrained("gpt2-medical-finetuned1")

NameError: name 'tokenizer' is not defined

In [35]:
from transformers import pipeline

# Load fine-tuned model
medical_bot = pipeline(
    "text-generation",
    model="gpt2-medical-finetuned1",
    tokenizer="gpt2-medical-finetuned1",
    pad_token_id=50256  # GPT-2 ka eos_token_id
)

# Test with prompt
prompt = "What is the best treatment for diabetes?"
response = medical_bot(
    prompt,
    max_new_tokens=100,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    temperature=0.7
)[0]['generated_text']
print("🔍 Model Response:\n", response)

Device set to use cuda:0


🔍 Model Response:
 What is the best treatment for diabetes? The goals of treatment for diabetes include
               
- making blood glucose levels safe and regular  - increasing the number of insulin-producing cells in the body  - lowering the risk of developing diabetes
               
For example, a person may want to take blood glucose monitoring tests to monitor blood sugar levels. These tests can help determine whether a person has diabetes.


**🧠 1. Load Fine-Tuned Model**

In [33]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('gpt2-medical-finetuned1')
model = AutoModelForCausalLM.from_pretrained('gpt2-medical-finetuned1')

# Test function
def test_model(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=100,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test with prompt
prompt = "What is the best treatment for diabetes?"
print("🔍 Model Response:\n", test_model(prompt))

🔍 Model Response:
 What is the best treatment for diabetes? A treatment that focuses on the kidneys is currently being tested in people with diabetes who are at high risk for kidney failure.
               
Diabetes is a condition that can result from a variety of causes. Some cases of diabetes are caused by mutations in the ALDH gene, while other cases are caused by new mutations in the gene and occur in people with no history of the disorder in their family. Diabetes is caused by mutations in


**2. Define Test Function**



**3. Run Test (Give Prompt!)**

In [34]:
test_prompts = [
    "What is the first sign of Hepatitis B?",
    "How can I manage high blood pressure?",
    "Symptoms of type 2 diabetes?",
    "Is Hepatitis B contagious?"
]

for p in test_prompts:
    print(f"\n🧪 Prompt: {p}")
    print("🔍 Response:", test_model(p))



🧪 Prompt: What is the first sign of Hepatitis B?
🔍 Response: What is the first sign of Hepatitis B? ? The first sign of hepatitis B is the first sign of symptoms. People who have been exposed to high levels of hepatitis B or another infection can develop liver inflammation. People who have not had hepatitis B or other infections before begin to develop liver inflammation may have liver problems or die from a liver transplant.
               
After initial symptoms, people may develop liver disease and liver failure. These liver problems usually occur in the liver, but

🧪 Prompt: How can I manage high blood pressure?
🔍 Response: How can I manage high blood pressure?
               
Most people who are overweight or obese are at risk for high blood pressure.
               
High blood pressure can also affect your heart and other organs. It can affect your blood pressure, too.
                
High blood pressure can also affect your blood sugar (blood sugar

🧪 Prompt: Symptoms of type 

In [None]:
{
  "metadata": {
    "widgets": {
      "state": {}
    }
  }
}