In [None]:
import torch
print(torch.cuda.is_available())

In [None]:
!pip install  transformers datasets accelerate huggingface_hub

In [None]:
import transformers
print(transformers.__version__)

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import load_dataset


In [None]:
import pandas as pd

df = pd.read_parquet("hf://datasets/AnonymousSub/MedQuAD_47441_Question_Answer_Pairs/data/train-00000-of-00001-4401d00b2bdd1863.parquet")

In [None]:
print(df.info())




In [None]:
# Normalize weird phrasing

# Step 1: Select and rename
df["prompt"] = df["Questions"].str.strip()
df["response"] = df["Answers"].str.strip()

df["prompt"] = df["prompt"].str.replace(r"\(are\)", "is", regex=True)
# ✅ Step 2: Drop real NaN first
df = df.dropna(subset=["prompt", "response"])

# ✅ Step 3: Drop if still any empty or 'None' as string
df = df[(df["prompt"].str.lower() != "none") & (df["response"].str.lower() != "none")]
df = df[(df["prompt"].str.strip() != "") & (df["response"].str.strip() != "")]

# ✅ Step 4: Now convert to string for safety
df["prompt"] = df["prompt"].astype(str)
df["response"] = df["response"].astype(str)

# ✅ Final format
final_df = df[["prompt", "response"]]

# ✅ Confirm
print(final_df.sample(3))
print(f"\n✅ Cleaned dataset ready with {len(final_df)} samples.")




In [None]:
from datasets import Dataset
dff = Dataset.from_pandas(final_df)
print(dff[0])

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

In [None]:
def tokenize_function(examples):
    # Combine prompt + response for each example in batch
    full_texts = [p + " " + r for p, r in zip(examples["prompt"], examples["response"])]

    # Tokenize all at once (batch)
    tokenized = tokenizer(
        full_texts,
        truncation=True,
        padding="max_length",
        max_length=512
    )

    # GPT-style: labels = input_ids
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized


In [None]:
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("gpt2")

model.resize_token_embeddings(len(tokenizer))  # Token embeddings adjust kiye
model.config.pad_token_id = tokenizer.pad_token_id

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./gpt2-medical-finetuned",       # Output directory
    eval_strategy="no",                      # No evaluation
    per_device_train_batch_size=4,              # Batch size per device
    num_train_epochs=1,                         # Number of training epochs
    save_steps=500,                             # Save model every 500 steps
    # eval_steps=500,                           # Commented, kyunki eval_strategy="no"
    logging_steps=100,                          # Log every 100 steps
    warmup_steps=100,                           # Learning rate warmup
    weight_decay=0.01,                          # Regularization
    save_total_limit=2,                         # Max saved checkpoints
    logging_dir="./logs",                       # Logging directory
    fp16=True,                                  # Use mixed precision (if GPU supports)
    report_to="none"                            # Disable external reporting
)

tokenized_data = dff.map(tokenize_function, batched=True)





In [None]:
from transformers import Trainer
trainer = Trainer(
    model = model,
    args= training_args,
    train_dataset=tokenized_data,
    tokenizer = tokenizer
)

In [None]:
trainer.train()

In [None]:
model.save_pretrained('gpt2-medical-finetuned1')

In [None]:
tokenizer.save_pretrained("gpt2-medical-finetuned1")

In [None]:
from transformers import pipeline

# Load fine-tuned model
medical_bot = pipeline(
    "text-generation",
    model="gpt2-medical-finetuned1",
    tokenizer="gpt2-medical-finetuned1",
    pad_token_id=50256  # GPT-2 ka eos_token_id
)

# Test with prompt
prompt = "What is the best treatment for diabetes?"
response = medical_bot(
    prompt,
    max_new_tokens=100,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    temperature=0.7
)[0]['generated_text']
print("🔍 Model Response:\n", response)

**🧠 1. Load Fine-Tuned Model**

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('gpt2-medical-finetuned1')
model = AutoModelForCausalLM.from_pretrained('gpt2-medical-finetuned1')

# Test function
def test_model(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=100,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test with prompt
prompt = "What is the best treatment for diabetes?"
print("🔍 Model Response:\n", test_model(prompt))

**2. Define Test Function**



**3. Run Test (Give Prompt!)**

In [None]:
test_prompts = [
    "What is the first sign of Hepatitis B?",
    "How can I manage high blood pressure?",
    "Symptoms of type 2 diabetes?",
    "Is Hepatitis B contagious?"
]

for p in test_prompts:
    print(f"\n🧪 Prompt: {p}")
    print("🔍 Response:", test_model(p))


In [None]:
!zip -r model.zip gpt2-medical-finetuned1

In [None]:
{
  "metadata": {
    "widgets": {
      "state": {}
    }
  }
}

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')


import zipfile
import os

zip_path = "/content/drive/MyDrive/finetuned models/model.zip"
extract_path = "/content/model"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# Check contents
os.listdir(extract_path)

In [None]:
from huggingface_hub import upload_folder

# Update repo_id yahan:
repo_id = "samirk10/fine-tune"  # replace with your actual repo ID
local_model_path = "model/gpt2-medical-finetuned1"  # tumhara fine-tuned model folder

upload_folder(
    repo_id=repo_id,
    folder_path=local_model_path,
    path_in_repo=".",  # root pe upload
    commit_message="Pushing fine-tuned GPT-2 medical model 🚀"
)
