In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Downgrade numpy and patch HuggingFace inside current Colab kernel
!pip install --quiet --force-reinstall numpy==1.24.4
!pip uninstall -y tensorflow jax jaxlib thinc pymc blosc2 treescope xarray-einstats


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m98.4 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.24.4 which is incompatible.
treescope 0.1.9 requires numpy>=1.25.2, but you have numpy 1.24.4 which is incompatible.
tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 1.24.4 which is incompatible.
pymc 5.23.0 requires numpy>=1.25.0, but you have numpy 1.24.4 which is incompatible.
jaxlib 0.5.1 requires numpy>=1.25, but you have numpy 1.24.4 which is incompatible.
blosc2 3.5.0 requires numpy>=1.26, but you have numpy 1.24.4 which is incompatible.
jax 0.5.2 requires numpy>=1.25, but you have numpy 1.24.4 which is incompatible.
xarray-einstats 0.9.1 requires numpy>=1.25, but you have numpy 1.24.4 which is inco

In [None]:
# ✅ Clean manual setup
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import get_peft_model, LoraConfig, TaskType
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import os


In [None]:
import pandas as pd
import random

# ✅ Step 1: Load dataset
df = pd.read_csv("/content/drive/MyDrive/cleaned_clinical_drugs_dataset.csv")

# ✅ Step 2: Define prompt templates
templates = [
    lambda row: f"Can you explain how {row['drug_name']} works for {row['efo_term']}?",
    lambda row: f"What is the mechanism of action of {row['drug_name']}?",
    lambda row: f"User: What does {row['drug_name']} do?\nBot:"
]

# ✅ Step 3: Format each row using a randomly chosen template
def format_prompt(row):
    template = random.choice(templates)
    return template(row) + f" {row['mechanism_of_action']}"

df['text'] = df.apply(format_prompt, axis=1)

# ✅ Step 4: Drop any rows where the prompt is missing
df = df.dropna(subset=["text"]).reset_index(drop=True)

# ✅ Step 5: Show status
print(f"✅ Total examples prepared: {len(df)}")


✅ Total examples prepared: 553992


In [None]:
from transformers import AutoModelForCausalLM
from peft import get_peft_model, LoraConfig, TaskType
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token  # required for GPT-2-based models

# Load base model
base_model = AutoModelForCausalLM.from_pretrained("distilgpt2")
base_model.resize_token_embeddings(len(tokenizer))  # ensure pad token is handled

# LoRA configuration
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    target_modules=["c_attn", "c_proj"],
    task_type=TaskType.CAUSAL_LM
)

# Apply LoRA
model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()
model = model.to("cuda" if torch.cuda.is_available() else "cpu")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]



trainable params: 405,504 || all params: 82,318,080 || trainable%: 0.4926


In [None]:
from torch.utils.data import Dataset, DataLoader

# Custom Dataset class for biomedical chatbot
class DrugChatDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=128):
        self.tokenizer = tokenizer
        self.texts = texts
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        input_ids = encoding["input_ids"].squeeze()
        attention_mask = encoding["attention_mask"].squeeze()
        labels = input_ids.clone()

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        }

# ✅ Create dataset and dataloader
train_dataset = DrugChatDataset(df["text"].tolist(), tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)


In [None]:
import torch.nn as nn
from torch.optim import AdamW
from tqdm import tqdm
import os

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-4)
epochs = 3  # You can set to 3–4 now

save_dir = "/content/drive/MyDrive/drugbot-distilgpt2-lora-checkpoints"
os.makedirs(save_dir, exist_ok=True)

loss_values = []

print("🚀 Starting Training...\n")
for epoch in range(epochs):
    model.train()
    total_loss = 0.0
    loop = tqdm(train_dataloader, leave=True)

    for step, batch in enumerate(loop):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        loss = outputs.loss
        loss.backward()

        # ✅ Gradient clipping for stability
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()
        optimizer.zero_grad()

        loop.set_description(f"Epoch {epoch+1}")
        loop.set_postfix(step=step, loss=loss.item())

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
    loss_values.append(avg_loss)

    print(f"\n✅ Epoch {epoch+1} complete. Average Loss: {avg_loss:.4f}")

    # ✅ Save model checkpoint after each epoch
    checkpoint_path = os.path.join(save_dir, f"epoch{epoch+1}_model")
    model.save_pretrained(checkpoint_path)
    tokenizer.save_pretrained(checkpoint_path)
    print(f"💾 Saved checkpoint to: {checkpoint_path}\n")


🚀 Starting Training...



Epoch 1: 100%|██████████| 69249/69249 [2:27:49<00:00,  7.81it/s, loss=0.0772, step=69248]



✅ Epoch 1 complete. Average Loss: 0.0598
💾 Saved checkpoint to: /content/drive/MyDrive/drugbot-distilgpt2-lora-checkpoints/epoch1_model



Epoch 2:   1%|          | 780/69249 [01:39<2:26:09,  7.81it/s, loss=0.0273, step=779]


KeyboardInterrupt: 

In [None]:
# Show a few final prompt-formatted texts
for i in range(3):
    print(f"\n--- Example {i+1} ---\n")
    print(df['text'].iloc[i])



--- Example 1 ---

What is the mechanism of action of EFAVIRENZ? Human immunodeficiency virus type 1 reverse transcriptase inhibitor

--- Example 2 ---

User: What does EFAVIRENZ do?
Bot: Human immunodeficiency virus type 1 reverse transcriptase inhibitor

--- Example 3 ---

Can you explain how EFAVIRENZ works for type 2 diabetes mellitus? Human immunodeficiency virus type 1 reverse transcriptase inhibitor


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_path = "/content/drive/MyDrive/drugbot-distilgpt2-lora-checkpoints/epoch1_model"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)
model.eval()


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): lora.Linear(
            (base_layer): Conv1D(nf=2304, nx=768)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.1, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=768, out_features=8, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=8, out_features=2304, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (c_proj): lora.Linear(
            (base_layer): Conv1D(nf=768, nx=768)
            (lo

In [None]:
prompt = "User: What is aspirin used for?\nBot:"
inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=100, pad_token_id=tokenizer.eos_token_id)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


User: What is aspirin used for?
Bot: Glucocorticoid receptor agonist


In [None]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/cleaned_clinical_drugs_dataset.csv")
df.head()


Unnamed: 0,drug_id,drug_name,SMILES,logD,logP,psa,drug_likeness,max_phase,IC50,target,organism,target_type,mechanism_of_action,efo_term,efo_id,mesh_heading,mesh_id,toxicity_alert,pIC50
0,CHEMBL223228,EFAVIRENZ,O=C1Nc2ccc(Cl)cc2[C@@](C#CC2CC2)(C(F)(F)F)O1,4.46,4.46,38.33,0.73,4.0,0.75,Human immunodeficiency virus type 1 reverse tr...,Human immunodeficiency virus 1,SINGLE PROTEIN,Human immunodeficiency virus type 1 reverse tr...,AIDS,EFO:0000765,Acquired Immunodeficiency Syndrome,D000163,triple bond,9.124939
1,CHEMBL223228,EFAVIRENZ,O=C1Nc2ccc(Cl)cc2[C@@](C#CC2CC2)(C(F)(F)F)O1,4.46,4.46,38.33,0.73,4.0,0.75,Human immunodeficiency virus type 1 reverse tr...,Human immunodeficiency virus 1,SINGLE PROTEIN,Human immunodeficiency virus type 1 reverse tr...,immunodeficiency disease,MONDO:0021094,Autoimmune Diseases,D001327,triple bond,9.124939
2,CHEMBL223228,EFAVIRENZ,O=C1Nc2ccc(Cl)cc2[C@@](C#CC2CC2)(C(F)(F)F)O1,4.46,4.46,38.33,0.73,4.0,0.75,Human immunodeficiency virus type 1 reverse tr...,Human immunodeficiency virus 1,SINGLE PROTEIN,Human immunodeficiency virus type 1 reverse tr...,type 2 diabetes mellitus,MONDO:0005148,"Diabetes Mellitus, Type 2",D003924,triple bond,9.124939
3,CHEMBL223228,EFAVIRENZ,O=C1Nc2ccc(Cl)cc2[C@@](C#CC2CC2)(C(F)(F)F)O1,4.46,4.46,38.33,0.73,4.0,0.75,Human immunodeficiency virus type 1 reverse tr...,Human immunodeficiency virus 1,SINGLE PROTEIN,Human immunodeficiency virus type 1 reverse tr...,hepatitis C virus infection,EFO:0003047,Hepatitis C,D006526,triple bond,9.124939
4,CHEMBL223228,EFAVIRENZ,O=C1Nc2ccc(Cl)cc2[C@@](C#CC2CC2)(C(F)(F)F)O1,4.46,4.46,38.33,0.73,4.0,0.75,Human immunodeficiency virus type 1 reverse tr...,Human immunodeficiency virus 1,SINGLE PROTEIN,Human immunodeficiency virus type 1 reverse tr...,infection,EFO:0000544,Infections,D007239,triple bond,9.124939


In [None]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/cleaned_clinical_drugs_dataset.csv")  # Replace with actual path if needed
unique_drugs = df["drug_name"].unique().tolist()

print(f"🔬 Total unique drugs: {len(unique_drugs)}")
print("🧪 Sample drugs:", unique_drugs[:10])


🔬 Total unique drugs: 850
🧪 Sample drugs: ['EFAVIRENZ', 'NEVIRAPINE', 'CTS-1027', 'EMIVIRINE', 'FINASTERIDE', 'FLUTAMIDE', 'ALLOPURINOL', 'NILUTAMIDE', 'SIROLIMUS', 'SAQUINAVIR MESYLATE']


In [None]:
test_drugs = ['EFAVIRENZ', 'NEVIRAPINE', 'FINASTERIDE', 'ALLOPURINOL', 'SIROLIMUS']

for drug in test_drugs:
    prompt = f"What is {drug} used for?"
    inputs = tokenizer(prompt, return_tensors="pt")

    outputs = model.generate(
        **inputs,
        max_new_tokens=80,
        do_sample=True,
        top_p=0.9,
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"💬 {drug}: {response}\n")


💬 EFAVIRENZ: What is EFAVIRENZ used for? Human immunodeficiency virus type 1 reverse transcriptase inhibitor

💬 NEVIRAPINE: What is NEVIRAPINE used for? Human immunodeficiency virus type 1 reverse transcriptase inhibitor

💬 FINASTERIDE: What is FINASTERIDE used for? Steroid 5-alpha-reductase 2 inhibitor

💬 ALLOPURINOL: What is ALLOPURINOL used for? Xanthine dehydrogenase inhibitor

💬 SIROLIMUS: What is SIROLIMUS used for? FK506-binding protein 1A inhibitor

