In [None]:
!python --version

In [None]:
!nvidia-smi


### Tokens & APIs


In [None]:
huggingface_token = ""


In [None]:
some_token = ""

In [None]:
wandb_token = ""

In [None]:
openai_key = ""

## Install Libraries

In [None]:
!pip install transformers accelerate bitsandbytes peft

In [None]:
# !pip uninstall -y bitsandbytes
!pip install -U bitsandbytes


In [None]:
# pip install --upgrade transformers accelerate bitsandbytes

In [None]:
pip install datasets

In [None]:
import bitsandbytes as bnb
print(bnb.__version__)  # Should print a valid version number


Enter the huggingface token when prompted

In [None]:
!huggingface-cli login

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/MyDrive/datasets

## Model Load

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import BitsAndBytesConfig

MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"


# Configuratiion of 4-bit quantization using bitsandbytes
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Enable 4-bit quantization
    bnb_4bit_quant_type="nf4",  # Use NormalFloat4 for better precision
    bnb_4bit_use_double_quant=True,  # Enable double quantization
    bnb_4bit_compute_dtype=torch.float16  # Compute in int8
)

# Load model with 4-bit quantization
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Verify model is in 4-bit mode
print(model.hf_device_map)  # Show which devices the model is loaded on
print(model.dtype)  # Should print torch.float16 (for computation)


In [None]:
# import torch
# from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
# from peft import PeftModel

# MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"
# fine_tuned_checkpoint = "/content/drive/MyDrive/datasets/llama_checkpoint_3000"
# output_path = "/content/drive/MyDrive/datasets/llama"



## QA Dataset

In [None]:
from datasets import load_dataset

#Loads a JSONL file where each line is {"question": ..., "answer": ...}
dataset = load_dataset("json", data_files="qa_dataset.jsonl", split="train")


In [None]:
tokenizer.pad_token = tokenizer.eos_token  # for LLaMA2

In [None]:
def tokenize_qa(example):
    prompt = f"Q: {example['question']}\nA: {example['answer']}"
    tokens = tokenizer(prompt,
                       truncation=True,
                       max_length=512,
                       padding="max_length")

    # Labels = input_ids for causal LM
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens



In [None]:
tokenized_dataset = dataset.map(tokenize_qa, batched=False)


## Lora

In [None]:
from peft import prepare_model_for_kbit_training

model = prepare_model_for_kbit_training(model)


In [None]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig( #4. Experimental Section
    r=96,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)


model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

#Training

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./llama2_finetune",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=0.0002,
    adam_epsilon=1e-8,
    max_steps=3000,
    gradient_checkpointing=True,
    fp16=True,
    optim="adamw_torch",
    logging_steps=50,
    report_to="wandb",  # logs to WandB
    save_strategy="steps",
    save_steps=500,
    # load_best_model_at_end=True,
    eval_steps=500,
    warmup_steps=100,
    weight_decay=0.01,
    # evaluation_strategy="no",
    max_grad_norm=0.3
)


In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

# train.train() # load from checkpoint after 500 steps



In [None]:
trainer.train(resume_from_checkpoint=True)

# RAG

### Set env & install libraries

In [None]:
fine_tuned_checkpoint = "your_checkpoint_dir"


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
import os
os.environ["OPENAI_API_KEY"] = openai_key


In [None]:
embedding_model_name = "BAAI/bge-large-en"

In [None]:
!pip install -U llama-index llama-index-core llama-index-vector-stores-chroma llama-index-llms-huggingface

# LangChain and Chroma for vector store + embeddings
!pip install -U langchain chromadb

In [None]:
pip install -U langchain-community

In [None]:
!pip install llama-index --upgrade

In [None]:
# Install HuggingFace embeddings + LLM support
!pip install llama-index-embeddings-huggingface
!pip install llama-index-llms-huggingface


In [None]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core.node_parser import SentenceSplitter



### Get the model and tokenizer from the checkpoints

In [None]:
base_model_id = "meta-llama/Llama-2-7b-chat-hf"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16
)

# Load model with 4-bit quantization
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    device_map="auto",
    quantization_config=bnb_config,
    token=huggingface_token
)

model_from_checkpoint = PeftModel.from_pretrained(
    base_model,
    fine_tuned_checkpoint,
    token=huggingface_token
)


tokenizer = AutoTokenizer.from_pretrained(base_model_id, token=huggingface_token)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
from llama_index.core import Settings

Settings.embed_model = HuggingFaceEmbedding(model_name=embedding_model_name) # needs to set internal embedding model to ours to not cause confliction to default of Llamaindex
# Settings.num_output = 312
# Settings.context_window = 3900

### Load the document to compile the Chroma vector db

In [None]:
documents = SimpleDirectoryReader(input_files=["document.json_path"]).load_data()

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

chunks = []
for doc in documents:
    for chunk in splitter.split_text(doc.text):
        chunks.append(Document(page_content=chunk, metadata=doc.metadata))



In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_name)

vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embedding_model,
    persist_directory="chroma_db"
)


In [None]:
from llama_index.vector_stores import ChromaVectorStore
from llama_index.storage.storage_context import StorageContext
from llama_index.indices.vector_store import VectorStoreIndex

chroma_vectorstore = ChromaVectorStore(chroma_collection=vectorstore._collection)
storage_context = StorageContext.from_defaults(vector_store=chroma_vectorstore)

index = VectorStoreIndex.from_documents(documents=chunks, storage_context=storage_context)

#### Save the index

In [None]:
index = VectorStoreIndex.from_documents(
    documents=chunks,
    storage_context=storage_context,
    persist_dir="index_storage"
)
index.storage_context.persist()


### Creating a Query Engine

In [None]:
query_engine = index.as_query_engine()

In [None]:
# response = query_engine.query("What are latest discoveries in bio-related field?")
# print(response)


In [None]:
# q = "What methods can be used to assess the adsorption of enzymes onto graphite electrodes, and how can the activities of immobilized laccase be evaluated?"


# a = query_engine.query(q)
# print(a)

### Test the performance

Need to re-evaluate the result csv file manually, to confirm the answers.

In [None]:
import json
import pandas as pd
from transformers import pipeline


mc_dataset_path = "define_path_to_mc_dataset.jsonl"

with open(mc_dataset_path, "r") as f:
    dataset = [json.loads(line) for line in f]

# Prompt engineer (depending on the prompt, model's behaviour to answer correctly differ.)
def format_prompt(q):
    return f"""You will be given a multiple-choice question. Respond with only the letter corresponding to the correct answer (A, B, C). Do not include explanations or restate the question.
            Question: {q}
            Answer:"""

def evaluate_model(model, tokenizer, dataset, use_rag=False, retriever=None, temperature=0.1):

    model.eval()
    results = []

    for d in dataset:
        question = d["question"]
        gt_answer = d["answer"].strip().upper()
        category = d.get('category', 'Unknown')

        # If RAG, retrieve context
        if use_rag and retriever is not None:
            context = retriever.retrieve(question)
            if isinstance(context, list):  # If retriever returns list of texts
                context = "\n".join(context)
            prompt = f"""Context:\n\n{context}\n\nQuestion:{question}\n\Answer:"""
        else:
            prompt = format_prompt(question)

        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        output = model.generate(
            **inputs,
            max_new_tokens=50,
            do_sample=False
        )

        decoded = tokenizer.decode(output[0], skip_special_tokens=True)
        response = decoded.strip().split()[-1][0].upper()


        predicted = None
        for choice in ['A', 'B', 'C']:
            if choice in response:
                predicted = choice
                break
        if predicted is None:
            predicted = "Unknown"

        print(f"GT: {gt_answer}, Predicted: {predicted}, Full Response: {decoded}")

        results.append({
            "question": question,
            "full_response": decoded,
            "predicted": predicted,
            "ground_truth": gt_answer,
            "category": category,
        })


    results_df = pd.DataFrame(results)

    return results_df




#### Base model eval

In [None]:
base_eval = evaluate_model(base_model, tokenizer, documents, use_rag=False)
base_eval.to_save('your_base_model_eval.csv')

#### BioLLM eval

In [None]:
bio_eval = evaluate_model(model_from_checkpoint, tokenizer, documents, use_rag=False)
bio_eval.to_save('your_bio_model_eval.csv')

#### BioLLM+RAG

In [None]:
class BioLLM:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.device = model.device

    def complete(self, prompt, max_tokens=100):
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
        outputs = self.model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            do_sample=False,
        )
        decoded = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return decoded.strip()

biollm = BioLLM(model=model_from_checkpoint, tokenizer=tokenizer)

In [None]:
rag_eval = evaluate_model(biollm, tokenizer, documents, use_rag=True, retriever=query_engine)
rag_eval.to_save('your_rag_eval.csv')

# Interface

In [None]:
!pip install gradio transformers llama-index sentence-transformers
!pip install --upgrade gradio

### Load from saved index

In [None]:
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext, load_index_from_storage
from langchain.embeddings import HuggingFaceEmbeddings
from llama_index.vector_stores.chroma import ChromaVectorStore

from langchain.vectorstores import Chroma

embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_name)

# econnect to ChromaDB


vectorstore = Chroma(
    persist_directory="chromadb_dir",
    embedding_function=embedding_model,
)

chroma_collection = vectorstore._collection
chroma_vectorstore = ChromaVectorStore(chroma_collection=chroma_collection)


### Define device to gpu, if not cpu

In [None]:
import torch

device = torch.device("cpu")
model_from_checkpoint.to(device)

In [None]:
# Load Chroma vector store
embedding_model = HuggingFaceEmbedding(model_name=embedding_model_name)
chroma_vectorstore = Chroma(
    persist_directory='chroma_db_dir',
    embedding_function=embedding_model,
)
chroma_collection = chroma_vectorstore._collection
chroma_vectorstore = ChromaVectorStore(chroma_collection=chroma_collection)

# Load LlamaIndex from storage
storage_context = StorageContext.from_defaults(
    persist_dir='index_dir',
    vector_store=chroma_vectorstore
)
index = load_index_from_storage(storage_context)

# Set up retriever and reranker
reranker = SentenceTransformerRerank(
    top_n=2, # define depending on your preference
    model="cross-encoder/ms-marco-MiniLM-L-6-v2",
    keep_retrieval_score=False
)

retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=5, # more the slower, the more accurate
    node_postprocessors=[reranker]
)


In [None]:
import os
import time
import torch
import gradio as gr

def chat_fn(message):
    try:
        # Retrieve relevant context
        retrieved_nodes = retriever.retrieve(message)
        context = "\n\n".join([node.text for node in retrieved_nodes])

        # Build prompt
        prompt = f"""You are a biomedical expert. Use the given context to answer the question in a concise and clear manner.

Context:
{context}

Question: {message}
Answer:"""

        raw_response = bio_llm.complete(prompt, max_tokens=256).strip()

        # if "answer:" in raw_response.lower():
        #     cleaned = raw_response.lower().split("answer:")[-1].strip()
        # else:
        #     cleaned = raw_response
        raw_lower = raw_response.lower()
        if "answer:" in raw_lower:
            idx = raw_lower.find("answer:")  # Find where 'answer:' starts (case-insensitive)
            cleaned = raw_response[idx + len("answer:"):].strip()  # Extract substring from original text
        else:
            cleaned = raw_response.strip()


        # if cleaned:
        #     cleaned = cleaned[0].upper() + cleaned[1:]

        return cleaned

    except torch.cuda.OutOfMemoryError:
        torch.cuda.empty_cache()
        return "⚠️ CUDA out of memory. Try a shorter query or reduce context length."

    except Exception as e:
        return f"⚠️ Error: {str(e)}"

    finally:
        torch.cuda.empty_cache()


In [None]:
def build_chat_interface():
    def respond(message, history):
        time.sleep(0.05)
        response = chat_fn(message)
        history.append((message, response))
        return "", history

    with gr.Blocks(theme=gr.themes.Soft()) as demo:
        gr.Markdown("# BioLLM + RAG Chat")
        gr.Markdown("Ask me biomedical questions! I use BioLLM + literature RAG for answers.")

        chatbot = gr.Chatbot()
        msg = gr.Textbox(placeholder="Type your question and press Enter...")

        clear = gr.Button("Clear")

        state = gr.State([])

        msg.submit(respond, [msg, state], [msg, chatbot])
        clear.click(lambda: ([], ""), None, [chatbot, msg])

    return demo


In [None]:
if __name__ == "__main__":
    chat_app = build_chat_interface()
    chat_app.launch(share=True, debug=True)