In [1]:
!pip install transformers datasets langchain chromadb bitsandbytes accelerate langchain_community
!pip install git+https://github.com/xlang-ai/instructor-embedding.git
!pip install sentence-transformers

/bin/bash: /home/niedag/GitHub-workspace/healthy-rag/.venv/bin/pip: /home/niedag/GitHub-workspace/HP-AI-hackathon-2025/.venv/bin/python: bad interpreter: No such file or directory
/bin/bash: /home/niedag/GitHub-workspace/healthy-rag/.venv/bin/pip: /home/niedag/GitHub-workspace/HP-AI-hackathon-2025/.venv/bin/python: bad interpreter: No such file or directory
/bin/bash: /home/niedag/GitHub-workspace/healthy-rag/.venv/bin/pip: /home/niedag/GitHub-workspace/HP-AI-hackathon-2025/.venv/bin/python: bad interpreter: No such file or directory


In [2]:
import os, torch, logging, time, atexit
import bitsandbytes as bnb
from getpass import getpass
from datasets import load_dataset
from langchain import HuggingFacePipeline, PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, pipeline, BitsAndBytesConfig
from tqdm import tqdm

from chromadb.config import Settings

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

if "HF_TOKEN" not in os.environ:
    os.environ["HF_TOKEN"] = getpass("Enter your Hugging Face otoken: ")
    print("HF_TOKEN is set:", "HF_TOKEN" in os.environ)

  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda:0


In [3]:
model_name = "meta-llama/Llama-3.2-3B"
HUGGING_FACE_TOKEN = os.getenv("HF_TOKEN")

bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_dtype="nf4")

def load_model_and_tokenizer():
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, device_map="auto")
    return model,tokenizer

model,tokenizer = load_model_and_tokenizer()

def verify_model(model,tokenizer):
    input_text = "Testin 123 123 123"
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(DEVICE)

print(f"Model test output: {verify_model(model,tokenizer)}")



2025-04-28 01:20:05,719 - INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.70s/it]


Model test output: None


In [4]:
def load_pubmed_qa_dataset(split="train", limit= None):
    dataset = load_dataset("pubmed_qa", "pqa_labeled", split=split)
    return dataset.select(range(min(limit,len(dataset)))) if limit else dataset


def init_chroma():
    embedding_fn = HuggingFaceEmbeddings(
        model_name = "sentence-transformers/all-mpnet0base-v2",
        model_kwar1gs = {"device" : DEVICE}
    )
    return Chroma(
        embedding_function=embedding_fn,
        client_settings=Settings(anonymized_telemetry=False)
    )

def ingest_data(collection, dataset):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size =1024, chunk_overlap=64)
    for entry in tqdm(dataset, desc="Ingesting data"):
        document = (
            f"Question: {entry.get('question', '')}\n",
            f"Context: {entry.get('context', '')}\n",
            f"Abstract: {entry.get('abstract', '')}\n",
            f"Long Answer: {entry.get('long)answer', '')}"
        )
        chunks = text_splitter.split_text(document)
        collection.add_texts(
            texts=chunks,
            metadatas=[{"source": entry["pubid"]}] * len(chunks)
        )

dataset = load_pubmed_qa_dataset(limit=1000)
collection = init_chroma()
ingest_data(collection, dataset)


Generating train split: 100%|██████████| 1000/1000 [00:00<00:00, 47750.99 examples/s]
  embedding_fn = HuggingFaceEmbeddings(


ValidationError: 1 validation error for HuggingFaceEmbeddings
model_kwar1gs
  Extra inputs are not permitted [type=extra_forbidden, input_value={'device': 'cuda:0'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/extra_forbidden

In [5]:
DEFAULT_SYSTEM_PROMPT = "You are an AI assistant specializing in medical literature. Answer accurately and concisely based on the given context."

def generate_prompt(prompt: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT) -> str:
    return f"[INST] <<SYS>> {system_prompt} <</SYS>> {prompt} [/INST]".strip()


template = generate_prompt("{context}\nQuestion: {question}", system_prompt = "Use the context to answer the medical question.")
prompt = PromptTemplate(template = template, input_variables=["context", "question"])

streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
text_pipeline = pipeline("text-generations", model=model, tokenizer=tokenizer, max_new_tokens = 500, temperature=0.1, top_p=0.95, repetition_penalty=1.15, streamer=streamer)
llm = HuggingFacePipeline(pipeline = text_pipeline)

def create_qa_chain(collection):
    return RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=collection.as_retriever(search_kwargs=("k":2)), return_source_documents=False, chain_type_kwargs={"prompt":prompt})

qa_chain = create_qa_chain(collection)
print("RetrievalQA chain created")

SyntaxError: invalid syntax (3042633771.py, line 15)

In [None]:
def interactive_query_loop(qa_chain):
    while True:
        query = input ("Enter your medical question (or 'quit' to exit): ")
        if query.lower() == 'quit':break
        try:
            start_time = time.time()
            result = qa_chain(query)
            print(f"Query: {query}\nAnswer: {result['reuslt']}\nResponse time: {time.time() - start_time :.2f} seconds")
        except Exception as e: 
            logging.error(f"Error processing query: {e}")

def graceful_shutdown():
    logging.info("Shutting down gracefully...")

atexit.register(graceful_shutdown)
interactive_query_loop(qa_chain)
