In [None]:
# Install necessary frameworks
!pip install -r requirements.txt
# Or install directly if requirements.txt is not present:
# !pip install llama-index llama-index-llms-huggingface llama-index-embeddings-huggingface transformers accelerate bitsandbytes datasets

## 1. Data Ingestion & Filtering
We reuse the logic to fetch and filter the Mol-Instructions dataset, but we will wrap the output into LlamaIndex `Document` objects.

In [None]:
import json
from pathlib import Path
from llama_index.core import Document
from datasets import load_dataset

class MolInstructionsLoader:
    """Loads and filters data, returning LlamaIndex Documents."""
    
    def __init__(self, cache_dir="./data"):
        self.cache_dir = Path(cache_dir)
        self.cache_dir.mkdir(exist_ok=True)
        self.filtered_file = self.cache_dir / "cancer_filtered.json"

    def load_documents(self, max_samples=2000):
        data = self._get_filtered_data(max_samples)
        documents = []
        
        print("Converting to LlamaIndex Documents...")
        for item in data:
            # We combine instruction, input, and output into the main text
            text = f"Instruction: {item['instruction']}\nInput: {item['input']}\nOutput: {item['output']}"
            
            # Metadata is crucial for Applied AI (filtering, tracing)
            metadata = {
                "source": "Mol-Instructions",
                "id": item['id'],
                "task": "mutation_analysis"
            }
            
            doc = Document(text=text, metadata=metadata)
            documents.append(doc)
            
        print(f"Created {len(documents)} documents.")
        return documents

    def _get_filtered_data(self, max_samples):
        if self.filtered_file.exists():
            print("Loading cached data...")
            with open(self.filtered_file, 'r') as f:
                return json.load(f)
        
        print("Downloading and filtering dataset...")
        # (Simplified filtering logic from original notebook)
        try:
            dataset = load_dataset("zjunlp/Mol-Instructions", "Molecule-oriented Instructions", split="train", streaming=True)
        except Exception as e:
            print(f"Error loading dataset: {e}")
            return []

        cancer_keywords = ['cancer', 'tumor', 'mutation', 'melanoma', 'braf', 'tp53', 'v600e']
        filtered_data = []
        count = 0
        
        for example in dataset:
            if count >= max_samples: break
            text = f"{example.get('instruction', '')} {example.get('output', '')}".lower()
            if any(k in text for k in cancer_keywords):
                filtered_data.append({
                    'instruction': example.get('instruction', ''),
                    'input': example.get('input', ''),
                    'output': example.get('output', ''),
                    'id': count
                })
                count += 1
        
        with open(self.filtered_file, 'w') as f:
            json.dump(filtered_data, f)
        return filtered_data

## 2. Setup Embeddings & LLM (The "Applied" Stack)
We use `HuggingFaceEmbedding` for local embeddings and `HuggingFaceLLM` for the quantized model.

In [None]:
import torch
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core import Settings
from transformers import BitsAndBytesConfig

# 1. Setup Embeddings
print("Loading Embeddings...")
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")

# 2. Setup Quantized LLM
# Note: This requires GPU. If running on CPU only, this cell might fail or be extremely slow.
print("Loading Quantized LLM...")

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

llm = HuggingFaceLLM(
    model_name="unsloth/Llama-3.2-1B-Instruct",
    tokenizer_name="unsloth/Llama-3.2-1B-Instruct",
    context_window=2048,
    max_new_tokens=256,
    model_kwargs={"quantization_config": quantization_config},
    generate_kwargs={"temperature": 0.7, "do_sample": True},
    device_map="auto",
)

# 3. Configure Global Settings
Settings.llm = llm
Settings.embed_model = embed_model

## 3. Indexing & Retrieval
We build a `VectorStoreIndex`. In a production "Applied AI" setting, you would persist this to disk (e.g., using ChromaDB) so you don't rebuild it every time.

In [None]:
from llama_index.core import VectorStoreIndex

# Load data
loader = MolInstructionsLoader()
documents = loader.load_documents(max_samples=500)

# Build Index
print("Building Vector Index...")
index = VectorStoreIndex.from_documents(documents)

# Create Query Engine
query_engine = index.as_query_engine(similarity_top_k=3)

## 4. Running Queries
Now we can ask questions. The system will retrieve relevant contexts and generate an answer.

In [None]:
response = query_engine.query("What is the clinical significance of BRAF V600E mutation in melanoma?")
print("\nResponse:")
print(response)

# Inspecting the retrieved nodes (Applied AI: Traceability)
print("\n--- Source Documents ---")
for node in response.source_nodes:
    print(f"[Score: {node.score:.3f}] {node.text[:200]}...")