In [None]:
# getting all libraries and dependencies ready
import warnings
import os
import json
import pickle
import requests
import numpy as np
import pandas as pd
from pathlib import Path
from typing import List, Dict, Optional


In [None]:
# ignoring any upcoming warnings
warnings.filterwarnings('ignore')

**STATEMENT: Developing a RAG-powered question-and-answer system that focuses on skin-cancer-related protein mutations.**

# This is an effort to close a critical knowledge gap in clinical and molecular oncology within Africa And all over the world. Skin cancer is rising across multiple regions of the continent due to increasing UV exposure, late diagnosis, limited access to dermatological specialists, and insufficient molecular diagnostic resources. Many health centres do not have immediate access to detailed genomic information, and when protein-level mutations are involved, the interpretation requires both specialized knowledge and reliable reference data. A Retrieval-Augmented Generation system provides a way to blend trusted scientific literature, curated genomic databases, and clinical guidelines into a single intelligent assistant that can respond to questions with precision and contextual clarity. The importance of such a system in Africa lies in its ability to reduce barriers to expert knowledge, especially in settings where oncologists, dermatopathologists, or molecular biologists may not be readily available.

This project is designed to strengthen clinical decision-support capacity by allowing medical personnel to ask direct questions about mutations associated with melanoma and other skin cancers, understanding how these mutations affect protein structure, disease progression, treatment response, and prognosis. Instead of relying solely on limited local expertise or outdated texts, clinicians can obtain up-to-date and evidence-based answers drawn from high-quality molecular databases, published dermatology research, and recognized cancer-genomics repositories. The RAG architecture ensures that the system retrieves factual information rather than inventing unsupported explanations, making it especially suitable for sensitive medical domains.

The solution aims to assist dermatologists who might need rapid clarification on the clinical significance of a mutation detected during biopsy evaluation. It supports pathologists who interpret histology and immunohistochemistry images but need molecular confirmation for difficult cases. It aids oncologists who formulate treatment plans influenced by mutation profiles such as BRAF, NRAS, or KIT alterations. It benefits molecular geneticists who analyse sequencing data in laboratories that often face staffing shortages. It also supports general practitioners operating in remote regions who frequently encounter skin lesions without specialist backup. In teaching hospitals, it provides an additional layer of academic support for medical students, resident doctors, and biomedical scientists learning how molecular mechanisms drive disease outcomes.

Ultimately, the system is being developed to expand access to reliable molecular oncology insights across African healthcare environments. Many facilities lack immediate access to advanced diagnostic equipment or specialists trained in protein mutation interpretation. A RAG-based assistant reduces this gap by delivering accurate, context-aware explanations that can guide patient triage, improve diagnostic confidence, and help clinicians make more informed decisions. It does not replace medical professionals but strengthens their capacity to deliver better care through improved access to molecular knowledge that is often scarce in the region. This project represents a step towards democratizing precision-medicine-level information for the African medical landscape, where timely understanding of protein mutations in skin cancer can significantly influence patient outcomes.


In [None]:
# installing the lib for quantizing,transforming,LLm to fast inference,response and manage memory
!pip install -q datasets transformers sentence-transformers faiss-cpu
!pip install -q bitsandbytes accelerate
!pip install -q gradio requests

In [None]:
# Data preparation and filtering

class MolInstructionsFilter:
    """class to Filter and prepare cancer-related data from
     Mol-Instructions using the feature sets of skin-cancer-related proteins.
    """

    def __init__(self, cache_dir="./data"):
        self.cache_dir = Path(cache_dir)
        self.cache_dir.mkdir(exist_ok=True)
        self.filtered_file = self.cache_dir / "cancer_filtered.json"

    def download_and_filter(self, max_samples=5000):
        """Downloading and filtering relevant cancer mutation data"""

        from datasets import load_dataset

        if self.filtered_file.exists():
            print("Filtered cancer mutation data already exists. Loading from cache...")
            with open(self.filtered_file, 'r') as f:
                return json.load(f)

        print("Downloading Mol-Instructions dataset...")

        # Loading the specific subsets that are relevant to mutation
        # Focusing on description and property prediction tasks
        try:
            dataset = load_dataset(
                "zjunlp/Mol-Instructions",
                "Molecule-oriented Instructions",
                split="train",
                streaming=True  # I'm Streaming to avoid loading entire 1B params
            )
        except:
            return []

        # Keywords for filtering skin cancer-related content
        cancer_keywords = [
            'cancer', 'tumor', 'mutation', 'oncology', 'melanoma',
            'carcinoma', 'metastasis', 'oncogene', 'braf', 'tp53',
            'egfr', 'kras', 'protein', 'inhibitor', 'therapy',
            'V600E','NRAS mutation'
            ]

        filtered_data = []
        count = 0

        print("Filtering for cancer-related content...")
        for example in dataset:
            if count >= max_samples:
                break

            #  Checking if output contains cancer keywords
            text = f"{example.get('instruction', '')} {example.get('output', '')}".lower()

            if any(keyword in text for keyword in cancer_keywords):
                filtered_data.append({
                    'instruction': example.get('instruction', ''),
                    'input': example.get('input', ''),
                    'output': example.get('output', ''),
                    'id': count
                })
                count += 1

                if count % 100 == 0:
                    print(f"Collected {count} samples...")

        # Saving filtered data
        with open(self.filtered_file, 'w') as f:
            json.dump(filtered_data, f, indent=2)

        print(f"Filtered {len(filtered_data)} cancer-related samples")
        return filtered_data


In [None]:
# inspecting the path of my filtered data
# obj = MolInstructionsFilter()
# print(obj.cache_dir)
# print(obj.filtered_file)

In [None]:
# trying to get an overview of the downloaded-json file
# json_obj = MolInstructionsFilter()
# data = json_obj.download_and_filter()
# print(data)

In [None]:
mf = MolInstructionsFilter()
print(mf)                 # printing the filtered object reference
print(mf.cache_dir)       # printing the data directory
print(mf.filtered_file)   # printing data/cancer_filtered.json loc

**KNOWLEDGE BASE DESIGN/CONNECTIVITY " UNIPROT "**

In [None]:
class UniProtCache:
    """Creating Cached access to UniProt protein database"""

    def __init__(self, cache_dir="./data"):
        self.cache_dir = Path(cache_dir)
        self.cache_dir.mkdir(exist_ok=True)
        self.cache_file = self.cache_dir / "uniprot_cache.json"
        self.cache = self._load_cache()

        # Listing some common skin cancer proteins
        self.cancer_proteins = [
            'BRAF', 'TP53', 'NRAS', 'CDKN2A', 'PTEN',
            'KIT', 'NF1', 'MAP2K1', 'TERT', 'ARID2'
        ]

    def _load_cache(self):
        if self.cache_file.exists():
            with open(self.cache_file, 'r') as f:
                return json.load(f)
        return {}

    def _save_cache(self):
        with open(self.cache_file, 'w') as f:
            json.dump(self.cache, f, indent=2)

    def fetch_protein_info(self, gene_name: str) -> Optional[Dict]:
        """Fetch protein info from UniProt (cached)"""
        if gene_name in self.cache:
            return self.cache[gene_name]

        try:
            # APi request to extract the protein features of skin cancer
            url = f"https://rest.uniprot.org/uniprotkb/search?query=gene:{gene_name}+AND+organism_id:9606&format=json&size=1"
            response = requests.get(url, timeout=10)

            if response.status_code == 200:
                data = response.json()
                if data.get('results'):
                    result = data['results'][0]
                    info = {
                        'gene': gene_name,
                        'protein_name': result.get('proteinDescription', {}).get('recommendedName', {}).get('fullName', {}).get('value', 'Unknown'),
                        'function': result.get('comments', [{}])[0].get('texts', [{}])[0].get('value', 'No function info'),
                        'accession': result.get('primaryAccession', ''),
                        'sequence_length': result.get('sequence', {}).get('length', 0)
                    }
                    self.cache[gene_name] = info
                    self._save_cache()
                    return info
        except Exception as e:
            print(f"Error fetching {gene_name}: {e}")

        return None

    def preload_cancer_proteins(self):
        """Preloading the common skin cancer proteins"""
        print("Preloading cancer protein database...")
        for protein in self.cancer_proteins:
            if protein not in self.cache:
                print(f"  Fetching {protein}...")
                self.fetch_protein_info(protein)
        print(f"Cached {len(self.cache)} proteins")


**RAG PIPPELINE SETUP: EMBEDDING,RETRIEVER, AUGMENTATIONS**

In [None]:
class CancerRAGRetriever:
    """Employing efficient retrieval system using FAISS"""

    def __init__(self, cache_dir="./data"):
        self.cache_dir = Path(cache_dir)
        self.index_file = self.cache_dir / "faiss_index.bin"
        self.docs_file = self.cache_dir / "documents.pkl"

        # Loading a lightweight embedding model using sentence transformer
        # To avoid memory overuse on my env
        from sentence_transformers import SentenceTransformer
        print("Loading embedding model...")
        self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
        print("Embedding model loaded successfully")

        self.index = None
        self.documents = []

    def build_index(self, data: List[Dict]):
        """Building the FAISS index from filtered data"""
        import faiss

        if self.index_file.exists() and self.docs_file.exists():
            print("Loading existing FAISS index...")
            self.index = faiss.read_index(str(self.index_file))
            with open(self.docs_file, 'rb') as f:
                self.documents = pickle.load(f)
            print(f"Loaded index with {len(self.documents)} documents")
            return

        print("Building FAISS index from scratch...")

        # Preparing documents for indexing
        self.documents = []
        texts_to_embed = []

        for item in data:
            doc_text = f"{item['instruction']} {item['input']} {item['output']}"
            self.documents.append({
                'text': doc_text,
                'instruction': item['instruction'],
                'output': item['output'],
                'id': item['id']
            })
            texts_to_embed.append(doc_text)

        # Now Generating the embeddings
        print("Generating embeddings...")
        embeddings = self.embedder.encode(
            texts_to_embed,
            show_progress_bar=True,
            batch_size=32
        )

        # Creating FAISS index
        dimension = embeddings.shape[1]
        self.index = faiss.IndexFlatIP(dimension)  # Inner product for cosine similarity(comparative correlation)

        # Normalizing the embedings for cosine similarity
        faiss.normalize_L2(embeddings)
        self.index.add(embeddings)

        # Saving the index in pickle file
        faiss.write_index(self.index, str(self.index_file))
        with open(self.docs_file, 'wb') as f:
            pickle.dump(self.documents, f)

        print(f"The index has been built and saved with {len(self.documents)} documents")

    def retrieve(self, query: str, top_k: int = 3) -> List[Dict]:
        """Function to retrieve top-k relevant documents"""
        import faiss

        # Embed query
        query_embedding = self.embedder.encode([query])
        faiss.normalize_L2(query_embedding)

        # Search
        scores, indices = self.index.search(query_embedding, top_k)

        # Return results
        results = []
        for score, idx in zip(scores[0], indices[0]):
            if idx < len(self.documents):
                results.append({
                    'score': float(score),
                    'document': self.documents[idx]
                })

        return results

**LLM GENERATION INT THE QUANTIZED MODEL**

In [None]:
class QuantizedLLM:
    """Creating a limited memory of 4-bit quantized
      LLM for efficient inference
    """

    def __init__(self, model_name="unsloth/Llama-3.2-1B-Instruct"):
        """
        I'm using the model because of the memory constraints
        unsloth/Llama-3.2-1B-Instruct is smallest and uses 1GB of VRAM
        which is enough for my application
        """
        self.model_name = model_name
        self.model = None
        self.tokenizer = None

    def load_model(self):
        """Loading quantized model"""
        if self.model is not None:
            return

        print(f"Loading quantized LLM: {self.model_name}...")

        # importing the tokenizer, pytorch model and quantization config to load the model
        from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
        import torch

        #4-bit quantization configuration
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
        )

        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            quantization_config=bnb_config,
            device_map="auto",
            trust_remote_code=True
        )

        print("Model loaded successfully")

    def generate(self, prompt: str, max_length: int = 512) -> str:
        """Generating response"""
        if self.model is None:
            self.load_model()

        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)

        outputs = self.model.generate(
            **inputs,
            max_new_tokens=max_length,
            temperature=0.7,
            do_sample=True,
            top_p=0.9,
            pad_token_id=self.tokenizer.eos_token_id
        )

        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Extracting only the generated part of the response ( but removed prompt)
        response = response[len(prompt):].strip()

        return response

**MAIN RAG PIPELINE:promptly Structured**

In [None]:
class CancerMutationRAG:
    """Complete RAG pipeline for cancer mutation analysis"""

    def __init__(self):
        self.data_filter = MolInstructionsFilter()
        self.uniprot = UniProtCache()
        self.retriever = CancerRAGRetriever()
        self.llm = QuantizedLLM()
        self.initialized = False

    def initialize(self):
        """Initializing all components"""
        if self.initialized:
            return

        print("=" * 70)
        print("INITIALIZING CANCER MUTATION RAG SYSTEM")
        print("=" * 70)

        # Filtering the dataset
        filtered_data = self.data_filter.download_and_filter(max_samples=2000)

        # Preloading the proteins
        self.uniprot.preload_cancer_proteins()

        # Building the retrieval index
        self.retriever.build_index(filtered_data)

        #Loading LLM (deferred until first query for memory efficiency)
        print("\nSystem initializing!")
        print("=" * 70)

        self.initialized = True

    def query(self, question: str) -> Dict:
        """Processing a query through the RAG pipeline"""
        if not self.initialized:
            self.initialize()

        print(f"\Processing query: {question}")

        # Retrieve relevant documents
        print("  → Retrieving relevant documents...")
        retrieved_docs = self.retriever.retrieve(question, top_k=3)

        #Extracting proteins mentioned and get UniProt data
        print(" Fetching protein information...")
        proteins_mentioned = self._extract_proteins(question)
        protein_info = []
        for protein in proteins_mentioned:
            info = self.uniprot.fetch_protein_info(protein)
            if info:
                protein_info.append(info)

        #Building the RAG context
        context = self._build_context(retrieved_docs, protein_info)

        #Generate answer of the query
        print("  → Generating answer with LLM...")
        prompt = self._build_prompt(question, context)
        answer = self.llm.generate(prompt, max_length=300)

        return {
            'question': question,
            'answer': answer,
            'retrieved_docs': retrieved_docs,
            'protein_info': protein_info,
            'context': context
        }

    def _extract_proteins(self, text: str) -> List[str]:
        """Extracting protein names from text"""
        common_proteins = [
            'BRAF', 'TP53', 'NRAS', 'CDKN2A', 'PTEN', 'KIT',
            'NF1', 'MAP2K1', 'EGFR', 'KRAS', 'TERT'
        ]
        text_upper = text.upper()
        return [p for p in common_proteins if p in text_upper]

    def _build_context(self, docs: List[Dict], proteins: List[Dict]) -> str:
        """Combining retrieved documents and protein info"""
        context_parts = []

        # Adding the retrieved documents
        if docs:
            context_parts.append("Relevant Information:")
            for i, doc in enumerate(docs[:2], 1):  # Top 2 docs
                context_parts.append(f"{i}. {doc['document']['output'][:300]}...")

        # Adding the protein info to contextualize it
        if proteins:
            context_parts.append("\nProtein Database Information:")
            for protein in proteins:
                context_parts.append(
                    f"- {protein['gene']}: {protein['protein_name']}\n"
                    f"  Function: {protein['function'][:200]}..."
                )

        return "\n".join(context_parts)

    def _build_prompt(self, question: str, context: str) -> str:
        """Build prompt for LLM"""
        return f"""You are an expert in cancer biology and protein mutations. Answer the question based on the provided context.

Context:
{context}

Question: {question}

Answer (be concise and scientific):"""


In [None]:
res = CancerMutationRAG()
print(res._build_prompt)

In [None]:
!apt-get install git

In [None]:
!git config --global user.email "akintoyesylvester1996@gmail.com"
!git config --global user.name "Akintoyefelix"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!ls "/content/drive/MyDrive/Colab Notebooks"

In [None]:
!git clone https://github.com/Akintoyefelix/cancer_mutation

In [None]:
!cp /content/drive/MyDrive/skin_cancer_mutation.ipynb /content/cancer_mutation

In [None]:
%cd /content/cancer_mutation

In [None]:
!git add .

In [None]:
!git commit -m "sec_commit skincancer mutattion"

In [None]:
!git remote set-url origin https://Akintoyefelix:$token@github.com/Akintoyefelix/cancer_mutation.git

In [None]:
!git push origin main

In [None]:
!git push https://Akintoyefelix:token@github.com/Akintoyefelix/cancer_mutation.git

In [None]:
!grep -R "ghp_" -n

In [None]:
!git commit -m "Commiting Skin Cancer Q%A system "

In [None]:
!grep -n "token" skin_cancer_mutation.ipynb