In [1]:
# ==============================================================================
# CELL 1: Install ALL Dependencies (Run Once)
# ==============================================================================

print("📦 Installing all dependencies...")

# VLM + RAG dependencies
get_ipython().system('pip install -q -U bitsandbytes')
get_ipython().system('pip install -q transformers accelerate peft pillow sentencepiece protobuf')
get_ipython().system('pip install -q langchain langchain-community faiss-cpu sentence-transformers datasets pandas')

# OpenCV dependencies
get_ipython().system('pip install -q opencv-python-headless imutils scikit-image webcolors scikit-learn')

# Gradio
get_ipython().system('pip install -q gradio')

print("✅ All dependencies installed!")
print("⏭️  Proceed to CELL 2")

📦 Installing all dependencies...
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m45.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m100.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m69.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.7/64.7 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m476.1/476.1 kB[0m [31m40.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is t

In [2]:
# ==============================================================================
# CELL 2: HuggingFace Login (Run Once)
# ==============================================================================

from huggingface_hub import login
from google.colab import userdata
import getpass

print("⚠️ Please provide your Hugging Face Token (must have access to Llama 3.2 models)")
try:
    token = userdata.get('HF_TOKEN')
    login(token=token)
    print("✅ Logged in using Colab secrets")
except:
    HF_TOKEN = getpass.getpass("Enter your HuggingFace token: ")
    login(token=HF_TOKEN)
    print("✅ Logged in successfully")

print("⏭️  Proceed to CELL 3")

⚠️ Please provide your Hugging Face Token (must have access to Llama 3.2 models)
Enter your HuggingFace token: ··········
✅ Logged in successfully
⏭️  Proceed to CELL 3


In [3]:
# ==============================================================================
# CELL 3: Build RAG System (Run Once - Stays in Memory)
# ==============================================================================

print("🗃️ Preparing Corpus & Building Index...")

import pandas as pd
import re
from datasets import load_dataset
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document

# Configuration
SKIN_KEYWORDS = ['melanoma', 'skin cancer', 'basal cell carcinoma', 'squamous cell carcinoma',
                 'dermatology', 'dermoscopy', 'cutaneous', 'skin lesion', 'nevus', 'mole']
CARDIO_KEYWORDS = ['cardiovascular', 'cardiac', 'heart', 'myocardial', 'coronary', 'stroke',
                   'hypertension', 'infarction']
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

def classify_abstract(text):
    if not isinstance(text, str): return None
    text = text.lower()
    has_skin = any(re.search(r'\b' + re.escape(k) + r'\b', text) if ' ' not in k else k in text
                   for k in SKIN_KEYWORDS)
    has_cardio = any(re.search(r'\b' + re.escape(k) + r'\b', text) if ' ' not in k else k in text
                     for k in CARDIO_KEYWORDS)
    if has_skin and has_cardio: return 'both'
    elif has_skin: return 'skin_cancer'
    elif has_cardio: return 'cardio'
    return None

print("📥 Loading & Processing Dataset...")
dataset = load_dataset("TimSchopf/medical_abstracts")
df = pd.concat([pd.DataFrame(dataset['train']), pd.DataFrame(dataset['test'])])
df['topic'] = df['medical_abstract'].apply(classify_abstract)
df = df[df['topic'].notna()].copy()
print(f"✅ Filtered to {len(df)} relevant abstracts")

# Create Documents
docs = [Document(page_content=row['medical_abstract'], metadata={'topic': row['topic']})
        for _, row in df.iterrows()]

# Build Index
print("🏗️ Building FAISS Index (this may take a minute)...")
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
chunks = splitter.split_documents(docs)
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL, model_kwargs={'device': 'cpu'})
vectorstore = FAISS.from_documents(chunks, embeddings)
print("✅ RAG system ready!")
print("⏭️  Proceed to CELL 4")


🗃️ Preparing Corpus & Building Index...
📥 Loading & Processing Dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/7.67M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11550 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2888 [00:00<?, ? examples/s]

✅ Filtered to 4009 relevant abstracts
🏗️ Building FAISS Index (this may take a minute)...


  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL, model_kwargs={'device': 'cpu'})


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ RAG system ready!
⏭️  Proceed to CELL 4


In [4]:
# ==============================================================================
# CELL 4: Load VLM Model (Run Once - Stays in Memory)
# ==============================================================================

print("🔄 Loading VLM Model...")

import torch
from transformers import MllamaForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
from peft import PeftModel
import os

base_model = "meta-llama/Llama-3.2-11B-Vision-Instruct"
adapter_path = "DermaVLM/DermatoLLama-50k"

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    llm_int8_skip_modules=["vision_model"]
)

model = MllamaForConditionalGeneration.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map="auto",
    torch_dtype=torch.float16
)
processor = AutoProcessor.from_pretrained(base_model)
model = PeftModel.from_pretrained(model, adapter_path)
print("✅ VLM Model loaded!")
print("⏭️  Proceed to CELL 5")


`torch_dtype` is deprecated! Use `dtype` instead!


🔄 Loading VLM Model...


config.json:   0%|          | 0.00/5.07k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/89.4k [00:00<?, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.47G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/437 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/5.09k [00:00<?, ?B/s]

adapter_config.json: 0.00B [00:00, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/52.5M [00:00<?, ?B/s]

✅ VLM Model loaded!
⏭️  Proceed to CELL 5


In [5]:
# ==============================================================================
# CELL 5: Define OpenCV Feature Extraction Functions (Run Once)
# ==============================================================================

print("🔧 Setting up OpenCV feature extraction pipeline...")

import cv2
import numpy as np
import imutils
from skimage import morphology
from sklearn.cluster import KMeans
import webcolors
from PIL import Image
import matplotlib.pyplot as plt

# ==========================================
# Utility Functions
# ==========================================

def load_and_preprocess_pil(pil_image, max_dim=1024):
    """Convert PIL image to OpenCV format"""
    img = np.array(pil_image)
    if len(img.shape) == 2:  # Grayscale
        img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
    elif img.shape[2] == 4:  # RGBA
        img = cv2.cvtColor(img, cv2.COLOR_RGBA2RGB)
    img = imutils.resize(img, width=min(max_dim, img.shape[1]))
    return img

def remove_hairs(img_rgb):
    """Remove hair artifacts from dermoscopic image"""
    gray = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2GRAY)
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (9, 9))
    blackhat = cv2.morphologyEx(gray, cv2.MORPH_BLACKHAT, kernel)
    _, th = cv2.threshold(blackhat, 10, 255, cv2.THRESH_BINARY)
    th = cv2.dilate(th, None, iterations=1)
    inpaint = cv2.inpaint(
        cv2.cvtColor(img_rgb, cv2.COLOR_RGB2BGR),
        th, 3, cv2.INPAINT_TELEA
    )
    return cv2.cvtColor(inpaint, cv2.COLOR_BGR2RGB)

def segment_lesion(img_rgb, k=2, min_size=500):
    """Segment lesion from skin using k-means clustering"""
    lab = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2LAB)
    X = lab.reshape((-1, 3)).astype(np.float32)
    km = KMeans(n_clusters=k, random_state=42).fit(X)
    labels = km.labels_.reshape(img_rgb.shape[:2])

    # Find darkest cluster (usually the lesion)
    cluster_means = []
    for i in range(k):
        mask_i = (labels == i)
        if mask_i.sum() > 0:
            cluster_means.append(lab[:, :, 0][mask_i].mean())
        else:
            cluster_means.append(999)

    lesion_label = int(np.argmin(cluster_means))
    mask = (labels == lesion_label).astype(bool)
    mask = morphology.remove_small_objects(mask, min_size=min_size)
    mask = morphology.remove_small_holes(mask, area_threshold=min_size)
    return (mask.astype('uint8') * 255)

# ==========================================
# Shape Analysis
# ==========================================

def compute_shape_features(mask):
    """Extract shape-based features (area, perimeter, circularity, asymmetry)"""
    cnts = cv2.findContours(mask.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cnts = imutils.grab_contours(cnts)
    if not cnts:
        return {}

    c = max(cnts, key=cv2.contourArea)
    area = cv2.contourArea(c)
    perimeter = cv2.arcLength(c, True)
    circularity = 4 * np.pi * area / (perimeter * perimeter + 1e-8)
    (x, y, w, h) = cv2.boundingRect(c)
    diameter_px = max(w, h)

    # Asymmetry calculation
    ys = c[:, :, 1].flatten()
    xs = c[:, :, 0].flatten()
    cx, cy = xs.mean(), ys.mean()
    cov = np.cov(xs - cx, ys - cy)

    try:
        eigvals, eigvecs = np.linalg.eig(cov)
        angle = np.degrees(np.arctan2(eigvecs[1, 0], eigvecs[0, 0]))
    except:
        angle = 0

    # Rotate and compare halves
    M = cv2.getRotationMatrix2D((cx, cy), angle, 1.0)
    h_mask, w_mask = mask.shape
    rotated = cv2.warpAffine(mask, M, (w_mask, h_mask),
                             flags=cv2.INTER_NEAREST, borderValue=0)

    ys_nonzero, xs_nonzero = np.where(rotated > 0)
    if len(xs_nonzero) == 0:
        asym_score = 0.0
    else:
        minx, maxx = xs_nonzero.min(), xs_nonzero.max()
        miny, maxy = ys_nonzero.min(), ys_nonzero.max()
        crop = rotated[miny:maxy+1, minx:maxx+1]
        mid = crop.shape[1] // 2
        left = crop[:, :mid]
        right = crop[:, -mid:] if mid > 0 else np.zeros_like(left)

        if left.shape != right.shape:
            minw = min(left.shape[1], right.shape[1])
            left = left[:, :minw]
            right = right[:, -minw:]

        diff = np.sum(np.abs(left.astype(int) - np.fliplr(right).astype(int)))
        asym_score = diff / max(1, crop.size)

    return {
        'area_px': float(area),
        'perimeter_px': float(perimeter),
        'circularity': float(circularity),
        'diameter_px': int(diameter_px),
        'asymmetry': float(asym_score)
    }

# ==========================================
# Color Analysis
# ==========================================

def get_css3_names_to_hex():
    """Get CSS3 color name mappings"""
    try:
        return webcolors.CSS3_NAMES_TO_HEX
    except AttributeError:
        color_dict = {}
        for name in webcolors.names('css3'):
            try:
                color_dict[name] = webcolors.name_to_hex(name, spec='css3')
            except ValueError:
                pass
        return color_dict

CSS3_NAMES_TO_HEX = get_css3_names_to_hex()

def rgb_to_name(rgb_triplet):
    """Convert RGB to closest CSS3 color name"""
    try:
        return webcolors.rgb_to_name(tuple(int(x) for x in rgb_triplet), spec='css3')
    except ValueError:
        min_dist = None
        min_name = None
        r, g, b = rgb_triplet
        for name, hexv in CSS3_NAMES_TO_HEX.items():
            rn, gn, bn = webcolors.hex_to_rgb(hexv)
            d = (r - rn) ** 2 + (g - gn) ** 2 + (b - bn) ** 2
            if min_dist is None or d < min_dist:
                min_dist = d
                min_name = name
        return min_name

def classify_dermatological_color(rgb):
    """Classify RGB color using dermatological terminology"""
    r, g, b = rgb
    if r > 200 and g > 180 and b > 180:
        return "white/depigmented"
    if r > 150 and g < 100 and b < 100:
        return "red/erythematous"
    if r > 180 and g > 120 and b > 120:
        return "pink"
    if r > 100 and g > 60:
        if r > 150 and g > 100:
            return "light brown/tan"
        elif r > 100 and g > 60:
            return "medium brown"
        else:
            return "dark brown"
    if r < 60 and g < 60 and b < 60:
        return "black/very dark brown"
    if b > r and b > g and r < 100:
        return "blue-gray (regression)"
    return "brown"

def analyze_colors(img_rgb, mask, n_colors=4):
    """Extract dominant colors from lesion using k-means"""
    pts = img_rgb[mask > 0].reshape(-1, 3)
    if pts.shape[0] == 0:
        return []

    n_clusters = min(n_colors, 6, pts.shape[0])
    km = KMeans(n_clusters=n_clusters, random_state=42).fit(pts)
    labels = km.labels_
    centers = km.cluster_centers_.astype(int)
    counts = np.bincount(labels)
    order = np.argsort(-counts)

    results = []
    total = labels.size
    for idx in order:
        rgb = tuple(int(x) for x in centers[idx].tolist())
        name = rgb_to_name(rgb)
        pct = 100.0 * counts[idx] / total
        results.append({
            'rgb': rgb,
            'name': name,
            'pct': float(pct)
        })
    return results

def analyze_color_distribution(img_rgb, mask, centers, labels):
    """Analyze spatial distribution of colors (central vs peripheral)"""
    ys, xs = np.where(mask > 0)
    center_y, center_x = ys.mean(), xs.mean()
    y_coords, x_coords = np.where(mask > 0)
    distances = np.sqrt((y_coords - center_y)**2 + (x_coords - center_x)**2)
    max_dist = distances.max()

    distribution = {}
    for i, center_rgb in enumerate(centers):
        color_mask = (labels == i)
        color_distances = distances[color_mask]
        central_count = np.sum(color_distances < max_dist * 0.5)
        peripheral_count = np.sum(color_distances >= max_dist * 0.5)
        total = central_count + peripheral_count
        if total > 0:
            distribution[i] = {
                'central_pct': 100 * central_count / total,
                'peripheral_pct': 100 * peripheral_count / total
            }
    return distribution

# ==========================================
# Border & Texture Analysis
# ==========================================

def assess_border_quality(mask):
    """Assess border irregularity and definition"""
    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
    if not contours:
        return {}

    contour = max(contours, key=cv2.contourArea)
    perimeter = cv2.arcLength(contour, True)
    epsilon = 0.02 * perimeter
    approx = cv2.approxPolyDP(contour, epsilon, True)
    irregularity_score = len(approx) / perimeter * 1000

    # Border definition (sharp vs blurry)
    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (5, 5))
    dilated = cv2.dilate(mask, kernel, iterations=1)
    eroded = cv2.erode(mask, kernel, iterations=1)
    border_zone = dilated - eroded
    border_width = np.sum(border_zone > 0) / perimeter

    border_definition = "well-defined" if border_width < 3 else "poorly-defined"
    border_regularity = "regular" if irregularity_score < 5 else "irregular"
    if irregularity_score > 10:
        border_regularity = "highly irregular/notched"

    return {
        'definition': border_definition,
        'regularity': border_regularity,
        'irregularity_score': float(irregularity_score),
        'border_width': float(border_width)
    }

def analyze_texture_patterns(img_rgb, mask):
    """Analyze texture and pigmentation patterns"""
    gray = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2GRAY)
    lesion_patch = gray[mask > 0]
    texture_variance = float(np.var(lesion_patch))

    edges = cv2.Canny(gray, 50, 150)
    lesion_edges = edges[mask > 0]
    edge_density = float(np.sum(lesion_edges > 0) / len(lesion_edges))

    # Classify pattern
    if edge_density > 0.15:
        pattern = "reticular/network pattern visible"
    elif edge_density > 0.05:
        pattern = "irregular pigmentation pattern"
    else:
        pattern = "homogeneous pigmentation"

    # Surface classification
    if texture_variance > 1000:
        surface = "highly textured/varied"
    elif texture_variance > 500:
        surface = "moderately textured"
    else:
        surface = "smooth/uniform"

    return {
        'pattern': pattern,
        'surface': surface,
        'texture_variance': texture_variance,
        'edge_density': edge_density
    }

# ==========================================
# ABCDE Risk Assessment
# ==========================================

def calculate_abcde_risk(colors, shape, border, size_mm):
    """Calculate ABCDE melanoma risk factors"""
    risk_factors = []

    # A - Asymmetry
    if shape['asymmetry'] > 30:
        risk_factors.append("A: Significant asymmetry detected")

    # B - Border irregularity
    if border['irregularity_score'] > 8:
        risk_factors.append("B: Highly irregular border")
    elif shape['circularity'] < 0.3:
        risk_factors.append("B: Irregular border shape")

    # C - Color variegation
    if len(colors) >= 4:
        risk_factors.append("C: Multiple colors present (≥4 distinct tones)")

    color_names = [classify_dermatological_color(c['rgb']) for c in colors]
    if any('black' in c for c in color_names):
        risk_factors.append("C: Black pigmentation present")
    if any('blue-gray' in c for c in color_names):
        risk_factors.append("C: Blue-gray areas (possible regression)")

    # D - Diameter
    if size_mm and size_mm > 6:
        risk_factors.append(f"D: Diameter > 6mm ({size_mm:.1f}mm)")

    return risk_factors

# ==========================================
# Main Description Generator
# ==========================================

def make_enhanced_description(colors, shape, texture, mask, border_info,
                              color_distribution, pixels_per_mm=None):
    """Generate comprehensive text description for VLM"""
    if pixels_per_mm is None:
        pixels_per_mm = 10

    diameter_mm = shape['diameter_px'] / pixels_per_mm
    area_mm2 = shape['area_px'] / (pixels_per_mm ** 2)

    # Color descriptions with spatial distribution
    color_descriptions = []
    for i, c in enumerate(colors):
        derm_color = classify_dermatological_color(c['rgb'])
        dist = color_distribution.get(i, {})
        location = ""
        if dist:
            if dist['central_pct'] > 70:
                location = " (predominantly central)"
            elif dist['peripheral_pct'] > 70:
                location = " (predominantly peripheral)"
            else:
                location = " (mixed distribution)"
        color_descriptions.append(f"{derm_color} {c['pct']:.1f}%{location}")

    # Asymmetry description
    if shape['asymmetry'] > 40:
        asymmetry_desc = "markedly asymmetric"
    elif shape['asymmetry'] > 25:
        asymmetry_desc = "moderately asymmetric"
    else:
        asymmetry_desc = "relatively symmetric"

    # ABCDE risk factors
    risk_factors = calculate_abcde_risk(colors, shape, border_info, diameter_mm)

    # Build description
    description = f"""DERMATOLOGICAL LESION ANALYSIS:

MORPHOLOGY:
- Size: Approximately {diameter_mm:.1f}mm diameter, {area_mm2:.1f}mm² area
- Shape: {asymmetry_desc} with {border_info['regularity']} borders
- Border definition: {border_info['definition']}
- Overall circularity: {shape['circularity']:.3f} (1.0 = perfect circle)

COLOR ANALYSIS:
- Number of distinct color zones: {len(colors)}
- Color composition: {'; '.join(color_descriptions)}
- Color pattern: {'Variegated (multiple distinct colors)' if len(colors) >= 3 else 'Relatively uniform'}

SURFACE & TEXTURE:
- Pigmentation pattern: {texture['pattern']}
- Surface appearance: {texture['surface']}
- Texture complexity score: {texture['texture_variance']:.1f}

BORDER CHARACTERISTICS:
- Border regularity: {border_info['regularity']}
- Border definition: {border_info['definition']}
- Irregularity score: {border_info['irregularity_score']:.2f}

ASYMMETRY ASSESSMENT:
- Asymmetry score: {shape['asymmetry']:.1f}
- Classification: {asymmetry_desc}

ABCDE MELANOMA RISK FACTORS:
"""

    if risk_factors:
        for factor in risk_factors:
            description += f"⚠️  {factor}\n"
    else:
        description += "- No major ABCDE risk factors detected\n"

    description += f"""
SUMMARY FOR CLINICAL CORRELATION:
This lesion presents with {len(colors)} distinct color zones, {border_info['regularity']} borders,
and {asymmetry_desc} morphology. {'Multiple concerning features warrant further evaluation.' if len(risk_factors) >= 2 else 'Clinical correlation recommended for definitive diagnosis.'}

NOTE: This is an automated image analysis. Clinical evaluation by a dermatologist
is essential for accurate diagnosis and management decisions.
"""
    return description

# ==========================================
# Main Pipeline Function
# ==========================================

def analyze_lesion_opencv(pil_image):
    """
    Complete OpenCV analysis pipeline
    Returns: description text, original image, hair-removed image, mask
    """
    try:
        # Convert PIL to OpenCV
        img = load_and_preprocess_pil(pil_image)

        # Preprocess: Remove hair artifacts
        img_nh = remove_hairs(img)

        # Segment lesion
        mask = segment_lesion(img_nh, k=3)

        # Extract Features
        shape = compute_shape_features(mask)
        colors = analyze_colors(img_nh, mask, n_colors=4)
        texture = analyze_texture_patterns(img_nh, mask)
        border_info = assess_border_quality(mask)

        # Color spatial distribution
        pts = img_nh[mask > 0].reshape(-1, 3)
        n_clusters = min(len(colors), 4, pts.shape[0])
        if n_clusters > 0:
            km = KMeans(n_clusters=n_clusters, random_state=42).fit(pts)
            color_distribution = analyze_color_distribution(
                img_nh, mask, km.cluster_centers_, km.labels_
            )
        else:
            color_distribution = {}

        # Generate comprehensive description
        description = make_enhanced_description(
            colors, shape, texture, mask,
            border_info, color_distribution,
            pixels_per_mm=10
        )

        return description, img, img_nh, mask

    except Exception as e:
        return f"Error in OpenCV analysis: {str(e)}", None, None, None

print("✅ OpenCV feature extraction pipeline ready!")
print("⏭️  Proceed to CELL 6")


🔧 Setting up OpenCV feature extraction pipeline...
✅ OpenCV feature extraction pipeline ready!
⏭️  Proceed to CELL 6


In [None]:
# ==============================================================================
# CELL 6: Launch Gradio App (Main Interface)
# ==============================================================================

import gradio as gr
from PIL import Image
import io

print("🚀 Creating Gradio Interface...")

# Verify all systems loaded
try:
    print("✅ Checking systems...")
    print(f"   Model: {type(model).__name__}")
    print(f"   Processor: {type(processor).__name__}")
    print(f"   Vectorstore: {type(vectorstore).__name__}")
    print("✅ All systems ready!")
except NameError as e:
    print(f"❌ Error: {e}")
    print("⚠️  Please run CELLS 1-5 first!")
    raise

# Main analysis function
def analyze_lesion_complete(image, use_opencv, opencv_data, final_tokens, final_temp, num_sources):
    """
    Complete analysis pipeline with OpenCV integration
    """
    global model, processor, vectorstore

    if image is None:
        return "⚠️ Please upload an image first!", "", ""

    try:
        # Convert to PIL if needed
        if not isinstance(image, Image.Image):
            image = Image.fromarray(image)
        image = image.convert("RGB")

        # Determine analysis mode and get features
        if use_opencv:
            # Run OpenCV feature extraction
            opencv_description, img_orig, img_hair_removed, mask = analyze_lesion_opencv(image)

            if "Error" in opencv_description:
                return opencv_description, "", ""

            analysis_mode_text = "🔬 OpenCV Feature Extraction Used"
            precomputed_data = opencv_description

        else:
            # Use manual input or direct VLM
            if not opencv_data or not opencv_data.strip():
                analysis_mode_text = "👁️ Direct VLM Analysis (No Pre-computed Data)"
                precomputed_data = None
            else:
                analysis_mode_text = "📊 Manual Pre-computed Data Used"
                precomputed_data = opencv_data

        # Phase 1: RAG Retrieval
        key_terms = [
            "melanoma", "atypical nevus", "dysplastic nevus",
            "asymmetry", "irregular borders", "multiple colors",
            "pigmented lesion", "ABCDE criteria", "basal cell carcinoma",
            "squamous cell carcinoma", "seborrheic keratosis",
            "dermatoscopy", "skin cancer", "benign nevus"
        ]
        query_text = " ".join(key_terms)
        results = vectorstore.similarity_search_with_score(query_text, k=int(num_sources))

        retrieved_context = ""
        sources_text = f"**{analysis_mode_text}**\n\n"
        sources_text += f"**Found {len(results)} relevant medical abstracts:**\n\n"

        for i, (doc, score) in enumerate(results, 1):
            sources_text += f"**[Source {i}]** (Relevance Score: {score:.4f})\n"
            sources_text += f"{doc.page_content}\n"
            sources_text += f"{'-'*80}\n\n"
            retrieved_context += f"\n[Source {i}]:\n{doc.page_content}\n"

        # Phase 2: Build prompt based on mode
        if precomputed_data:
            # Use quantitative measurements from OpenCV
            analysis_prompt = f"""
{precomputed_data}

================================================================================
RELEVANT MEDICAL LITERATURE:
================================================================================
{retrieved_context}

================================================================================
EVIDENCE-BASED ANALYSIS INSTRUCTIONS:
================================================================================

Based on the quantitative measurements provided above AND the medical literature,
provide a comprehensive structured diagnosis.

**MANDATORY CITATION RULE**: Cite sources using [Source 1], [Source 2], etc.

Structure your response with these EXACT sections:

1. DIFFERENTIAL DIAGNOSIS (Ranked by Likelihood):
   - AT LEAST 5 diagnoses with likelihood levels (Most Likely / Likely / Possible / Less Likely)
   - Reference the specific measurements (asymmetry score, color zones, size, irregularity score)
   - Cite literature for each diagnosis

2. CONCERNING FEATURES WITH EVIDENCE:
   - List specific quantitative features from the measurements
   - Explain clinical significance with citations
   - Example: "The asymmetry score of X is concerning because [Source Y]..."

3. COMPARISON TO LITERATURE PATTERNS:
   - How these measurements compare to literature patterns
   - Statistical context if available from sources

4. CLINICAL RECOMMENDATIONS:
   - Urgency level (Immediate/Urgent/Routine) with justification from sources
   - Specific next steps (biopsy type, excision, monitoring)
   - Follow-up timeline
   - Cite sources for recommendations

5. PATIENT COMMUNICATION GUIDANCE:
   - Clear, compassionate explanation of findings
   - What to expect in next steps
   - Address common concerns

Remember: Cite [Source X] for every clinical claim.
"""
        else:
            # Direct VLM analysis
            analysis_prompt = """
COMPREHENSIVE DERMATOLOGICAL LESION ANALYSIS:

Analyze this skin lesion image in detail:

MORPHOLOGICAL ASSESSMENT:
- Size estimation (diameter in mm)
- Shape (regular, irregular, asymmetric)
- Border characteristics (well-defined, poorly-defined, irregular, notching)
- Surface features

COLOR & PIGMENTATION:
- Number of distinct color zones
- Specific colors present (brown, black, red, blue, purple, tan)
- Distribution pattern (uniform, variegated, patchy)
- Presence of dots, globules, or streaks

CLINICAL FEATURES:
- Symmetry assessment (degree of asymmetry)
- ABCDE criteria evaluation

"""
            analysis_prompt += f"""
================================================================================
RELEVANT MEDICAL LITERATURE:
================================================================================
{retrieved_context}

================================================================================
EVIDENCE-BASED ANALYSIS INSTRUCTIONS:
================================================================================

Based on your visual analysis of the image AND the medical literature,
provide a comprehensive structured diagnosis.

**MANDATORY CITATION RULE**: Cite sources using [Source 1], [Source 2], etc.

Structure your response with these EXACT sections:

1. DIFFERENTIAL DIAGNOSIS (Ranked by Likelihood):
   AT LEAST 5 diagnoses with likelihood levels. Cite literature.

2. CONCERNING FEATURES WITH EVIDENCE:
   List each concerning feature with citations.

3. COMPARISON TO LITERATURE PATTERNS:
   Compare to patterns described in sources.

4. CLINICAL RECOMMENDATIONS:
   Urgency level, actions, follow-up timeline. Cite sources.

5. PATIENT COMMUNICATION GUIDANCE:
   Clear, compassionate explanation.

Remember: Cite [Source X] for every clinical claim.
"""

        # Generate VLM analysis
        messages_final = [
            {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": analysis_prompt}]}
        ]

        input_text_final = processor.apply_chat_template(messages_final, add_generation_prompt=True, tokenize=False)
        inputs_final = processor(images=image, text=input_text_final, return_tensors="pt").to(model.device)

        output_final = model.generate(
            **inputs_final,
            max_new_tokens=int(final_tokens),
            do_sample=True,
            temperature=float(final_temp)
        )
        final_response = processor.decode(
            output_final[0][inputs_final.input_ids.shape[1]:],
            skip_special_tokens=True
        )

        # Generate complete report
        full_report = f"""
SKIN LESION ANALYSIS REPORT
{'='*80}
Generated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}
Analysis Mode: {analysis_mode_text}

{f"OPENCV FEATURE EXTRACTION:\\n{precomputed_data}\\n\\n{'='*80}\\n" if use_opencv else ""}

EVIDENCE-BASED DIAGNOSIS:
{final_response}

RETRIEVED SOURCES:
{retrieved_context}

DISCLAIMER: For research and educational purposes only. NOT a substitute for
professional medical advice, diagnosis, or treatment. Consult a qualified dermatologist.
{'='*80}
"""

        # Save report
        filename = f"analysis_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.txt"
        with open(filename, 'w') as f:
            f.write(full_report)

        # Return results
        opencv_output = precomputed_data if use_opencv else ""
        return opencv_output, sources_text, final_response

    except Exception as e:
        import traceback
        error_msg = f"❌ Error: {str(e)}\n\n{traceback.format_exc()}"
        return error_msg, "", ""


# Create Gradio Interface
custom_css = """
    .scrollable-output textarea {
        max-height: 500px !important;
        overflow-y: auto !important;
    }
    .gradio-container {
        max-width: 1400px !important;
    }
    #opencv_output, #sources_output, #final_output {
        max-height: 500px;
        overflow-y: auto;
    }
"""

with gr.Blocks(
    title="Complete Skin Cancer Analysis System",
    theme=gr.themes.Soft(),
    css=custom_css
) as demo:

    gr.Markdown("""
    # 🔬 Complete Skin Cancer Analysis System

    **OpenCV Feature Extraction + VLM + RAG**

    This integrated system combines:
    - 🎨 **OpenCV**: Automated quantitative feature extraction (size, color, asymmetry, borders, texture)
    - 🤖 **VLM**: Llama-3.2-11B-Vision-Instruct + DermaVLM LoRA adapter for advanced image analysis
    - 📚 **RAG**: Evidence-based diagnosis with medical literature citations

    ⚠️ **Medical Disclaimer**: For research and educational purposes only. NOT a substitute for professional medical advice, diagnosis, or treatment. Always consult a qualified dermatologist.
    """)

    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### 📤 Step 1: Upload Image")
            image_input = gr.Image(type="pil", label="Upload Skin Lesion Image")

            gr.Markdown("### 🔬 Step 2: Analysis Mode")
            use_opencv = gr.Checkbox(
                value=True,
                label="✅ Use OpenCV Feature Extraction (Recommended)",
                info="Automatically extract quantitative measurements"
            )

            gr.Markdown("### 📊 Step 3: Manual Data (Optional)")
            opencv_data = gr.Textbox(
                label="Manual Pre-computed Measurements",
                placeholder="Only if NOT using OpenCV. Paste your own measurements here...",
                lines=8,
                info="Ignored if OpenCV checkbox is checked"
            )

            gr.Markdown("### ⚙️ Step 4: Model Parameters")
            with gr.Accordion("Advanced Settings", open=False):
                final_tokens = gr.Slider(
                    512, 2048, value=1024, step=128,
                    label="Analysis Tokens",
                    info="More tokens = more detailed analysis"
                )
                final_temp = gr.Slider(
                    0.1, 1.0, value=0.5, step=0.1,
                    label="Temperature",
                    info="Lower = more precise citations"
                )
                num_sources = gr.Slider(
                    1, 10, value=5, step=1,
                    label="Number of Sources",
                    info="Medical abstracts to retrieve"
                )

            analyze_btn = gr.Button("🔬 Analyze Lesion", variant="primary", size="lg")

        with gr.Column(scale=2):
            gr.Markdown("### 📊 Analysis Results")

            with gr.Tabs():
                with gr.Tab("🔬 OpenCV Features"):
                    gr.Markdown("*Quantitative measurements extracted by OpenCV*")
                    opencv_output = gr.Textbox(
                        label="Automated Feature Extraction",
                        lines=15,
                        max_lines=30,
                        show_copy_button=True,
                        elem_id="opencv_output",
                        elem_classes="scrollable-output"
                    )

                with gr.Tab("📚 Literature Sources"):
                    gr.Markdown("*Retrieved medical abstracts*")
                    sources_output = gr.Textbox(
                        label="Medical Literature",
                        lines=12,
                        max_lines=25,
                        show_copy_button=True,
                        elem_id="sources_output",
                        elem_classes="scrollable-output"
                    )

                with gr.Tab("🏥 Evidence-Based Diagnosis"):
                    gr.Markdown("*Comprehensive analysis with literature citations*")
                    final_output = gr.Textbox(
                        label="Clinical Diagnosis & Recommendations",
                        lines=12,
                        max_lines=25,
                        show_copy_button=True,
                        elem_id="final_output",
                        elem_classes="scrollable-output"
                    )

    with gr.Row():
        gr.Markdown("""
        ---
        ### 🎯 How It Works:

        **With OpenCV Enabled (Recommended):**
        1. 🎨 OpenCV extracts quantitative features from the image
        2. 📚 System retrieves relevant medical literature
        3. 🤖 VLM analyzes image + measurements + literature → Comprehensive diagnosis

        **Without OpenCV:**
        - VLM performs qualitative image analysis directly
        - Or you can paste your own pre-computed measurements

        ### ✨ OpenCV Features Extracted:
        - **Size**: Diameter (mm), Area (mm²)
        - **Shape**: Asymmetry score (0-100), Circularity (0-1)
        - **Colors**: 4+ color zones with percentages & spatial distribution
        - **Borders**: Irregularity score, Definition quality
        - **Texture**: Complexity score, Pigmentation pattern
        - **ABCDE**: Automated risk factor detection

        ### 📊 Output Structure:
        1. **5+ Ranked Differential Diagnoses** with likelihood levels
        2. **Concerning Features** with quantitative evidence & citations
        3. **Comparison to Literature** with pattern matching
        4. **Clinical Recommendations** with urgency level & next steps
        5. **Patient Communication** guidance

        ### 📥 Download Complete Report:
        After analysis, download the full report from Colab files panel (←) or:
        ```python
        from google.colab import files
        files.download('analysis_YYYYMMDD_HHMMSS.txt')
        ```
        """)

    # Connect button to function
    analyze_btn.click(
        fn=analyze_lesion_complete,
        inputs=[image_input, use_opencv, opencv_data, final_tokens, final_temp, num_sources],
        outputs=[opencv_output, sources_output, final_output]
    )

# Launch Gradio
print("\n" + "="*80)
print("🚀 Launching Complete Analysis System...")
print("="*80)

demo.launch(
    share=True,
    debug=True,
    show_error=True,
    server_name="0.0.0.0",
    server_port=7860
)

print("\n" + "="*80)
print("✅ System is running!")
print("📋 Features:")
print("   • OpenCV automated feature extraction")
print("   • VLM vision analysis with DermaVLM")
print("   • RAG evidence-based diagnosis with citations")
print("   • 3-tab interface: Features → Literature → Diagnosis")
print("   • Downloadable comprehensive reports")
print("⚠️  Keep this cell running to use the app!")
print("="*80)

🚀 Creating Gradio Interface...
✅ Checking systems...
   Model: PeftModelForCausalLM
   Processor: MllamaProcessor
   Vectorstore: FAISS
✅ All systems ready!


  with gr.Blocks(
  with gr.Blocks(



🚀 Launching Complete Analysis System...
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://75f016d85399f43da6.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
