# Agentic RAG with Local Ollama Model
This notebook demonstrates how to build a Retrieval-Augmented Generation (RAG) agent using LangGraph, LangChain, and a local  model run via Ollama.

Adapted from: https://langchain-ai.github.io/langgraph/tutorials/rag/langgraph_agentic_rag/

## Materials
This notebook and all materials referenced here can be found on Sol `/data/sse/ai-accelerated-spark`.

## 1. Import libraries

In [None]:
from langchain_core.prompts import PromptTemplate
from langchain_core.messages import HumanMessage, AIMessage, BaseMessage
from langchain_core.tools import Tool
from langgraph.graph import Graph
from langchain.text_splitter import CharacterTextSplitter
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import torch
from typing import List
from pydantic import BaseModel, Field
import os

os.environ["USER_AGENT"] = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/109.0"

## 2. Preprocess documents
### 2.1. Fetch documents

In [None]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.document_loaders import NotebookLoader
from langchain.docstore.document import Document
import os

# External URLs for GPU acceleration knowledge
urls = [
    "https://medium.com/cupy-team/announcing-cupy-v13-66979ee7fab0",
    "https://www.unum.cloud/blog/2022-01-26-cupy",
    "https://medium.com/rapids-ai/easy-cpu-gpu-arrays-and-dataframes-run-your-dask-code-where-youd-like-e349d92351d"
]

docs = []
for url in urls:
    try:
        loaded_docs = WebBaseLoader(url).load()
        docs.extend(loaded_docs)
        print(f"✅ Loaded web content from {url}")
    except Exception as e:
        print(f"⚠️ Could not load {url}: {str(e)}")

# Load local notebook content for comprehensive GPU acceleration examples
notebook_dir = "../python_notebooks"
notebook_files = [
    "notebook-1-cupy.ipynb",
    "notebook-2-rapids-cudf.ipynb", 
    "notebook-3-rapids-cuml.ipynb",
    "notebook-4-warp.ipynb"
]

# Load notebook content if available
for nb_file in notebook_files:
    nb_path = os.path.join(notebook_dir, nb_file)
    if os.path.exists(nb_path):
        try:
            nb_loader = NotebookLoader(nb_path, include_outputs=True, max_output_length=1000)
            nb_docs = nb_loader.load()
            docs.extend(nb_docs)
            print(f"✅ Loaded {nb_file}")
        except Exception as e:
            print(f"⚠️ Could not load {nb_file}: {str(e)}")
    else:
        print(f"❌ Notebook not found: {nb_path}")

# Add curated GPU acceleration content
gpu_acceleration_content = """
# GPU Acceleration with NVIDIA Rapids

## CuPy Performance Patterns
- Matrix operations show 5-50x speedup on GPU vs CPU
- Best performance for arrays > 1M elements
- Memory bandwidth is often the bottleneck
- Use .astype() to ensure optimal data types (float32)
- Kernel launch overhead affects small operations

## cuDF Performance Benefits  
- DataFrame operations can achieve 10-100x speedup
- GroupBy operations scale excellently on GPU
- String operations benefit significantly from GPU parallelization
- Best for datasets > 100K rows
- Memory management is crucial for large datasets

## cuML Machine Learning Acceleration
- K-Means clustering: 10-50x speedup typical
- Random Forest: 5-25x speedup
- Logistic Regression: 3-15x speedup
- UMAP/t-SNE: 10-100x speedup for dimensionality reduction

## Best Practices for GPU Acceleration
1. Keep data on GPU between operations
2. Use appropriate data types (prefer float32 over float64)
3. Batch operations to amortize kernel launch overhead
4. Profile memory usage and optimize transfers
5. Use @cupy.fuse for element-wise operations
6. Consider problem size - GPU overhead for small data

## When NOT to use GPU
- Very small datasets (< 10K elements)
- Sequential algorithms that don't parallelize
- Frequent CPU-GPU memory transfers
- Operations dominated by I/O
"""

# Add curated content as a Document object
docs.append(Document(page_content=gpu_acceleration_content, metadata={"source": "curated_gpu_guide"}))

print(f"📚 Total documents loaded: {len(docs)}")

In [None]:
docs[0].page_content.strip()[:1000]

### 2.2. Split the fetched documents into smaller chunks for indexing into the vectorstore

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

docs_list = docs  # docs is already a flat list of Document objects

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=100, chunk_overlap=50
)
doc_splits = text_splitter.split_documents(docs_list)

In [None]:
doc_splits[0].page_content

## 3.Create a retriever tool
### 3.1. Use an in-memory vector store and all-MiniLM-L6-V2 embeddings model

In [None]:
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_huggingface import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

vectorstore = InMemoryVectorStore.from_documents(
    documents = doc_splits, embedding = embedding_model
)
retriever = vectorstore.as_retriever()

In [None]:
# TODO: Use ChromaDB for persistent vectorstore
# https://python.langchain.com/docs/integrations/vectorstores/

### 3.2. Create a retriever tool using LangChain's prebuild `create_retriever_tool`

In [None]:
from langchain.tools.retriever import create_retriever_tool

retriever_tool = create_retriever_tool(
    retriever,
    "retrieve_python_gpu_acceleration",
    "Search and return information about accelerating Python code using the GPU with RAPIDS and CuPy.",
)

### 3.3. Test the tool

In [None]:
retriever_tool.invoke({"query": "How can I create a CuPy-backed Dask array for random data?"})

## 4. Generate query
### 4.1. Load local LLM

Start ollama using the terminal:
```bash
module load ollama/0.9.0
export OLLAMA_MODELS=/data/datasets/community/ollama
ollama-start
```

Check the available list of models using `ollama list`. Let me know via Slack if you would like to use and test other models.

In [None]:
from langchain_ollama import ChatOllama
import socket
from langchain_ollama.llms import OllamaLLM
from langchain.chat_models import init_chat_model

host_node = socket.gethostname()
llm_model = init_chat_model("ollama:qwen3:14b", temperature=0, base_url=f"http://vpatel69@{host_node}:11434/")

### 4.2. Build a `generate_query_or_respond` node

In [None]:
from langgraph.graph import MessagesState
import re

def generate_query_or_respond(state: MessagesState):
    """Call the model to generate a response based on the current state. Given
    the question, it will decide to retrieve using the retriever tool, or simply respond to the user.
    """
    response = (
        llm_model
        .bind_tools([retriever_tool]).invoke(state["messages"])
    )
    # remove thinking text
    content = re.sub(r"<think>.*</think>", "", response.content, flags=re.DOTALL).strip()
    response.content = content
    return {"messages": [response]}

### 4.3. Try a random input

In [None]:
input = {"messages": [{"role": "user", "content": "Hello! What is the color of the sky?"}]}
generate_query_or_respond(input)["messages"][-1].pretty_print()

### 4.4. Try semantic search question

In [None]:
input = {
    "messages": [
        {
            "role": "user",
            "content": "How can I create a CuPy-backed Dask array for random data?",
        }
    ]
}
generate_query_or_respond(input)["messages"][-1].pretty_print()

## 5. Grade documents
### 5.1. Add conditional edge `grade_documents` to determine the relevance of retrieved documents

In [None]:
from pydantic import BaseModel, Field
from typing import Literal

GRADE_PROMPT = (
    "You are a grader assessing relevance of a retrieved document to a user question. \n "
    "Here is the retrieved document: \n\n {context} \n\n"
    "Here is the user question: {question} \n"
    "If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant. \n"
    "Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question."
)

class GradeDocuments(BaseModel):
    """Grade documents using a binary score for relevance check."""

    binary_score: str = Field(
        description="Relevance score: 'yes' if relevant, or 'no' if not relevant"
    )


def grade_documents(
    state: MessagesState,
) -> Literal["generate_answer", "rewrite_question"]:
    """Determine whether the retrieved documents are relevant to the question."""
    question = state["messages"][0].content
    context = state["messages"][-1].content
    

    prompt = GRADE_PROMPT.format(question=question, context=context)
    response = (
        llm_model
        .with_structured_output(GradeDocuments).invoke(
            [{"role": "user", "content": prompt}]
        )
    )
    score = response.binary_score

    if score == "yes":
        return "generate_answer"
    else:
        return "rewrite_question"

### 5.2. Try with irrelevant documents in the tool response

In [None]:
from langchain_core.messages import convert_to_messages

input = {
    "messages": convert_to_messages(
        [
            {
                "role": "user",
                "content": "How can I create a CuPy-backed Dask array for random data?",
            },
            {
                "role": "assistant",
                "content": "",
                "tool_calls": [
                    {
                        "id": "1",
                        "name": "retrieve_python_gpu_acceleration",
                        "args": {"query": "creating CuPy-backed Dask arrays for random data"},
                    }
                ],
            },
            {"role": "tool", "content": "meow", "tool_call_id": "1"},
        ]
    )
}
grade_documents(input)

### 5.3. Try with relevant documents

In [None]:
input = {
    "messages": convert_to_messages(
        [
            {
                "role": "user",
                "content": "How can I create a CuPy-backed Dask array for random data?",
            },
            {
                "role": "assistant",
                "content": "",
                "tool_calls": [
                    {
                        "id": "1",
                        "name": "retrieve_python_gpu_acceleration",
                        "args": {"query": "creating CuPy-backed Dask arrays for random data"},
                    }
                ],
            },
            {
                "role": "tool",
                "content": 'Now, we can leverage the array.backend configuration to create a CuPy-backed Dask array for random data:>>> with dask.config.set({“array.backend”: “cupy”}):…    darr = da.random.randint(0, 3, size=(10, 20), chunks=(2, 5)) #\n\n= rs.randint(0, 3, size=(10, 20), chunks=(2, 5))>>> darrdask.array<randint, shape=(10, 20), dtype=int64, chunksize=(2, 5), \\chunktype=cupy.ndarray>Now, we can leverage the array.backend configuration to create a CuPy-backed Dask array for random data:>>> with\n\nfor random array creation.',
                "tool_call_id": "1",
            },
        ]
    )
}
grade_documents(input)

## 6.
### 6.1

In [None]:
REWRITE_PROMPT = (
    "Look at the input and try to reason about the underlying semantic intent / meaning.\n"
    "Here is the initial question:"
    "\n ------- \n"
    "{question}"
    "\n ------- \n"
    "Formulate an improved question:"
)


def rewrite_question(state: MessagesState):
    """Rewrite the original user question."""
    messages = state["messages"]
    question = messages[0].content
    prompt = REWRITE_PROMPT.format(question=question)
    response = llm_model.invoke([{"role": "user", "content": prompt}])
    # remove thinking text
    content = re.sub(r"<think>.*</think>", "", response.content, flags=re.DOTALL).strip()
    response.content = content
    return {"messages": [{"role": "user", "content": response.content}]}

### 6.2 Test

In [None]:
input = {
    "messages": convert_to_messages(
        [
            {
                "role": "user",
                "content": "How can I create a CuPy-backed Dask array for random data?",
            },
            {
                "role": "assistant",
                "content": "",
                "tool_calls": [
                    {
                        "id": "1",
                        "name": "retrieve_python_gpu_acceleration",
                        "args": {"query": "creating CuPy-backed Dask arrays for random data"},
                    }
                ],
            },
            {"role": "tool", "content": "meow", "tool_call_id": "1"},
        ]
    )
}

response = rewrite_question(input)
print(response["messages"][-1]["content"])

## 7. Generate an answer
### 7.1. Build `generate_answer` node

In [None]:
GENERATE_PROMPT = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer the question. "
    "If you don't know the answer, just say that you don't know. "
    "Use three sentences maximum and keep the answer concise.\n"
    "Question: {question} \n"
    "Context: {context}"
)


def generate_answer(state: MessagesState):
    """Generate an answer."""
    question = state["messages"][0].content
    context = state["messages"][-1].content
    prompt = GENERATE_PROMPT.format(question=question, context=context)
    response = llm_model.invoke([{"role": "user", "content": prompt}])
    # remove thinking text
    content = re.sub(r"<think>.*</think>", "", response.content, flags=re.DOTALL).strip()
    response.content = content
    return {"messages": [response]}

## 7.2 Test

In [None]:
input = {
    "messages": convert_to_messages(
        [
            {
                "role": "user",
                "content": "How can I create a CuPy-backed Dask array for random data?",
            },
            {
                "role": "assistant",
                "content": "",
                "tool_calls": [
                    {
                        "id": "1",
                        "name": "retrieve_python_gpu_acceleration",
                        "args": {"query": "creating CuPy-backed Dask arrays for random data"},
                    }
                ],
            },
            {
                "role": "tool",
                "content": 'Now, we can leverage the array.backend configuration to create a CuPy-backed Dask array for random data:>>> with dask.config.set({“array.backend”: “cupy”}):…    darr = da.random.randint(0, 3, size=(10, 20), chunks=(2, 5)) #\n\n= rs.randint(0, 3, size=(10, 20), chunks=(2, 5))>>> darrdask.array<randint, shape=(10, 20), dtype=int64, chunksize=(2, 5), \\chunktype=cupy.ndarray>Now, we can leverage the array.backend configuration to create a CuPy-backed Dask array for random data:>>> with\n\nfor random array creation.',
                "tool_call_id": "1",
            },
        ]
    )
}

response = generate_answer(input)
response["messages"][-1].pretty_print()

## 8. Assemble the graph

In [None]:
from langgraph.graph import StateGraph, START, END
from langgraph.prebuilt import ToolNode
from langgraph.prebuilt import tools_condition

workflow = StateGraph(MessagesState)

# Define the nodes we will cycle between
workflow.add_node(generate_query_or_respond)
workflow.add_node("retrieve", ToolNode([retriever_tool]))
workflow.add_node(rewrite_question)
workflow.add_node(generate_answer)

workflow.add_edge(START, "generate_query_or_respond")

# Decide whether to retrieve
workflow.add_conditional_edges(
    "generate_query_or_respond",
    # Assess LLM decision (call `retriever_tool` tool or respond to the user)
    tools_condition,
    {
        # Translate the condition outputs to nodes in our graph
        "tools": "retrieve",
        END: END,
    },
)

# Edges taken after the `action` node is called.
workflow.add_conditional_edges(
    "retrieve",
    # Assess agent decision
    grade_documents,
)
workflow.add_edge("generate_answer", END)
workflow.add_edge("rewrite_question", "generate_query_or_respond")

# Compile
graph = workflow.compile()

In [None]:
from IPython.display import Image, display

display(Image(graph.get_graph().draw_mermaid_png()))

## 9. Run the agentic RAG

In [None]:
for chunk in graph.stream(
    {
        "messages": [
            {
                "role": "user",
                "content": "How can I create a CuPy-backed Dask array for random data?",
            }
        ]
    }
):
    for node, update in chunk.items():
        print("Update from node", node)
        update["messages"][-1].pretty_print()
        print("\n\n")

## 10. Graphic User Interface using Gradio

In [None]:
import gradio as gr

def ask_graph(user_input, chat_history):
    result = graph.invoke({
        "messages": [
            {"role": "user", "content": user_input}
        ]
    })

    response = result["messages"][-1].content

    if not chat_history:
        response = [{"role": "user", "content": user_input}, {"role": "assistant", "content": response}]
    else:
        response = chat_history + [{"role": "user", "content": user_input}, {"role": "assistant", "content": response}]

    return "", response

def clear_conversation():
    return "", ""

with gr.Blocks(fill_height=True, fill_width=True) as demo:
    gr.Markdown("### Agentic RAG")

    with gr.Column():

        with gr.Row():
            chatbot = gr.Chatbot(height=350, type="messages")

        with gr.Row():
            with gr.Column(scale=4):
                query_input = gr.Textbox(
                    label="Enter text here", placeholder="Ask something...", lines=1
                    )
            with gr.Column(scale=1):
                with gr.Row():
                    submit_btn = gr.Button("⬆")
                # 🧹 Clear button
                with gr.Row():
                    clear_btn = gr.Button("🧹 Clear Conversation")

        submit_btn.click(
            fn=ask_graph,
            inputs=[query_input, chatbot],
            outputs=[query_input, chatbot],
        )

        query_input.submit(
            fn=ask_graph,
            inputs=[query_input, chatbot],
            outputs=[query_input, chatbot],
        )

        clear_btn.click(
            fn=clear_conversation,
            outputs=[query_input, chatbot],
        )

demo.launch(share=True)

In [None]:
# Check dependencies for Enhanced GPU Mentor
import sys

required_packages = {
    'plotly': 'plotly',
    'gradio': 'gradio', 
    'pandas': 'pandas',
    'numpy': 'numpy'
}

missing_packages = []

for package, import_name in required_packages.items():
    try:
        __import__(import_name)
        print(f"✅ {package} - Available")
    except ImportError:
        print(f"❌ {package} - Missing")
        missing_packages.append(package)

if missing_packages:
    print(f"\nInstall missing packages with:")
    print(f"pip install {' '.join(missing_packages)}")
else:
    print("\n🎉 All required packages are available!")

# Check Sol-specific modules (these should be available when running on Sol)
print("\n--- Sol-specific checks ---")
sol_modules = ['subprocess', 'uuid', 'pathlib', 'json', 'tempfile']
for module in sol_modules:
    try:
        __import__(module)
        print(f"✅ {module} - Available")
    except ImportError:
        print(f"❌ {module} - Missing (this should not happen)")

# GPU Mentor: Enhanced RAG with Code Execution & Benchmarking

This enhanced version of the Agentic RAG system includes:
- **Code Execution on Sol**: Submit and execute user code on Sol's GPU nodes
- **Performance Benchmarking**: Compare CPU vs GPU performance with RAPIDS libraries
- **Code Optimization**: Automatically suggest GPU-accelerated alternatives
- **Interactive Learning**: Socratic questioning to guide learning

## Architecture Overview
1. **RAG Agent**: Existing system for answering questions about GPU acceleration
2. **Code Executor**: Submits jobs to Sol via SLURM
3. **Benchmark Engine**: Measures and compares CPU/GPU performance
4. **Code Optimizer**: Suggests RAPIDS/CuPy alternatives
5. **Enhanced UI**: Comprehensive interface for code playground and visualization

In [None]:
# Enhanced imports for GPU Mentor
import subprocess
import tempfile
import time
import json
import uuid
from pathlib import Path
from typing import Dict, List, Tuple, Optional
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime
import ast
import inspect

## 11. Sol Code Executor - SLURM Integration

In [None]:
class SolCodeExecutor:
    """
    Executes code on Sol supercomputer via SLURM job submission.
    Handles both CPU and GPU benchmarking jobs.
    """
    
    def __init__(self, base_work_dir="/tmp/gpu_mentor"):
        self.base_work_dir = Path(base_work_dir)
        self.base_work_dir.mkdir(exist_ok=True)
        
    def create_slurm_script(self, code: str, job_type: str = "cpu", 
                           time_limit: str = "00:15:00", 
                           memory: str = "32G") -> str:
        """Create SLURM batch script for code execution."""
        
        job_id = str(uuid.uuid4())[:8]
        script_content = ""
        
        if job_type == "cpu":
            script_content = f"""#!/bin/bash
#SBATCH --job-name=gpu_mentor_cpu_{job_id}
#SBATCH --partition=general
#SBATCH --qos=public
#SBATCH --time={time_limit}
#SBATCH --cpus-per-task=8
#SBATCH --mem={memory}
#SBATCH --output=cpu_output_{job_id}.out
#SBATCH --error=cpu_error_{job_id}.err

# Load necessary modules
module load python/3.11
module load anaconda3

# Activate conda environment with CPU libraries
source activate base

# Create timing wrapper
cat > benchmark_cpu_{job_id}.py << 'SCRIPT_EOF'
import time
import sys
import traceback
import json

start_time = time.perf_counter()
try:
{self._indent_code(code)}
    execution_status = "success"
    error_message = ""
except Exception as e:
    execution_status = "error"
    error_message = str(e)
    traceback.print_exc()

end_time = time.perf_counter()
execution_time = end_time - start_time

# Save benchmark results
results = {{
    "execution_time": execution_time,
    "job_type": "cpu",
    "job_id": "{job_id}",
    "status": execution_status,
    "error": error_message
}}

with open("cpu_benchmark_{job_id}.json", "w") as f:
    json.dump(results, f)

print(f"CPU Execution time: {{execution_time:.4f}} seconds")
SCRIPT_EOF

# Execute the benchmark script
python benchmark_cpu_{job_id}.py
"""
        else:  # GPU job
            script_content = f"""#!/bin/bash
#SBATCH --job-name=gpu_mentor_gpu_{job_id}
#SBATCH --partition=general
#SBATCH --qos=public
#SBATCH --time={time_limit}
#SBATCH --cpus-per-task=8
#SBATCH --mem={memory}
#SBATCH --gres=gpu:1
#SBATCH --output=gpu_output_{job_id}.out
#SBATCH --error=gpu_error_{job_id}.err

# Load necessary modules
module load python/3.11
module load anaconda3
module load cuda/12.1

# Activate conda environment with GPU libraries
source activate rapids-23.08

# Create timing wrapper
cat > benchmark_gpu_{job_id}.py << 'SCRIPT_EOF'
import time
import sys
import traceback
import json

# Import GPU libraries
try:
    import cupy as cp
    import cudf
    import cuml
    gpu_available = True
except ImportError as e:
    print(f"GPU libraries not available: {{e}}")
    gpu_available = False

start_time = time.perf_counter()
try:
{self._indent_code(code)}
    execution_status = "success"
    error_message = ""
except Exception as e:
    execution_status = "error"
    error_message = str(e)
    traceback.print_exc()

end_time = time.perf_counter()
execution_time = end_time - start_time

# Save benchmark results
results = {{
    "execution_time": execution_time,
    "job_type": "gpu",
    "job_id": "{job_id}",
    "status": execution_status,
    "error": error_message,
    "gpu_available": gpu_available
}}

with open("gpu_benchmark_{job_id}.json", "w") as f:
    json.dump(results, f)

print(f"GPU Execution time: {{execution_time:.4f}} seconds")
SCRIPT_EOF

# Execute the benchmark script
python benchmark_gpu_{job_id}.py
"""
        
        return script_content, job_id
    
    def _indent_code(self, code: str, indent: str = "    ") -> str:
        """Add proper indentation to user code for embedding in script."""
        return "\n".join(indent + line for line in code.split("\n"))
    
    def submit_job(self, script_content: str, job_id: str) -> str:
        """Submit job to SLURM and return job ID."""
        script_path = self.base_work_dir / f"job_{job_id}.sh"
        
        with open(script_path, 'w') as f:
            f.write(script_content)
        
        try:
            # Submit job via sbatch
            result = subprocess.run(
                ["sbatch", str(script_path)],
                capture_output=True,
                text=True,
                cwd=self.base_work_dir
            )
            
            if result.returncode == 0:
                # Extract SLURM job ID from output
                slurm_job_id = result.stdout.strip().split()[-1]
                return slurm_job_id
            else:
                raise Exception(f"Job submission failed: {result.stderr}")
                
        except Exception as e:
            print(f"Error submitting job: {e}")
            return None
    
    def check_job_status(self, slurm_job_id: str) -> str:
        """Check the status of a SLURM job."""
        try:
            result = subprocess.run(
                ["squeue", "-j", slurm_job_id, "-h", "-o", "%T"],
                capture_output=True,
                text=True
            )
            
            if result.returncode == 0 and result.stdout.strip():
                return result.stdout.strip()
            else:
                # Job might be completed, check sacct
                result = subprocess.run(
                    ["sacct", "-j", slurm_job_id, "-n", "-o", "State"],
                    capture_output=True,
                    text=True
                )
                if result.returncode == 0 and result.stdout.strip():
                    return result.stdout.strip().split()[0]
                else:
                    return "UNKNOWN"
        except Exception as e:
            print(f"Error checking job status: {e}")
            return "ERROR"
    
    def get_job_results(self, job_id: str, job_type: str) -> Dict:
        """Retrieve benchmark results from completed job."""
        result_file = self.base_work_dir / f"{job_type}_benchmark_{job_id}.json"
        
        if result_file.exists():
            with open(result_file, 'r') as f:
                return json.load(f)
        else:
            return {"error": "Results file not found"}
    
    def cleanup_job_files(self, job_id: str):
        """Clean up temporary job files."""
        patterns = [
            f"job_{job_id}.sh",
            f"*_output_{job_id}.out",
            f"*_error_{job_id}.err",
            f"*_benchmark_{job_id}.py",
            f"*_benchmark_{job_id}.json"
        ]
        
        for pattern in patterns:
            for file_path in self.base_work_dir.glob(pattern):
                try:
                    file_path.unlink()
                except Exception as e:
                    print(f"Error cleaning up {file_path}: {e}")

# Initialize the Sol executor
sol_executor = SolCodeExecutor()

## 12. Code Optimizer - GPU Acceleration Suggestions

In [None]:
class CodeOptimizer:
    """
    Analyzes user code and suggests GPU-accelerated alternatives using RAPIDS and CuPy.
    """
    
    def __init__(self):
        self.optimization_patterns = {
            # NumPy to CuPy optimizations
            'numpy': {
                'import numpy as np': 'import cupy as np',
                'np.array(': 'cp.array(',
                'np.random.': 'cp.random.',
                'np.linalg.': 'cp.linalg.',
                'np.fft.': 'cp.fft.',
                '.cpu()': '',  # Remove .cpu() calls
            },
            
            # Pandas to cuDF optimizations
            'pandas': {
                'import pandas as pd': 'import cudf as pd',
                'pd.DataFrame(': 'cudf.DataFrame(',
                'pd.Series(': 'cudf.Series(',
                'pd.read_csv(': 'cudf.read_csv(',
                'pd.read_parquet(': 'cudf.read_parquet(',
                '.to_pandas()': '',  # Remove .to_pandas() calls
            },
            
            # Scikit-learn to cuML optimizations
            'sklearn': {
                'from sklearn.': 'from cuml.',
                'sklearn.': 'cuml.',
            },
            
            # Dask optimizations
            'dask': {
                'import dask.array as da': 'import dask.array as da\\n# Configure Dask to use CuPy backend\\nimport dask\\ndask.config.set({"array.backend": "cupy"})',
                'import dask.dataframe as dd': 'import dask_cudf as dd',
            }
        }
    
    def analyze_code(self, code: str) -> Dict[str, any]:
        """Analyze code for optimization opportunities."""
        analysis = {
            'libraries_detected': [],
            'optimization_opportunities': [],
            'estimated_speedup': 1.0,
            'gpu_compatible': True,
            'warnings': []
        }
        
        # Detect libraries used
        for lib_type, patterns in self.optimization_patterns.items():
            for pattern in patterns.keys():
                if pattern in code:
                    analysis['libraries_detected'].append(lib_type)
                    break
        
        # Check for GPU incompatible operations
        incompatible_patterns = [
            'matplotlib.pyplot',  # Plotting might need CPU arrays
            'pickle.dump',        # Serialization issues
            'multiprocessing',    # GPU memory management conflicts
        ]
        
        for pattern in incompatible_patterns:
            if pattern in code:
                analysis['warnings'].append(f"Detected {pattern} - may require CPU data conversion")
        
        # Estimate potential speedup based on operations
        compute_intensive_ops = [
            'np.dot', 'np.matmul', '@',  # Matrix operations
            'np.fft', 'scipy.fft',       # FFT operations
            '.groupby(', '.agg(',        # Aggregation operations
            'for ' in code and 'range(' in code,  # Loops that could be vectorized
        ]
        
        speedup_factors = []
        for op in compute_intensive_ops:
            if isinstance(op, bool):
                if op:
                    speedup_factors.append(5.0)  # Loop vectorization
            elif op in code:
                if 'matmul' in op or 'dot' in op or '@' in op:
                    speedup_factors.append(10.0)  # Matrix ops
                elif 'fft' in op:
                    speedup_factors.append(15.0)  # FFT ops
                else:
                    speedup_factors.append(3.0)   # Other ops
        
        if speedup_factors:
            analysis['estimated_speedup'] = max(speedup_factors)
        
        return analysis
    
    def suggest_optimizations(self, code: str) -> str:
        """Generate GPU-optimized version of the code."""
        optimized_code = code
        
        # Apply optimization patterns
        for lib_type, patterns in self.optimization_patterns.items():
            for old_pattern, new_pattern in patterns.items():
                optimized_code = optimized_code.replace(old_pattern, new_pattern)
        
        # Add GPU-specific optimizations
        if 'import cupy' in optimized_code and 'import cupy as np' not in optimized_code:
            optimized_code = 'import cupy as cp\\n' + optimized_code
        
        # Add memory pool for better performance
        if 'cupy' in optimized_code:
            memory_pool_code = """
# Enable CuPy memory pool for better performance
import cupy
mempool = cupy.get_default_memory_pool()
pinned_mempool = cupy.get_default_pinned_memory_pool()
"""
            optimized_code = memory_pool_code + optimized_code
        
        return optimized_code
    
    def create_benchmark_code(self, original_code: str, optimized_code: str) -> Tuple[str, str]:
        """Create side-by-side benchmark versions."""
        
        cpu_benchmark = f"""
# CPU Version Benchmark
import time
import numpy as np
import pandas as pd

{original_code}
"""
        
        gpu_benchmark = f"""
# GPU Version Benchmark  
import time
import cupy as cp
import cudf as pd

{optimized_code}

# Convert final results back to CPU for comparison if needed
# result = cp.asnumpy(result) if hasattr(result, 'get') else result
"""
        
        return cpu_benchmark, gpu_benchmark

# Initialize the code optimizer
code_optimizer = CodeOptimizer()

## 13. Benchmark Engine - Performance Comparison

In [None]:
class BenchmarkEngine:
    """
    Coordinates CPU vs GPU benchmarking using Sol's compute resources.
    """
    
    def __init__(self, sol_executor: SolCodeExecutor, code_optimizer: CodeOptimizer):
        self.sol_executor = sol_executor
        self.code_optimizer = code_optimizer
        self.benchmark_history = []
    
    def run_comprehensive_benchmark(self, user_code: str, timeout: int = 300) -> Dict:
        """
        Run comprehensive CPU vs GPU benchmark.
        
        Args:
            user_code: Original user code to benchmark
            timeout: Maximum wait time for jobs to complete (seconds)
            
        Returns:
            Dictionary with benchmark results and visualizations
        """
        
        print("🔍 Analyzing code for optimization opportunities...")
        analysis = self.code_optimizer.analyze_code(user_code)
        
        print("⚡ Generating GPU-optimized version...")
        optimized_code = self.code_optimizer.suggest_optimizations(user_code)
        
        # Create benchmark versions
        cpu_code, gpu_code = self.code_optimizer.create_benchmark_code(user_code, optimized_code)
        
        print("🚀 Submitting jobs to Sol...")
        
        # Submit CPU job
        cpu_script, cpu_job_id = self.sol_executor.create_slurm_script(
            cpu_code, job_type="cpu", time_limit="00:15:00"
        )
        cpu_slurm_id = self.sol_executor.submit_job(cpu_script, cpu_job_id)
        
        # Submit GPU job
        gpu_script, gpu_job_id = self.sol_executor.create_slurm_script(
            gpu_code, job_type="gpu", time_limit="00:15:00"
        )
        gpu_slurm_id = self.sol_executor.submit_job(gpu_script, gpu_job_id)
        
        if not cpu_slurm_id or not gpu_slurm_id:
            return {"error": "Failed to submit jobs to Sol"}
        
        print(f"✅ Jobs submitted: CPU ({cpu_slurm_id}), GPU ({gpu_slurm_id})")
        print("⏳ Waiting for jobs to complete...")
        
        # Wait for jobs to complete
        start_wait = time.time()
        cpu_status = gpu_status = "PENDING"
        
        while time.time() - start_wait < timeout:
            cpu_status = self.sol_executor.check_job_status(cpu_slurm_id)
            gpu_status = self.sol_executor.check_job_status(gpu_slurm_id)
            
            print(f"📊 Status - CPU: {cpu_status}, GPU: {gpu_status}")
            
            if cpu_status in ["COMPLETED", "FAILED"] and gpu_status in ["COMPLETED", "FAILED"]:
                break
                
            time.sleep(10)  # Check every 10 seconds
        
        # Collect results
        print("📈 Collecting benchmark results...")
        cpu_results = self.sol_executor.get_job_results(cpu_job_id, "cpu")
        gpu_results = self.sol_executor.get_job_results(gpu_job_id, "gpu")
        
        # Calculate performance metrics
        benchmark_results = self._process_results(
            cpu_results, gpu_results, analysis, user_code, optimized_code
        )
        
        # Store in history
        self.benchmark_history.append({
            "timestamp": datetime.now().isoformat(),
            "results": benchmark_results
        })
        
        # Cleanup
        self.sol_executor.cleanup_job_files(cpu_job_id)
        self.sol_executor.cleanup_job_files(gpu_job_id)
        
        return benchmark_results
    
    def _process_results(self, cpu_results: Dict, gpu_results: Dict, 
                        analysis: Dict, original_code: str, optimized_code: str) -> Dict:
        """Process and format benchmark results."""
        
        results = {
            "analysis": analysis,
            "original_code": original_code,
            "optimized_code": optimized_code,
            "cpu_results": cpu_results,
            "gpu_results": gpu_results,
            "performance_metrics": {},
            "recommendations": []
        }
        
        # Calculate performance metrics
        if (cpu_results.get("status") == "success" and 
            gpu_results.get("status") == "success"):
            
            cpu_time = cpu_results.get("execution_time", 0)
            gpu_time = gpu_results.get("execution_time", 0)
            
            if cpu_time > 0 and gpu_time > 0:
                speedup = cpu_time / gpu_time
                efficiency = (speedup / analysis.get("estimated_speedup", 1.0)) * 100
                
                results["performance_metrics"] = {
                    "cpu_execution_time": cpu_time,
                    "gpu_execution_time": gpu_time,
                    "speedup_factor": speedup,
                    "efficiency_percent": efficiency,
                    "time_saved": cpu_time - gpu_time,
                    "percent_improvement": ((cpu_time - gpu_time) / cpu_time) * 100
                }
        
        # Generate recommendations
        results["recommendations"] = self._generate_recommendations(results)
        
        return results
    
    def _generate_recommendations(self, results: Dict) -> List[str]:
        """Generate educational recommendations based on benchmark results."""
        recommendations = []
        
        metrics = results.get("performance_metrics", {})
        speedup = metrics.get("speedup_factor", 1.0)
        
        if speedup > 5:
            recommendations.append("🎉 Excellent GPU acceleration! This workload benefits significantly from parallel processing.")
        elif speedup > 2:
            recommendations.append("✅ Good GPU speedup achieved. Consider optimizing memory access patterns for even better performance.")
        elif speedup > 1.1:
            recommendations.append("📈 Modest improvement with GPU. This workload may be memory-bound or have limited parallelism.")
        else:
            recommendations.append("⚠️ Limited GPU benefit. Consider if this workload has sufficient computational complexity.")
        
        # Check for optimization opportunities
        analysis = results.get("analysis", {})
        if "numpy" in analysis.get("libraries_detected", []):
            recommendations.append("💡 Consider using CuPy's memory pool for better performance with repeated operations.")
        
        if "pandas" in analysis.get("libraries_detected", []):
            recommendations.append("📊 cuDF provides GPU-accelerated dataframe operations similar to pandas.")
        
        if analysis.get("warnings"):
            recommendations.append("⚠️ Some operations may require CPU-GPU memory transfers. Profile memory usage.")
        
        return recommendations
    
    def create_visualization(self, benchmark_results: Dict) -> go.Figure:
        """Create interactive visualization of benchmark results."""
        
        metrics = benchmark_results.get("performance_metrics", {})
        
        if not metrics:
            # Create error visualization
            fig = go.Figure()
            fig.add_annotation(
                text="Benchmark data not available",
                xref="paper", yref="paper",
                x=0.5, y=0.5, showarrow=False,
                font=dict(size=20)
            )
            return fig
        
        # Create comparison chart
        fig = go.Figure()
        
        # Execution time comparison
        fig.add_trace(go.Bar(
            name='CPU',
            x=['Execution Time'],
            y=[metrics["cpu_execution_time"]],
            marker_color='lightcoral',
            text=[f"{metrics['cpu_execution_time']:.3f}s"],
            textposition='auto'
        ))
        
        fig.add_trace(go.Bar(
            name='GPU',
            x=['Execution Time'],
            y=[metrics["gpu_execution_time"]],
            marker_color='lightblue',
            text=[f"{metrics['gpu_execution_time']:.3f}s"],
            textposition='auto'
        ))
        
        # Add speedup annotation
        speedup = metrics.get("speedup_factor", 1.0)
        fig.add_annotation(
            text=f"🚀 {speedup:.1f}x Speedup",
            xref="paper", yref="paper",
            x=0.7, y=0.9,
            showarrow=False,
            font=dict(size=16, color="green"),
            bgcolor="lightyellow",
            bordercolor="orange",
            borderwidth=2
        )
        
        fig.update_layout(
            title="CPU vs GPU Performance Comparison",
            yaxis_title="Execution Time (seconds)",
            barmode='group',
            template="plotly_white"
        )
        
        return fig

# Initialize the benchmark engine
benchmark_engine = BenchmarkEngine(sol_executor, code_optimizer)

## 14. Enhanced GPU Mentor Agent

In [None]:
class EnhancedGPUMentor:
    """
    Enhanced GPU Mentor that combines RAG capabilities with code execution and analysis.
    Integrates code input directly with LLM for comprehensive responses.
    """
    
    def __init__(self, rag_graph, benchmark_engine: BenchmarkEngine, code_optimizer: CodeOptimizer):
        self.rag_graph = rag_graph
        self.benchmark_engine = benchmark_engine
        self.code_optimizer = code_optimizer
        self.conversation_history = []
        self.code_execution_results = []
    
    def process_user_input(self, user_input: str, code: str = None) -> Dict:
        """
        Process user input with optional code, feeding both to LLM for integrated response.
        """
        
        response = {
            "text_response": "",
            "code_analysis": None,
            "code_output": None,
            "optimized_code": None,
            "socratic_questions": [],
            "learning_objectives": []
        }
        
        # Create enhanced prompt that includes code context if provided
        enhanced_prompt = self._create_enhanced_prompt(user_input, code)
        
        # Get RAG response with code context
        rag_result = self.rag_graph.invoke({
            "messages": [{"role": "user", "content": enhanced_prompt}]
        })
        response["text_response"] = rag_result["messages"][-1].content
        
        # If code is provided, analyze and execute it
        if code and code.strip():
            print("🔍 Analyzing provided code...")
            
            # Analyze code for optimization opportunities
            analysis = self.code_optimizer.analyze_code(code)
            response["code_analysis"] = analysis
            
            # Generate optimized version
            optimized_code = self.code_optimizer.suggest_optimizations(code)
            response["optimized_code"] = optimized_code
            
            # Execute code and capture output
            print("⚡ Executing code...")
            try:
                code_output = self._execute_code_safely(code)
                response["code_output"] = code_output
                
                # Store execution results
                self.code_execution_results.append({
                    "timestamp": datetime.now().isoformat(),
                    "code": code,
                    "output": code_output,
                    "analysis": analysis
                })
                
            except Exception as e:
                response["code_output"] = {"error": f"Code execution failed: {str(e)}"}
            
            # Generate educational content based on code and context
            response["socratic_questions"] = self._generate_socratic_questions(analysis, user_input, code)
            response["learning_objectives"] = self._generate_learning_objectives(analysis, code)
        
        # Store conversation
        self.conversation_history.append({
            "user_input": user_input,
            "code": code,
            "response": response,
            "timestamp": datetime.now().isoformat()
        })
        
        return response
    
    def _create_enhanced_prompt(self, user_input: str, code: str = None) -> str:
        """Create enhanced prompt that includes code context for the LLM."""
        
        if not code or not code.strip():
            return user_input
        
        enhanced_prompt = f"""
User Question: {user_input}

User's Python Code:
```python
{code}
```

Please analyze this code in the context of the user's question. Consider:
1. How the code relates to the question being asked
2. Potential GPU acceleration opportunities in this specific code
3. Any issues, optimizations, or improvements you can suggest
4. Educational insights about GPU acceleration concepts demonstrated in this code

Provide a comprehensive response that addresses both the question and the code together.
"""
        return enhanced_prompt
    
    def _execute_code_safely(self, code: str) -> Dict:
        """Execute code safely and capture output."""
        
        import io
        import sys
        import contextlib
        
        # Capture stdout and stderr
        old_stdout = sys.stdout
        old_stderr = sys.stderr
        stdout_capture = io.StringIO()
        stderr_capture = io.StringIO()
        
        execution_result = {
            "stdout": "",
            "stderr": "",
            "variables": {},
            "execution_time": 0,
            "status": "success"
        }
        
        try:
            start_time = time.perf_counter()
            
            # Redirect output
            sys.stdout = stdout_capture
            sys.stderr = stderr_capture
            
            # Create a safe execution environment
            safe_globals = {
                '__builtins__': __builtins__,
                'print': print,
                'len': len,
                'range': range,
                'enumerate': enumerate,
                'zip': zip,
                'sum': sum,
                'max': max,
                'min': min,
                'abs': abs,
                'round': round,
                'type': type,
                'str': str,
                'int': int,
                'float': float,
                'list': list,
                'dict': dict,
                'tuple': tuple,
                'set': set,
            }
            
            # Add commonly used libraries
            try:
                import numpy as np
                safe_globals['np'] = np
                safe_globals['numpy'] = np
            except ImportError:
                pass
            
            try:
                import pandas as pd
                safe_globals['pd'] = pd
                safe_globals['pandas'] = pd
            except ImportError:
                pass
            
            try:
                import matplotlib.pyplot as plt
                safe_globals['plt'] = plt
            except ImportError:
                pass
            
            local_vars = {}
            
            # Execute the code
            exec(code, safe_globals, local_vars)
            
            end_time = time.perf_counter()
            execution_result["execution_time"] = end_time - start_time
            
            # Capture variables (limit to avoid memory issues)
            for name, value in local_vars.items():
                if not name.startswith('_'):
                    try:
                        # Only store basic info about complex objects
                        if hasattr(value, 'shape'):  # numpy arrays, pandas objects
                            execution_result["variables"][name] = f"{type(value).__name__} with shape {value.shape}"
                        elif hasattr(value, '__len__') and len(value) > 100:
                            execution_result["variables"][name] = f"{type(value).__name__} with {len(value)} elements"
                        elif isinstance(value, (int, float, str, bool, list, dict, tuple)) and len(str(value)) < 1000:
                            execution_result["variables"][name] = str(value)
                        else:
                            execution_result["variables"][name] = f"{type(value).__name__} object"
                    except:
                        execution_result["variables"][name] = f"{type(value).__name__} object"
            
        except Exception as e:
            execution_result["status"] = "error"
            execution_result["error"] = str(e)
            end_time = time.perf_counter()
            execution_result["execution_time"] = end_time - start_time
        
        finally:
            # Restore stdout/stderr
            sys.stdout = old_stdout
            sys.stderr = old_stderr
            
            # Capture output
            execution_result["stdout"] = stdout_capture.getvalue()
            execution_result["stderr"] = stderr_capture.getvalue()
        
        return execution_result
    
    def _generate_socratic_questions(self, analysis: Dict, user_context: str, code: str) -> List[str]:
        """Generate Socratic questions based on code analysis and user context."""
        questions = []
        
        libraries = analysis.get("libraries_detected", [])
        estimated_speedup = analysis.get("estimated_speedup", 1.0)
        
        # Code-specific questions
        if "numpy" in libraries:
            questions.extend([
                "Looking at your NumPy operations, which ones do you think would benefit most from GPU acceleration?",
                "How might the memory access patterns in your code affect GPU performance?",
                "What would happen to performance if you increased the array sizes by 10x?"
            ])
        
        if "pandas" in libraries:
            questions.extend([
                "Which pandas operations in your code are most computationally expensive?",
                "How would you modify this code to work with cuDF instead of pandas?",
                "What considerations should you make when transferring data between CPU and GPU?"
            ])
        
        # Context-aware questions
        if "for " in code and "range(" in code:
            questions.append("Could you vectorize any of these loops to improve performance?")
        
        if "def " in code:
            questions.append("How could you modify this function to accept both CPU and GPU arrays?")
        
        if estimated_speedup > 5:
            questions.append("Your code has high parallelization potential. What makes it suitable for GPU acceleration?")
        elif estimated_speedup < 2:
            questions.append("This code may not benefit much from GPU acceleration. Can you identify why?")
        
        return questions[:3]  # Limit to avoid overwhelming
    
    def _generate_learning_objectives(self, analysis: Dict, code: str) -> List[str]:
        """Generate specific learning objectives based on the code and analysis."""
        objectives = []
        
        libraries = analysis.get("libraries_detected", [])
        
        if "numpy" in libraries:
            objectives.extend([
                "Understand when to use CuPy vs NumPy for your specific operations",
                "Learn about GPU memory management for array operations",
                "Master efficient data transfer between CPU and GPU"
            ])
        
        if "pandas" in libraries:
            objectives.extend([
                "Compare cuDF vs pandas for your data processing workflow",
                "Understand GPU memory requirements for dataframe operations",
                "Learn efficient groupby and aggregation patterns on GPU"
            ])
        
        # Code-specific objectives
        if "for " in code:
            objectives.append("Explore vectorization techniques to eliminate loops")
        
        if "def " in code:
            objectives.append("Design functions that work efficiently with both CPU and GPU data")
        
        return objectives
    
    def generate_tutorial_content(self, topic: str) -> str:
        """Generate comprehensive tutorial content on specific GPU acceleration topics."""
        
        tutorial_prompt = f"""
        Create a comprehensive tutorial on {topic} for GPU acceleration. Include:
        1. Conceptual explanation
        2. Code examples comparing CPU vs GPU approaches
        3. Performance considerations
        4. Best practices
        5. Common pitfalls to avoid
        
        Focus on practical, hands-on learning with RAPIDS and CuPy libraries.
        """
        
        result = self.rag_graph.invoke({
            "messages": [{"role": "user", "content": tutorial_prompt}]
        })
        
        return result["messages"][-1].content
    
    def get_execution_summary(self) -> Dict:
        """Get summary of all code execution results."""
        if not self.code_execution_results:
            return {"message": "No code executed yet"}
        
        summary = {
            "total_executions": len(self.code_execution_results),
            "successful_executions": 0,
            "failed_executions": 0,
            "average_execution_time": 0,
            "common_libraries": [],
            "recent_outputs": []
        }
        
        execution_times = []
        libraries_count = {}
        
        for result in self.code_execution_results[-10:]:  # Last 10 executions
            if result.get("output", {}).get("status") == "success":
                summary["successful_executions"] += 1
                exec_time = result.get("output", {}).get("execution_time", 0)
                execution_times.append(exec_time)
            else:
                summary["failed_executions"] += 1
            
            # Count libraries
            for lib in result.get("analysis", {}).get("libraries_detected", []):
                libraries_count[lib] = libraries_count.get(lib, 0) + 1
            
            # Add recent output summary
            output = result.get("output", {})
            summary["recent_outputs"].append({
                "timestamp": result.get("timestamp"),
                "status": output.get("status", "unknown"),
                "execution_time": output.get("execution_time", 0),
                "output_length": len(output.get("stdout", ""))
            })
        
        if execution_times:
            summary["average_execution_time"] = sum(execution_times) / len(execution_times)
        
        summary["common_libraries"] = sorted(libraries_count.items(), key=lambda x: x[1], reverse=True)
        
        return summary

# Initialize the enhanced GPU mentor
gpu_mentor = EnhancedGPUMentor(graph, benchmark_engine, code_optimizer)

## 15. Enhanced Gradio Interface - GPU Mentor Playground

In [None]:
class BenchmarkEngine:
    """Comprehensive benchmarking engine for CPU vs GPU performance comparison."""
    
    def __init__(self):
        self.benchmark_results = []
        self.predefined_benchmarks = self._setup_predefined_benchmarks()
    
    def _setup_predefined_benchmarks(self):
        """Setup predefined benchmarks based on NVIDIA Rapids techniques."""
        return {
            "Matrix Operations": {
                "description": "Compare NumPy vs CuPy for large matrix operations",
                "categories": ["Linear Algebra", "Array Processing"],
                "benchmarks": [
                    {
                        "name": "Matrix Multiplication",
                        "cpu_code": """
import numpy as np
import time

# Setup
size = {size}
A = np.random.rand(size, size).astype(np.float32)
B = np.random.rand(size, size).astype(np.float32)

# Benchmark
start_time = time.perf_counter()
C = np.matmul(A, B)
cpu_time = time.perf_counter() - start_time

result = {{"execution_time": cpu_time, "result_shape": C.shape, "result_sum": float(np.sum(C))}}
""",
                        "gpu_code": """
import cupy as cp
import time

# Setup
size = {size}
A = cp.random.rand(size, size).astype(cp.float32)
B = cp.random.rand(size, size).astype(cp.float32)

# Benchmark
start_time = time.perf_counter()
C = cp.matmul(A, B)
cp.cuda.Device().synchronize()
gpu_time = time.perf_counter() - start_time

result = {{"execution_time": gpu_time, "result_shape": C.shape, "result_sum": float(cp.sum(C))}}
""",
                        "sizes": [256, 512, 1024, 2048],
                        "metric": "execution_time"
                    },
                    {
                        "name": "Singular Value Decomposition",
                        "cpu_code": """
import numpy as np
import time

# Setup
size = {size}
A = np.random.rand(size, size).astype(np.float32)

# Benchmark
start_time = time.perf_counter()
U, s, Vt = np.linalg.svd(A)
cpu_time = time.perf_counter() - start_time

result = {{"execution_time": cpu_time, "singular_values": len(s), "min_sv": float(np.min(s))}}
""",
                        "gpu_code": """
import cupy as cp
import time

# Setup
size = {size}
A = cp.random.rand(size, size).astype(cp.float32)

# Benchmark
start_time = time.perf_counter()
U, s, Vt = cp.linalg.svd(A)
cp.cuda.Device().synchronize()
gpu_time = time.perf_counter() - start_time

result = {{"execution_time": gpu_time, "singular_values": len(s), "min_sv": float(cp.min(s))}}
""",
                        "sizes": [128, 256, 512, 1024],
                        "metric": "execution_time"
                    }
                ]
            },
            
            "DataFrame Operations": {
                "description": "Compare Pandas vs cuDF for data processing tasks",
                "categories": ["Data Processing", "Analytics"],
                "benchmarks": [
                    {
                        "name": "GroupBy Aggregation",
                        "cpu_code": """
import pandas as pd
import numpy as np
import time

# Setup
n = {size}
df = pd.DataFrame({{
    'group': np.random.choice(['A', 'B', 'C', 'D', 'E'], n),
    'value1': np.random.randn(n),
    'value2': np.random.randn(n),
    'value3': np.random.randint(1, 100, n)
}})

# Benchmark
start_time = time.perf_counter()
result_df = df.groupby('group').agg({{
    'value1': ['mean', 'std', 'min', 'max'],
    'value2': ['sum', 'count'],
    'value3': ['median']
}})
cpu_time = time.perf_counter() - start_time

result = {{"execution_time": cpu_time, "groups": len(result_df), "total_rows": len(df)}}
""",
                        "gpu_code": """
import cudf
import numpy as np
import time

# Setup
n = {size}
df = cudf.DataFrame({{
    'group': np.random.choice(['A', 'B', 'C', 'D', 'E'], n),
    'value1': np.random.randn(n),
    'value2': np.random.randn(n),
    'value3': np.random.randint(1, 100, n)
}})

# Benchmark
start_time = time.perf_counter()
result_df = df.groupby('group').agg({{
    'value1': ['mean', 'std', 'min', 'max'],
    'value2': ['sum', 'count'],
    'value3': ['mean']  # cuDF doesn't support median in groupby
}})
gpu_time = time.perf_counter() - start_time

result = {{"execution_time": gpu_time, "groups": len(result_df), "total_rows": len(df)}}
""",
                        "sizes": [100000, 500000, 1000000, 2000000],
                        "metric": "execution_time"
                    },
                    {
                        "name": "String Operations",
                        "cpu_code": """
import pandas as pd
import numpy as np
import time

# Setup
n = {size}
df = pd.DataFrame({{
    'text': ['sample_text_' + str(i) for i in range(n)],
    'category': np.random.choice(['cat', 'dog', 'bird'], n)
}})

# Benchmark
start_time = time.perf_counter()
df['text_upper'] = df['text'].str.upper()
df['text_length'] = df['text'].str.len()
df['contains_sample'] = df['text'].str.contains('sample')
cpu_time = time.perf_counter() - start_time

result = {{"execution_time": cpu_time, "processed_strings": len(df), "avg_length": df['text_length'].mean()}}
""",
                        "gpu_code": """
import cudf
import numpy as np
import time

# Setup
n = {size}
df = cudf.DataFrame({{
    'text': ['sample_text_' + str(i) for i in range(n)],
    'category': np.random.choice(['cat', 'dog', 'bird'], n)
}})

# Benchmark
start_time = time.perf_counter()
df['text_upper'] = df['text'].str.upper()
df['text_length'] = df['text'].str.len()
df['contains_sample'] = df['text'].str.contains('sample')
gpu_time = time.perf_counter() - start_time

result = {{"execution_time": gpu_time, "processed_strings": len(df), "avg_length": df['text_length'].mean()}}
""",
                        "sizes": [50000, 100000, 250000, 500000],
                        "metric": "execution_time"
                    }
                ]
            },
            
            "Machine Learning": {
                "description": "Compare scikit-learn vs cuML for ML algorithms",
                "categories": ["Machine Learning", "Classification"],
                "benchmarks": [
                    {
                        "name": "K-Means Clustering",
                        "cpu_code": """
import numpy as np
from sklearn.cluster import KMeans
import time

# Setup
n_samples = {size}
n_features = 20
X = np.random.rand(n_samples, n_features).astype(np.float32)

# Benchmark
start_time = time.perf_counter()
kmeans = KMeans(n_clusters=8, random_state=42, n_init=10)
labels = kmeans.fit_predict(X)
cpu_time = time.perf_counter() - start_time

result = {{"execution_time": cpu_time, "n_clusters": 8, "inertia": float(kmeans.inertia_)}}
""",
                        "gpu_code": """
import cupy as cp
from cuml.cluster import KMeans
import time

# Setup
n_samples = {size}
n_features = 20
X = cp.random.rand(n_samples, n_features).astype(cp.float32)

# Benchmark
start_time = time.perf_counter()
kmeans = KMeans(n_clusters=8, random_state=42, n_init=10)
labels = kmeans.fit_predict(X)
cp.cuda.Device().synchronize()
gpu_time = time.perf_counter() - start_time

result = {{"execution_time": gpu_time, "n_clusters": 8, "inertia": float(kmeans.inertia_)}}
""",
                        "sizes": [10000, 50000, 100000, 200000],
                        "metric": "execution_time"
                    }
                ]
            },
            
            "Mathematical Functions": {
                "description": "Compare NumPy vs CuPy for mathematical operations",
                "categories": ["Mathematics", "Signal Processing"],
                "benchmarks": [
                    {
                        "name": "FFT Computation",
                        "cpu_code": """
import numpy as np
import time

# Setup
n = {size}
x = np.random.randn(n).astype(np.complex64)

# Benchmark
start_time = time.perf_counter()
fft_result = np.fft.fft(x)
cpu_time = time.perf_counter() - start_time

result = {{"execution_time": cpu_time, "fft_size": len(fft_result), "max_magnitude": float(np.max(np.abs(fft_result)))}}
""",
                        "gpu_code": """
import cupy as cp
import time

# Setup
n = {size}
x = cp.random.randn(n).astype(cp.complex64)

# Benchmark
start_time = time.perf_counter()
fft_result = cp.fft.fft(x)
cp.cuda.Device().synchronize()
gpu_time = time.perf_counter() - start_time

result = {{"execution_time": gpu_time, "fft_size": len(fft_result), "max_magnitude": float(cp.max(cp.abs(fft_result)))}}
""",
                        "sizes": [8192, 32768, 131072, 524288],
                        "metric": "execution_time"
                    }
                ]
            }
        }
    
    def run_benchmark(self, category, benchmark_name, size):
        """Run a specific benchmark and return results."""
        if category not in self.predefined_benchmarks:
            return None
        
        benchmark_data = None
        for bench in self.predefined_benchmarks[category]["benchmarks"]:
            if bench["name"] == benchmark_name:
                benchmark_data = bench
                break
        
        if not benchmark_data:
            return None
        
        # Prepare code with size parameter
        cpu_code = benchmark_data["cpu_code"].format(size=size)
        gpu_code = benchmark_data["gpu_code"].format(size=size)
        
        results = {
            "benchmark": benchmark_name,
            "category": category,
            "size": size,
            "cpu_result": None,
            "gpu_result": None,
            "speedup": None,
            "winner": None,
            "error": None
        }
        
        try:
            # Execute CPU code
            cpu_globals = {}
            exec(cpu_code, cpu_globals)
            results["cpu_result"] = cpu_globals.get("result", {})
            
            # Execute GPU code (with error handling for missing GPU libraries)
            try:
                gpu_globals = {}
                exec(gpu_code, gpu_globals)
                results["gpu_result"] = gpu_globals.get("result", {})
                
                # Calculate speedup
                if (results["cpu_result"] and results["gpu_result"] and 
                    "execution_time" in results["cpu_result"] and 
                    "execution_time" in results["gpu_result"]):
                    
                    cpu_time = results["cpu_result"]["execution_time"]
                    gpu_time = results["gpu_result"]["execution_time"]
                    
                    if gpu_time > 0:
                        results["speedup"] = cpu_time / gpu_time
                        results["winner"] = "GPU" if results["speedup"] > 1 else "CPU"
                    
            except ImportError as e:
                results["error"] = f"GPU libraries not available: {str(e)}"
                results["gpu_result"] = {"error": "GPU libraries not available"}
            except Exception as e:
                results["error"] = f"GPU execution failed: {str(e)}"
                results["gpu_result"] = {"error": str(e)}
                
        except Exception as e:
            results["error"] = f"Benchmark execution failed: {str(e)}"
        
        # Store results
        self.benchmark_results.append(results)
        return results
    
    def get_benchmark_categories(self):
        """Get list of available benchmark categories."""
        return list(self.predefined_benchmarks.keys())
    
    def get_benchmarks_for_category(self, category):
        """Get list of benchmarks for a specific category."""
        if category in self.predefined_benchmarks:
            return [bench["name"] for bench in self.predefined_benchmarks[category]["benchmarks"]]
        return []
    
    def get_benchmark_sizes(self, category, benchmark_name):
        """Get available sizes for a specific benchmark."""
        if category in self.predefined_benchmarks:
            for bench in self.predefined_benchmarks[category]["benchmarks"]:
                if bench["name"] == benchmark_name:
                    return bench["sizes"]
        return []
    
    def format_benchmark_results(self, results):
        """Format benchmark results for display."""
        if not results or results.get("error"):
            return f"❌ Error: {results.get('error', 'Unknown error')}"
        
        output = f"## 🏁 Benchmark Results: {results['benchmark']}\n\n"
        output += f"**Category:** {results['category']}  \n"
        output += f"**Problem Size:** {results['size']:,}  \n\n"
        
        # CPU Results
        if results.get("cpu_result"):
            cpu_time = results["cpu_result"].get("execution_time", 0)
            output += f"### 🖥️ CPU Performance (NumPy/Pandas/scikit-learn)\n"
            output += f"- **Execution Time:** {cpu_time:.4f} seconds\n"
            
            # Add additional metrics
            for key, value in results["cpu_result"].items():
                if key != "execution_time":
                    output += f"- **{key.replace('_', ' ').title()}:** {value}\n"
            output += "\n"
        
        # GPU Results
        if results.get("gpu_result") and not results["gpu_result"].get("error"):
            gpu_time = results["gpu_result"].get("execution_time", 0)
            output += f"### 🚀 GPU Performance (CuPy/cuDF/cuML)\n"
            output += f"- **Execution Time:** {gpu_time:.4f} seconds\n"
            
            # Add additional metrics
            for key, value in results["gpu_result"].items():
                if key != "execution_time":
                    output += f"- **{key.replace('_', ' ').title()}:** {value}\n"
            output += "\n"
            
            # Speedup analysis
            if results.get("speedup"):
                speedup = results["speedup"]
                winner = results["winner"]
                output += f"### 📊 Performance Analysis\n"
                output += f"- **Speedup:** {speedup:.2f}x\n"
                output += f"- **Winner:** {winner} 🏆\n"
                
                if speedup > 1:
                    output += f"- **Performance Gain:** {((speedup - 1) * 100):.1f}% faster on GPU\n"
                    if speedup > 10:
                        output += "- **Analysis:** 🔥 Excellent GPU acceleration! This workload is highly parallel.\n"
                    elif speedup > 3:
                        output += "- **Analysis:** ✅ Good GPU performance gain.\n"
                    else:
                        output += "- **Analysis:** ⚡ Moderate GPU acceleration.\n"
                else:
                    output += f"- **Performance Loss:** {((1 - speedup) * 100):.1f}% slower on GPU\n"
                    output += "- **Analysis:** ⚠️ GPU overhead dominates for this problem size. Try larger datasets.\n"
        
        elif results.get("gpu_result", {}).get("error"):
            output += f"### ❌ GPU Results\n"
            output += f"Error: {results['gpu_result']['error']}\n\n"
            output += "💡 **Note:** GPU libraries (CuPy, cuDF, cuML) need to be installed for GPU benchmarks.\n"
        
        return output
    
    def get_recent_results(self, limit=5):
        """Get recent benchmark results."""
        return self.benchmark_results[-limit:] if self.benchmark_results else []

# Initialize benchmark engine
benchmark_engine = BenchmarkEngine()

In [None]:
class EducationalContentEnhancer:
    """Enhance the RAG system with educational content and examples."""
    
    def __init__(self):
        self.code_examples = self._load_code_examples()
        self.performance_insights = self._load_performance_insights()
    
    def _load_code_examples(self):
        """Load curated code examples for common GPU acceleration patterns."""
        return {
            "matrix_multiplication": {
                "cpu_code": """
# CPU Version with NumPy
import numpy as np
import time

size = 2048
A = np.random.rand(size, size).astype(np.float32)
B = np.random.rand(size, size).astype(np.float32)

start_time = time.perf_counter()
C = np.matmul(A, B)
cpu_time = time.perf_counter() - start_time
print(f"CPU time: {cpu_time:.4f} seconds")
""",
                "gpu_code": """
# GPU Version with CuPy  
import cupy as cp
import time

size = 2048
A = cp.random.rand(size, size).astype(cp.float32)
B = cp.random.rand(size, size).astype(cp.float32)

start_time = time.perf_counter()
C = cp.matmul(A, B)
cp.cuda.Device().synchronize()
gpu_time = time.perf_counter() - start_time
print(f"GPU time: {gpu_time:.4f} seconds")
""",
                "expected_speedup": "10-50x",
                "key_points": [
                    "Use float32 for better GPU performance",
                    "Synchronize GPU for accurate timing",
                    "Performance scales with matrix size"
                ]
            },
            
            "dataframe_groupby": {
                "cpu_code": """
# CPU Version with Pandas
import pandas as pd
import numpy as np
import time

n = 1000000
df = pd.DataFrame({
    'group': np.random.choice(['A', 'B', 'C', 'D'], n),
    'value1': np.random.randn(n),
    'value2': np.random.randn(n)
})

start_time = time.perf_counter()
result = df.groupby('group').agg({
    'value1': ['mean', 'std'],
    'value2': ['sum', 'count']
})
cpu_time = time.perf_counter() - start_time
print(f"CPU time: {cpu_time:.4f} seconds")
""",
                "gpu_code": """
# GPU Version with cuDF
import cudf
import numpy as np
import time

n = 1000000
df = cudf.DataFrame({
    'group': np.random.choice(['A', 'B', 'C', 'D'], n),
    'value1': np.random.randn(n),
    'value2': np.random.randn(n)
})

start_time = time.perf_counter()
result = df.groupby('group').agg({
    'value1': ['mean', 'std'],
    'value2': ['sum', 'count']
})
gpu_time = time.perf_counter() - start_time
print(f"GPU time: {gpu_time:.4f} seconds")
""",
                "expected_speedup": "5-20x",
                "key_points": [
                    "cuDF API is nearly identical to pandas",
                    "Best performance with large datasets",
                    "GPU memory considerations for large DataFrames"
                ]
            },
            
            "element_wise_operations": {
                "cpu_code": """
# CPU Version - Element-wise operations
import numpy as np
import time

n = 10000000
x = np.random.rand(n).astype(np.float32)
y = np.random.rand(n).astype(np.float32)

start_time = time.perf_counter()
# Multiple operations create intermediate arrays
result = np.sqrt(x**2 + y**2)
mean_result = np.mean(result)
cpu_time = time.perf_counter() - start_time
print(f"CPU time: {cpu_time:.4f} seconds")
""",
                "gpu_code": """
# GPU Version with CuPy and kernel fusion
import cupy as cp
import time

n = 10000000
x = cp.random.rand(n).astype(cp.float32)
y = cp.random.rand(n).astype(cp.float32)

@cp.fuse()
def fused_distance_mean(x, y):
    return cp.mean(cp.sqrt(x**2 + y**2))

start_time = time.perf_counter()
mean_result = fused_distance_mean(x, y)
cp.cuda.Device().synchronize()
gpu_time = time.perf_counter() - start_time
print(f"GPU time: {gpu_time:.4f} seconds")
""",
                "expected_speedup": "3-15x",
                "key_points": [
                    "Use @cp.fuse() to reduce kernel launches",
                    "Avoid creating unnecessary intermediate arrays",
                    "Memory bandwidth often limits performance"
                ]
            }
        }
    
    def _load_performance_insights(self):
        """Load performance insights and optimization tips."""
        return {
            "general_principles": [
                "GPU acceleration benefits scale with problem size",
                "Memory bandwidth often bottlenecks GPU performance",
                "Minimize CPU-GPU data transfers",
                "Use appropriate data types (float32 vs float64)",
                "Batch operations to amortize kernel launch overhead"
            ],
            
            "when_to_use_gpu": [
                "Large datasets (>100K elements for arrays, >50K rows for DataFrames)",
                "Highly parallel operations (matrix multiplication, element-wise ops)",
                "Repetitive computations that stay on GPU",
                "Machine learning with large feature spaces"
            ],
            
            "when_not_to_use_gpu": [
                "Small datasets where overhead dominates",
                "Sequential algorithms that don't parallelize well",
                "Code with frequent CPU-GPU transfers",
                "I/O bound operations"
            ],
            
            "optimization_techniques": {
                "cupy": [
                    "Use @cp.fuse() for element-wise operations",
                    "Keep data on GPU between operations",
                    "Use streams for concurrent operations",
                    "Profile with cupyx.profiler.benchmark"
                ],
                "cudf": [
                    "Use appropriate dtypes to save memory",
                    "Leverage GPU-accelerated string operations",
                    "Use .query() for efficient filtering",
                    "Batch operations on large DataFrames"
                ],
                "cuml": [
                    "Use single precision (float32) when possible",
                    "Leverage GPU memory for large datasets",
                    "Use appropriate algorithm parameters",
                    "Consider data preprocessing on GPU"
                ]
            }
        }
    
    def get_example_for_operation(self, operation_type):
        """Get code example for a specific operation type."""
        return self.code_examples.get(operation_type, None)
    
    def get_optimization_tips(self, library):
        """Get optimization tips for a specific library."""
        return self.performance_insights.get("optimization_techniques", {}).get(library, [])
    
    def get_performance_guidelines(self):
        """Get general performance guidelines."""
        return self.performance_insights

# Initialize the educational content enhancer
content_enhancer = EducationalContentEnhancer()

In [None]:
class PerformanceVisualizer:
    """Create visualizations and insights for benchmark results."""
    
    def __init__(self):
        self.visualization_templates = self._setup_visualization_templates()
    
    def _setup_visualization_templates(self):
        """Setup templates for different types of performance visualizations."""
        return {
            "speedup_chart": """
## 📈 Performance Speedup Analysis

```
Benchmark: {benchmark_name}
Category: {category}
Problem Size: {size:,}

CPU Time:    {cpu_time:.4f}s  ████████████████████████
GPU Time:    {gpu_time:.4f}s  {gpu_bar}
Speedup:     {speedup:.2f}x   {speedup_indicator}
```

**Performance Insights:**
{insights}
""",
            
            "scaling_analysis": """
## 📊 Performance Scaling Analysis

**How performance changes with problem size:**

{scaling_data}

**Key Observations:**
- GPU advantage increases with larger problem sizes
- Overhead is more significant for smaller datasets
- Memory bandwidth becomes the limiting factor at large scales
""",
            
            "comparison_matrix": """
## 🏆 Technology Comparison

| Operation Type | CPU Library | GPU Library | Typical Speedup | Best Use Case |
|----------------|-------------|-------------|-----------------|---------------|
| Matrix Ops     | NumPy       | CuPy        | 10-50x         | Linear algebra, large arrays |
| DataFrame Ops  | Pandas      | cuDF        | 5-20x          | Data processing, analytics |
| ML Algorithms  | scikit-learn| cuML        | 5-25x          | Large datasets, feature engineering |
| Math Functions | NumPy       | CuPy        | 3-15x          | Signal processing, numerical computing |

**💡 Selection Guidelines:**
- **Problem Size**: GPU benefits increase with larger datasets
- **Memory**: Consider GPU memory limitations for very large data
- **Pipeline**: Keep operations on GPU to avoid transfer overhead
"""
        }
    
    def create_speedup_visualization(self, benchmark_result):
        """Create a text-based speedup visualization."""
        if not benchmark_result or benchmark_result.get("error"):
            return "❌ No valid benchmark results to visualize"
        
        cpu_result = benchmark_result.get("cpu_result", {})
        gpu_result = benchmark_result.get("gpu_result", {})
        
        if not cpu_result or not gpu_result or gpu_result.get("error"):
            return "❌ Incomplete benchmark results for visualization"
        
        cpu_time = cpu_result.get("execution_time", 0)
        gpu_time = gpu_result.get("execution_time", 0)
        speedup = benchmark_result.get("speedup", 1)
        
        # Create simple text bar visualization
        max_bar_length = 24
        if cpu_time > 0:
            gpu_bar_length = max(1, int((gpu_time / cpu_time) * max_bar_length))
            gpu_bar = "█" * gpu_bar_length
        else:
            gpu_bar = "█"
        
        # Speedup indicator
        if speedup > 10:
            speedup_indicator = "🔥 Excellent acceleration!"
        elif speedup > 3:
            speedup_indicator = "✅ Good performance gain"
        elif speedup > 1:
            speedup_indicator = "⚡ Moderate improvement"
        else:
            speedup_indicator = "⚠️ GPU overhead dominates"
        
        # Generate insights
        insights = self._generate_performance_insights(benchmark_result)
        
        return self.visualization_templates["speedup_chart"].format(
            benchmark_name=benchmark_result.get("benchmark", "Unknown"),
            category=benchmark_result.get("category", "Unknown"),
            size=benchmark_result.get("size", 0),
            cpu_time=cpu_time,
            gpu_time=gpu_time,
            gpu_bar=gpu_bar,
            speedup=speedup,
            speedup_indicator=speedup_indicator,
            insights=insights
        )
    
    def _generate_performance_insights(self, benchmark_result):
        """Generate specific insights based on benchmark results."""
        insights = []
        speedup = benchmark_result.get("speedup", 1)
        category = benchmark_result.get("category", "")
        size = benchmark_result.get("size", 0)
        
        # Size-based insights
        if size < 1000:
            insights.append("• Small problem size - GPU overhead may limit benefits")
        elif size < 100000:
            insights.append("• Medium problem size - good balance of performance and overhead")
        else:
            insights.append("• Large problem size - excellent candidate for GPU acceleration")
        
        # Category-specific insights
        if "Matrix" in category:
            if speedup > 10:
                insights.append("• Matrix operations scale excellently on GPU due to high parallelism")
            else:
                insights.append("• Consider larger matrices or float32 data type for better GPU performance")
        
        elif "DataFrame" in category:
            if speedup > 5:
                insights.append("• DataFrame operations benefit from GPU's high memory bandwidth")
            else:
                insights.append("• Try larger datasets or more complex operations for better GPU utilization")
        
        elif "Machine Learning" in category:
            if speedup > 5:
                insights.append("• ML algorithms show good GPU acceleration with parallel computations")
            else:
                insights.append("• Consider hyperparameter tuning or larger feature spaces")
        
        # Performance-based insights
        if speedup < 1:
            insights.append("• GPU overhead exceeds benefits - consider CPU for this workload")
        elif speedup > 20:
            insights.append("• Exceptional GPU performance - this workload is highly parallel")
        
        return "\n".join(insights) if insights else "• Standard GPU acceleration performance"
    
    def create_educational_summary(self, benchmark_result):
        """Create educational summary explaining the results."""
        category = benchmark_result.get("category", "Unknown")
        speedup = benchmark_result.get("speedup", 1)
        
        summary = f"""
### 🎓 Educational Summary

**What happened in this benchmark:**
"""
        
        if "Matrix" in category:
            summary += """
1. **CPU Processing**: NumPy used optimized BLAS libraries but was limited by sequential processing
2. **GPU Processing**: CuPy leveraged thousands of CUDA cores for parallel matrix computations
3. **Key Factor**: Matrix multiplication is embarrassingly parallel, ideal for GPU architecture
"""
        
        elif "DataFrame" in category:
            summary += """
1. **CPU Processing**: Pandas processed data sequentially with some multi-threading
2. **GPU Processing**: cuDF utilized GPU's high memory bandwidth and parallel cores
3. **Key Factor**: GroupBy operations benefit from GPU's ability to process many groups simultaneously
"""
        
        elif "Machine Learning" in category:
            summary += """
1. **CPU Processing**: scikit-learn used optimized CPU algorithms
2. **GPU Processing**: cuML leveraged GPU parallelism for distance calculations and updates
3. **Key Factor**: ML algorithms with many data points benefit from massive parallelization
"""
        
        # Add learning objectives
        summary += f"""
**Learning Objectives Achieved:**
• Demonstrated {speedup:.1f}x performance improvement with GPU acceleration
• Showed real-world application of NVIDIA Rapids ecosystem
• Illustrated when GPU acceleration provides significant benefits
• Experienced hands-on performance comparison

**Next Steps to Explore:**
• Try different problem sizes to see how speedup scales
• Experiment with different data types (float32 vs float64)
• Explore memory usage patterns between CPU and GPU implementations
"""
        
        return summary

# Initialize performance visualizer
perf_visualizer = PerformanceVisualizer()

In [None]:
import gradio as gr
import json

def chat_with_mentor(message, code, chat_history):
    """Handle chat interactions with the GPU Mentor - now integrates code with LLM."""
    
    try:
        # Process user input through the enhanced mentor (code + message together)
        response = gpu_mentor.process_user_input(message, code)
        
        # Format response for chat
        formatted_response = response["text_response"]
        
        # Add code analysis if available
        if response["code_analysis"]:
            analysis = response["code_analysis"]
            formatted_response += f"\n\n**📊 Code Analysis:**\n"
            formatted_response += f"• Libraries detected: {', '.join(analysis['libraries_detected'])}\n"
            formatted_response += f"• Estimated speedup potential: {analysis['estimated_speedup']:.1f}x\n"
            formatted_response += f"• GPU compatible: {'✅' if analysis['gpu_compatible'] else '❌'}\n"
            
            if analysis['warnings']:
                formatted_response += f"• ⚠️ Warnings: {'; '.join(analysis['warnings'])}\n"
        
        # Add code execution output
        if response["code_output"]:
            output = response["code_output"]
            formatted_response += f"\n\n**⚡ Code Execution Results:**\n"
            
            if output.get("status") == "success":
                formatted_response += f"• ✅ Execution successful ({output.get('execution_time', 0):.3f}s)\n"
                
                if output.get("stdout"):
                    formatted_response += f"• 📄 Output:\n```\n{output['stdout']}\n```\n"
                
                if output.get("variables"):
                    formatted_response += f"• 📊 Variables created: {', '.join(output['variables'].keys())}\n"
                    # Show details for important variables
                    for var_name, var_info in list(output['variables'].items())[:3]:
                        formatted_response += f"  - `{var_name}`: {var_info}\n"
            else:
                formatted_response += f"• ❌ Execution failed: {output.get('error', 'Unknown error')}\n"
                if output.get("stderr"):
                    formatted_response += f"• 🚨 Error details:\n```\n{output['stderr']}\n```\n"
        
        # Add Socratic questions
        if response["socratic_questions"]:
            formatted_response += f"\n\n**🤔 Think About This:**\n"
            for i, question in enumerate(response["socratic_questions"], 1):
                formatted_response += f"{i}. {question}\n"
        
        # Update chat history
        if chat_history is None:
            chat_history = []
        
        # Format user message with code if provided
        user_message = message
        if code and code.strip():
            user_message += f"\n\n```python\n{code}\n```"
        
        chat_history.append({"role": "user", "content": user_message})
        chat_history.append({"role": "assistant", "content": formatted_response})
        
        return "", "", chat_history, response.get("code_output"), response.get("optimized_code", "")
        
    except Exception as e:
        error_msg = f"❌ Error: {str(e)}"
        if chat_history is None:
            chat_history = []
        chat_history.append({"role": "user", "content": message})
        chat_history.append({"role": "assistant", "content": error_msg})
        return "", "", chat_history, None, ""

def analyze_code_only(code):
    """Analyze code for optimization opportunities."""
    
    if not code.strip():
        return "Please provide code to analyze.", ""
    
    try:
        analysis = code_optimizer.analyze_code(code)
        optimized_code = code_optimizer.suggest_optimizations(code)
        
        analysis_text = f"""
**🔍 Code Analysis Results:**
• Libraries detected: {', '.join(analysis['libraries_detected'])}
• Estimated speedup potential: {analysis['estimated_speedup']:.1f}x
• GPU compatible: {'✅ Yes' if analysis['gpu_compatible'] else '❌ No'}

**⚡ Optimization Opportunities:**
• Matrix operations: {'✅ Detected' if any(op in code for op in ['np.dot', 'np.matmul', '@']) else '❌ None'}
• Array operations: {'✅ Detected' if 'numpy' in analysis['libraries_detected'] else '❌ None'}
• DataFrame operations: {'✅ Detected' if 'pandas' in analysis['libraries_detected'] else '❌ None'}
• Loop vectorization: {'✅ Possible' if 'for ' in code and 'range(' in code else '❌ None'}

**⚠️ Considerations:**
{chr(10).join('• ' + warning for warning in analysis['warnings']) if analysis['warnings'] else '• None detected'}

**💡 Recommendations:**
• Consider using CuPy for NumPy operations on large arrays
• Try cuDF for pandas operations on large datasets  
• Use memory pools for repeated GPU operations
• Profile memory usage for optimal batch sizes
"""
        
        return analysis_text, optimized_code
        
    except Exception as e:
        return f"Error analyzing code: {str(e)}", ""

def get_tutorial(topic):
    """Generate tutorial content for specific topics."""
    
    if not topic.strip():
        return "Please specify a topic for the tutorial."
    
    try:
        tutorial_content = gpu_mentor.generate_tutorial_content(topic)
        return tutorial_content
    except Exception as e:
        return f"Error generating tutorial: {str(e)}"

def clear_chat():
    """Clear chat history."""
    return None, None, None, None, ""

# Sample code examples for quick testing
sample_codes = {
    "Simple Array Operations": '''import numpy as np

# Create arrays
n = 1000
x = np.random.rand(n)
y = np.random.rand(n)

# Basic operations
result = np.sqrt(x**2 + y**2)
mean_result = np.mean(result)

print(f"Array size: {n}")
print(f"Mean result: {mean_result:.4f}")
print(f"Max result: {np.max(result):.4f}")''',

    "Matrix Multiplication": '''import numpy as np

# Create matrices
n = 500
A = np.random.rand(n, n)
B = np.random.rand(n, n)

# Matrix multiplication
C = np.dot(A, B)

print(f"Matrix size: {n}x{n}")
print(f"Result shape: {C.shape}")
print(f"Result sum: {np.sum(C):.2f}")''',
    
    "DataFrame Operations": '''import pandas as pd
import numpy as np

# Create dataset
n = 10000
df = pd.DataFrame({
    'x': np.random.randn(n),
    'y': np.random.randn(n),
    'group': np.random.choice(['A', 'B', 'C'], n)
})

# Compute statistics
result = df.groupby('group').agg({
    'x': ['mean', 'std'],
    'y': ['sum', 'count']
})

print(f"Dataset size: {len(df)} rows")
print("Grouped results:")
print(result)''',
    
    "Mathematical Functions": '''import numpy as np

# Generate data
n = 5000
x = np.linspace(0, 4*np.pi, n)
y = np.sin(x) * np.exp(-x/10)

# Compute statistics
mean_y = np.mean(y)
std_y = np.std(y)
max_y = np.max(y)

print(f"Data points: {n}")
print(f"Mean: {mean_y:.4f}")
print(f"Std: {std_y:.4f}")
print(f"Max: {max_y:.4f}")''',

    "Data Processing Loop": '''import numpy as np

# Create data
data = np.random.rand(1000, 10)
results = []

# Process data (can be vectorized)
for i in range(len(data)):
    row_sum = np.sum(data[i])
    row_mean = np.mean(data[i])
    results.append(row_sum * row_mean)

final_result = np.array(results)
print(f"Processed {len(data)} rows")
print(f"Final result shape: {final_result.shape}")
print(f"Average result: {np.mean(final_result):.4f}")'''
}

# Create the enhanced Gradio interface
with gr.Blocks(title="GPU Mentor - Enhanced AI Tutor", theme=gr.themes.Soft()) as demo:
    
    gr.Markdown("# 🚀 Enhanced GPU Mentor: AI Tutor with Integrated Code Execution")
    
    with gr.Tabs():
        
        # Features Tab (moved from main interface)
        with gr.Tab("ℹ️ Features"):
            gr.Markdown("""
            ## 🚀 Enhanced GPU Mentor Features
            
            **🔗 Integrated LLM + Code**: Ask questions about your code - the AI sees both your question and code together
            
            **⚡ Live Code Execution**: Run your Python code instantly and see the output in the chat
            
            **🔍 Smart Analysis**: Get optimization suggestions and GPU acceleration opportunities
            
            **📚 Educational Guidance**: Socratic questions and learning objectives based on your actual code
            
            **🎯 Multi-Modal Support**: Handles text questions, code analysis, and execution all in one interface
            
            **🚀 GPU Optimization**: Automatic detection of optimization opportunities and GPU-compatible code suggestions
            
            **📊 Performance Insights**: Real-time analysis of code performance and potential speedup estimates
            """)
        
        # Main Chat Playground Tab (Redesigned)
        with gr.Tab("💬 Chat Playground"):
            with gr.Column():
                # Main conversation area
                chatbot = gr.Chatbot(
                    label="GPU Mentor Conversation",
                    height=500,
                    type="messages",
                    show_copy_button=True
                )
                
                # Integrated input area at bottom of conversation
                with gr.Row():
                    with gr.Column(scale=3):
                        message_input = gr.Textbox(
                            label="",
                            placeholder="Ask about GPU acceleration, optimization, or explain your code...",
                            lines=2,
                            show_label=False
                        )
                    with gr.Column(scale=1):
                        submit_btn = gr.Button("💬 Send", variant="primary", size="lg")
                        clear_btn = gr.Button("🧹 Clear", size="sm")
                
                # Code input area (collapsible)
                with gr.Accordion("📝 Python Code (Optional)", open=False):
                    code_input = gr.Code(
                        label="",
                        language="python",
                        lines=8,
                        show_label=False,
                        # placeholder="# Enter your Python code here (optional)\n# The AI will analyze and execute it along with your question"
                    )
                    
                    # Sample code selector
                    with gr.Row():
                        sample_dropdown = gr.Dropdown(
                            choices=list(sample_codes.keys()),
                            label="Load Sample Code",
                            value=None,
                            scale=2
                        )
                        load_sample_btn = gr.Button("📂 Load", scale=1)
        
        # Code Analysis Tab (Updated)
        with gr.Tab("🔍 Code Analysis & Optimization"):
            with gr.Row():
                with gr.Column():
                    analyze_code = gr.Code(
                        label="Code to Analyze",
                        language="python",
                        lines=15
                    )
                    
                    analyze_btn = gr.Button("🔍 Analyze Code", variant="primary")
                    
                    analysis_results = gr.Textbox(
                        label="Analysis Results",
                        lines=15
                    )
                
                with gr.Column():
                    optimized_code = gr.Code(
                        label="GPU-Optimized Version",
                        language="python",
                        lines=20
                    )
        
        # Performance Benchmarking Tab (NEW)
        with gr.Tab("🏁 Performance Benchmarking"):
            gr.Markdown("## 🚀 CPU vs GPU Performance Comparison")
            gr.Markdown("""
            Compare the performance of CPU and GPU implementations across different workloads.
            This interactive benchmarking tool demonstrates real-world GPU acceleration benefits.
            """)
            
            with gr.Row():
                with gr.Column(scale=1):
                    # Benchmark selection controls
                    benchmark_category = gr.Dropdown(
                        choices=benchmark_engine.get_benchmark_categories(),
                        label="📂 Benchmark Category",
                        value="Matrix Operations"
                    )
                    
                    benchmark_name = gr.Dropdown(
                        choices=[],
                        label="🎯 Specific Benchmark",
                        value=None
                    )
                    
                    benchmark_size = gr.Dropdown(
                        choices=[],
                        label="📏 Problem Size",
                        value=None
                    )
                    
                    run_benchmark_btn = gr.Button("🏃‍♂️ Run Benchmark", variant="primary", size="lg")
                    
                    # Benchmark status
                    benchmark_status = gr.Textbox(
                        label="Status",
                        value="Select benchmark parameters and click 'Run Benchmark'",
                        interactive=False,
                        lines=2
                    )
                
                with gr.Column(scale=2):
                    # Results display
                    benchmark_results = gr.Markdown(
                        label="📊 Benchmark Results",
                        value="""
### 🎯 Ready to Benchmark!

Select a category, benchmark, and problem size from the left panel, then click **Run Benchmark** to see CPU vs GPU performance comparison.

**Available Categories:**
- **Matrix Operations**: Linear algebra operations (NumPy vs CuPy)
- **DataFrame Operations**: Data processing tasks (Pandas vs cuDF)  
- **Machine Learning**: ML algorithms (scikit-learn vs cuML)
- **Mathematical Functions**: Mathematical computations (NumPy vs CuPy)

**What You'll Learn:**
- Real-world GPU acceleration benefits
- Performance scaling with problem size
- When GPU acceleration is most effective
- Memory and computational trade-offs

**💡 Pro Tips:**
- Start with Matrix Operations for dramatic speedups
- DataFrame Operations work best with large datasets (>100K rows)
- ML Algorithms show consistent benefits across problem sizes
- Mathematical Functions benefit from kernel fusion techniques
"""
                    )
            
            # Technology comparison section
            with gr.Row():
                gr.Markdown("""
### 🏆 Technology Comparison Guide

| **Operation Type** | **CPU Library** | **GPU Library** | **Typical Speedup** | **Best Use Case** |
|-------------------|-----------------|-----------------|---------------------|-------------------|
| **Matrix Operations** | NumPy | CuPy | 10-50x | Linear algebra, large arrays |
| **DataFrame Operations** | Pandas | cuDF | 5-20x | Data processing, analytics |
| **ML Algorithms** | scikit-learn | cuML | 5-25x | Large datasets, feature engineering |
| **Math Functions** | NumPy | CuPy | 3-15x | Signal processing, numerical computing |

**🎯 Selection Guidelines:**
- **Problem Size**: GPU benefits increase with larger datasets
- **Memory**: Consider GPU memory limitations for very large data  
- **Pipeline**: Keep operations on GPU to avoid transfer overhead
- **Data Type**: Use float32 when possible for better GPU performance
""")
            
            # Recent benchmarks section
            with gr.Row():
                with gr.Column():
                    gr.Markdown("### 📈 Recent Benchmark History")
                    recent_benchmarks_btn = gr.Button("🔍 Show Recent Results")
                    recent_benchmarks = gr.JSON(label="Recent Benchmarks", visible=False)
                    
                    # Quick benchmark buttons for common operations
                    gr.Markdown("### ⚡ Quick Benchmarks")
                    with gr.Row():
                        quick_matrix_btn = gr.Button("Matrix Ops", size="sm")
                        quick_dataframe_btn = gr.Button("DataFrame Ops", size="sm") 
                        quick_ml_btn = gr.Button("ML Algorithms", size="sm")
                        quick_math_btn = gr.Button("Math Functions", size="sm")
        
        # Tutorial Generator Tab  
        with gr.Tab("📚 Personalized Tutorials"):
            with gr.Column():
                tutorial_topic = gr.Textbox(
                    label="Tutorial Topic",
                    placeholder="e.g., 'CuPy memory management', 'cuDF vs pandas performance', 'vectorizing loops'...",
                    lines=1
                )
                
                generate_tutorial_btn = gr.Button("📝 Generate Tutorial", variant="primary")
                
                tutorial_content = gr.Markdown(
                    label="Tutorial Content",
                    value="Enter a topic above to generate a personalized tutorial."
                )
        
        # Execution Summary Tab (Replaces Performance)
        with gr.Tab("📈 Code Execution Summary"):
            with gr.Column():
                gr.Markdown("### Your Code Execution History")
                
                summary_btn = gr.Button("📊 View Execution Summary")
                execution_summary = gr.JSON(label="Execution Summary")
    
    
    # Supporting functions for interface
    def get_tutorial(topic):
        """Generate tutorial content for the given topic."""
        if not topic.strip():
            return "Please enter a topic to generate a tutorial."
        
        try:
            content = gpu_mentor.generate_tutorial_content(topic)
            return content
        except Exception as e:
            return f"Error generating tutorial: {str(e)}"
    
    def clear_chat():
        """Clear the chat history."""
        return [], None, None
    
    # Benchmarking functions
    def update_benchmark_options(category):
        """Update benchmark name dropdown based on selected category."""
        if not category:
            return gr.Dropdown(choices=[], value=None), gr.Dropdown(choices=[], value=None)
        
        benchmarks = benchmark_engine.get_benchmarks_for_category(category)
        return (
            gr.Dropdown(choices=benchmarks, value=benchmarks[0] if benchmarks else None),
            gr.Dropdown(choices=[], value=None)
        )
    
    def update_size_options(category, benchmark_name):
        """Update size dropdown based on selected benchmark."""
        if not category or not benchmark_name:
            return gr.Dropdown(choices=[], value=None)
        
        sizes = benchmark_engine.get_benchmark_sizes(category, benchmark_name)
        return gr.Dropdown(choices=sizes, value=sizes[0] if sizes else None)
    
    def run_selected_benchmark(category, benchmark_name, size):
        """Run the selected benchmark and return enhanced results with visualizations."""
        if not all([category, benchmark_name, size]):
            return "❌ Please select all benchmark parameters", "Please select category, benchmark, and size"
        
        status_msg = f"🏃‍♂️ Running {benchmark_name} benchmark with size {size:,}..."
        
        try:
            results = benchmark_engine.run_benchmark(category, benchmark_name, size)
            
            if not results:
                return "❌ Benchmark failed to execute", "Benchmark execution failed"
            
            # Create comprehensive results with visualizations
            formatted_results = ""
            
            # Add basic benchmark results
            basic_results = benchmark_engine.format_benchmark_results(results)
            formatted_results += basic_results
            
            # Add performance visualization if successful
            if results.get("speedup") and not results.get("error"):
                formatted_results += "\n\n---\n\n"
                formatted_results += perf_visualizer.create_speedup_visualization(results)
                
                # Add educational summary
                formatted_results += "\n\n---\n\n"
                formatted_results += perf_visualizer.create_educational_summary(results)
                
                # Add code examples for this category
                formatted_results += "\n\n---\n\n"
                formatted_results += f"### 💻 Code Examples\n\n"
                
                if "Matrix" in category:
                    example = content_enhancer.get_example_for_operation("matrix_multiplication")
                elif "DataFrame" in category:
                    example = content_enhancer.get_example_for_operation("dataframe_groupby")
                else:
                    example = content_enhancer.get_example_for_operation("element_wise_operations")
                
                if example:
                    formatted_results += f"**CPU Implementation:**\n```python\n{example['cpu_code']}\n```\n\n"
                    formatted_results += f"**GPU Implementation:**\n```python\n{example['gpu_code']}\n```\n\n"
                    formatted_results += f"**Expected Speedup:** {example['expected_speedup']}\n\n"
                    formatted_results += "**Key Optimization Points:**\n"
                    for point in example['key_points']:
                        formatted_results += f"• {point}\n"
            
            # Success message
            if results and results.get("speedup"):
                success_msg = f"✅ Completed! GPU Speedup: {results['speedup']:.2f}x"
                if results['speedup'] > 10:
                    success_msg += " 🔥 Excellent acceleration!"
                elif results['speedup'] > 3:
                    success_msg += " ✅ Good performance!"
                elif results['speedup'] > 1:
                    success_msg += " ⚡ Moderate improvement"
                else:
                    success_msg += " ⚠️ GPU overhead detected"
            else:
                success_msg = "✅ Benchmark completed"
                
            return formatted_results, success_msg
            
        except Exception as e:
            error_msg = f"❌ Benchmark failed: {str(e)}"
            error_details = f"""
## ❌ Benchmark Error

**Error Details:** {str(e)}

**Possible Causes:**
• GPU libraries (CuPy, cuDF, cuML) may not be installed
• Insufficient GPU memory for the selected problem size
• CUDA driver/runtime issues

**Troubleshooting:**
• Try a smaller problem size
• Check GPU memory availability
• Verify RAPIDS installation: `conda list | grep -E "(cupy|cudf|cuml)"`

**Note:** CPU benchmarks should still work even without GPU libraries.
"""
            return error_details, error_msg
    
    def show_recent_benchmarks():
        """Show recent benchmark results."""
        recent = benchmark_engine.get_recent_results()
        if not recent:
            return {"message": "No recent benchmarks"}, gr.JSON(visible=True)
        
        summary = []
        for result in recent:
            summary.append({
                "benchmark": result.get("benchmark", "Unknown"),
                "category": result.get("category", "Unknown"),
                "size": result.get("size", 0),
                "speedup": result.get("speedup", "N/A"),
                "winner": result.get("winner", "N/A")
            })
        
        return summary, gr.JSON(visible=True)
    
    def run_quick_benchmark(benchmark_type):
        """Run a quick benchmark for common operations."""
        quick_benchmarks = {
            "Matrix Ops": ("Matrix Operations", "Matrix Multiplication", 1024),
            "DataFrame Ops": ("DataFrame Operations", "GroupBy Aggregation", 500000),
            "ML Algorithms": ("Machine Learning", "K-Means Clustering", 50000),
            "Math Functions": ("Mathematical Functions", "FFT Computation", 131072)
        }
        
        if benchmark_type in quick_benchmarks:
            category, name, size = quick_benchmarks[benchmark_type]
            return run_selected_benchmark(category, name, size)
        
        return "❌ Quick benchmark not found", "Error"

    # Event handlers
    def load_sample_code(sample_name):
        if sample_name and sample_name in sample_codes:
            return sample_codes[sample_name]
        return ""
    
    # Wire up the interface
    sample_dropdown.change(load_sample_code, inputs=[sample_dropdown], outputs=[code_input])
    load_sample_btn.click(load_sample_code, inputs=[sample_dropdown], outputs=[code_input])
    
    submit_btn.click(
        chat_with_mentor,
        inputs=[message_input, code_input, chatbot],
        outputs=[message_input, code_input, chatbot, gr.State(), gr.State()]
    )
    
    clear_btn.click(clear_chat, outputs=[chatbot, gr.State(), gr.State()])
    
    analyze_btn.click(
        analyze_code_only,
        inputs=[analyze_code],
        outputs=[analysis_results, optimized_code]
    )
    
    generate_tutorial_btn.click(
        get_tutorial,
        inputs=[tutorial_topic],
        outputs=[tutorial_content]
    )
    
    summary_btn.click(
        lambda: gpu_mentor.get_execution_summary(),
        outputs=[execution_summary]
    )
    
    # Benchmarking event handlers
    benchmark_category.change(
        update_benchmark_options,
        inputs=[benchmark_category],
        outputs=[benchmark_name, benchmark_size]
    )
    
    benchmark_name.change(
        update_size_options,
        inputs=[benchmark_category, benchmark_name],
        outputs=[benchmark_size]
    )
    
    run_benchmark_btn.click(
        run_selected_benchmark,
        inputs=[benchmark_category, benchmark_name, benchmark_size],
        outputs=[benchmark_results, benchmark_status]
    )
    
    recent_benchmarks_btn.click(
        show_recent_benchmarks,
        outputs=[recent_benchmarks, recent_benchmarks]
    )
    
    # Quick benchmark buttons
    quick_matrix_btn.click(
        lambda: run_quick_benchmark("Matrix Ops"),
        outputs=[benchmark_results, benchmark_status]
    )
    
    quick_dataframe_btn.click(
        lambda: run_quick_benchmark("DataFrame Ops"),
        outputs=[benchmark_results, benchmark_status]
    )
    
    quick_ml_btn.click(
        lambda: run_quick_benchmark("ML Algorithms"),
        outputs=[benchmark_results, benchmark_status]
    )
    
    quick_math_btn.click(
        lambda: run_quick_benchmark("Math Functions"),
        outputs=[benchmark_results, benchmark_status]
    )

# Launch the enhanced interface
demo.launch(share=True)

## 16. Example Usage & Testing

Let's test the GPU Mentor system with some example interactions:

## 🏁 Testing the Benchmarking System

The new benchmarking feature provides comprehensive CPU vs GPU performance comparisons across multiple domains:

### 🚀 New Features Added:

1. **Interactive Benchmarking Tab** - Compare CPU vs GPU performance across different workloads
2. **Educational Content** - Learn why GPU acceleration works for specific operations
3. **Visual Performance Analysis** - See speedup charts and scaling insights
4. **Code Examples** - Get optimized CPU and GPU implementations
5. **Quick Benchmark Buttons** - Run common benchmarks with one click

### 📊 Available Benchmark Categories:

- **Matrix Operations**: NumPy vs CuPy for linear algebra
- **DataFrame Operations**: Pandas vs cuDF for data processing  
- **Machine Learning**: scikit-learn vs cuML for ML algorithms
- **Mathematical Functions**: NumPy vs CuPy for numerical computing

### 🎯 Key Learning Objectives:

- Understand when GPU acceleration provides significant benefits
- Learn optimization techniques for different GPU libraries
- Experience hands-on performance comparison
- Discover scaling patterns with problem size
- Explore memory and computational trade-offs

### 💻 How to Use:

1. **Launch the Interface**: Run the Gradio interface cell
2. **Navigate to Benchmarking Tab**: Click on "🏁 Performance Benchmarking"
3. **Select Parameters**: Choose category, benchmark, and problem size
4. **Run Benchmark**: Click "Run Benchmark" to see CPU vs GPU comparison
5. **Explore Results**: Review performance analysis, visualizations, and code examples
6. **Try Quick Benchmarks**: Use the quick benchmark buttons for common operations

The system will provide detailed explanations of why certain operations benefit from GPU acceleration and offer educational insights based on NVIDIA Rapids best practices.

In [None]:
# Test the enhanced GPU Mentor with integrated LLM + code execution
sample_numpy_code = """
import numpy as np

# Create large arrays
n = 1000
x = np.random.rand(n)
y = np.random.rand(n)

# Compute distance
result = np.sqrt(x**2 + y**2)
mean_distance = np.mean(result)

print(f"Array size: {n}")
print(f"Mean distance: {mean_distance:.4f}")
"""

print("=== Testing Enhanced GPU Mentor (LLM + Code Integration) ===")

# Test the integrated approach - LLM sees both question and code together
user_question = "How can I optimize this distance calculation for GPU acceleration?"

try:
    # This now sends both the question AND code to the LLM together
    response = gpu_mentor.process_user_input(user_question, sample_numpy_code)
    
    print("🤖 AI Response (with code context):")
    print(response["text_response"][:300] + "..." if len(response["text_response"]) > 300 else response["text_response"])
    
    print("\n⚡ Code Execution Results:")
    if response["code_output"]:
        output = response["code_output"]
        print(f"Status: {output.get('status', 'unknown')}")
        print(f"Execution time: {output.get('execution_time', 0):.4f}s")
        print(f"Output: {output.get('stdout', 'No output')}")
        print(f"Variables: {list(output.get('variables', {}).keys())}")
    
    print("\n🔍 Code Analysis:")
    if response["code_analysis"]:
        analysis = response["code_analysis"]
        print(f"Libraries: {analysis.get('libraries_detected', [])}")
        print(f"Speedup potential: {analysis.get('estimated_speedup', 1.0):.1f}x")
    
    print("\n🚀 Optimized Code:")
    if response["optimized_code"]:
        print(response["optimized_code"][:200] + "..." if len(response["optimized_code"]) > 200 else response["optimized_code"])
    
    print("\n🤔 Socratic Questions:")
    for i, question in enumerate(response.get("socratic_questions", []), 1):
        print(f"{i}. {question}")
        
except Exception as e:
    print(f"Error testing enhanced mentor: {e}")

print("\n" + "="*50)
print("Testing Safe Code Execution...")

# Test safe code execution
test_code = """
import numpy as np
data = np.array([1, 2, 3, 4, 5])
result = np.sum(data)
print(f"Sum: {result}")
"""

try:
    output = gpu_mentor._execute_code_safely(test_code)
    print(f"Execution status: {output['status']}")
    print(f"Execution time: {output['execution_time']:.4f}s")
    print(f"Output: {output['stdout']}")
    print(f"Variables: {output['variables']}")
except Exception as e:
    print(f"Error in safe execution: {e}")

In [None]:
# Test the enhanced GPU Mentor (without actual Sol execution for demo)
print("\n=== Testing Enhanced GPU Mentor ===")

# Simulate a user interaction
user_question = "How can I accelerate matrix multiplication with CuPy?"
sample_code = """
import numpy as np
A = np.random.rand(500, 500)
B = np.random.rand(500, 500)
C = np.dot(A, B)
"""

# Test just the RAG response and code analysis (skip actual benchmarking)
try:
    # Get RAG response
    rag_result = gpu_mentor.rag_graph.invoke({
        "messages": [{"role": "user", "content": user_question}]
    })
    print("RAG Response:", rag_result["messages"][-1].content[:200] + "...")
    
    # Analyze code
    analysis = gpu_mentor.code_optimizer.analyze_code(sample_code)
    print("\nCode Analysis:", analysis)
    
    # Generate Socratic questions
    questions = gpu_mentor._generate_socratic_questions(analysis, user_question)
    print("\nSocratic Questions:")
    for i, q in enumerate(questions, 1):
        print(f"{i}. {q}")
        
except Exception as e:
    print(f"Error testing GPU Mentor: {e}")