## Step 1: Verify Dual GPU Environment

In [None]:
import subprocess
import os

print("="*70)
print("üîç SPLIT-GPU ENVIRONMENT CHECK")
print("="*70)

# Check GPUs
result = subprocess.run(
    ["nvidia-smi", "--query-gpu=index,name,memory.total,memory.free", "--format=csv,noheader"],
    capture_output=True, text=True
)

gpus = result.stdout.strip().split('\n')
print(f"\nüìä Detected {len(gpus)} GPU(s):")
for gpu in gpus:
    print(f"   {gpu}")

if len(gpus) >= 2:
    print("\n‚úÖ Dual T4 ready for split-GPU operation!")
    print("   GPU 0 ‚Üí llama-server (LLM)")
    print("   GPU 1 ‚Üí RAPIDS/Graphistry")
else:
    print("\n‚ö†Ô∏è Need 2 GPUs for split operation")

## Step 2: Install Dependencies

In [None]:
%%time
print("üì¶ Installing dependencies...")

# Install llcuda v2.2.0 (force fresh install to ensure correct binaries)
!pip install -q --no-cache-dir --force-reinstall git+https://github.com/llcuda/llcuda.git@v2.2.0

# Install cuGraph (matching Kaggle RAPIDS 25.6.0)
!pip install -q --extra-index-url=https://pypi.nvidia.com "cugraph-cu12==25.6.*"

# Install Graphistry
!pip install -q graphistry

# Verify installations
import llcuda
print(f"\n‚úÖ llcuda {llcuda.__version__} installed")

try:
    import cudf
    import cugraph
    print(f"‚úÖ cuDF {cudf.__version__}")
    print(f"‚úÖ cuGraph {cugraph.__version__}")
except ImportError as e:
    print(f"‚ö†Ô∏è RAPIDS: {e}")

try:
    import graphistry
    print(f"‚úÖ Graphistry {graphistry.__version__}")
except ImportError as e:
    print(f"‚ö†Ô∏è Graphistry: {e}")

## Step 3: Download GGUF Model

In [None]:
%%time
from huggingface_hub import hf_hub_download
import os

# Download a model that fits on single GPU (leaving GPU 1 free)
MODEL_REPO = "unsloth/gemma-3-4b-it-GGUF"
MODEL_FILE = "gemma-3-4b-it-Q4_K_M.gguf"

print(f"üì• Downloading {MODEL_FILE}...")
print(f"   This will run on GPU 0 only.")

model_path = hf_hub_download(
    repo_id=MODEL_REPO,
    filename=MODEL_FILE,
    local_dir="/kaggle/working/models"
)

size_gb = os.path.getsize(model_path) / (1024**3)
print(f"\n‚úÖ Model downloaded: {model_path}")
print(f"   Size: {size_gb:.2f} GB")

## Step 4: Start llama-server on GPU 0 Only

In [None]:
from llcuda.server import ServerManager

print("="*70)
print("üöÄ STARTING LLAMA-SERVER ON GPU 0")
print("="*70)

# Configuration for GPU 0 ONLY (leave GPU 1 for RAPIDS)
print("\nüìã Configuration:")
print("   GPU 0: 100% (llama-server)")
print("   GPU 1: 0% (reserved for RAPIDS)")

server = ServerManager()
server.start_server(
    model_path=model_path,
    host="127.0.0.1",
    port=8080,
    
    # GPU 0 only configuration
    gpu_layers=99,
    tensor_split="1.0,0.0",  # 100% on GPU 0, 0% on GPU 1
    
    # Optimize for single GPU
    ctx_size=4096,
    flash_attention=True,
)

if server.check_server_health():
    print("\n‚úÖ llama-server running on GPU 0!")
else:
    print("\n‚ùå Server failed to start")

## Step 5: Verify GPU Split

In [None]:
print("="*70)
print("üìä GPU MEMORY SPLIT VERIFICATION")
print("="*70)

!nvidia-smi --query-gpu=index,name,memory.used,memory.free --format=csv

import subprocess
result = subprocess.run(
    ["nvidia-smi", "--query-gpu=index,memory.free", "--format=csv,noheader,nounits"],
    capture_output=True, text=True
)

lines = result.stdout.strip().split('\n')
if len(lines) >= 2:
    gpu1_free = int(lines[1].split(',')[1].strip())
    print(f"\n‚úÖ GPU 1 has {gpu1_free} MiB free for RAPIDS!")

## Step 6: Initialize RAPIDS on GPU 1

In [None]:
import os
# Force RAPIDS to use GPU 1
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

print("="*70)
print("üî• INITIALIZING RAPIDS ON GPU 1")
print("="*70)

import cudf
import cupy as cp

# Verify we're on the right GPU
print(f"\nüìä RAPIDS GPU Info:")
device = cp.cuda.Device(0)  # Device 0 in filtered view = actual GPU 1
print(f"   Device: {device.id} (filtered view)")
print(f"   Actual GPU: 1 (Tesla T4)")

# Test cuDF on GPU 1
test_df = cudf.DataFrame({
    'source': [0, 1, 2, 3, 4],
    'target': [1, 2, 3, 4, 0],
    'weight': [1.0, 2.0, 1.5, 0.5, 3.0]
})

print(f"\n‚úÖ cuDF working on GPU 1")
print(f"   Test DataFrame: {test_df.shape}")

## Step 7: Create Sample Graph Data

In [None]:
import cudf
import cugraph

print("="*70)
print("üìä CREATING SAMPLE GRAPH ON GPU 1")
print("="*70)

# Create a sample social network graph
edges = cudf.DataFrame({
    'source': [0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 5, 5, 6, 7, 8, 9],
    'target': [1, 2, 3, 2, 4, 3, 5, 4, 6, 5, 6, 7, 7, 8, 9, 0],
})

# Node labels
node_names = {
    0: "Alice", 1: "Bob", 2: "Charlie", 3: "Diana",
    4: "Eve", 5: "Frank", 6: "Grace", 7: "Henry",
    8: "Ivy", 9: "Jack"
}

print(f"\nüìä Graph created:")
print(f"   Nodes: {len(node_names)}")
print(f"   Edges: {len(edges)}")

# Create cuGraph graph
G = cugraph.Graph()
G.from_cudf_edgelist(edges, source='source', destination='target')

print(f"\n‚úÖ cuGraph graph created on GPU 1")

## Step 8: Run Graph Analytics on GPU 1

In [None]:
print("="*70)
print("üî¨ GPU-ACCELERATED GRAPH ANALYTICS")
print("="*70)

# PageRank
print("\nüìä PageRank Analysis:")
pagerank = cugraph.pagerank(G)
pagerank = pagerank.sort_values('pagerank', ascending=False)

for _, row in pagerank.to_pandas().head(5).iterrows():
    node_id = int(row['vertex'])
    score = row['pagerank']
    name = node_names.get(node_id, f"Node {node_id}")
    print(f"   {name}: {score:.4f}")

# Betweenness Centrality
print("\nüìä Betweenness Centrality:")
bc = cugraph.betweenness_centrality(G)
bc = bc.sort_values('betweenness_centrality', ascending=False)

for _, row in bc.to_pandas().head(5).iterrows():
    node_id = int(row['vertex'])
    score = row['betweenness_centrality']
    name = node_names.get(node_id, f"Node {node_id}")
    print(f"   {name}: {score:.4f}")

print("\n‚úÖ Graph analytics computed on GPU 1")

## Step 9: Use LLM to Analyze Graph Results

In [None]:
# Reset CUDA_VISIBLE_DEVICES to access llama-server
del os.environ["CUDA_VISIBLE_DEVICES"]

from llcuda.api.client import LlamaCppClient

print("="*70)
print("ü§ñ LLM ANALYSIS OF GRAPH RESULTS")
print("="*70)

# Get top PageRank nodes
top_nodes = pagerank.to_pandas().head(3)
top_names = [node_names[int(row['vertex'])] for _, row in top_nodes.iterrows()]

# Create prompt for LLM
prompt = f"""I have a social network graph with 10 people. 
The PageRank analysis shows the most influential people are: {', '.join(top_names)}.
The betweenness centrality shows who are the key connectors in the network.

Based on this, what insights can you provide about the network structure? 
Keep your response to 3-4 sentences."""

client = LlamaCppClient(base_url="http://127.0.0.1:8080")

response = client.chat_completion(
    messages=[{"role": "user", "content": prompt}],
    max_tokens=200,
    temperature=0.7
)

print(f"\nüìù LLM Analysis (GPU 0):")
print(response.choices[0].message.content)

print("\n‚úÖ Simultaneous GPU operation:")
print("   GPU 0: LLM inference")
print("   GPU 1: Graph analytics (previously computed)")

## Step 10: Graphistry Visualization Setup

In [None]:
import graphistry
import pandas as pd

print("="*70)
print("üìä GRAPHISTRY VISUALIZATION")
print("="*70)

# Convert to pandas for Graphistry (works in-notebook)
edges_pd = edges.to_pandas()
edges_pd['source_name'] = edges_pd['source'].map(node_names)
edges_pd['target_name'] = edges_pd['target'].map(node_names)

# Create nodes DataFrame with metrics
pagerank_pd = pagerank.to_pandas()
bc_pd = bc.to_pandas()

nodes_pd = pd.DataFrame({
    'node_id': list(node_names.keys()),
    'name': list(node_names.values())
})
nodes_pd = nodes_pd.merge(
    pagerank_pd.rename(columns={'vertex': 'node_id'}),
    on='node_id'
)
nodes_pd = nodes_pd.merge(
    bc_pd.rename(columns={'vertex': 'node_id'}),
    on='node_id'
)

print(f"\nüìä Prepared for visualization:")
print(f"   Nodes: {len(nodes_pd)}")
print(f"   Edges: {len(edges_pd)}")

# Note: Graphistry requires registration for full visualization
# For demo purposes, we'll show the prepared data
print(f"\nüìã Node Metrics:")
print(nodes_pd[['name', 'pagerank', 'betweenness_centrality']].to_string(index=False))

## Step 11: Interactive LLM + Graph Workflow

In [None]:
print("="*70)
print("üîÑ INTERACTIVE LLM + GRAPH WORKFLOW")
print("="*70)

def analyze_node(node_name):
    """Use LLM to analyze a specific node's network position."""
    node_data = nodes_pd[nodes_pd['name'] == node_name].iloc[0]
    
    prompt = f"""Analyze the network position of {node_name}:
    - PageRank score: {node_data['pagerank']:.4f} (higher = more influential)
    - Betweenness centrality: {node_data['betweenness_centrality']:.4f} (higher = more connections)
    
    What does this tell us about {node_name}'s role in the network? Answer in 2 sentences."""
    
    response = client.chat_completion(
        messages=[{"role": "user", "content": prompt}],
        max_tokens=100,
        temperature=0.7
    )
    
    return response.choices[0].message.content

# Analyze top 3 nodes
print("\nüîç Node Analysis:")
for name in ['Alice', 'Charlie', 'Frank']:
    print(f"\nüìå {name}:")
    analysis = analyze_node(name)
    print(f"   {analysis}")

## Step 12: Monitor Both GPUs

In [None]:
print("="*70)
print("üìä DUAL GPU MONITORING")
print("="*70)

!nvidia-smi

print("\nüí° Split-GPU Operation:")
print("   GPU 0: llama-server (GGUF model loaded)")
print("   GPU 1: RAPIDS memory (cuDF/cuGraph data structures)")

## Step 13: Cleanup

In [None]:
print("üõë Stopping llama-server...")
server.stop_server()

# Clear RAPIDS memory
import gc
del G, edges, pagerank, bc
gc.collect()

print("\n‚úÖ Resources cleaned up")
print("\nüìä Final GPU Status:")
!nvidia-smi --query-gpu=index,memory.used,memory.free --format=csv

## üìö Summary

### Split-GPU Architecture:
- **GPU 0**: llama-server with `tensor_split=[1.0, 0.0]`
- **GPU 1**: RAPIDS/cuGraph via `CUDA_VISIBLE_DEVICES="1"`

### Key Integration Points:
1. ‚úÖ LLM for natural language analysis
2. ‚úÖ cuGraph for GPU-accelerated graph algorithms
3. ‚úÖ Graphistry for visualization
4. ‚úÖ Combined insights from both

### Code Pattern:
```python
# GPU 0: llama-server
config = ServerConfig(tensor_split=[1.0, 0.0], main_gpu=0)

# GPU 1: RAPIDS
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import cudf, cugraph  # Uses GPU 1
```

---

**Next:** [07-openai-api-client](07-openai-api-client-llcuda-v2.2.0.ipynb)