In [None]:
# Step 2: Install vllm from pip
# !pip install uv
# !uv pip install -r requirements.txt

# requirements.txt
# vllm==0.9.1
# llmcompressor==0.5.2
# pandas==2.3.0
# datasets==3.6.0
# lm-eval==0.4.7
# transformers<4.54.0 # https://github.com/vllm-project/vllm-ascend/issues/2046


In [None]:
!pip install uv

In [None]:
!uv pip install -r requirements.txt

In [None]:
# Login to OC Cluster
# !oc login --token=<Enter Token> --server=<Enter Server Detail>
!oc login --token=sha256~xxxxxx --server=https://api.cluster-xxxxx

In [None]:
# Validate the environment
import subprocess
import sys
import pkg_resources
from packaging import version
import os

def check_package_version(package_name, expected_version):
    """Check if a package is installed and has the expected version"""
    try:
        installed_version = pkg_resources.get_distribution(package_name).version
        if version.parse(installed_version) >= version.parse(expected_version):
            return True, installed_version
        else:
            return False, installed_version
    except pkg_resources.DistributionNotFound:
        return False, "Not installed"

def check_llama_serving_status():
    """Check the status of llama-serving namespace"""
    try:
        # Check if we can access the namespace
        result = subprocess.run( 
            ["oc", "get", "pods", "-n", "llama-serving", "--no-headers"],
            capture_output=True,
            text=True,
            timeout=30
        )
        
        if result.returncode == 0:
            lines = result.stdout.strip().split('\n')
            if not lines or lines == ['']:
                return True, "No pods found - default vLLM inference serving is disabled (optimal for labs)"
            
            # Check pod status
            pod_status = []
            for line in lines:
                if line.strip():
                    parts = line.split()
                    if len(parts) >= 3:
                        pod_name = parts[0]
                        ready = parts[1]
                        status = parts[2]
                        pod_status.append(f"{pod_name}: {ready} {status}")
            
            return True, pod_status
        else:
            return False, f"Error accessing namespace: {result.stderr}"
    except subprocess.TimeoutExpired:
        return False, "Timeout accessing llama-serving namespace"
    except Exception as e:
        return False, f"Error: {str(e)}"

def validate_environment():
    """Main validation function"""
    print("üîç Environment Validation Report")
    print("=" * 50)
    
    # Check required packages
    required_packages = {
        "vllm": "0.9.1",
        "llmcompressor": "0.5.2"
    }
    
    all_checks_passed = True
    
    for package, expected_ver in required_packages.items():
        is_valid, installed_ver = check_package_version(package, expected_ver)
        
        if is_valid:
            print(f"‚úÖ {package}: {installed_ver} (>= {expected_ver})")
        else:
            print(f"‚ùå {package}: {installed_ver} (expected >= {expected_ver})")
            all_checks_passed = False
    
    # Check llama-serving status
    print("\nüîç llama-serving Status:")
    print("-" * 30)
    
    is_accessible, status_info = check_llama_serving_status()
    
    if is_accessible:
        print("‚úÖ llama-serving namespace is accessible")
        if isinstance(status_info, list):
            for pod_info in status_info:
                print(f"  üì¶ {pod_info}")
        else:
            print(f"  ‚ÑπÔ∏è  {status_info}")
    else:
        print(f"‚ùå llama-serving namespace: {status_info}")
        all_checks_passed = False
    
    print("\n" + "=" * 50)
    if all_checks_passed:
        print("üéâ All validation checks passed! Environment is ready.")
    else:
        print("‚ö†Ô∏è  Some validation checks failed. Please review the issues above.")
    
    return all_checks_passed

# Run the validation
validate_environment()

In [None]:
# Status Check
def quick_status_check():
    """Quick validation check with minimal output"""
    print("üöÄ Quick Environment Status Check")
    print("=" * 35)
    
    # Check vllm
    vllm_ok, vllm_ver = check_package_version("vllm", "0.9.1")
    print(f"{'‚úÖ' if vllm_ok else '‚ùå'} vllm: {vllm_ver}")
    
    # Check llmcompressor
    llmc_ok, llmc_ver = check_package_version("llmcompressor", "0.5.2")
    print(f"{'‚úÖ' if llmc_ok else '‚ùå'} llmcompressor: {llmc_ver}")
    
    # Check llama-serving
    llama_ok, _ = check_llama_serving_status()
    print(f"{'‚úÖ' if llama_ok else '‚ùå'} llama-serving: {'accessible, default vLLM inference serving is disabled (optimal for labs)' if llama_ok else 'issue detected'}")
    
    # Overall status
    all_good = vllm_ok and llmc_ok and llama_ok
    print(f"\n{'üéâ' if all_good else '‚ö†Ô∏è'} Overall: {'All systems ready!' if all_good else 'Issues found - run full validation above'}")
    
    return all_good

# Run quick check
quick_status_check()

In [None]:
# Test the model is running
!curl -X POST -H "Content-Type: application/json" -d '{ \
    "prompt": "What is the capital of France?", \
    "max_tokens": 50 \
}' http://localhost:8000/v1/completions | python -m json.tool

In [None]:
# Download Benchmarking Tool
!git clone https://github.com/vllm-project/vllm.git

In [None]:
!cd vllm && git checkout v0.9.1

In [None]:
# Enter token for Hugging Face
# %env HF_TOKEN=<token>
%env HF_TOKEN=hf_xxxxx

In [None]:
# Run Benchmarking tool
!python vllm/benchmarks/benchmark_serving.py \
--backend vllm --model RedHatAI/Llama-3.2-1B-Instruct-FP8 \
--num-prompts 100 --dataset-name random  --random-input 200 --random-output 200 --port 8000

In [None]:
# Display Nvidia Benchmarking Performance
from IPython.display import IFrame

IFrame("https://docs.nvidia.com/nim/benchmarking/llm/latest/performance.html#llama-3-1-8b-instruct-results", width=600, height=600)

In [None]:
# Inspect what are you running on OpenShift Cluster
!oc project ai-roadshow
!oc get pods

In [None]:
# Run the nvidia-smi utility in the Pod to retrieve the details of the NVIDIA GPU
!oc exec vllm-0 -- nvidia-smi