<a href="https://colab.research.google.com/github/look4pritam/LargeLanguageModels/blob/master/Notebooks/vLLM/vLLM-Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# vLLM with Goolge Colab

### Install Python 'vllm' 'transformers' and 'accelerate' packages.

In [1]:
!pip install vllm transformers accelerate

Collecting vllm
  Downloading vllm-0.8.2-cp38-abi3-manylinux1_x86_64.whl.metadata (27 kB)
Collecting numpy<2.0.0 (from vllm)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting blake3 (from vllm)
  Downloading blake3-1.0.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Collecting fastapi>=0.115.0 (from fastapi[standard]>=0.115.0->vllm)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting prometheus-fastapi-instrumentator>=7.0.0 (from vllm)
  Downloading prometheus_fastapi_instrumentator-7.1.0-py3-none-any.whl.metadata (13 kB)
Collecting tiktoken>=0.6.0 (from vllm)
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting lm-format-enforcer<0.11,>=0.10.11 (from vllm)
  Downloading lm_format_enforce

### Download 'unsloth/Llama-3.2-3B-Instruct' model.

In [1]:
from huggingface_hub import snapshot_download

model_id = "unsloth/Llama-3.2-3B-Instruct"
local_dir = "/content/Llama-3.2-3B-Instruct"
snapshot_download(repo_id=model_id, local_dir=local_dir)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 11 files:   0%|          | 0/11 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.43G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/945 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.74k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.7k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

'/content/Llama-3.2-3B-Instruct'

In [2]:
!vllm serve --help

2025-04-06 02:09:13.703395: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743905353.985458    2249 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743905354.062291    2249 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-06 02:09:14.660507: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
INFO 04-06 02:09:20 [__init__.py:239] Automatically detected platform cuda.
usage: vllm serve <model_tag> [options]



In [None]:
!vllm serve '/content/Llama-3.2-3B-Instruct'\
  --dtype half \
  --gpu-memory-utilization 0.95 \
  --max-model-len 16384 \
  --host 0.0.0.0 \
  --port 8000 \
  --trust-remote-code

### Run vLLM server.

In [3]:
import subprocess
import time
import os

In [4]:
vllm_process = subprocess.Popen([
    'vllm',
    'serve',
    'unsloth/Llama-3.2-1B-Instruct',
    '--trust-remote-code',
    '--dtype', 'half',
    '--max-model-len', '16384',
    '--host', '0.0.0.0',
    '--port', '8000',
    '--gpu-memory-utilization', '0.70'
], stdout=subprocess.PIPE, stderr=subprocess.PIPE, start_new_session=True)

### Check vLLM server status.

In [5]:
import requests

In [6]:
def check_vllm_status():
    try:
        response = requests.get("http://localhost:8000/health")
        if response.status_code == 200:
            print("vllm server is running")
            return True
    except requests.exceptions.ConnectionError:
        print("vllm server is not running")
        return False

In [8]:
try:
    # Monitor the process
    while True:
        if check_vllm_status() == True:
            print("The vllm server is ready to serve.")
            break
        else:
            print("The vllm server has stopped.")
            stdout, stderr = vllm_process.communicate(timeout=200)
            print(f"STDOUT: {stdout.decode('utf-8')}")
            print(f"STDERR: {stderr.decode('utf-8')}")
            break
        time.sleep(5)  # Check every second
except KeyboardInterrupt:
    print("Stopping the check of vllm...")

vllm server is not running
The vllm server has stopped.


TimeoutExpired: Command '['vllm', 'serve', 'unsloth/Llama-3.2-1B-Instruct', '--trust-remote-code', '--dtype', 'half', '--max-model-len', '16384', '--host', '0.0.0.0', '--port', '8000', '--gpu-memory-utilization', '0.70']' timed out after 200 seconds

### Run a simple client.

In [9]:
import requests
import json
from pydantic import BaseModel
import requests

In [10]:
class QuestionRequest(BaseModel):
    question: str
    model: str = "unsloth/Llama-3.2-1B-Instruct"

In [11]:
def ask_model(question: str, model: str):
    """
    Sends a request to the model server and fetches a response.
    """
    url = "http://localhost:8000/v1/chat/completions"
    headers = {"Content-Type": "application/json"}
    data = {
        "model": model,
        "messages": [
            {
                "role": "user",
                "content": question
            }
        ]
    }

    response = requests.post(url, headers=headers, json=data)
    response.raise_for_status()  # Raise exception for HTTP errors
    return response.json()

In [12]:
result = ask_model("What is the capital of India?", "unsloth/Llama-3.2-1B-Instruct")
print(json.dumps(result, indent=2))

{
  "id": "chatcmpl-077a4c265abd411aa096185cbd982999",
  "object": "chat.completion",
  "created": 1743905657,
  "model": "unsloth/Llama-3.2-1B-Instruct",
  "choices": [
    {
      "index": 0,
      "message": {
        "role": "assistant",
        "reasoning_content": null,
        "content": "The capital of India is New Delhi.",
        "tool_calls": []
      },
      "logprobs": null,
      "finish_reason": "stop",
      "stop_reason": null
    }
  ],
  "usage": {
    "prompt_tokens": 42,
    "total_tokens": 51,
    "completion_tokens": 9,
    "prompt_tokens_details": null
  },
  "prompt_logprobs": null
}
