<a href="https://colab.research.google.com/github/orca3/llm-model-serving/blob/main/ch02/ch2_Batching.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --quiet vllm transformers tiktoken

In [None]:
import torch
import gc
import time

# Unload models and clean up gpu memory cache
def free_gpu(model):
  if model:
    # Removes the reference to the model's memory,
    # making it eligible for garbage collection.
    del model

  # Release any cached GPU memory that's no longer needed.
  if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()

  # Trigger garbage collection to ensure memory is fully released.
  gc.collect()

free_gpu(None)

In [None]:
# Define the prompt.
prompt = """You are an expert AI historian writing a detailed chapter for a book titled "The Evolution of Human-AI Collaboration."

Begin by summarizing the early stages of artificial intelligence in the 1950s, touching on symbolic logic and rule-based systems. Then transition into the rise of machine learning, particularly deep learning in the 2010s.

Afterward, describe how large language models like GPT transformed human-computer interaction, enabling applications in education, creative writing, customer support, and software development.

Finally, reflect on the societal and ethical implications of AI, such as misinformation, bias, and the alignment problem.

Write in a formal tone, with rich detail and examples in each era."""

In [None]:
import time
from vllm import LLM, SamplingParams

# Load model with vLLM
llm = LLM(
    model="Qwen/Qwen2.5-0.5B",
    dtype="float16",
    trust_remote_code=True,
    max_model_len=2048
)


In [None]:
import torch
import gc
import time
from vllm import LLM, SamplingParams
from transformers import pipeline

# Prompts for batch generation, 4 input sequences
prompts = [
    "What is the meaning of life?",
    "Write a short story about a robot learning to love.",
    "Explain quantum physics in simple terms.",
    "Translate 'Hello, world!' into Spanish."
]

sampling_params = SamplingParams(
    temperature=0.8,
    top_p=0.95,
    max_tokens=100
)

start_time = time.time()
# process four input sequences together in one batch
vllm_outputs = llm.generate(prompts, sampling_params)
end_time = time.time()
vllm_time = end_time - start_time

print(f"\nvLLM generation time for 4 prompts in a batch: {vllm_time:.4f} seconds")

start_time = time.time()
# process the first input sequence only
vllm_outputs = llm.generate([prompts[0]], sampling_params)
end_time = time.time()
vllm_time = end_time - start_time

print(f"\nvLLM generation time for 1 prompts: {vllm_time:.4f} seconds")

