In [1]:
import torch
print(torch.__version__)  # should say +cu128
print(torch.cuda.is_available())  # should be True
print(torch.cuda.get_device_name(0))  # should say RTX 5080

2.8.0.dev20250503+cu128
True
NVIDIA GeForce RTX 5080 Laptop GPU


# ⚡ Chain of Thought (CoT) — CPU vs GPU Speed Test

This notebook compares model inference time using CPU and GPU for the same CoT-style prompt using `flan-t5-small`.

Make sure you have a CUDA-compatible GPU and PyTorch with GPU support installed.

In [2]:
from transformers import pipeline
import torch
import time

# Check device
device = 0 if torch.cuda.is_available() else -1
print(f"CUDA Available: {torch.cuda.is_available()}")
print(f"Using device: {'GPU' if device == 0 else 'CPU'}")

# Load model
generator = pipeline("text2text-generation", model="google/flan-t5-small", device=device)


CUDA Available: True
Using device: GPU


Device set to use cuda:0


## 🧪 CoT Prompt Test
We'll test inference time with the same prompt on both CPU and GPU.

In [3]:
prompt_cot = """If you have 5 cars and each car has 4 wheels, how many wheels total?

Let's think step by step."""


In [6]:
prompt_cot = """Explain in detail how the French Revolution influenced modern democratic systems. Let's think step by step."""


In [7]:
# Run on CPU
generator_cpu = pipeline("text2text-generation", model="google/flan-t5-small", device=-1)

start_cpu = time.time()
output_cpu = generator_cpu(prompt_cot, max_new_tokens=300)[0]["generated_text"]
end_cpu = time.time()

print("🧠 CPU Output:")
print(output_cpu)
print(f"⏱ CPU Inference Time: {end_cpu - start_cpu:.2f} seconds")


Device set to use cpu


🧠 CPU Output:
The French Revolution influenced modern democratic systems. The French Revolution influenced modern democratic systems. The answer: yes.
⏱ CPU Inference Time: 0.52 seconds


In [8]:
# Run on GPU (if available)
if torch.cuda.is_available():
    generator_gpu = pipeline("text2text-generation", model="google/flan-t5-small", device=0)

    start_gpu = time.time()
    output_gpu = generator_gpu(prompt_cot, max_new_tokens=100)[0]["generated_text"]
    end_gpu = time.time()

    print("🚀 GPU Output:")
    print(output_gpu)
    print(f"⏱ GPU Inference Time: {end_gpu - start_gpu:.2f} seconds")
else:
    print("🚫 No GPU available.")


Device set to use cuda:0


🚀 GPU Output:
The French Revolution influenced modern democratic systems. The French Revolution influenced modern democratic systems. The answer: yes.
⏱ GPU Inference Time: 0.33 seconds
