# GPU & LLM Deployment on Colab: From Zero to Practical
Safe for free Colab (T4/L4/A100 when available).

## 0) GPU Runtime Check & Basics

In [1]:
import torch, platform, os, textwrap, subprocess, sys
print("Python:", sys.version)
print("Pytorch", torch.__version__)
print("CUDA available:", torch.cuda.is_available)

Python: 3.12.11 (main, Jun  4 2025, 08:56:18) [GCC 11.4.0]
Pytorch 2.8.0+cu126
CUDA available: <function is_available at 0x7ac5fcd980e0>


In [3]:
if torch.cuda.is_available():
  print("GPU count", torch.cuda.device_count())
  print("GPU name:", torch.cuda.get_device_name(0))
  !nvidia-smi || true
else:
  print("No GPU detected")


GPU count 1
GPU name: Tesla T4
Sun Sep  7 14:33:50 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   65C    P8             11W /   70W |       2MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                 

## 1) Install Core Libraries
- `transformers`, `accelerate` for LLMs
- `bitsandbytes` for 8-bit/4-bit loading (VRAM savings)
- `datasets` for toy tasks

In [4]:
%%bash
pip -q install --upgrade transformers accelerate bitsandbytes datasets sentencepiece
python - << 'PY'
import torch, sys
print("Torch:", torch.__version__)
print("CUDA:", torch.version.cuda if hasattr(torch.version, "cuda") else None)
print("Is CUDA available:", torch.cuda.is_available())
PY

     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 42.2/42.2 kB 4.3 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 11.6/11.6 MB 72.5 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 61.3/61.3 MB 16.2 MB/s eta 0:00:00
Torch: 2.8.0+cu126
CUDA: 12.6
Is CUDA available: True


## 2) PyTorch on GPU: Tensors, Memory, and Speed

In [10]:
import torch, time
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device: ", device)

# Allocate big tensors on CPU vs GPU and measure speed
N = 4096
# torch.randn(N, N) function in PyTorch creates a square tensor of size N x N filled with random
# numbers. These random numbers are drawn from a standard normal distribution,
# meaning they have a mean of 0 and a variance of 1.
a_cpu = torch.randn(N,N)
b_cpu = torch.randn(N,N)

start = time.time()
#c_cpu = torch.matmul(a_cpu, b_cpu)
c_cpu = a_cpu @ b_cpu
end = time.time()
cpu_time = end - start
print("CPU time:", cpu_time)


Using device:  cuda
CPU time: 2.9028379917144775


In [11]:
if torch.cuda.is_available():
  a_gpu = a_cpu.to(device, non_blocking=True)
  b_gpu = b_cpu.to(device, non_blocking=True)
  torch.cuda.synchronize()
  start = time.time()
  c_gpu = a_gpu @ b_gpu
  torch.cuda.synchronize()
  gpu_time = time.time() - start
  print(f"Matrix multiplication time CPU: {cpu_time} | GPU: {gpu_time}")
else:
  print(f"Matrix multiplication time CPU: {cpu_time} | GPU: N/A")

Matrix multiplication time CPU: 2.9028379917144775 | GPU: 0.14554810523986816
