In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import time

In [2]:
class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1)
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.fc = nn.Linear(32 * 7 * 7, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        x = self.conv2(x)
        x = self.relu(x)
        x = self.maxpool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x


In [3]:
import os

In [4]:
os.chdir("/mnt/data/")

In [5]:
try:
    os.mkdir("MNIST")
except Exception as e:
    print("Directory already there")
os.chdir("MNIST")

Directory already there


In [14]:
# Load MNIST, this is just an example
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
trainset = torchvision.datasets.MNIST(root='.', train=True, download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True, num_workers=2,drop_last=True)

testset = torchvision.datasets.MNIST(root='.', train=False, download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=1000, shuffle=False, num_workers=2)


In [15]:
# "Naive" model
model = SimpleCNN().cuda()

In [16]:
# Compile the model
compiled_model = torch.compile(model,backend='cudagraphs')

In [17]:
# Optimizer, loss
optimizer = optim.Adam(compiled_model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()


In [18]:
# Helpers
def train(model, trainloader, optimizer, criterion, epochs=1):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        start_time = time.time()
        for i, data in enumerate(trainloader, 0):
            inputs, labels = data[0].cuda(), data[1].cuda()
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        epoch_time = time.time()-start_time
        print(f"Epoch {epoch + 1}, Loss: {running_loss / len(trainloader)}, time: {epoch_time:.2f}s")

def evaluate(model, testloader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data in testloader:
            images, labels = data[0].cuda(), data[1].cuda()
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print(f"Accuracy: {100 * correct / total}%")

In [19]:
torch.set_float32_matmul_precision('high')

In [20]:
next(iter(trainloader))[0].shape

torch.Size([64, 1, 28, 28])

In [21]:
# Train the compiled model
print("Training compiled model:")
start_time = time.time()
train(compiled_model, trainloader, optimizer, criterion, epochs=3)
evaluate(compiled_model, testloader)
end_time = time.time()

print(end_time - start_time)

#Train the uncompiled model.
start_time = time.time()

model2 = SimpleCNN().cuda()
optimizer2 = optim.Adam(model2.parameters(), lr=0.001)
print("Training uncompiled model:")
train(model2, trainloader, optimizer2, criterion, epochs=3)
evaluate(model2, testloader)

end_time = time.time()


print(end_time - start_time)


Training compiled model:
Epoch 1, Loss: 0.1820743108931591, time: 5.71s
Epoch 2, Loss: 0.05601658393766059, time: 5.73s
Epoch 3, Loss: 0.04027422228071435, time: 5.90s




Accuracy: 98.83%
18.16615915298462
Training uncompiled model:
Epoch 1, Loss: 0.1786678190191009, time: 5.85s
Epoch 2, Loss: 0.05807021551740901, time: 5.63s
Epoch 3, Loss: 0.04264528390537975, time: 5.82s
Accuracy: 98.59%
18.048738956451416


In [106]:
class ComplexModel(nn.Module):
    def __init__(self):
        super(ComplexModel, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(1024, 2048),
            nn.ReLU(),
            nn.Linear(2048, 4096),
            nn.ReLU(),
            nn.Linear(4096, 2048),
            nn.ReLU(),
            nn.Linear(2048, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 100),
        )

    def forward(self, x):
        return self.layers(x)

In [107]:
# Random data
batch_size = 128
input_size = 1024
output_size = 100
inputs = torch.randn(batch_size, input_size).cuda()
targets = torch.randn(batch_size, output_size).cuda()

# Models
model = ComplexModel().cuda()
compiled_model = torch.compile(model,mode="max-autotune",backend='inductor')

# Loss and optimizers
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
compiled_optimizer = optim.Adam(compiled_model.parameters(), lr=0.001)

# Training loop
def train(model, optimizer, criterion, inputs, targets, epochs=10):
    model.train()
    start_time = time.time()
    for epoch in range(epochs):
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
    end_time = time.time()
    return end_time - start_time

# Train uncompiled model
uncompiled_time = train(model, optimizer, criterion, inputs, targets)
print(f"Uncompiled training time: {uncompiled_time:.4f} seconds")

# Warmup
compiled_time = train(compiled_model, compiled_optimizer, criterion, inputs, targets)
print(f"Compiled training time: {compiled_time:.4f} seconds")

print(f"Speedup: {uncompiled_time / compiled_time:.2f}x")

  return F.mse_loss(input, target, reduction=self.reduction)


Uncompiled training time: 0.3453 seconds


AUTOTUNE addmm(128x1, 128x32, 32x1)
  triton_mm_702 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=16, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=2
  triton_mm_703 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=16, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2
  triton_mm_704 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=16, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4
  triton_mm_706 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=16, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_707 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=16, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
  triton_mm_708 0.0041 ms 100.0% A

E0313 08:33:12.997363 160764 site-packages/torch/_inductor/select_algorithm.py:1477] [2/0] Exception `ptxas` failed with error code -2
E0313 08:33:12.997363 160764 site-packages/torch/_inductor/select_algorithm.py:1477] [2/0] `ptxas` stderr:
E0313 08:33:12.997363 160764 site-packages/torch/_inductor/select_algorithm.py:1477] [2/0] 
E0313 08:33:12.997363 160764 site-packages/torch/_inductor/select_algorithm.py:1477] [2/0] Repro command: /home/nikolas/miniconda3/lib/python3.9/site-packages/triton/backends/nvidia/bin/ptxas -lineinfo -v --gpu-name=sm_86 /tmp/tmpyb375rly.ptx -o /tmp/tmpyb375rly.ptx.o
E0313 08:33:12.997363 160764 site-packages/torch/_inductor/select_algorithm.py:1477] [2/0]  for benchmark choice TritonTemplateCaller(/tmp/torchinductor_nikolas/xt/cxtpo4gcqqa66qioxrafdkpahmf5cogsaubxphjbsksltk25rney.py, ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8)
AUTOTUNE mm(128x1, 1x

KeyboardInterrupt: 

In [None]:
# Train uncompiled model
uncompiled_time = train(model, optimizer, criterion, inputs, targets)
print(f"Uncompiled training time: {uncompiled_time:.4f} seconds")


In [27]:
# Post JIT, damn this took a while.
compiled_time = train(compiled_model, compiled_optimizer, criterion, inputs, targets)
print(f"Compiled training time: {compiled_time:.4f} seconds")

print(f"Speedup: {uncompiled_time / compiled_time:.2f}x")

Compiled training time: 0.0453 seconds
Speedup: 1.72x


In [28]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.amp import autocast, GradScaler
import gc

In [29]:
torch.cuda.empty_cache()
gc.collect()

0

In [31]:
from torch.utils.data import TensorDataset, DataLoader

In [85]:
targets.shape

torch.Size([256])

In [92]:
inputs.sum(axis=1).shape

torch.Size([153600])

In [97]:
import numpy as np

In [119]:
# Random data
batch_size = 1024*150
input_size = 256
output_size = 10
inputs = torch.randn(batch_size, input_size).cuda()
targets = torch.tensor(np.expand_dims([1 if x else 0  for x in inputs.sum(axis=1) > torch.mean(inputs.sum(axis=1))],axis=1))
targets = targets.to(dtype=torch.float).cuda()
# Create TensorDataset
dataset = TensorDataset(inputs, targets)

# Create DataLoader
trainloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
# Helpers
class ComplexModel(nn.Module):
    def __init__(self):
        super(ComplexModel, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(256, 2048),
            nn.ReLU(),
            nn.Linear(2048, 4096),
            nn.ReLU(),
            nn.Linear(4096, 2048),
            nn.ReLU(),
            nn.Linear(2048, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
        )

    def forward(self, x):
        return self.layers(x)

    
def get_memory_usage():
    pid = os.getpid()
    process = psutil.Process(pid)
    memory_info = process.memory_info()
    return memory_info.rss / (1024 * 1024)  # Return memory usage in MB

def get_cuda_memory_usage():
    allocated_memory = torch.cuda.memory_allocated() / (1024 * 1024)  # MB
    cached_memory = torch.cuda.memory_reserved() / (1024 * 1024) #MB
    return allocated_memory, cached_memory

def train(model, trainloader, optimizer, criterion, epochs=1):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        start_time = time.time()
        for i, data in enumerate(trainloader, 0):
            inputs, labels = data[0].cuda(), data[1].cuda()
            optimizer.zero_grad()
          
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        
        epoch_time = time.time()-start_time
        print(f"Outputs dtype: {outputs.dtype}")
        allocated_memory, cached_memory = get_cuda_memory_usage()
        print(f"Epoch {epoch + 1}, Loss: {running_loss / len(trainloader)}, time: {epoch_time:.2f}s, Allocated Memory: {allocated_memory:.2f} MB, Cached Memory: {cached_memory:.2f} MB")
        torch.cuda.empty_cache()
        gc.collect()
def train2(model, trainloader, optimizer, criterion,scaler, epochs=1):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        start_time = time.time()
        for i, data in enumerate(trainloader, 0):
            inputs, labels = data[0].cuda(), data[1].cuda()
            optimizer.zero_grad()
            with autocast(device_type='cuda'):
                outputs = model(inputs)
                loss = criterion(outputs, labels)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            running_loss += loss.item()
        epoch_time = time.time()-start_time
        print(f"Outputs dtype: {outputs.dtype}")
        allocated_memory, cached_memory = get_cuda_memory_usage()
        print(f"Epoch {epoch + 1}, Loss: {running_loss / len(trainloader)}, time: {epoch_time:.2f}s, Allocated Memory: {allocated_memory:.2f} MB, Cached Memory: {cached_memory:.2f} MB")
        torch.cuda.empty_cache()
        gc.collect()


In [120]:
torch.cuda.empty_cache()
gc.collect()

602

In [121]:
model = ComplexModel().cuda()
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
scaler = GradScaler()
print("Training with autocast:")
train2(model, trainloader, optimizer, criterion, scaler, epochs=5)


Training with autocast:
Outputs dtype: torch.float16
Epoch 1, Loss: 0.0, time: 2.01s, Allocated Memory: 4578.44 MB, Cached Memory: 9780.00 MB
Outputs dtype: torch.float16
Epoch 2, Loss: 0.0, time: 2.00s, Allocated Memory: 4578.44 MB, Cached Memory: 9932.00 MB
Outputs dtype: torch.float16
Epoch 3, Loss: 0.0, time: 2.00s, Allocated Memory: 4578.44 MB, Cached Memory: 9932.00 MB
Outputs dtype: torch.float16
Epoch 4, Loss: 0.0, time: 2.01s, Allocated Memory: 4578.44 MB, Cached Memory: 9932.00 MB
Outputs dtype: torch.float16
Epoch 5, Loss: 0.0, time: 1.99s, Allocated Memory: 4578.44 MB, Cached Memory: 9932.00 MB


In [122]:
torch.cuda.empty_cache()
gc.collect()

0

In [123]:
model = ComplexModel().cuda()
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()
print("Training without autocast:")
train(model, trainloader, optimizer, criterion, epochs=5)


Training without autocast:
Outputs dtype: torch.float32
Epoch 1, Loss: 0.44498562812805176, time: 2.31s, Allocated Memory: 4579.57 MB, Cached Memory: 14580.00 MB
Outputs dtype: torch.float32
Epoch 2, Loss: 0.4326412081718445, time: 2.30s, Allocated Memory: 4579.57 MB, Cached Memory: 14732.00 MB
Outputs dtype: torch.float32
Epoch 3, Loss: 0.3801073133945465, time: 2.29s, Allocated Memory: 4579.57 MB, Cached Memory: 14730.00 MB
Outputs dtype: torch.float32
Epoch 4, Loss: 0.24810852110385895, time: 2.28s, Allocated Memory: 4579.57 MB, Cached Memory: 14732.00 MB
Outputs dtype: torch.float32
Epoch 5, Loss: 0.23171819746494293, time: 2.29s, Allocated Memory: 4579.57 MB, Cached Memory: 14732.00 MB


In [124]:
torch.cuda.empty_cache()
gc.collect()

0

In [125]:
import os
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data import DataLoader, DistributedSampler
from datasets import load_dataset
from transformers import BertForSequenceClassification, BertTokenizerFast, AdamW

def setup_process(rank, world_size):
    # Initialize the distributed process group (using NCCL backend for GPUs)
    dist.init_process_group(
        backend="nccl",
        init_method="env://",
        world_size=world_size,
        rank=rank,
    )

def cleanup():
    dist.destroy_process_group()

def ddp_training(rank, world_size):
    setup_process(rank, world_size)
    device = torch.device(f"cuda:{rank}")
    
    # Use a clinical transformer model (Bio_ClinicalBERT) for medical text
    model_name = "emilyalsentzer/Bio_ClinicalBERT"
    model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)
    model.to(device)
    ddp_model = DDP(model, device_ids=[rank])
    
    # Load the MedNLI dataset (a medical natural language inference dataset)
    dataset = load_dataset("mednli", split="train")
    
    # Initialize tokenizer from the same model
    tokenizer = BertTokenizerFast.from_pretrained(model_name)
    
    # Tokenize the dataset: combine "premise" and "hypothesis" with truncation and padding.
    def tokenize_function(example):
        return tokenizer(example["premise"], example["hypothesis"],
                         truncation=True, padding="max_length", max_length=128)
    
    dataset = dataset.map(tokenize_function, batched=True)
    # Set the format for PyTorch tensors, keeping input_ids, attention_mask, and label.
    dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
    
    # Use DistributedSampler so each process gets a unique subset of data.
    sampler = DistributedSampler(dataset, num_replicas=world_size, rank=rank, shuffle=True)
    dataloader = DataLoader(dataset, batch_size=8, sampler=sampler)
    
    # Set up the optimizer.
    optimizer = AdamW(ddp_model.parameters(), lr=2e-5)
    
    ddp_model.train()
    num_epochs = 2  # For demonstration, we use a small number of epochs.
    for epoch in range(num_epochs):
        # Set epoch for sampler to have proper shuffling
        sampler.set_epoch(epoch)
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)
            
            outputs = ddp_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            # Only rank 0 logs to avoid duplicate output.
            if rank == 0:
                print(f"Epoch {epoch}, Loss: {loss.item()}")
    
    cleanup()

In [126]:
# Use the number of available GPUs for distributed training
world_size = torch.cuda.device_count()
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "12355"

# Spawn one process per GPU
mp.spawn(ddp_training, args=(world_size,), nprocs=world_size, join=True)


Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/home/nikolas/miniconda3/lib/python3.9/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/home/nikolas/miniconda3/lib/python3.9/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'ddp_training' on <module '__main__' (built-in)>


ProcessExitedException: process 0 terminated with exit code 1