In [1]:
import time
import torch

import torch.optim as optim

from torch import nn
from torch.utils.data import DataLoader, TensorDataset

def print_trainable_summary(model: torch.nn.Module) -> None:
    total = 0
    trainable = 0
    trainable_names = []
    for name, p in model.named_parameters():
        n = p.numel()
        total += n
        if p.requires_grad:
            trainable += n
            trainable_names.append((name, n))
    print(
        f"Total params: {total:,}; Trainable params: {trainable:,} ({100 * trainable / total:.4f}%)"
    )
    print("Trainable parameter groups (name, numel):")
    for nm, nmels in trainable_names:
        print(" ", nm, nmels)

  import pynvml  # type: ignore[import]


In [3]:
class BigMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(BigMLP, self).__init__()
        self.layer1 = nn.Linear(input_dim, hidden_dim)
        self.layer2 = nn.Linear(hidden_dim, hidden_dim)
        self.layer3 = nn.Linear(hidden_dim, hidden_dim)
        self.layer4 = nn.Linear(hidden_dim, hidden_dim)
        self.output = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = torch.relu(self.layer1(x))
        x = torch.relu(self.layer2(x))
        x = torch.relu(self.layer3(x))
        x = torch.relu(self.layer4(x))
        x = self.output(x)
        return x

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

input_dim=10
hidden_dim = 6400
output_dim = 1
model = BigMLP(input_dim, hidden_dim, output_dim).to(device)
print_trainable_summary(model)

cuda
Total params: 122,976,001; Trainable params: 122,976,001 (100.0000%)
Trainable parameter groups (name, numel):
  layer1.weight 64000
  layer1.bias 6400
  layer2.weight 40960000
  layer2.bias 6400
  layer3.weight 40960000
  layer3.bias 6400
  layer4.weight 40960000
  layer4.bias 6400
  output.weight 6400
  output.bias 1


In [4]:
def get_gpu_utilization():
    if torch.cuda.is_available():
        gpu_memory = torch.cuda.memory_allocated() / 1024**2  # MB
        gpu_utilization = torch.cuda.utilization()  # percentage
        return gpu_memory, gpu_utilization
    return 0, 0

num_samples=1000
X = torch.randn(num_samples, input_dim)
y = torch.sum(X, dim=1) + torch.randn(num_samples) * 0.1  # sum of inputs with noise

start_time = time.time()
model.train()
total_loss = 0
epochs=10

# Convert dataset to DataLoader for batch processing
dataset = TensorDataset(X, y)
data_loader = DataLoader(dataset, batch_size=32, shuffle=True)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(epochs):
    epoch_loss = 0
    for inputs, labels in data_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        # Zero the gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        epoch_loss += loss.item()

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

    total_loss += epoch_loss
    elapsed_time = time.time() - start_time
    gpu_memory, gpu_utilization = get_gpu_utilization()

    # Print performance metrics every epoch
    print(f"Epoch [{epoch+1}/{epochs}], Loss: {epoch_loss:.4f}, "
            f"Time: {elapsed_time:.2f}s, "
            f"GPU Memory: {gpu_memory:.2f} MB, "
            f"GPU Utilization: {gpu_utilization}%")

total_time = time.time() - start_time

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [1/10], Loss: 53081.7775, Time: 3.13s, GPU Memory: 1892.72 MB, GPU Utilization: 98%
Epoch [2/10], Loss: 331.7734, Time: 4.62s, GPU Memory: 1892.72 MB, GPU Utilization: 97%
Epoch [3/10], Loss: 339.1843, Time: 6.10s, GPU Memory: 1892.72 MB, GPU Utilization: 98%
Epoch [4/10], Loss: 340.9896, Time: 7.58s, GPU Memory: 1892.72 MB, GPU Utilization: 98%
Epoch [5/10], Loss: 335.1193, Time: 9.06s, GPU Memory: 1892.72 MB, GPU Utilization: 98%
Epoch [6/10], Loss: 333.3865, Time: 10.54s, GPU Memory: 1892.72 MB, GPU Utilization: 98%
Epoch [7/10], Loss: 337.8707, Time: 12.01s, GPU Memory: 1892.72 MB, GPU Utilization: 98%
Epoch [8/10], Loss: 332.7873, Time: 13.49s, GPU Memory: 1892.72 MB, GPU Utilization: 98%
Epoch [9/10], Loss: 351.3964, Time: 14.97s, GPU Memory: 1892.72 MB, GPU Utilization: 97%
Epoch [10/10], Loss: 335.5929, Time: 16.45s, GPU Memory: 1892.72 MB, GPU Utilization: 98%


In [2]:
import torch
from torch import nn


class LoRAAdapter(nn.Module):
    """
    Wraps an existing nn.Linear (or nn.Embedding) and adds low-rank adapters.
    """

    def __init__(
        self, module: nn.Module, r: int = 4, alpha: int = None, init_scale: float = 1e-3
    ):
        super().__init__()
        self.module = module  # e.g., nn.Linear
        self.r = r
        if alpha is None:
            alpha = r
        self.scaling = alpha / r

        # For Linear: out_features x in_features -> we create A: r x in, B: out x r
        if isinstance(module, nn.Linear):
            in_f = module.in_features
            out_f = module.out_features
            self.A = nn.Parameter(torch.zeros(r, in_f))
            self.B = nn.Parameter(torch.zeros(out_f, r))
            nn.init.kaiming_uniform_(self.A, a=5**0.5)
            nn.init.zeros_(self.B)
        else:
            raise NotImplementedError
        
        # freeze the base
        for p in self.module.parameters():
            p.requires_grad = False

    def forward(self, x):
        base = self.module(x)
        delta = (x @ self.A.T) @ self.B.T
        return base + delta * self.scaling


def find_parent_and_attr(model, module_name: str):
    parts = module_name.split(".")
    parent = model
    for p in parts[:-1]:
        # handle list indices in module names like blocks.3.attn
        if p.isdigit():
            parent = parent[int(p)]
        else:
            parent = getattr(parent, p)
    return parent, parts[-1]


def apply_lora_to_linear_modules(
    model: nn.Module, target_modules: list[str], r=4, alpha=None
):
    for name, mod in model.named_modules():
        if isinstance(mod, nn.Linear) and any([tm in name for tm in target_modules]):
            print(name)
            parent, attr = find_parent_and_attr(model, name)
            wrapped = LoRAAdapter(mod, r=r, alpha=alpha)
            wrapped.to(mod.weight.device)
            setattr(parent, attr, wrapped)


In [3]:
class BigMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(BigMLP, self).__init__()
        self.layer1 = nn.Linear(input_dim, hidden_dim)
        self.layer2 = nn.Linear(hidden_dim, hidden_dim)
        self.layer3 = nn.Linear(hidden_dim, hidden_dim)
        self.layer4 = nn.Linear(hidden_dim, hidden_dim)
        self.output = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = torch.relu(self.layer1(x))
        x = torch.relu(self.layer2(x))
        x = torch.relu(self.layer3(x))
        x = torch.relu(self.layer4(x))
        x = self.output(x)
        return x

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

input_dim=10
hidden_dim = 6400
output_dim = 1
model = BigMLP(input_dim, hidden_dim, output_dim).to(device)
print_trainable_summary(model)

apply_lora_to_linear_modules(
    model, target_modules=["layer2", "layer3", "layer4"], r=8, alpha=32
)
print_trainable_summary(model)

cuda
Total params: 122,976,001; Trainable params: 122,976,001 (100.0000%)
Trainable parameter groups (name, numel):
  layer1.weight 64000
  layer1.bias 6400
  layer2.weight 40960000
  layer2.bias 6400
  layer3.weight 40960000
  layer3.bias 6400
  layer4.weight 40960000
  layer4.bias 6400
  output.weight 6400
  output.bias 1
layer2
layer3
layer4
Total params: 123,283,201; Trainable params: 384,001 (0.3115%)
Trainable parameter groups (name, numel):
  layer1.weight 64000
  layer1.bias 6400
  layer2.A 51200
  layer2.B 51200
  layer3.A 51200
  layer3.B 51200
  layer4.A 51200
  layer4.B 51200
  output.weight 6400
  output.bias 1


In [4]:
def get_gpu_utilization():
    if torch.cuda.is_available():
        gpu_memory = torch.cuda.memory_allocated() / 1024**2  # MB
        gpu_utilization = torch.cuda.utilization()  # percentage
        return gpu_memory, gpu_utilization
    return 0, 0

num_samples=1000
X = torch.randn(num_samples, input_dim)
y = torch.sum(X, dim=1) + torch.randn(num_samples) * 0.1  # sum of inputs with noise

start_time = time.time()
model.train()
total_loss = 0
epochs=10

# Convert dataset to DataLoader for batch processing
dataset = TensorDataset(X, y)
data_loader = DataLoader(dataset, batch_size=32, shuffle=True)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(epochs):
    epoch_loss = 0
    for inputs, labels in data_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        # Zero the gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        epoch_loss += loss.item()

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

    total_loss += epoch_loss
    elapsed_time = time.time() - start_time
    gpu_memory, gpu_utilization = get_gpu_utilization()

    # Print performance metrics every epoch
    print(f"Epoch [{epoch+1}/{epochs}], Loss: {epoch_loss:.4f}, "
            f"Time: {elapsed_time:.2f}s, "
            f"GPU Memory: {gpu_memory:.2f} MB, "
            f"GPU Utilization: {gpu_utilization}%")

total_time = time.time() - start_time

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [1/10], Loss: 296.7354, Time: 2.52s, GPU Memory: 490.94 MB, GPU Utilization: 2%
Epoch [2/10], Loss: 300.0705, Time: 2.76s, GPU Memory: 490.94 MB, GPU Utilization: 46%
Epoch [3/10], Loss: 298.0779, Time: 2.98s, GPU Memory: 490.94 MB, GPU Utilization: 62%
Epoch [4/10], Loss: 303.6243, Time: 3.19s, GPU Memory: 490.94 MB, GPU Utilization: 70%
Epoch [5/10], Loss: 300.2487, Time: 3.41s, GPU Memory: 490.94 MB, GPU Utilization: 71%
Epoch [6/10], Loss: 299.3167, Time: 3.63s, GPU Memory: 490.94 MB, GPU Utilization: 68%
Epoch [7/10], Loss: 297.3388, Time: 3.84s, GPU Memory: 490.94 MB, GPU Utilization: 71%
Epoch [8/10], Loss: 294.4271, Time: 4.07s, GPU Memory: 490.94 MB, GPU Utilization: 71%
Epoch [9/10], Loss: 300.3594, Time: 4.28s, GPU Memory: 490.94 MB, GPU Utilization: 68%
Epoch [10/10], Loss: 306.8770, Time: 4.50s, GPU Memory: 490.94 MB, GPU Utilization: 71%
