In [2]:
import torch
import torchvision
import torchvision.transforms as transforms
from torchvision import models
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from torch.utils.data.distributed import DistributedSampler
import torch.distributed as dist
from opacus import PrivacyEngine
from opacus.validators import ModuleValidator
import numpy as np
from tqdm import tqdm
import time

In [3]:
torch.cuda.empty_cache()
# Parameters
EPSILON = 3.0
DELTA = 1e-5
EPOCHS = 100
LR = 1e-3
BATCH_SIZE = 64
MAX_PHYSICAL_BATCH_SIZE =  64
MAX_GRAD_NORM = 1.5  
DATA_ROOT = './cifar10'

In [4]:
CIFAR10_MEAN = (0.4914, 0.4822, 0.4465)
CIFAR10_STD_DEV = (0.2023, 0.1994, 0.2010)
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

In [5]:
train_dataset = torchvision.datasets.CIFAR10(root=DATA_ROOT, train=True, download=True, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

test_dataset = torchvision.datasets.CIFAR10(root=DATA_ROOT, train=False, download=True, transform=transform)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

Files already downloaded and verified
Files already downloaded and verified


In [6]:
model = models.resnet50(pretrained=False, num_classes=10)
model.conv1 = nn.Conv2d(3,64, kernel_size=(3, 3), stride=(1,1), padding=(1,1), bias=False)



In [7]:
errors = ModuleValidator.validate(model, strict=False)
if errors:  # If there are errors, fix them
    model = ModuleValidator.fix(model)
ModuleValidator.validate(model, strict=True)

[]

In [7]:
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# if torch.cuda.device_count() > 1:
#     model = nn.DataParallel(model)
#device_ids = [0, 1]  # Specify the GPU IDs you want to use
# model = nn.DataParallel(model, device_ids=device_ids)
# model.to('cuda')  # Move the model to the first GPU
import os
import torch
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP

def setup_distributed():
    # Check if CUDA is available
    if not torch.cuda.is_available():
        raise RuntimeError("CUDA is not available. Distributed training requires CUDA.")

    # Set up the environment variables if they're not already set
    if 'WORLD_SIZE' not in os.environ:
        os.environ['WORLD_SIZE'] = str(torch.cuda.device_count())
    if 'RANK' not in os.environ:
        os.environ['RANK'] = '0'
    if 'LOCAL_RANK' not in os.environ:
        os.environ['LOCAL_RANK'] = '0'
    if 'MASTER_ADDR' not in os.environ:
        os.environ['MASTER_ADDR'] = 'localhost'
    if 'MASTER_PORT' not in os.environ:
        os.environ['MASTER_PORT'] = '12355'

    # Initialize the process group
    dist.init_process_group(backend='nccl')
    
    # Get the local rank and global rank
    local_rank = int(os.environ['LOCAL_RANK'])
    global_rank = dist.get_rank()
    
    # Set the device
    torch.cuda.set_device(local_rank)
    device = torch.device(f'cuda:{local_rank}')

    return local_rank, global_rank, device

def prepare_model(model, device):
    model = model.to(device)
    model = DDP(model, device_ids=[device])
    return model

# Usage in your main script:
# if __name__ == "__main__":
local_rank, global_rank, device = setup_distributed()
    
    
    # Prepare the model for distributed training


    # Your training loop here
    # ...

    # Clean up
    
# dist.init_process_group(backend='nccl')
# rank = dist.get_rank()
# model = nn.parallel.DistributedDataParallel(model, device_ids=[rank])
# model = model.to(device)

In [8]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.NAdam(model.parameters(), lr=LR)

In [9]:
privacy_engine = PrivacyEngine()
model, optimizer, train_loader = privacy_engine.make_private_with_epsilon(
    module=model,
    optimizer=optimizer,
    data_loader=train_loader,
    epochs=EPOCHS,
    target_epsilon=EPSILON,
    target_delta=DELTA,
    max_grad_norm=MAX_GRAD_NORM,
)



In [10]:
def adjust_privacy_parameters(epoch):
    """ Adjust privacy parameters dynamically based on the epoch """
    # Simple example: Linear decay of max_grad_norm
    initial_norm = 1.5
    final_norm = 0.5
    total_epochs = EPOCHS
    new_norm = initial_norm - (epoch / total_epochs) * (initial_norm - final_norm)
    privacy_engine.max_grad_norm = new_norm


In [11]:
def train(model, train_loader, optimizer, criterion, device):
    model.train()
    train_loss = 0
    correct = 0
    total = 0
    for batch_idx, (inputs, targets) in enumerate(tqdm(train_loader)):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

        # Optionally adjust privacy parameters at some interval or condition
        if batch_idx % 100 == 0:  # Example condition
            adjust_privacy_parameters(epoch)

    return train_loss / len(train_loader), 100. * correct / total

In [None]:
model = prepare_model(model, device)

In [12]:
for epoch in range(EPOCHS):
    start_time = time.time()
    train_loss, train_accuracy = train(model, train_loader, optimizer, criterion, device)
    end_time = time.time()
    print(f'Epoch: {epoch+1}, Loss: {train_loss:.4f}, Accuracy: {train_accuracy:.2f}%, Duration: {end_time - start_time:.2f}s')

  0%|          | 0/781 [00:01<?, ?it/s]


RuntimeError: You are trying to call the hook of a dead Module!

In [None]:
def test(model, test_loader, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += targets.size(0)
            correct += (predicted == targets).sum().item()

    accuracy = 100 * correct / total
    print(f'Test Accuracy: {accuracy:.2f}%')
    return accuracy

In [1]:
import socket

def is_port_free(port):
    try:
        # Try to create a socket and bind it to the port
        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
            s.bind(('', port))
            return True
    except socket.error:
        return False

# Usage example
port_to_check = 12355
if is_port_free(port_to_check):
    print(f"Port {port_to_check} is free")
else:
    print(f"Port {port_to_check} is in use")

Port 12355 is free
