In [None]:
! pip install torch neptune_scale GPUtil psutil

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import time
import subprocess
import threading
import queue
import GPUtil
import psutil
from neptune_scale import Run
from uuid import uuid4

In [None]:
# Set Neptune credentials as environment variables
# %env NEPTUNE_API_TOKEN = "YOUR_API_TOKEN"
# %env NEPTUNE_PROJECT = "WORKSPACE/PROJECT"

In [None]:
# Initialize Neptune Run object
# Set parameters
params = {
    "device": torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    "num_gpus": torch.cuda.device_count(),
}

run = Run(
    run_id=f"system-{uuid4()}"
)

run.log_configs(
    {
        # "config/learning_rate": params["device"],
        "config/optimizer": params["num_gpus"],
    }
)

run.add_tags(tags=["GPU"], group_tags=True)
run.add_tags(tags=["monitor_gpu", "single-node"])

In [None]:
# Function to track system (CPU and GPU) usage and log to Neptune
## This function will be launched on a background thread to track the metrics at each step
def track_system_metrics(run, event):
    monitoring_metrics = {}
    step_counter = 0

    while True:
        event.wait()  # Wait until the training step signals readiness
        step_counter += 1

        # Get cpu usage
        monitoring_metrics["system/monitor/CPU/percent"] = psutil.cpu_percent()

        # Get gpu usage
        gpus = GPUtil.getGPUs()
        for gpu in gpus:
            gpu_id = gpu.id
            gpu_name = gpu.name

            monitoring_metrics[
                f"system/monitor/GPU/{gpu_name}-{gpu_id}-{gpu.uuid}/memory_used_GB"
            ] = (gpu.memoryUsed / 1024)
            monitoring_metrics[
                f"system/monitor/GPU/{gpu_name}-{gpu_id}-{gpu.uuid}/memory_total_GB"
            ] = (gpu.memoryTotal / 1024)
            monitoring_metrics[
                f"system/monitor/GPU/{gpu_name}-{gpu_id}-{gpu.uuid}/memory_utilized_percent"
            ] = (
                gpu.memoryUtil * 100
            )  # Percentage
            monitoring_metrics[
                f"system/monitor/GPU/{gpu_name}-{gpu_id}-{gpu.uuid}/memory_free_GB"
            ] = (
                gpu.memoryFree / 1024
            )  # in MB
            monitoring_metrics[
                f"system/monitor/GPU/{gpu_name}-{gpu_id}-{gpu.uuid}/temperature_celsius"
            ] = gpu.temperature  # Celsius

        # Log system metrics to Neptune
        run.log_metrics(data=monitoring_metrics, step=step_counter)

        event.clear()  # Reset event to wait for next training step

In [None]:
# Dummy dataset and model for demonstration
class SimpleDataset(torch.utils.data.Dataset):
    def __init__(self, size=1000):
        self.data = torch.randn(size, 10)
        self.target = torch.randint(0, 2, (size,))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.target[idx]


class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc = nn.Linear(10, 2)

    def forward(self, x):
        return self.fc(x)


# Training setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SimpleModel()

# Use DataParallel for multi-GPU setup
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)

model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
dataset = SimpleDataset()
dataloader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True)

# Queue to store GPU metrics from the background thread
# metrics_queue = queue.Queue()

# Event for synchronizing the training loop with the GPU metrics collection
event = threading.Event()

# Start the background thread to track GPU metrics
system_monitoring_thread = threading.Thread(
    target=track_system_metrics, args=(run, event), daemon=True
)
system_monitoring_thread.start()

# Training loop
num_epochs = 10
step_counter = 0
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for i, (inputs, labels) in enumerate(dataloader):
        time.sleep(0.5)
        step_counter += 1
        start_time = time.time()
        inputs, labels = inputs.to(device), labels.to(device)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Signal the system monitoring thread to log metrics after every step after the backward pass is complete
        # metrics_queue.put(step_counter)
        event.set()

        # Track running loss
        running_loss += loss.item()

    print(f"Epoch {epoch+1} complete.")

print("Training Finished!")

run.close()