# MNIST Example with Data Logging in DataFed

## Import Libraries

In [1]:
%load_ext autoreload
%autoreload 2

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

from datafed_torchflow.computer import get_system_info
from datafed_torchflow.pytorch import TorchLogger


## Paramters to Update

In [2]:
notebook_path = '/home/jca92/DataFed_TorchFlow/examples/Model_logger.ipynb'

## Builds the CNN

In [3]:
# Define the CNN architecture
class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        # Convolutional layers
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
        
        # Max pooling layer
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)

        # Fully connected layers
        self.fc1 = nn.Linear(64 * 7 * 7, 128)
        self.fc2 = nn.Linear(128, 10)  # Output layer for 10 classes (digits 0-9)
    
    def forward(self, x):
        # Apply convolutional layers with ReLU and max pooling
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))

        # Flatten the output
        x = x.view(-1, 64 * 7 * 7)

        # Apply fully connected layers with ReLU and final output
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        
        return x


## Define transformations for data preprocessing

In [4]:
transform = transforms.Compose([
    transforms.ToTensor(),  # Convert images to PyTorch tensors
    transforms.Normalize((0.1307,), (0.3081,))  # Normalize with mean and std of MNIST dataset
])


## Load the MNIST dataset


In [5]:
train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transform)

# Create data loaders
train_loader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=1000, shuffle=False)


## Instantiate the model, loss function, and optimizer


In [6]:
model = SimpleCNN()
criterion = nn.CrossEntropyLoss()  # Loss function for multi-class classification
optimizer = optim.Adam(model.parameters(), lr=0.001)  # Adam optimizer


## Training function


In [7]:
def train(model, device, train_loader, optimizer, criterion, epoch):
    model.train()  # Set the model to training mode
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()  # Zero the gradients

        # Forward pass
        output = model(data)
        loss = criterion(output, target)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        if batch_idx % 100 == 0:
            print(f'Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} '
                  f'({100. * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}')


## Testing function

In [8]:
def test(model, device, test_loader, criterion):
    model.eval()  # Set the model to evaluation mode
    test_loss = 0
    correct = 0
    with torch.no_grad():  # Disable gradient calculation for evaluation
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)

            # Forward pass
            output = model(data)
            test_loss += criterion(output, target).item()  # Sum up the batch loss
            pred = output.argmax(dim=1, keepdim=True)  # Get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)
    accuracy = 100. * correct / len(test_loader.dataset)

    print(f'\nTest set: Average loss: {test_loss:.4f}, Accuracy: {correct}/{len(test_loader.dataset)} '
          f'({accuracy:.2f}%)\n')


## Instantiate the DataFed Configuration

## Train Model

In [9]:
# Train and test the CNN
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

n_epochs = 5
for epoch in range(1, n_epochs + 1):
    train(model, device, train_loader, optimizer, criterion, epoch)
    test(model, device, test_loader, criterion)


Test set: Average loss: 0.0001, Accuracy: 9850/10000 (98.50%)


Test set: Average loss: 0.0000, Accuracy: 9914/10000 (99.14%)


Test set: Average loss: 0.0000, Accuracy: 9897/10000 (98.97%)


Test set: Average loss: 0.0000, Accuracy: 9900/10000 (99.00%)


Test set: Average loss: 0.0000, Accuracy: 9900/10000 (99.00%)



In [10]:
instance = TorchLogger(model, optimizer, 'delete/delete_me', script_path=notebook_path)

Unable to connect to pypi: <Fault -32500: 'RuntimeError: PyPI no longer supports the XMLRPC package_releases method. Use JSON or Simple API instead. See https://github.com/pypi/warehouse/issues/16642 and https://warehouse.pypa.io/api-reference/xml-rpc.html#deprecated-methods for more information.'>


In [16]:
instance.getMetadata()

{'layers': {'1-conv1': {'conv1': {'type': 'Conv2d',
    'layer_name': 'conv1',
    'config': {'training': False,
     'in_channels': 1,
     'out_channels': 32,
     'kernel_size': (3, 3),
     'stride': (1, 1),
     'padding': (1, 1),
     'dilation': (1, 1),
     'transposed': False,
     'output_padding': (0, 0),
     'groups': 1,
     'padding_mode': 'zeros'}}},
  '2-conv2': {'conv2': {'type': 'Conv2d',
    'layer_name': 'conv2',
    'config': {'training': False,
     'in_channels': 32,
     'out_channels': 64,
     'kernel_size': (3, 3),
     'stride': (1, 1),
     'padding': (1, 1),
     'dilation': (1, 1),
     'transposed': False,
     'output_padding': (0, 0),
     'groups': 1,
     'padding_mode': 'zeros'}}},
  '3-pool': {'pool': {'type': 'MaxPool2d',
    'layer_name': 'pool',
    'config': {'training': False,
     'kernel_size': 2,
     'stride': 2,
     'padding': 0,
     'dilation': 1,
     'return_indices': False,
     'ceil_mode': False}}},
  '4-fc1': {'fc1': {'type': 'L

In [11]:

# Function to save the model
def save_model(model, filename="mnist_cnn.pth"):
    """
    Saves the model's state dictionary.

    Args:
        model: The PyTorch model to save.
        filename (str): The file path to save the model state dictionary.
    """
    torch.save(model.state_dict(), filename)
    print(f"Model saved to {filename}")

# Function to load the model
def load_model(model, filename="mnist_cnn.pth"):
    """
    Loads the model's state dictionary.

    Args:
        model: The PyTorch model to load the state dictionary into.
        filename (str): The file path to load the model state dictionary from.
    """
    model.load_state_dict(torch.load(filename))
    model.eval()  # Set the model to evaluation mode
    print(f"Model loaded from {filename}")



# Save the model after training
save_model(model, "mnist_cnn.pth")

# Load the model (optional)
# load_model(model, "mnist_cnn.pth")


Model saved to mnist_cnn.pth


In [None]:
import json
import os
import ipykernel
import requests
from notebook import notebookapp

def get_notebook_name():
    kernel_id = os.path.basename(ipykernel.connect.get_connection_file()).split('-')[1]
    servers = list(notebookapp.list_running_servers())
    if servers:
        for server in servers:
            response = requests.get(f"{server['url']}api/sessions", params={'token': server.get('token', '')})
            for session in json.loads(response.text):
                if session['kernel']['id'] == kernel_id:
                    return session['notebook']['path']

In [14]:
from m3util.globus.globus import check_globus_endpoint

check_globus_endpoint(instance.df_api.endpointDefaultGet())

'Endpoint 7d7d5294-23aa-11ef-af02-21fa2ca908a5 is not active'

In [13]:
check_globus_endpoint(instance.df_api.endpointDefaultGet())
from m3util.globus.globus import check_globus_file_access

check_globus_file_access(instance.df_api.endpointDefaultGet, instance.local_path)

GlobusAccessError: Unexpected error: Error accessing '/home/ferroelectric/DataFed_TorchFlow-1/examples': Usage: globus ls [OPTIONS] ENDPOINT_ID[:PATH]

Error: Invalid value for 'ENDPOINT_ID[:PATH]': '<bound method API.endpointDefaultGet of <datafed_torchflow.datafed.DataFed object at 0x7f3b74d44590>>' is not a valid UUID.