### Comparing Training and Evaluation Performance of a Simple Neural Network on CPU and GPU using TensorFlow and PyTorch:

- uses PyTorch's 'torch.device' to switch which device is being used
- the CPU performs all computations for training (forward and backward passes) and inference

## Plan:
1. Model is defined by PyTorch's nn.Module
2. Data is loaded using DataLoader
3. Training: for epoch, model processes the training data by batch. For batch, data and target labels are moved to 'device', forward pass computes the output, loss is computed using 'CrossEntropyLoss', backward pass is performed to compute gradients, optimizer updates the model parameters
4. Inference: model processes the test data by batch. For batch. data and target labels to moved to 'device', model performs forward pass to compute predictions. In addition, accuracy, inference time, throughput, CPU usage and memory usage are calculated


# BEST Code to test both GPU and CPU

### My GPU:
Chipset Model: Apple M2
- Type: GPU (Graphics Processing Unit)
- Bus: Built-In
- Total Number of Cores: 8

### MY CPU:
Model Name: MacBook Air
- Model Identifier: Mac14,2
- Chip: Apple M2
- Total Number of Cores: 8 (4 performance and 4 efficiency)

## Import Packages

In [1]:
#import the necessary packages
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, trabnsforms
from torch.utils.data import DataLoader
import time
import psutil
import tensorflow as tf


## Data Preparation

In [3]:
#define transformations to prepare dataset for training neural network
#ToTensor - converts PIL Image/ Numpy Arrays into PyTorch tensor
#Normalize - normalizes tensor images with mean and sd
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])

#load datasets as train and test
train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transform)

#create dataloaders
#load in 64 samples at a time
#shuffled at every epoch to prevent learning unintended patterns/ overfitting
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

## Define Model

In [4]:
#define a simple neural network
class SimpleNN(nn.Module):
    #initializes layers of the neural network
    def __init__(self):
        #constructor of parent class
        super(SimpleNN, self).__init__()
        #defines 3 linear (fully connected) layers
        self.fc1 = nn.Linear(28 * 28, 128) #matches dimension size of input images, with 128 features in the layer
        self.fc2 = nn.Linear(128, 64) 
        self.fc3 = nn.Linear(64, 10) #10 matches number of classification classes

    #defines forward pass of the neural network
    def forward(self, x):
        #flattens input tensor
        x = torch.flatten(x, 1)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        #should return classification class, a digit 0-9
        return x

## System Metrics Function

In [5]:
#function to get system metrics (cpu usage and memory)
def get_system_metrics():
    cpu_usage = psutil.cpu_percent()
    memory_info = psutil.virtual_memory()
    return cpu_usage, memory_info.percent

## Training Loop

In [6]:
#function for training loop
#set number of epoch to 5
def train_model(model, train_loader, criterion, optimizer, device, num_epochs=5):
    model.train() #sets model to training mode
    total_training_time = 0  #initialize total training time
    model.to(device)  # move model to the specified device
    #loops over each 5 epoch
    for epoch in range(num_epochs):
        #starts timer for time parameters
        start_time = time.time()
        epoch_loss = 0
        #inner loop iterates over the batches of data from the training dataset
        for batch_idx, (data, target) in enumerate(train_loader): 
            data, target = data.to(device), target.to(device)  # move data and target to the specified device
            optimizer.zero_grad() #clears the gradients of optimized tensors
            output = model(data) #passes training data through model
            loss = criterion(output, target) #calculates loss (how well the model's predictions match the target values)
            loss.backward()
            optimizer.step() #updates model params
            epoch_loss += loss.item()

        end_time = time.time()
        total_epoch_time = end_time - start_time #calculates total time taken for epoch
        total_training_time += total_epoch_time  # Accumulate total training time

        cpu_usage, memory_usage = get_system_metrics()
        
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss/len(train_loader):.4f}, Time: {total_epoch_time:.2f}s, CPU Usage: {cpu_usage}%, Memory Usage: {memory_usage}%')

    #print total training time after all epochs
    print(f'Total Training Time: {total_training_time:.2f}s')

## Inference Loop

In [7]:
#inference loop
#create function for evaluation, with model and test data as parameters
def evaluate_model(model, test_loader, device):
    model.eval() #sets model to evaluation mode
    model.to(device)  # move model to the specified device
    #initialize metircs
    total_correct = 0
    total_samples = 0
    total_inference_time = 0

    with torch.no_grad(): #disables gradient calculation (reduces memory usage and speeds up)
        for batch_idx, (data, target) in enumerate(test_loader): #loops through batches from the test dataset
            data, target = data.to(device), target.to(device)  # move data and target to the specified device
            start_time = time.time()
            output = model(data)
            end_time = time.time()
            
            inference_time = end_time - start_time #calculates inference time for current batch
            total_inference_time += inference_time #adds up each inference time
            
            _, predicted = torch.max(output.data, 1) #finds the class w highest predicted score for each sample in the batch
            total_correct += (predicted == target).sum().item() #compares predicted with actual label, counts the total num of correct predictions
            total_samples += target.size(0) #gets the number of samples in the current batch and adds to count of total samples processed

    accuracy = total_correct / total_samples 
    avg_inference_time = total_inference_time / len(test_loader)
    throughput = total_samples / total_inference_time #computes the num of samples processed per second
    
    cpu_usage, memory_usage = get_system_metrics() #uses function from above
    
    print(f'Accuracy: {accuracy:.4f}, Average Inference Time: {avg_inference_time:.4f}s, Throughput: {throughput:.2f} samples/s, CPU Usage: {cpu_usage}%, Memory Usage: {memory_usage}%')


## Run Experiment

In [8]:
#function to run the entire workflow on a specified device
def run_experiment(device):
    #initialize the model
    model = SimpleNN()
    #define loss function and optimizer
    criterion = nn.CrossEntropyLoss() #CEL measures how well the model's predictions match the actual labels, best for classification
    optimizer = optim.SGD(model.parameters(), lr=0.01) #SGD updates the model params 

    #train the model by running training loop
    train_model(model, train_loader, criterion, optimizer, device, num_epochs=5)
    #evaluate the model using function 
    evaluate_model(model, test_loader, device)

# Testing CPU

In [9]:
device = torch.device('cpu')
print(f"Running on {device}")


#run the experiment on CPU
run_experiment('cpu')

Running on cpu
Epoch [1/5], Loss: 1.0473, Time: 2.37s, CPU Usage: 18.6%, Memory Usage: 83.8%
Epoch [2/5], Loss: 0.3819, Time: 2.11s, CPU Usage: 78.0%, Memory Usage: 83.8%
Epoch [3/5], Loss: 0.3256, Time: 2.13s, CPU Usage: 94.1%, Memory Usage: 84.1%
Epoch [4/5], Loss: 0.2956, Time: 2.14s, CPU Usage: 90.2%, Memory Usage: 83.5%
Epoch [5/5], Loss: 0.2710, Time: 2.15s, CPU Usage: 89.9%, Memory Usage: 83.3%
Total Training Time: 10.90s
Accuracy: 0.9241, Average Inference Time: 0.0001s, Throughput: 765998.98 samples/s, CPU Usage: 75.4%, Memory Usage: 83.7%


# Testing GPU

In [15]:
#check if a GPU is available and run the experiment on GPU
if torch.backends.mps.is_available():
    device = torch.device('mps')
    print(f"Running on {device}")
else:
    device = torch.device('cpu')
    print("MPS device not found, running on CPU")

run_experiment('mps')

Running on mps
Epoch [1/5], Loss: 0.9877, Time: 3.90s, CPU Usage: 11.0%, Memory Usage: 84.9%
Epoch [2/5], Loss: 0.3814, Time: 3.31s, CPU Usage: 18.1%, Memory Usage: 84.5%
Epoch [3/5], Loss: 0.3263, Time: 3.29s, CPU Usage: 17.2%, Memory Usage: 83.8%
Epoch [4/5], Loss: 0.2946, Time: 3.35s, CPU Usage: 21.1%, Memory Usage: 84.2%
Epoch [5/5], Loss: 0.2707, Time: 3.34s, CPU Usage: 18.6%, Memory Usage: 84.8%
Total Training Time: 17.20s
Accuracy: 0.9236, Average Inference Time: 0.0002s, Throughput: 352506.95 samples/s, CPU Usage: 38.3%, Memory Usage: 85.3%


## OTHER METHOD:

In [2]:
#load and prepare the MNIST dataset
#MNIST dataset contains handwritten digits from 0 to 9
#x_train and x_test are the images, y_train and y_test are the corresponding labels
mnist = tf.keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()

#normalize the pixel values to be between 0 and 1 (for better performance of the model)
x_train, x_test = x_train / 255.0, x_test / 255.0


#The purpose of this model is to classify handwritten digit images from the MNIST dataset into one of the 10 digit classes (0-9)

#build the model
model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(input_shape=(28, 28)),  #flatten the 28x28 images into a 1D array of 784 elements
    tf.keras.layers.Dense(128, activation='relu'),  
    tf.keras.layers.Dropout(0.2),  #dropout layer to prevent overfitting by dropping 20% of the input units
    tf.keras.layers.Dense(10)  #output layer with 10 neurons (one for each digit class)

])

#compile the model
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer='adam', loss=loss_fn, metrics=['accuracy'])

#function to train and evaluate the model on a specific device
def run_experiment(device_name):
    with tf.device(device_name):
        #rebuild the model for each device
        model = tf.keras.models.Sequential([
            tf.keras.layers.Flatten(input_shape=(28, 28)),
            tf.keras.layers.Dense(128, activation='relu'),
            tf.keras.layers.Dropout(0.2),
            tf.keras.layers.Dense(10)
        ])
        model.compile(optimizer='adam', loss=loss_fn, metrics=['accuracy'])
        
        #train the model
        start_time = time.time()
        model.fit(x_train, y_train, epochs=5) #train the model for 5 epochs on the training data
        end_time = time.time()
        training_time = end_time - start_time #calculate the total training time
        
        #evaluate the model
        evaluation = model.evaluate(x_test, y_test, verbose=2)
        
        return training_time, evaluation

#run the experiment on CPU
cpu_time, cpu_eval = run_experiment('/CPU:0')
print(f"CPU Training time: {cpu_time:.2f} seconds, Evaluation: {cpu_eval}")

#check if a GPU is available and run the experiment on GPU
if tf.config.list_physical_devices('GPU'):
    gpu_time, gpu_eval = run_experiment('/GPU:0')
    print(f"GPU Training time: {gpu_time:.2f} seconds, Evaluation: {gpu_eval}")
else:
    print("No GPU found")


  super().__init__(**kwargs)
2024-08-06 15:09:27.084739: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2
2024-08-06 15:09:27.084793: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2024-08-06 15:09:27.084797: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2024-08-06 15:09:27.085439: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-08-06 15:09:27.085476: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Epoch 1/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 582us/step - accuracy: 0.8652 - loss: 0.4686
Epoch 2/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 586us/step - accuracy: 0.9564 - loss: 0.1510
Epoch 3/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 570us/step - accuracy: 0.9664 - loss: 0.1099
Epoch 4/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 585us/step - accuracy: 0.9743 - loss: 0.0848
Epoch 5/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 699us/step - accuracy: 0.9772 - loss: 0.0741
313/313 - 0s - 381us/step - accuracy: 0.9779 - loss: 0.0715
CPU Training time: 6.31 seconds, Evaluation: [0.07153843343257904, 0.9779000282287598]
Epoch 1/5


2024-08-06 15:09:34.311313: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - accuracy: 0.8561 - loss: 0.4903
Epoch 2/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - accuracy: 0.9549 - loss: 0.1531
Epoch 3/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - accuracy: 0.9664 - loss: 0.1094
Epoch 4/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - accuracy: 0.9735 - loss: 0.0849
Epoch 5/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - accuracy: 0.9765 - loss: 0.0729
313/313 - 1s - 2ms/step - accuracy: 0.9768 - loss: 0.0734
GPU Training time: 40.91 seconds, Evaluation: [0.07337352633476257, 0.9768000245094299]
