### To compare running the model on a CPU and GPU:

- uses PyTorch's 'torch.device' to switch which device is being used
- the CPU performs all computations for training (forward and backward passes) and inference

## Plan:
1. Model is defined by PyTorch's nn.Module
2. Data is loaded using DataLoader
3. Training: for epoch, model processes the training data by batch. For batch, data and target labels are moved to 'device', forward pass computes the output, loss is computed using 'CrossEntropyLoss', backward pass is performed to compute gradients, optimizer updates the model parameters
4. Inference: model processes the test data by batch. For batch. data and target labels to moved to 'device', model performs forward pass to compute predictions. In addition, accuracy, inference time, throughput, CPU usage and memory usage are calculated


# BEST Code to test both GPU and CPU

In [None]:
#load and prepare the MNIST dataset
mnist = tf.keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

#build the model
model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(input_shape=(28, 28)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(10)
])

#compile the model
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer='adam', loss=loss_fn, metrics=['accuracy'])

#function to train and evaluate the model on a specific device
def run_experiment(device_name):
    with tf.device(device_name):
        # Rebuild the model for each device
        model = tf.keras.models.Sequential([
            tf.keras.layers.Flatten(input_shape=(28, 28)),
            tf.keras.layers.Dense(128, activation='relu'),
            tf.keras.layers.Dropout(0.2),
            tf.keras.layers.Dense(10)
        ])
        model.compile(optimizer='adam', loss=loss_fn, metrics=['accuracy'])
        
        #train the model
        start_time = time.time()
        model.fit(x_train, y_train, epochs=5)
        end_time = time.time()
        training_time = end_time - start_time
        
        #evaluate the model
        evaluation = model.evaluate(x_test, y_test, verbose=2)
        
        return training_time, evaluation

#run the experiment on CPU
cpu_time, cpu_eval = run_experiment('/CPU:0')
print(f"CPU Training time: {cpu_time:.2f} seconds, Evaluation: {cpu_eval}")

#check if a GPU is available and run the experiment on GPU
if tf.config.list_physical_devices('GPU'):
    gpu_time, gpu_eval = run_experiment('/GPU:0')
    print(f"GPU Training time: {gpu_time:.2f} seconds, Evaluation: {gpu_eval}")
else:
    print("No GPU found")


Epoch 1/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 583us/step - accuracy: 0.8614 - loss: 0.4786
Epoch 2/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 595us/step - accuracy: 0.9540 - loss: 0.1542
Epoch 3/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 720us/step - accuracy: 0.9670 - loss: 0.1107
Epoch 4/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 765us/step - accuracy: 0.9735 - loss: 0.0870
Epoch 5/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 654us/step - accuracy: 0.9768 - loss: 0.0724
313/313 - 0s - 403us/step - accuracy: 0.9773 - loss: 0.0702
CPU Training time: 6.73 seconds, Evaluation: [0.07018803060054779, 0.9772999882698059]
Epoch 1/5
[1m 376/1875[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m6s[0m 4ms/step - accuracy: 0.7387 - loss: 0.8844

# Testing CPU

## Import Packages

In [2]:
#import the necessary packages
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import time
import psutil


## Data Preparation

In [3]:
#define transformations to prepare dataset for training neural network
#ToTensor - converts PIL Image/ Numpy Arrays into PyTorch tensor
#Normalize - normalizes tensor images with mean and sd
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])

#load datasets as train and test
train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transform)

#create dataloaders
#load in 64 samples at a time
#shuffled at every epoch to prevent learning unintended patterns/ overfitting
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)


## Define Model

In [4]:
#define a simple neural network
class SimpleNN(nn.Module):
    #initializes layers of the neural network
    def __init__(self):
        #constructor of parent class
        super(SimpleNN, self).__init__()
        #defines 3 linear (fully connected) layers
        self.fc1 = nn.Linear(28 * 28, 128) #matches dimension size of input images, with 128 features in the layer
        self.fc2 = nn.Linear(128, 64) 
        self.fc3 = nn.Linear(64, 10) #10 matches number of classification classes

    #defines forward pass of the neural network
    def forward(self, x):
        #flattens input tensor
        x = torch.flatten(x, 1)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        #should return classification class, a digit 0-9
        return x

## System Metrics Function

In [5]:
#function to get system metrics (cpu usage and memory)
def get_system_metrics():
    cpu_usage = psutil.cpu_percent()
    memory_info = psutil.virtual_memory()
    return cpu_usage, memory_info.percent

## Training Loop

In [6]:
#function for training loop
#set number of epoch to 5
def train_model(model, train_loader, criterion, optimizer, device, num_epochs=5):
    model.train() #sets model to training mode
    total_training_time = 0  #initialize total training time
    model.to(device)  # move model to the specified device
    #loops over each 5 epoch
    for epoch in range(num_epochs):
        #starts timer for time parameters
        start_time = time.time()
        epoch_loss = 0
        #inner loop iterates over the batches of data from the training dataset
        for batch_idx, (data, target) in enumerate(train_loader): 
            data, target = data.to(device), target.to(device)  # move data and target to the specified device
            optimizer.zero_grad() #clears the gradients of optimized tensors
            output = model(data) #passes training data through model
            loss = criterion(output, target) #calculates loss (how well the model's predictions match the target values)
            loss.backward()
            optimizer.step() #updates model params
            epoch_loss += loss.item()

        end_time = time.time()
        total_epoch_time = end_time - start_time #calculates total time taken for epoch
        total_training_time += total_epoch_time  # Accumulate total training time

        cpu_usage, memory_usage = get_system_metrics()
        
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss/len(train_loader):.4f}, Time: {total_epoch_time:.2f}s, CPU Usage: {cpu_usage}%, Memory Usage: {memory_usage}%')

    #print total training time after all epochs
    print(f'Total Training Time: {total_training_time:.2f}s')

## Inference Loop

In [7]:
#inference loop
#create function for evaluation, with model and test data as parameters
def evaluate_model(model, test_loader, device):
    model.eval() #sets model to evaluation mode
    model.to(device)  # move model to the specified device
    #initialize metircs
    total_correct = 0
    total_samples = 0
    total_inference_time = 0

    with torch.no_grad(): #disables gradient calculation (reduces memory usage and speeds up)
        for batch_idx, (data, target) in enumerate(test_loader): #loops through batches from the test dataset
            data, target = data.to(device), target.to(device)  # move data and target to the specified device
            start_time = time.time()
            output = model(data)
            end_time = time.time()
            
            inference_time = end_time - start_time #calculates inference time for current batch
            total_inference_time += inference_time #adds up each inference time
            
            _, predicted = torch.max(output.data, 1) #finds the class w highest predicted score for each sample in the batch
            total_correct += (predicted == target).sum().item() #compares predicted with actual label, counts the total num of correct predictions
            total_samples += target.size(0) #gets the number of samples in the current batch and adds to count of total samples processed

    accuracy = total_correct / total_samples 
    avg_inference_time = total_inference_time / len(test_loader)
    throughput = total_samples / total_inference_time #computes the num of samples processed per second
    
    cpu_usage, memory_usage = get_system_metrics() #uses function from above
    
    print(f'Accuracy: {accuracy:.4f}, Average Inference Time: {avg_inference_time:.4f}s, Throughput: {throughput:.2f} samples/s, CPU Usage: {cpu_usage}%, Memory Usage: {memory_usage}%')


## Run Experiment

In [8]:
#function to run the entire workflow on a specified device
def run_experiment(device):
    #initialize the model
    model = SimpleNN()
    #define loss function and optimizer
    criterion = nn.CrossEntropyLoss() #CEL measures how well the model's predictions match the actual labels, best for classification
    optimizer = optim.SGD(model.parameters(), lr=0.01) #SGD updates the model params 

    #train the model by running training loop
    train_model(model, train_loader, criterion, optimizer, device, num_epochs=5)
    #evaluate the model using function 
    evaluate_model(model, test_loader, device)

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Running on {device}")


#run the experiment on CPU
run_experiment('cpu')

Running on cpu
Epoch [1/5], Loss: 1.0167, Time: 2.28s, CPU Usage: 15.6%, Memory Usage: 84.4%
Epoch [2/5], Loss: 0.3849, Time: 2.12s, CPU Usage: 89.7%, Memory Usage: 84.3%
Epoch [3/5], Loss: 0.3249, Time: 2.08s, CPU Usage: 84.4%, Memory Usage: 84.0%
Epoch [4/5], Loss: 0.2922, Time: 2.09s, CPU Usage: 84.7%, Memory Usage: 84.3%
Epoch [5/5], Loss: 0.2660, Time: 2.09s, CPU Usage: 86.8%, Memory Usage: 84.1%
Total Training Time: 10.66s
Accuracy: 0.9298, Average Inference Time: 0.0001s, Throughput: 1009022.32 samples/s, CPU Usage: 81.5%, Memory Usage: 84.3%


In [10]:
# #run the experiment on GPU (if available)
# if torch.cuda.is_available():
#     print("Running on GPU:")
#     run_experiment('cuda')
# else:
#     print("CUDA is not available. Skipping GPU run.")


# Testing GPU

In [11]:
#import the necessary packages
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import time
import psutil


In [15]:
#pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu116


Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu116
Collecting torchaudio
  Downloading torchaudio-2.4.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (6.4 kB)
INFO: pip is looking at multiple versions of torchaudio to determine which version is compatible with other requirements. This could take a while.
  Downloading torchaudio-2.3.1-cp311-cp311-macosx_11_0_arm64.whl.metadata (6.4 kB)
Downloading torchaudio-2.3.1-cp311-cp311-macosx_11_0_arm64.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: torchaudio
Successfully installed torchaudio-2.3.1
Note: you may need to restart the kernel to use updated packages.


In [20]:
# !python -m pip install tensorflow-macos
# !python -m pip install tensorflow-metal



Collecting tensorflow-macos
  Downloading tensorflow_macos-2.16.2-cp311-cp311-macosx_12_0_arm64.whl.metadata (3.3 kB)
Collecting tensorflow==2.16.2 (from tensorflow-macos)
  Downloading tensorflow-2.16.2-cp311-cp311-macosx_12_0_arm64.whl.metadata (4.1 kB)
Collecting absl-py>=1.0.0 (from tensorflow==2.16.2->tensorflow-macos)
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow==2.16.2->tensorflow-macos)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=23.5.26 (from tensorflow==2.16.2->tensorflow-macos)
  Downloading flatbuffers-24.3.25-py2.py3-none-any.whl.metadata (850 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow==2.16.2->tensorflow-macos)
  Downloading gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow==2.16.2->tensorflow-macos)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting h5py>=3.

In [22]:
# import tensorflow as tf
# import time

#load and prepare the MNIST dataset
mnist = tf.keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

#build the model
model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(input_shape=(28, 28)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(10)
])

#compile the model
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer='adam', loss=loss_fn, metrics=['accuracy'])

#train the model
start_time = time.time()
model.fit(x_train, y_train, epochs=5)
end_time = time.time()

print(f"Training time: {end_time - start_time:.2f} seconds")

#evaluate the model
model.evaluate(x_test, y_test, verbose=2)


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
[1m11490434/11490434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step


  super().__init__(**kwargs)
2024-08-02 10:36:53.548946: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2
2024-08-02 10:36:53.548998: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2024-08-02 10:36:53.549003: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2024-08-02 10:36:53.550452: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-08-02 10:36:53.553078: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Epoch 1/5


2024-08-02 10:36:55.587642: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 4ms/step - accuracy: 0.8605 - loss: 0.4788
Epoch 2/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - accuracy: 0.9562 - loss: 0.1473
Epoch 3/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - accuracy: 0.9687 - loss: 0.1069
Epoch 4/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - accuracy: 0.9728 - loss: 0.0881
Epoch 5/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - accuracy: 0.9757 - loss: 0.0783
Training time: 43.66 seconds
313/313 - 1s - 3ms/step - accuracy: 0.9775 - loss: 0.0721


[0.072133369743824, 0.9775000214576721]

# BEST Code to test both GPU and CPU

In [24]:
#load and prepare the MNIST dataset
mnist = tf.keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

#build the model
model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(input_shape=(28, 28)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(10)
])

#compile the model
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer='adam', loss=loss_fn, metrics=['accuracy'])

#function to train and evaluate the model on a specific device
def run_experiment(device_name):
    with tf.device(device_name):
        # Rebuild the model for each device
        model = tf.keras.models.Sequential([
            tf.keras.layers.Flatten(input_shape=(28, 28)),
            tf.keras.layers.Dense(128, activation='relu'),
            tf.keras.layers.Dropout(0.2),
            tf.keras.layers.Dense(10)
        ])
        model.compile(optimizer='adam', loss=loss_fn, metrics=['accuracy'])
        
        #train the model
        start_time = time.time()
        model.fit(x_train, y_train, epochs=5)
        end_time = time.time()
        training_time = end_time - start_time
        
        #evaluate the model
        evaluation = model.evaluate(x_test, y_test, verbose=2)
        
        return training_time, evaluation

#run the experiment on CPU
cpu_time, cpu_eval = run_experiment('/CPU:0')
print(f"CPU Training time: {cpu_time:.2f} seconds, Evaluation: {cpu_eval}")

#check if a GPU is available and run the experiment on GPU
if tf.config.list_physical_devices('GPU'):
    gpu_time, gpu_eval = run_experiment('/GPU:0')
    print(f"GPU Training time: {gpu_time:.2f} seconds, Evaluation: {gpu_eval}")
else:
    print("No GPU found")


Epoch 1/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 590us/step - accuracy: 0.8573 - loss: 0.4895
Epoch 2/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 590us/step - accuracy: 0.9546 - loss: 0.1546
Epoch 3/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 607us/step - accuracy: 0.9676 - loss: 0.1034
Epoch 4/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 596us/step - accuracy: 0.9740 - loss: 0.0852
Epoch 5/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 587us/step - accuracy: 0.9769 - loss: 0.0742
313/313 - 0s - 364us/step - accuracy: 0.9784 - loss: 0.0701
CPU Training time: 6.28 seconds, Evaluation: [0.07007426768541336, 0.9783999919891357]
Epoch 1/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - accuracy: 0.8588 - loss: 0.4843
Epoch 2/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - accuracy: 0.9570 - loss: 0.1