In [1]:
import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from torch import nn
import torch.nn.functional as F
import time

import numpy as np
import matplotlib.pyplot as plt

import psutil
import pynvml

from thop import profile
from thop import clever_format

In [2]:


# Load the MNIST dataset
transform = transforms.Compose([
    transforms.ToTensor(),  # Convert the image to a PyTorch tensor
    transforms.Normalize((0.1307,), (0.3081,))  # Normalize with the MNIST mean and std
])
train_data = datasets.MNIST(root='/scratch/joluseti/local_datasets', train=True, download=False, transform=transform)
test_data = datasets.MNIST(root='/scratch/joluseti/local_datasets', train=False, download=False, transform=transform)

# Create the dictionary
label_data_dict = {}
for data, label in train_data:
    if label not in label_data_dict:  # Add only one sample per label
        label_data_dict[label] = data
        # Break early if all labels are collected
        if len(label_data_dict) == 10:
            break



In [3]:
# For test data
test_data_dict = {}
for data, label in test_data:
    if label not in test_data_dict:
        test_data_dict[label] = []  # Initialize a list for each label
    test_data_dict[label].append(data)  # Append the data to the list

In [4]:
for label, data in test_data_dict.items():
    print(f"Label: {label}, Data shape: {len(data)}")

Label: 7, Data shape: 1028
Label: 2, Data shape: 1032
Label: 1, Data shape: 1135
Label: 0, Data shape: 980
Label: 4, Data shape: 982
Label: 9, Data shape: 1009
Label: 5, Data shape: 892
Label: 6, Data shape: 958
Label: 3, Data shape: 1010
Label: 8, Data shape: 974


In [5]:
test_loader = DataLoader(test_data, batch_size=256, shuffle=False)

In [6]:
image = label_data_dict[0].unsqueeze(0)
image.shape

torch.Size([1, 1, 28, 28])

In [7]:
class QuantLenet(nn.Module):
    def __init__(self):
        super().__init__()

        self.quant = torch.ao.quantization.QuantStub()

        self.conv1 = nn.Conv2d(in_channels = 1, out_channels = 6, kernel_size = 5, bias=False)
        self.pool = nn.MaxPool2d(kernel_size = 2, stride = 2) 
        self.conv2 = nn.Conv2d(in_channels = 6, out_channels = 16, kernel_size = 5, bias=False)
        self.fc1 = nn.Linear(in_features = 16 * 4 * 4, out_features = 120, bias = False)
        self.fc2 = nn.Linear(in_features = 120, out_features = 84, bias=False)
        self.fc3 = nn.Linear(in_features = 84, out_features = 10, bias=False)
        self.relu1 = nn.ReLU()
        self.relu2 = nn.ReLU()
        
        self.dequant = torch.ao.quantization.DeQuantStub()

    def forward(self, x):
        x = self.quant(x)
        x = self.relu1(self.conv1(x))
        x = self.pool(x)
        x = self.relu2(self.conv2(x))
        x = self.pool(x)
        y = x.reshape(-1, 16 * 4 * 4)
        x = x.reshape(-1, 16 * 4 * 4)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        x = self.dequant(x)

        return x, y

In [8]:
model_path = '/scratch/joluseti/618/model/float_model.pth'
float_model = torch.load(model_path, weights_only=False).to('cpu')
float_model.eval()

QuantLenet(
  (quant): QuantStub()
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1), bias=False)
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1), bias=False)
  (fc1): Linear(in_features=256, out_features=120, bias=False)
  (fc2): Linear(in_features=120, out_features=84, bias=False)
  (fc3): Linear(in_features=84, out_features=10, bias=False)
  (relu1): ReLU()
  (relu2): ReLU()
  (dequant): DeQuantStub()
)

In [9]:
# Load the float and quantized models
model = torch.jit.load('/scratch/joluseti/618/test/float.pth')

# Set both models to evaluation mode
model.eval()


RecursiveScriptModule(
  original_name=QuantLenet
  (quant): RecursiveScriptModule(original_name=QuantStub)
  (conv1): RecursiveScriptModule(original_name=Conv2d)
  (pool): RecursiveScriptModule(original_name=MaxPool2d)
  (conv2): RecursiveScriptModule(original_name=Conv2d)
  (fc1): RecursiveScriptModule(original_name=Linear)
  (fc2): RecursiveScriptModule(original_name=Linear)
  (fc3): RecursiveScriptModule(original_name=Linear)
  (relu1): RecursiveScriptModule(original_name=ReLU)
  (relu2): RecursiveScriptModule(original_name=ReLU)
  (dequant): RecursiveScriptModule(original_name=DeQuantStub)
)

In [10]:
output, _ = model(image)
output.shape

torch.Size([1, 10])

In [12]:
macs, params = profile(float_model, inputs=(image, ), verbose=False)
read_macs, read_params = clever_format([macs, params], "%.3f")
print(f"number of macs {macs}, number of parameters {params}")
print(f"number of macs {read_macs}, number of parameters {read_params}\n")

number of macs 281640.0, number of parameters 44190.0
number of macs 281.640K, number of parameters 44.190K



In [13]:
def count_parameters(model):
  """
  Counts the total number of trainable parameters in a PyTorch model.

  Args:
    model: A PyTorch nn.Module object.

  Returns:
    int: The total number of trainable parameters.
  """
  total_params = 0
  for param in model.parameters():
    if param.requires_grad:  # Only count trainable parameters
      num_params = 1
      for size in param.size():
        num_params *= size
      total_params += num_params
  return total_params

count_parameters(float_model)

44190

In [21]:
def get_gpu_info():
    UNIT = 1024 * 1024 * 1024
    pynvml.nvmlInit()
    gpuDeriveInfo = pynvml.nvmlSystemGetDriverVersion()

    gpuDeviceCount = pynvml.nvmlDeviceGetCount()
    #print("Number of GPU：", gpuDeviceCount )

    handle = pynvml.nvmlDeviceGetHandleByIndex(0)
    memoryInfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
    gpuTemperature = pynvml.nvmlDeviceGetTemperature(handle, 0)
    gpuEnergyUsage = pynvml.nvmlDeviceGetTotalEnergyConsumption(handle)
    gpuPowerState = pynvml.nvmlDeviceGetPowerState(handle)
    #gpuUtilRate = pynvml.nvmlDeviceGetUtilizationRates(handle).gpu
    #gpuMemoryRate = pynvml.nvmlDeviceGetUtilizationRates(handle).memory
    print(f"memoryInfo.total: {memoryInfo.total/UNIT:.2f} GB;   memoryInfo.used: {memoryInfo.used/UNIT:.2f} GB;   memoryInfo.free: {memoryInfo.free/UNIT:.2f} GB;   gpuTemperature: {gpuTemperature} C;   gpuEnergyUsage: {gpuEnergyUsage}\n")


def get_current_energy():
    pynvml.nvmlInit()
    handle = pynvml.nvmlDeviceGetHandleByIndex(0)
    gpuEnergyUsage = pynvml.nvmlDeviceGetTotalEnergyConsumption(handle)
    return gpuEnergyUsage

get_gpu_info()

memoryInfo.total: 80.00 GB;   memoryInfo.used: 1.06 GB;   memoryInfo.free: 78.94 GB;   gpuTemperature: 26 C;   gpuEnergyUsage: 1856224846297



In [18]:
device = torch.device("cuda:0" if torch.cuda.is_available() else 'cpu')
print(device)
print("GPU:",torch.cuda.get_device_name(0))

cuda:0
GPU: NVIDIA A100-SXM4-80GB MIG 1g.10gb


In [23]:
repeat = 200

def print_res(macs, total_time, total_energy):
    print(f"Average Energy (per inference) => {total_energy:.4f}")
    print(f"Average Time (per inference) => {total_time*1000:.4f}ms")
    print(f"GOPS/KJ => {((repeat*macs/(1024**3)) / (total_energy*(10**-3))):.4f}")
    print(f"GOPs => {((repeat*macs/(1024**3))/ total_time):.4f}")
    print("")

In [15]:
def benchmark_model(model, image, repeat):
    torch.cuda.empty_cache()

    model = model.to(device)
    input_tensor = image.to(device)

    model.eval()  # Set model to evaluation mode
    for _ in range(5):
        with torch.no_grad():
            model(input_tensor)

    total_time = 0
    total_energy = 0


    for _ in range(repeat):
      energy_start = get_current_energy()

      torch.cuda.synchronize()
      time_start = time.time()

      with torch.no_grad():
        output = model(input_tensor)

      torch.cuda.synchronize()
      time_end = time.time()

      energy_end = get_current_energy()

      total_time += (time_end - time_start)
      total_energy += (energy_end - energy_start)

    return total_time / repeat, total_energy / repeat

In [24]:
print(f"Initial GPU info before benchmarking model:")
get_gpu_info()
torch.cuda.empty_cache()

model.cuda()

print(f"Benchmarking model ...\n")

#torch.cuda.empty_cache()

total_time, total_energy = benchmark_model(model, image, repeat)

print_res(macs, total_time, total_energy)

print(f"GPU info after benchmarking model:")
get_gpu_info()

Initial GPU info before benchmarking model:
memoryInfo.total: 80.00 GB;   memoryInfo.used: 1.06 GB;   memoryInfo.free: 78.94 GB;   gpuTemperature: 25 C;   gpuEnergyUsage: 1856282772501

Benchmarking model ...

Average Energy (per inference) => 315.6650
Average Time (per inference) => 0.2017ms
GOPS/KJ => 0.1662
GOPs => 260.1334

GPU info after benchmarking model:
memoryInfo.total: 80.00 GB;   memoryInfo.used: 1.06 GB;   memoryInfo.free: 78.94 GB;   gpuTemperature: 25 C;   gpuEnergyUsage: 1856282859335



In [25]:
total_energy / total_time

1565301.9154450016

In [27]:
UNIT = 1024 * 1024 * 1024
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def get_gpu_info(handle):
    memoryInfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
    gpuTemperature = pynvml.nvmlDeviceGetTemperature(handle, 0)
    gpuEnergyUsage = pynvml.nvmlDeviceGetTotalEnergyConsumption(handle)
    print(f"memoryInfo.total: {memoryInfo.total/UNIT:.2f} GB;   memoryInfo.used: {memoryInfo.used/UNIT:.2f} GB;   memoryInfo.free: {memoryInfo.free/UNIT:.2f} GB;   gpuTemperature: {gpuTemperature} C;   gpuEnergyUsage: {gpuEnergyUsage}\n")


def get_current_energy(handle):
    gpuEnergyUsage = pynvml.nvmlDeviceGetTotalEnergyConsumption(handle)
    return gpuEnergyUsage

def print_res(macs, total_time, total_energy):
    power_watts = total_energy / total_time
    print(f"Average Energy (per inference) => {total_energy:.4f} Joules")
    print(f"Average Time (per inference) => {total_time*1000:.4f}ms")
    print(f"Average Power (per inference) => {power_watts:.4f} Watts")
    print(f"GOPS/W => {(repeat*macs/(1000**3)) / (total_energy*(10**-3)/total_time):.4f}") # fixed to divide by (1000**3)
    print(f"GOPs => {((repeat*macs/(1000**3))/ total_time):.4f}") # fixed to divide by (1000**3)
    print("")



def benchmark_model(model, image, repeat, handle):
    torch.cuda.empty_cache()

    model = model.to(device)
    input_tensor = image.to(device)

    model.eval()  # Set model to evaluation mode

    for _ in range(5): # warm up run
        with torch.no_grad():
          model(input_tensor)

    total_time = 0
    total_energy = 0

    for _ in range(repeat):
      energy_start = get_current_energy(handle)

      torch.cuda.synchronize()
      time_start = time.time()

      with torch.no_grad():
          output = model(input_tensor)

      torch.cuda.synchronize()
      time_end = time.time()

      energy_end = get_current_energy(handle)

      total_time += (time_end - time_start)
      total_energy += (energy_end - energy_start)

    return total_time / repeat, total_energy / repeat

In [29]:
pynvml.nvmlInit() # call init once
handle = pynvml.nvmlDeviceGetHandleByIndex(0) # get the handle
print(f"Initial GPU info before benchmarking model:")
get_gpu_info(handle)
torch.cuda.empty_cache()

model = model.to(device)

image = image.to(device)
print(f"Benchmarking model ...\n")


total_time, total_energy = benchmark_model(model, image, repeat, handle)

print_res(macs, total_time, total_energy)

print(f"GPU info after benchmarking model:")
get_gpu_info(handle)
pynvml.nvmlShutdown() # shutdown the device

Initial GPU info before benchmarking model:
memoryInfo.total: 80.00 GB;   memoryInfo.used: 1.06 GB;   memoryInfo.free: 78.94 GB;   gpuTemperature: 25 C;   gpuEnergyUsage: 1856353235037

Benchmarking model ...

Average Energy (per inference) => 275.4300 Joules
Average Time (per inference) => 0.2030ms
Average Power (per inference) => 1356979.2568 Watts
GOPS/W => 0.0000
GOPs => 277.5149

GPU info after benchmarking model:
memoryInfo.total: 80.00 GB;   memoryInfo.used: 1.06 GB;   memoryInfo.free: 78.94 GB;   gpuTemperature: 25 C;   gpuEnergyUsage: 1856353321896

