# Imports

In [1]:
import sys, argparse, json
sys.path.append('..') #adjust based on your system's directory
import torch, time, os 
import numpy as np
from torch.utils.data import DataLoader
from tqdm import tqdm
import psutil, gc
import time, platform

## Utils

### Performance

In [2]:
def get_cpu_info():
    # CPU Information
    print("CPU Information:")
    print(f"Processor: {platform.processor()}")
    print(f"Architecture: {platform.architecture()}")
    print(f"System: {platform.system()}")
    print(f"Platform: {platform.platform()}")

    return {
        'processor': platform.processor(),
        'architecture': platform.architecture(),
        'system': platform.system(),
        'platform': platform.platform()
    }

# RAM Information
def get_ram_info():
    if hasattr(os, 'sysconf'):
        if 'SC_PAGE_SIZE' in os.sysconf_names and 'SC_PHYS_PAGES' in os.sysconf_names:
            page_size = os.sysconf('SC_PAGE_SIZE')  # in bytes
            total_pages = os.sysconf('SC_PHYS_PAGES')
            total_ram = page_size * total_pages  # in bytes
            total_ram_gb = total_ram / (1024 ** 3)  # convert to GB
            print(f"Total memory (GB): {total_ram_gb:.2f}")
            return total_ram_gb
    return None

#Load Data
def load_data(data_path, device):
    return torch.load(data_path, map_location = device, weights_only=False)

#Load Model
def load_model(model_path, device):
    model = torch.load(model_path, map_location = device, weights_only=False)
    return model.module.eval()

#Use DataLoader for iterating over batches
def data_loader(data, batch_size):
    return DataLoader(data, batch_size = batch_size, drop_last = False)   #Drop samples out of the batch size

def get_process_memory_mb():
    process = psutil.Process(os.getpid())
    return process.memory_info().rss / 1024 / 1024

def get_model_memory_mb(model):
    total_params = sum(p.numel() for p in model.parameters())
    param_bytes = total_params * 4  # assuming float32
    return param_bytes / 1024 / 1024

### Size Calculator

In [3]:
import torch
from typing import Union

def get_tensor_size_in_mb(tensor: torch.Tensor) -> Union[float, str]:
    """
    Calculates the size of a PyTorch tensor in megabytes (MB).

    This function determines the total memory occupied by a tensor by multiplying
    the number of elements in the tensor by the size of a single element in bytes.
    The result is then converted from bytes to megabytes for a more
    human-readable format.

    Args:
        tensor (torch.Tensor): The PyTorch tensor to be measured.

    Returns:
        Union[float, str]: The size of the tensor in MB as a float.
                         Returns a string message if the input is not a torch.Tensor.
    """
    # Check if the input is a PyTorch tensor to prevent errors.
    if not isinstance(tensor, torch.Tensor):
        return "Input must be a PyTorch tensor."

    try:
        # Step 1: Get the total number of elements in the tensor.
        # torch.numel() returns the number of elements in a tensor,
        # which is equivalent to the product of its dimensions.
        num_elements = tensor.numel()

        # Step 2: Get the size of a single element in bytes.
        # tensor.element_size() returns the size of each element
        # based on its data type (e.g., float32 is 4 bytes, float64 is 8 bytes).
        element_size_in_bytes = tensor.element_size()

        # Step 3: Calculate the total size in bytes.
        total_size_in_bytes = num_elements * element_size_in_bytes

        # Step 4: Convert the total size to megabytes.
        # There are 1024 bytes in a kilobyte and 1024 kilobytes in a megabyte.
        size_in_mb = total_size_in_bytes / (1024 * 1024)

        return size_in_mb
    except Exception as e:
        # A simple try-except block to catch potential runtime errors.
        return f"An error occurred: {e}"

### InfiniteSampler

In [4]:
from torch.utils.data.sampler import Sampler
import distributed
from typing import Any, Optional
import itertools

def _get_torch_dtype(size: int) -> Any:
    return torch.int32 if size <= 2**31 else torch.int64

def _generate_randperm_indices(*, size: int, generator: torch.Generator):
    """Generate the indices of a random permutation."""
    dtype = _get_torch_dtype(size)
    # This is actually matching PyTorch's CPU implementation, see: https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/TensorFactories.cpp#L900-L921
    perm = torch.arange(size, dtype=dtype)
    for i in range(size):
        j = torch.randint(i, size, size=(1,), generator=generator).item()

        # Always swap even if no-op
        value = perm[j].item()
        perm[j] = perm[i].item()
        perm[i] = value
        yield value

class InfiniteSampler(Sampler):
    def __init__(
        self,
        *,
        sample_count: int,
        shuffle: bool = False,
        seed: int = 0,
        start: Optional[int] = None,
        step: Optional[int] = None,
        advance: int = 0,
    ):
        self._sample_count = sample_count
        self._seed = seed
        self._shuffle = shuffle
        self._start = distributed.get_global_rank() if start is None else start
        self._step = distributed.get_global_size() if step is None else step
        self._advance = advance

    def __iter__(self):
        if self._shuffle:
            iterator = self._shuffled_iterator()
        else:
            iterator = self._iterator()

        yield from itertools.islice(iterator, self._advance, None)

    def _iterator(self):
        assert not self._shuffle

        while True:
            iterable = range(self._sample_count)
            yield from itertools.islice(iterable, self._start, None, self._step)

    def _shuffled_iterator(self):
        assert self._shuffle

        # Instantiate a generator here (rather than in the ctor) to keep the class
        # picklable (requirement of mp.spawn)
        generator = torch.Generator().manual_seed(self._seed)

        while True:
            iterable = _generate_randperm_indices(size=self._sample_count, generator=generator)
            yield from itertools.islice(iterable, self._start, None, self._step)

# Configs

In [5]:
batch_size = 512
data_path = '../../../raw_data/1.pt'
model_path = '../Fine_Tune_Model/Mixed_Inception_z_VITAE_Base_Img_Full_New_Full.pt'
device = 'cpu'

In [6]:
get_cpu_info()
get_ram_info()

CPU Information:
Processor: x86_64
Architecture: ('64bit', 'ELF')
System: Linux
Platform: Linux-4.18.0-553.47.1.el8_10.x86_64-x86_64-with-glibc2.28
Total memory (GB): 376.53


376.5285186767578

# Run Inference

In [None]:
data = load_data(data_path, device)
model = load_model(model_path, device)
real_redshift = data[:][2]

In [8]:
sample_count = len(data.tensors[0])
sampler = InfiniteSampler(
    sample_count=sample_count,
    shuffle=False,
    seed=7,
    advance=0,
)

In [9]:
dataloader = DataLoader(
    data, batch_size = batch_size, drop_last = False,
    sampler=sampler, num_workers=0
)

In [10]:
print(data.tensors[0].shape)
total_size = sum([get_tensor_size_in_mb(t) for t in data.tensors])
mb_per_sample = total_size / len(data.tensors[0])

print(f'Total size {total_size:0.5f}, MB per sample {mb_per_sample:0.5f}')
# one sample is ~ 0.019573 MB in size

batch_per_GB = int(np.ceil(1024 / (batch_size * mb_per_sample)))
print(f'Batches per GB {batch_per_GB}')

torch.Size([51267, 5, 32, 32])
Total size 1003.45984, MB per sample 0.01957
Batches per GB 103


In [None]:
total_batches = batch_per_GB * 1024 # 1TB 
total_time = 0.0  # Initialize total time for execution
total_data_bits = 0  # Initialize total data bits processed

start = time.perf_counter()
total = []
# Initialize the profiler to track both CPU and GPU activities and memory usage
with torch.no_grad():
    for i, data in tqdm(enumerate(dataloader), total=total_batches):
        image = data[0].to(device)  # Image to device
        magnitude = data[1].to(device)  # Magnitude to device

        _ = model([image, magnitude])  # Model inference

        # Calculate data size for this batch
        image_bits = image.element_size() * image.nelement() * 8  # Convert bytes to bits
        magnitude_bits = magnitude.element_size() * magnitude.nelement() * 8  # Convert bytes to bits
        total_data_bits += image_bits + magnitude_bits  # Add data bits for this batch

        if (i+1) % batch_per_GB !=0: continue
        GB_processed = int((i+1) / batch_per_GB)
        print(f'Processed {GB_processed} GB.')

        # benchmark results
        num_samples = (i+1) * batch_size
        # Extract total CPU and GPU time
        total_time = time.perf_counter() - start 
        total_process_mem = get_process_memory_mb()
        execution_info = {
            'total_execution_time (seconds)': total_time,
            'total_process_memory (MB)': total_process_mem,
            'num_batches': i+1,   # Number of batches
            'batch_size': batch_size,   # Batch size
            'device': device,   # Selected device,
            'GB_processed': GB_processed
        }

        avg_time_batch = total_time / (i+1)

        # Average execution time per batch
        execution_info['execution_time_per_batch'] = avg_time_batch
        # Throughput in bits per second (using total_time for all batches)
        execution_info['throughput_bps'] = total_data_bits / total_time
        execution_info['sample_persec'] = num_samples / total_time,  # Number of samples processed per second
        
        total.append(execution_info)
        with open(f'cpu/{GB_processed}.json', 'w') as f:
            json.dump(execution_info, f, indent=4)

        # stop condition, otherwise infinite sampler will keep running forever
        if i+1 >= total_batches: break

with open(f'cpu/total.json', 'w') as f:
    json.dump(total, f, indent=4)

  0%|          | 103/105472 [03:19<56:07:21,  1.92s/it]

Processed 1 GB.


  0%|          | 206/105472 [06:45<56:21:39,  1.93s/it]

Processed 2 GB.


  0%|          | 309/105472 [10:02<57:53:15,  1.98s/it]

Processed 3 GB.


  0%|          | 412/105472 [13:20<56:59:43,  1.95s/it]

Processed 4 GB.


  0%|          | 515/105472 [16:37<56:57:51,  1.95s/it]

Processed 5 GB.


  1%|          | 618/105472 [20:04<56:28:26,  1.94s/it]

Processed 6 GB.


  1%|          | 645/105472 [20:56<56:34:24,  1.94s/it]