In [None]:
# run the script initnotebook.py in the cuurent folder
# Error when run multiple times becasue the directory changed
%run init_notebook.py

In [None]:
import torch
from  torch.utils.data import DataLoader
import torch.nn as nn
from models.builder import EncoderDecoder as segmodel
from dataloader.cfg_defaults import get_cfg_defaults
from config_cityscapes import *
import os
from dataloader.cityscapes_dataloader import CityscapesDataset
from val_segformer_rgbonly import val_cityscape
import torch.nn.functional as F
from utils.visualize import unnormalize_img_numpy
import matplotlib.pyplot as plt
import time
import psutil

In [None]:
config_path = 'dataloader/cityscapes_rgbd_config.yaml'
config_path = os.path.join(projectFolder, config_path)

cfg = get_cfg_defaults()
cfg.merge_from_file(config_path)
cfg.freeze()

data_mean = [0.291,  0.329,  0.291]
data_std = [0.190,  0.190,  0.185]

In [None]:
cityscapes_test = CityscapesDataset(cfg, split='val')
test_loader = DataLoader(cityscapes_test, batch_size=1, shuffle=False, num_workers=4) # batchsize?
print(f'total test sample: {len(cityscapes_test)} v_iteration:{len(test_loader)}')


In [None]:
pretrained_model_path = './pretrained/model_400.pth'
criterion = nn.CrossEntropyLoss(reduction='mean', ignore_index=config.background)

model = segmodel(cfg=config, criterion=criterion, norm_layer=nn.BatchNorm2d, test=True)
model = nn.DataParallel(model, device_ids = config.device_ids)
model.to(f'cuda:{model.device_ids[0]}', non_blocking=True)

In [None]:
saved_model_path = os.path.join(projectFolder, pretrained_model_path)
print(saved_model_path)
# exit()
state_dict = torch.load(saved_model_path)
model.load_state_dict(state_dict['model'], strict=False)
print(f'model loaded')
epoch = state_dict['epoch']


In [29]:
def measure_latency_cpu_usage(model, test_inputs):
    process = psutil.Process()
    cpu_start = process.cpu_percent()
    start = time.time()
    predictions = model(test_inputs)
    end = time.time()
    cpu_end = process.cpu_percent()
    latency = end - start
    cpu_usage = cpu_end - cpu_start
    return latency, cpu_usage


In [33]:
model.eval()
nDatapoint = 100

total_time = 0
total_latency = 0

with torch.no_grad():
    for idx, sample in enumerate(test_loader):
        if idx == nDatapoint:
            break
        imgs = sample['image']  
        imgs = imgs.to(f'cuda:{model.device_ids[0]}', non_blocking=True)
        img = imgs[:, :, :, :1024]

        out = model(img)
        # measure latency and cpu usage
        latency, cpu_usage = measure_latency_cpu_usage(model, img)
        total_latency += latency
        total_time += cpu_usage

input:  torch.Size([1, 3, 1024, 1024])
############### Stage 1 ##########################
tokenization:  torch.Size([1, 65536, 64])
+++++++++ block +++++ input:  torch.Size([1, 65536, 64]) 256 256
!!!!!!!!!!!!attention head:  2  !!!!!!!!!!
torch.Size([1, 65536, 64]) 256 256
input: MLP  torch.Size([1, 65536, 64]) 256 256
+++++++++ block +++++ input:  torch.Size([1, 65536, 64]) 256 256
!!!!!!!!!!!!attention head:  2  !!!!!!!!!!
torch.Size([1, 65536, 64]) 256 256
input: MLP  torch.Size([1, 65536, 64]) 256 256
+++++++++ block +++++ input:  torch.Size([1, 65536, 64]) 256 256
!!!!!!!!!!!!attention head:  2  !!!!!!!!!!
torch.Size([1, 65536, 64]) 256 256
input: MLP  torch.Size([1, 65536, 64]) 256 256
output:  torch.Size([1, 64, 256, 256])
******** End Stage 1 **************
############### Stage 2 ##########################
tokenization:  torch.Size([1, 16384, 128])
+++++++++ block +++++ input:  torch.Size([1, 16384, 128]) 128 128
!!!!!!!!!!!!attention head:  4  !!!!!!!!!!
torch.Size([1, 16384

In [34]:
print(f'average latency: {total_latency/nDatapoint}')
print(f'average cpu usage: {total_time/nDatapoint}')

average latency: 0.08709349393844605
average cpu usage: 101.02699999999996


In [36]:
import time
import torch

def measure_gpu_throughput(model, inputs, batch_size):
    inputs = inputs.to('cuda')
    model = model.to('cuda')
    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)
    start.record()
    with torch.no_grad():
        for i in range(0, inputs.size(0), batch_size):
            output = model(inputs[i:i + batch_size])
    end.record()
    torch.cuda.synchronize()
    latency = start.elapsed_time(end)
    throughput = inputs.size(0) * batch_size / latency
    return throughput

throughput = measure_gpu_throughput(model, img, 4)
print(f'GPU throughput: {throughput} images/s')

input:  torch.Size([1, 3, 1024, 1024])
############### Stage 1 ##########################
tokenization:  torch.Size([1, 65536, 64])
+++++++++ block +++++ input:  torch.Size([1, 65536, 64]) 256 256
!!!!!!!!!!!!attention head:  2  !!!!!!!!!!
torch.Size([1, 65536, 64]) 256 256
input: MLP  torch.Size([1, 65536, 64]) 256 256
+++++++++ block +++++ input:  torch.Size([1, 65536, 64]) 256 256
!!!!!!!!!!!!attention head:  2  !!!!!!!!!!
torch.Size([1, 65536, 64]) 256 256
input: MLP  torch.Size([1, 65536, 64]) 256 256
+++++++++ block +++++ input:  torch.Size([1, 65536, 64]) 256 256
!!!!!!!!!!!!attention head:  2  !!!!!!!!!!
torch.Size([1, 65536, 64]) 256 256
input: MLP  torch.Size([1, 65536, 64]) 256 256
output:  torch.Size([1, 64, 256, 256])
******** End Stage 1 **************
############### Stage 2 ##########################
tokenization:  torch.Size([1, 16384, 128])
+++++++++ block +++++ input:  torch.Size([1, 16384, 128]) 128 128
!!!!!!!!!!!!attention head:  4  !!!!!!!!!!
torch.Size([1, 16384