# **Preparing**

Check the GPU provided (if GPU is not activated: Edit -> Notebook Settings -> Hardware Accelerator -> select GPU)

In [32]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"
# https://github.com/googlecolab/colabtools/issues/3409

In [14]:
!nvidia-smi

Sun Dec 31 08:39:10 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   45C    P0              27W /  70W |   1793MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [15]:
!nvcc -V

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


Download data

In [16]:
%cd /content/
!rm -rf *
!mkdir -p /content/src/build/
%cd /content/src/
!gdown -O data.zip 1Iey_nn0xAQBOUiq8Ib8FQYSmB7ouCTtu
!unzip data.zip
!rm data.zip

/content
/content/src
Downloading...
From: https://drive.google.com/uc?id=1Iey_nn0xAQBOUiq8Ib8FQYSmB7ouCTtu
To: /content/src/data.zip
100% 78.9M/78.9M [00:00<00:00, 234MB/s]
Archive:  data.zip
   creating: test/
  inflating: test/test.jpg           
  inflating: model.pth               
  inflating: model_jit.pth           
  inflating: paths.txt               


After completion, the directory structure includes:
```
- [src]
    - [build]: Torch C++ installation files
    - [test]: Sample images
    - paths.txt: Paths to the images
    - model_jit.pth: PyTorch model converted to TorchScript
```

#**PYTORCH**

In [17]:
%cd /content/src

/content/src


In [18]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torchvision.transforms as tf
from PIL import Image
import cv2
import pandas as pd

from time import time

In [19]:
BATCH_SIZE = 32
SCALE_SIZE = 300

In [20]:
def process_image(im):
    im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
    im = cv2.resize(im, (SCALE_SIZE, SCALE_SIZE), 0.0, 0.0, cv2.INTER_LINEAR)
    return im

class TestDataset:
    def __init__(self, paths_txt='paths.txt'):
        df = pd.read_csv(paths_txt, delimiter=' ', header=None)

        self.data = list(df[0])

        self.tf = tf.Compose([
            tf.ToTensor(),
            tf.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
        ])

    def __getitem__(self, i):
        im = cv2.imread(self.data[i])
        im = process_image(im)
        im = self.tf(im)
        return im

    def __len__(self):
        return len(self.data)

dataset = TestDataset()
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

In [None]:
device = torch.device('cuda')

script_model = torch.jit.load('model_jit.pth')
script_model = script_model.to(device)
script_model.eval()

In [22]:
# Warm up GPU
with torch.no_grad():
    for batch in dataloader:
        batch = batch.to(device)
        script_model(batch)

# Measure total time
t1 = time()
with torch.no_grad():
    for batch in dataloader:
        batch = batch.to(device)
        script_model(batch)
t2 = time()

print(f"Total time: {t2 - t1} second(s) (FPS: {len(dataloader.dataset) /(t2 - t1)})")

# Measure dataloading time
t1 = time()
with torch.no_grad():
    for batch in dataloader:
        batch = batch.to(device)
t2 = time()

print(f"Dataloading time: {t2 - t1} second(s) (FPS: {len(dataloader.dataset) /(t2 - t1)})")

# Measure inference time
with torch.no_grad():
    t = 0
    for batch in dataloader:
        batch = batch.to(device)
        t1 = time()
        script_model(batch)
        t2 = time()
        t += t2 - t1

print(f"Inference time: {t} second(s) (FPS: {len(dataloader.dataset) / t})")

Total time: 23.37158465385437 second(s) (FPS: 43.85667532522063)
Dataloading time: 10.550398349761963 second(s) (FPS: 97.15272978513896)
Inference time: 0.18727731704711914 second(s) (FPS: 5473.166831742411)


#**LIBTORCH**

In [34]:
%cd /content/src

/content/src


In [35]:
!sudo apt -qq install libgif-dev

libgif-dev is already the newest version (5.1.9-2build2).
0 upgraded, 0 newly installed, 0 to remove and 24 not upgraded.


In [36]:
!wget https://download.pytorch.org/libtorch/nightly/cu101/libtorch-cxx11-abi-shared-with-deps-latest.zip -q
!unzip -q libtorch-cxx11-abi-shared-with-deps-latest.zip

replace libtorch/lib/libasmjit.a? [y]es, [n]o, [A]ll, [N]one, [r]ename: A


In [48]:
%%writefile main.cpp
#include <torch/torch.h>
#include <torch/script.h>
#include <opencv2/core.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>

#include <time.h>
#include <codecvt>
#include <locale>
#include <cstddef>
#include <cstdio>
#include <vector>
#include <chrono>
#include <fstream>
#include <iostream>
#include <string>

#define BATCH_SIZE 32
#define SCALE_SIZE 300

const at::Tensor MEAN = torch::tensor({0.485, 0.456, 0.406});
const at::Tensor STD = torch::tensor({0.229, 0.224, 0.225});

typedef std::pair<std::string, int> psi;

std::vector<psi> convert(std::string infile)
{
    std::fstream fi;
    fi.open(infile, std::fstream::in);

    std::string filename;
    int label;

    std::vector<psi> results;

    while (fi >> filename >> label)
    {
        results.push_back(psi(filename, label));
    }

    return results;
}

at::Tensor imageTransform(cv::Mat image)
{
    // Convert BGR to RGB
    cv::cvtColor(image, image, cv::COLOR_BGR2RGB);

    // Resize image
    cv::Mat resizedImage;
    cv::resize(image, resizedImage, cv::Size{SCALE_SIZE, SCALE_SIZE}, 0.0, 0.0, cv::INTER_LINEAR);

    // Convert to tensor
    at::Tensor image_tensor = torch::from_blob(resizedImage.data, {resizedImage.rows, resizedImage.cols, 3}, at::kByte);
    image_tensor = image_tensor / 255.0f;

    // Normalize
    image_tensor = (image_tensor - MEAN) / STD;

    // Transpose H, W, C -> C, H, W
    image_tensor = image_tensor.permute({2, 0, 1});

    return image_tensor;
}

class CustomDataset : public torch::data::Dataset<CustomDataset>
{
private:
    // Declare 2 vectors of tensors for images and labels
    std::vector<std::string> list_images; // list of path of images
public:
    // Constructor
    CustomDataset(std::string paths_txt)
    {
        std::vector<psi> paths = convert(paths_txt);
        for (int i = 0 ; i < paths.size(); i++)
        {
            list_images.push_back(paths.at(i).first);
        }
    };

    // Override get() function to return tensor at location index
    torch::data::Example<> get(size_t index) override
    {
        cv::Mat image = cv::imread(list_images.at(index), cv::IMREAD_ANYCOLOR);
        torch::Tensor sample_img = imageTransform(image);
        torch::Tensor sample_label = torch::zeros(1);
        return {sample_img, sample_label};
    };

    // Return the length of data
    torch::optional<size_t> size() const override
    {
        return list_images.size();
    };
};

int main(int argc, char **argv)
{
    auto custom_dataset = CustomDataset("../paths.txt").map(torch::data::transforms::Stack<>());
    auto dataset_size = custom_dataset.size().value();

    auto data_loader = torch::data::make_data_loader<torch::data::samplers::SequentialSampler>(
        std::move(custom_dataset),
        BATCH_SIZE
    );

    torch::Device device(torch::kCUDA);

    torch::jit::script::Module module;
    try
    {
        const std::string model_path{argv[1]};
        module = torch::jit::load(model_path);
        module.to(device);
        module.eval();
    }
    catch (const c10::Error &e)
    {
        std::cerr << "Error loading the model\n";
        std::cerr << e.msg();
        return -1;
    }

    torch::NoGradGuard no_grad_guard;

    // Warmup GPU(s)
    for (int i = 0; i < 2; i++) {
        for (torch::data::Example<>& batch: *data_loader) {
            torch::Tensor data = batch.data;                // Tensor B C W H
            std::vector<torch::jit::IValue> inputs;         // IValue
            inputs.push_back(data.to(device));              // Tensor2IValue
            module.forward(inputs).toTensor();
        }
    }

    auto t1 = std::chrono::system_clock::now();
    auto t2 = std::chrono::system_clock::now();
    auto total_time = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count();

    // Measure total time
    t1 = std::chrono::system_clock::now();
    for (torch::data::Example<>& batch: *data_loader) {
        torch::Tensor data = batch.data;                // Tensor B C W H
        std::vector<torch::jit::IValue> inputs;         // IValue
        inputs.push_back(data.to(device));              // Tensor2IValue
        module.forward(inputs).toTensor();
    }
    t2 = std::chrono::system_clock::now();

    total_time = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count();
    std::cerr << "Total time: " << total_time * 1.0 / 1000 << " seconds";
    std::cerr << " (FPS: " << dataset_size / (total_time * 1.0 / 1000)  << ")" << std::endl;

    // Measure dataloading time
    t1 = std::chrono::system_clock::now();
    for (torch::data::Example<>& batch: *data_loader) {
        torch::Tensor data = batch.data;                // Tensor B C W H
        std::vector<torch::jit::IValue> inputs;         // IValue
        inputs.push_back(data.to(device));              // Tensor2IValue
    }
    t2 = std::chrono::system_clock::now();

    total_time = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count();
    std::cerr << "Dataloading time: " << total_time * 1.0 / 1000 << " seconds";
    std::cerr << " (FPS: " << dataset_size / (total_time * 1.0 / 1000)  << ")" << std::endl;

    // Measure model inference time
    total_time = 0;
    for (torch::data::Example<>& batch: *data_loader) {
        torch::Tensor data = batch.data;                // Tensor B C W H
        std::vector<torch::jit::IValue> inputs;         // IValue
        inputs.push_back(data.to(device));              // Tensor2IValue
        t1 = std::chrono::system_clock::now();
        module.forward(inputs).toTensor();
        t2 = std::chrono::system_clock::now();
        total_time += std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count();
    }
    std::cerr << "Inference time: " << total_time * 1.0 / 1000 << " seconds";
    std::cerr << " (FPS: " << dataset_size / (total_time * 1.0 / 1000)  << ")" << std::endl;

    return 0;
}

Writing main.cpp


In [49]:
%%writefile CMakeLists.txt
cmake_minimum_required(VERSION 3.1)
project(inference)
find_package(Torch REQUIRED)
find_package(OpenCV REQUIRED)
find_package(OpenCV COMPONENTS core imgproc highgui REQUIRED)

set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

include_directories(${OpenCV_INCLUDE_DIRS})
include_directories(${Torch_INCLUDE_DIRS})

add_executable(run main.cpp)
target_link_libraries(run ${TORCH_LIBRARIES} ${OpenCV_LIBRARIES})
target_compile_features(run PUBLIC cxx_range_for)
set_property(TARGET run PROPERTY CXX_STANDARD 14)

Overwriting CMakeLists.txt


In [51]:
%%shell
cd /content/src/build/
echo "$(pwd)"
cmake -DCMAKE_PREFIX_PATH=/content/src/libtorch ..

/content/src/build
  Compatibility with CMake < 3.5 will be removed from a future version of
  CMake.

  Update the VERSION argument <min> value or use a ...<max> suffix to tell
  CMake that the project does not need compatibility with older versions.

[0m
-- Caffe2: CUDA detected: 12.2
-- Caffe2: CUDA nvcc is: /usr/local/cuda/bin/nvcc
-- Caffe2: CUDA toolkit directory: /usr/local/cuda
-- Caffe2: Header version is: 12.2
-- Found cuDNN: v8.9.6  (include: /usr/include, library: /usr/lib/x86_64-linux-gnu/libcudnn.so)
  Failed to compute shorthash for libnvrtc.so
Call Stack (most recent call first):
  libtorch/share/cmake/Caffe2/Caffe2Config.cmake:88 (include)
  libtorch/share/cmake/Torch/TorchConfig.cmake:68 (find_package)
  CMakeLists.txt:4 (find_package)

[0m
-- Autodetected CUDA architecture(s):  7.5
-- Added CUDA NVCC flags for: -gencode;arch=compute_75,code=sm_75
-- Found OpenCV: /usr (found version "4.5.4") found components: core imgproc highgui 
-- Configuring done (0.3s)
-- Gene



In [52]:
%cd /content/src/build
!make -j4
%cd /content/src

/content/src/build
[ 50%] [32mBuilding CXX object CMakeFiles/run.dir/main.cpp.o[0m
[100%] [32m[1mLinking CXX executable run[0m
[100%] Built target run
/content/src


In [53]:
%cd /content/src/build
!./run ../model_jit.pth
%cd /content/src

/content/src/build
Total time: 8.466 seconds (FPS: 121.073)
Dataloading time: 8.461 seconds (FPS: 121.144)
Inference time: 0.147 seconds (FPS: 6972.79)
/content/src
