# **Chuẩn bị**

Kiểm tra GPU được cung cấp (nếu chưa mở GPU: Edit -> Notebook Settings -> Hardware Accelerator chọn GPU)

In [None]:
!nvidia-smi

Fri Jan  8 03:48:58 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.27.04    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   65C    P8    10W /  70W |      0MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!nvcc -V

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Sun_Jul_28_19:07:16_PDT_2019
Cuda compilation tools, release 10.1, V10.1.243


Tải các tập tin cần thiết và chuẩn bị cấu trúc thư mục.

In [None]:
%cd /content/
!rm -rf *
!mkdir -p /content/src/build/
%cd /content/src/

from google_drive_downloader import GoogleDriveDownloader as gdd
gdd.download_file_from_google_drive(file_id='1Iey_nn0xAQBOUiq8Ib8FQYSmB7ouCTtu',
                                    dest_path='./data.zip',
                                    unzip=True)
!rm data.zip

/content
/content/src
Downloading 1Iey_nn0xAQBOUiq8Ib8FQYSmB7ouCTtu into ./data.zip... Done.
Unzipping...Done.


Sau khi hoàn tất, cấu trúc thư mục bao gồm:
- [src]:
    - [build]: các tập tin cài đặt của Torch C++
    - [test]: các ảnh mẫu
    - paths.txt: đường dẫn đến các ảnh
    - model_jit.pth: mô hình PyTorch đã được chuyển đổi sang TorchScript

#**PYTORCH**

In [None]:
%cd /content/src

/content/src


Import các thư viện cần thiết

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torchvision.transforms as tf
from PIL import Image
import cv2
import pandas as pd

from time import time

In [None]:
BATCH_SIZE = 32
SCALE_SIZE = 300

In [None]:
def process_image(im):
    im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
    im = cv2.resize(im, (SCALE_SIZE, SCALE_SIZE), 0.0, 0.0, cv2.INTER_LINEAR)
    return im

class TestDataset:
    def __init__(self, paths_txt='paths.txt'):
        df = pd.read_csv(paths_txt, delimiter=' ', header=None)

        self.data = list(df[0])

        self.tf = tf.Compose([
            tf.ToTensor(),
            tf.Normalize(mean=[0.485, 0.456, 0.406], 
                         std=[0.229, 0.224, 0.225]),
        ])

    def __getitem__(self, i):
        im = cv2.imread(self.data[i])
        im = process_image(im)
        im = self.tf(im)
        return im

    def __len__(self):
        return len(self.data)

dataset = TestDataset()
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

In [None]:
device = torch.device('cuda')

script_model = torch.jit.load('model_jit.pth')
script_model = script_model.to(device)
script_model.eval()

RecursiveScriptModule(
  original_name=BaseModel
  (extractor): RecursiveScriptModule(
    original_name=EfficientNetExtractor
    (extractor): RecursiveScriptModule(
      original_name=EfficientNet
      (_conv_stem): RecursiveScriptModule(
        original_name=Conv2dStaticSamePadding
        (static_padding): RecursiveScriptModule(original_name=ZeroPad2d)
      )
      (_bn0): RecursiveScriptModule(original_name=BatchNorm2d)
      (_blocks): RecursiveScriptModule(
        original_name=ModuleList
        (0): RecursiveScriptModule(
          original_name=MBConvBlock
          (_depthwise_conv): RecursiveScriptModule(
            original_name=Conv2dStaticSamePadding
            (static_padding): RecursiveScriptModule(original_name=ZeroPad2d)
          )
          (_bn1): RecursiveScriptModule(original_name=BatchNorm2d)
          (_se_reduce): RecursiveScriptModule(
            original_name=Conv2dStaticSamePadding
            (static_padding): RecursiveScriptModule(original_name=I

In [None]:
# Warm up GPU(s)
with torch.no_grad():
    for batch in dataloader:
        batch = batch.to(device)
        script_model(batch)

# Measure total time
t1 = time()
with torch.no_grad():
    for batch in dataloader:
        batch = batch.to(device)
        script_model(batch)
t2 = time()

print(f"Total time: {t2 - t1} second(s) (FPS: {len(dataloader.dataset) /(t2 - t1)})")

# Measure dataloading time
t1 = time()
with torch.no_grad():
    for batch in dataloader:
        batch = batch.to(device)
t2 = time()

print(f"Dataloading time: {t2 - t1} second(s) (FPS: {len(dataloader.dataset) /(t2 - t1)})")

# Measure inference time
with torch.no_grad():
    t = 0
    for batch in dataloader:
        batch = batch.to(device)
        t1 = time()
        script_model(batch)
        t2 = time()
        t += t2 - t1

print(f"Inference time: {t} second(s) (FPS: {len(dataloader.dataset) / t})")

Total time: 11.899887561798096 second(s) (FPS: 86.13526763820283)
Dataloading time: 11.577296495437622 second(s) (FPS: 88.53535023517206)
Inference time: 0.2382047176361084 second(s) (FPS: 4303.021410204954)


#**LIBTORCH** 

In [None]:
%cd /content/src

/content/src


In [None]:
!sudo apt -qq install libgif-dev

The following NEW packages will be installed:
  libgif-dev
0 upgraded, 1 newly installed, 0 to remove and 16 not upgraded.
Need to get 20.6 kB of archives.
After this operation, 97.3 kB of additional disk space will be used.
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 1.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: falling back to frontend: Teletype
dpkg-preconfigure: unable to re-open stdin: 
Selecting previously unselected package libgif-dev.
(Reading database ... 145483 files and directories currently installed.)
Preparing to unpack .../libgif-dev_5.1.4-2ubuntu0.1_amd64.deb ...
Unpacking libgif-dev (5.1.4-2ubuntu0.1) ...
Setting up libgif-dev (5.1.4-2ubuntu0.1) ...


In [None]:
!wget https://download.pytorch.org/libtorch/nightly/cu101/libtorch-cxx11-abi-shared-with-deps-latest.zip -q
!unzip -q libtorch-cxx11-abi-shared-with-deps-latest.zip

In [None]:
%%shell
cd /content/src/build/
rm -rf  *
cmake -DCMAKE_PREFIX_PATH=/content/src/libtorch ..

-- The C compiler identification is GNU 7.5.0
-- The CXX compiler identification is GNU 7.5.0
-- Check for working C compiler: /usr/bin/cc
-- Check for working C compiler: /usr/bin/cc -- works
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Detecting C compile features
-- Detecting C compile features - done
-- Check for working CXX compiler: /usr/bin/c++
-- Check for working CXX compiler: /usr/bin/c++ -- works
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Looking for pthread.h
-- Looking for pthread.h - found
-- Looking for pthread_create
-- Looking for pthread_create - not found
-- Looking for pthread_create in pthreads
-- Looking for pthread_create in pthreads - not found
-- Looking for pthread_create in pthread
-- Looking for pthread_create in pthread - found
-- Found Threads: TRUE  
-- Found CUDA: /usr/local/cuda (found version "10.1") 
-- Caffe2:



In [None]:
%%writefile main.cpp
#include <torch/torch.h>
#include <torch/script.h>
#include <opencv2/core.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>

#include <time.h>
#include <codecvt>
#include <locale>
#include <cstddef>
#include <cstdio>
#include <vector>
#include <chrono>
#include <fstream>
#include <iostream>
#include <string>

#define BATCH_SIZE 32
#define SCALE_SIZE 300

const at::Tensor MEAN = torch::tensor({0.485, 0.456, 0.406});
const at::Tensor STD = torch::tensor({0.229, 0.224, 0.225});

typedef std::pair<std::string, int> psi;

std::vector<psi> convert(std::string infile)
{
    std::fstream fi;
    fi.open(infile, std::fstream::in);
 
    std::string filename;
    int label;

    std::vector<psi> results;

    while (fi >> filename >> label)
    {
        results.push_back(psi(filename, label));
    }
 
    return results;
}

at::Tensor imageTransform(cv::Mat image)
{
    // Convert BGR to RGB
    cv::cvtColor(image, image, cv::COLOR_BGR2RGB);
 
    // Resize image
    cv::Mat resizedImage;
    cv::resize(image, resizedImage, cv::Size{SCALE_SIZE, SCALE_SIZE}, 0.0, 0.0, cv::INTER_LINEAR);
 
    // Convert to tensor
    at::Tensor image_tensor = torch::from_blob(resizedImage.data, {resizedImage.rows, resizedImage.cols, 3}, at::kByte);
    image_tensor = image_tensor / 255.0f;

    // Normalize
    image_tensor = (image_tensor - MEAN) / STD;
 
    // Transpose H, W, C -> C, H, W
    image_tensor = image_tensor.permute({2, 0, 1});

    return image_tensor;
}

class CustomDataset : public torch::data::Dataset<CustomDataset>
{
private:
    // Declare 2 vectors of tensors for images and labels
    std::vector<std::string> list_images; // list of path of images
public:
    // Constructor
    CustomDataset(std::string paths_txt)
    {
        std::vector<psi> paths = convert(paths_txt);
        for (int i = 0 ; i < paths.size(); i++) 
        {
            list_images.push_back(paths.at(i).first);
        }
    };

    // Override get() function to return tensor at location index
    torch::data::Example<> get(size_t index) override
    {
        cv::Mat image = cv::imread(list_images.at(index), cv::IMREAD_ANYCOLOR);
        torch::Tensor sample_img = imageTransform(image);
        torch::Tensor sample_label = torch::zeros(1);
        return {sample_img, sample_label};
    };

    // Return the length of data
    torch::optional<size_t> size() const override
    {
        return list_images.size();
    };
};

int main(int argc, char **argv)
{
    auto custom_dataset = CustomDataset("../paths.txt").map(torch::data::transforms::Stack<>());
    auto dataset_size = custom_dataset.size().value();

    auto data_loader = torch::data::make_data_loader<torch::data::samplers::SequentialSampler>(
        std::move(custom_dataset),
        BATCH_SIZE
    );

    torch::Device device(torch::kCUDA);

    torch::jit::script::Module module;
    try
    {
        const std::string model_path{argv[1]};
        module = torch::jit::load(model_path);
        module.to(device);
        module.eval();
    }
    catch (const c10::Error &e)
    {
        std::cerr << "Error loading the model\n";
        std::cerr << e.msg();
        return -1;
    }
    
    torch::NoGradGuard no_grad_guard;
 
    // Warmup GPU(s)
    for (int i = 0; i < 2; i++) {
        for (torch::data::Example<>& batch: *data_loader) {
            torch::Tensor data = batch.data;                // Tensor B C W H
            std::vector<torch::jit::IValue> inputs;         // IValue 
            inputs.push_back(data.to(device));              // Tensor2IValue
            module.forward(inputs).toTensor();
        }
    }
 
    auto t1 = std::chrono::system_clock::now();
    auto t2 = std::chrono::system_clock::now();
    auto total_time = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count();
 
    // Measure total time
    t1 = std::chrono::system_clock::now();
    for (torch::data::Example<>& batch: *data_loader) {
        torch::Tensor data = batch.data;                // Tensor B C W H
        std::vector<torch::jit::IValue> inputs;         // IValue 
        inputs.push_back(data.to(device));              // Tensor2IValue
        module.forward(inputs).toTensor();
    }
    t2 = std::chrono::system_clock::now();
 
    total_time = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count();
    std::cerr << "Total time: " << total_time * 1.0 / 1000 << " seconds";
    std::cerr << " (FPS: " << dataset_size / (total_time * 1.0 / 1000)  << ")" << std::endl;
 
    // Measure dataloading time
    t1 = std::chrono::system_clock::now();
    for (torch::data::Example<>& batch: *data_loader) {
        torch::Tensor data = batch.data;                // Tensor B C W H
        std::vector<torch::jit::IValue> inputs;         // IValue 
        inputs.push_back(data.to(device));              // Tensor2IValue
    }
    t2 = std::chrono::system_clock::now();
 
    total_time = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count();
    std::cerr << "Dataloading time: " << total_time * 1.0 / 1000 << " seconds";
    std::cerr << " (FPS: " << dataset_size / (total_time * 1.0 / 1000)  << ")" << std::endl;
 
    // Measure model inference time
    total_time = 0;
    for (torch::data::Example<>& batch: *data_loader) {
        torch::Tensor data = batch.data;                // Tensor B C W H
        std::vector<torch::jit::IValue> inputs;         // IValue 
        inputs.push_back(data.to(device));              // Tensor2IValue
        t1 = std::chrono::system_clock::now();
        module.forward(inputs).toTensor();
        t2 = std::chrono::system_clock::now();
        total_time += std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count();
    }
    std::cerr << "Inference time: " << total_time * 1.0 / 1000 << " seconds";
    std::cerr << " (FPS: " << dataset_size / (total_time * 1.0 / 1000)  << ")" << std::endl;

    return 0;
}

Overwriting main.cpp


In [None]:
%%writefile CMakeLists.txt
cmake_minimum_required(VERSION 3.1)
project(inference)

find_package(Torch REQUIRED)
find_package(OpenCV REQUIRED)
find_package(OpenCV COMPONENTS core imgproc highgui REQUIRED)

set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
    
include_directories(${OpenCV_INCLUDE_DIRS})
include_directories(${Torch_INCLUDE_DIRS})

add_executable(run main.cpp)
target_link_libraries(run ${TORCH_LIBRARIES} ${OpenCV_LIBRARIES})
target_compile_features(run PUBLIC cxx_range_for)
set_property(TARGET run PROPERTY CXX_STANDARD 14)

Overwriting CMakeLists.txt


In [None]:
%cd /content/src/build
!make -j4
%cd /content/src

/content/src/build
[35m[1mScanning dependencies of target run[0m
[ 50%] [32mBuilding CXX object CMakeFiles/run.dir/main.cpp.o[0m
[100%] [32m[1mLinking CXX executable run[0m
[100%] Built target run
/content/src


In [None]:
%cd /content/src/build
!./run ../model_jit.pth 
%cd /content/src

/content/src/build
Total time: 11.912 seconds (FPS: 86.0477)
Dataloading time: 11.695 seconds (FPS: 87.6443)
Inference time: 0.198 seconds (FPS: 5176.77)
/content/src
