Low performance comparing to NVIDIA's official 3d software stack in jetson. #60

ZhenshengLee · 2021-04-13T01:18:37Z

Recently I tested cuda-pcl which only provides api for jetson.

The test in my jetson xavier shows that the performance of pass through filter in cupoch is nearly only a half of which with cuda-pcl

below is my test screenshot. The test resource is from cuda-pcl

When using cuda-pcl-filter, it only costs 0.586943ms to filter 119978 points to 5110 points.

But when using cupoch, it costs

I will try to find solution inside cupoch and try not to import 3rd party codes or use lower api from cuda.

Any Suggestions to improve the performance so that it can make full usage of jetson gpu? @neka-nat

Thanks!

[ ]managed memory allocation

ZhenshengLee · 2021-04-13T01:21:21Z

The cupoch code is below, which is inspired by main.cpp from cuda-pcl

#include <fstream>
#include <chrono>

// #include <pcl/io/pcd_io.h>
// #include <pcl/filters/passthrough.h>
// #include <pcl/filters/voxel_grid.h>

// #include "cuda_runtime.h"
// #include "lib/cudaFilter.h"

#include <cupoch/cupoch.h>
#include <gflags/gflags.h>

// DEFINE_string(target_cloud, "./res/pcd/ndt/room_scan1.pcd", "Filename of target_cloud for ndt alignment");
DEFINE_string(input_cloud, "./res/pcd/sample.pcd", "Filename of source_cloud for cuda-filter");

using namespace cupoch;

void Getinfo(void)
{
    cudaDeviceProp prop;

    int count = 0;
    cudaGetDeviceCount(&count);
    printf("\nGPU has cuda devices: %d\n", count);
    for (int i = 0; i < count; ++i)
    {
        cudaGetDeviceProperties(&prop, i);
        printf("----device id: %d info----\n", i);
        printf("  GPU : %s \n", prop.name);
        printf("  Capbility: %d.%d\n", prop.major, prop.minor);
        printf("  Global memory: %luMB\n", prop.totalGlobalMem >> 20);
        printf("  Const memory: %luKB\n", prop.totalConstMem >> 10);
        printf("  SM in a block: %luKB\n", prop.sharedMemPerBlock >> 10);
        printf("  warp size: %d\n", prop.warpSize);
        printf("  threads in a block: %d\n", prop.maxThreadsPerBlock);
        printf("  block dim: (%d,%d,%d)\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
        printf("  grid dim: (%d,%d,%d)\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
    }
    printf("\n");
}

void testCupoch(std::shared_ptr<geometry::PointCloud>& cloudSrc, std::shared_ptr<geometry::PointCloud>& cloudDst)
{
    std::chrono::steady_clock::time_point t1 = std::chrono::steady_clock::now();
    std::chrono::steady_clock::time_point t2 = std::chrono::steady_clock::now();
    std::chrono::duration<double, std::ratio<1, 1000>> time_span =
        std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1000>>>(t2 - t1);

    std::cout << "\n------------checking CUDA ---------------- " << std::endl;
    std::cout << "CUDA Loaded " << cloudSrc->points_.size() << std::endl;

    {
        unsigned int nCount = cloudSrc->points_.size();
        unsigned int countLeft = 0;

        std::cout << "\n------------checking CUDA PassThrough ---------------- " << std::endl;

        // setP.type = type;
        // setP.dim = 0;
        // setP.upFilterLimits = 0.5;
        // setP.downFilterLimits = -0.5;
        // setP.limitsNegative = false;
        // filterTest.set(setP);

        t1 = std::chrono::steady_clock::now();
        // filterTest.filter(output, &countLeft, input, nCount);
        cloudDst = cloudSrc->PassThroughFilter(0, -0.5, 0.5);
        // cloudSrc->PassThroughFilter(1, m_min_height_low, m_max_height_low);
        // cloudSrc->PassThroughFilter(2, m_min_distance_low, m_max_distance_low);
        countLeft = cloudDst->points_.size();

        t2 = std::chrono::steady_clock::now();
        time_span = std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1000>>>(t2 - t1);
        std::cout << "CUDA PassThrough by Time: " << time_span.count() << " ms." << std::endl;
        std::cout << "CUDA PassThrough before filtering: " << nCount << std::endl;
        std::cout << "CUDA PassThrough after filtering: " << countLeft << std::endl;
    }

    // {
    //     unsigned int countLeft = 0;
    //     std::cout << "\n------------checking CUDA VoxelGrid---------------- " << std::endl;

    //     memset(outputData, 0, sizeof(float) * 4 * nCount);

    //     type = VOXELGRID;

    //     setP.type = type;
    //     setP.voxelX = 1;
    //     setP.voxelY = 1;
    //     setP.voxelZ = 1;

    //     filterTest.set(setP);
    //     int status = 0;
    //     cudaDeviceSynchronize();
    //     t1 = std::chrono::steady_clock::now();
    //     status = filterTest.filter(output, &countLeft, input, nCount);
    //     cudaDeviceSynchronize();
    //     t2 = std::chrono::steady_clock::now();

    //     if (status != 0)
    //         return;
    //     time_span = std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1000>>>(t2 - t1);
    //     std::cout << "CUDA VoxelGrid by Time: " << time_span.count() << " ms." << std::endl;
    //     std::cout << "CUDA VoxelGrid before filtering: " << nCount << std::endl;
    //     std::cout << "CUDA VoxelGrid after filtering: " << countLeft << std::endl;

    //     pcl::PointCloud<pcl::PointXYZ>::Ptr cloudNew(new pcl::PointCloud<pcl::PointXYZ>);
    //     cloudNew->width = countLeft;
    //     cloudNew->height = 1;
    //     cloudNew->points.resize(cloudNew->width * cloudNew->height);

    //     int check = 0;
    //     for (std::size_t i = 0; i < cloudNew->size(); ++i)
    //     {
    //         cloudNew->points[i].x = output[i * 4 + 0];
    //         cloudNew->points[i].y = output[i * 4 + 1];
    //         cloudNew->points[i].z = output[i * 4 + 2];
    //     }
    //     pcl::io::savePCDFileASCII("after-cuda-VoxelGrid.pcd", *cloudNew);
    // }


}

// void testPCL(pcl::PointCloud<pcl::PointXYZ>::Ptr cloudSrc, pcl::PointCloud<pcl::PointXYZ>::Ptr cloudDst)
// {
//     std::cout << "\n\n------------checking PCL ---------------- " << std::endl;
//     std::cout << "PCL(CPU) Loaded " << cloudSrc->width * cloudSrc->height
//               << " data points from PCD file with the following fields: " << pcl::getFieldsList(*cloudSrc) << std::endl;

//     std::chrono::steady_clock::time_point t1 = std::chrono::steady_clock::now();
//     std::chrono::steady_clock::time_point t2 = std::chrono::steady_clock::now();
//     std::chrono::duration<double, std::ratio<1, 1000>> time_span =
//         std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1000>>>(t2 - t1);

//     int nCount = cloudSrc->width * cloudSrc->height;
//     float* outputData = (float*)cloudDst->points.data();
//     {
//         std::cout << "\n------------checking PCL(CPU) PassThrough ---------------- " << std::endl;

//         memset(outputData, 0, sizeof(float) * 4 * nCount);

//         // Create the filtering object
//         pcl::PassThrough<pcl::PointXYZ> pass;
//         pass.setInputCloud(cloudSrc);
//         pass.setFilterFieldName("x");
//         pass.setFilterLimits(-0.5, 0.5);
//         pass.setFilterLimitsNegative(false);

//         t1 = std::chrono::steady_clock::now();
//         pass.filter(*cloudDst);
//         t2 = std::chrono::steady_clock::now();

//         time_span = std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1000>>>(t2 - t1);
//         std::cout << "PCL(CPU) PassThrough by Time: " << time_span.count() << " ms." << std::endl;

//         std::cout << "PointCloud before filtering: " << cloudSrc->width * cloudSrc->height << " data points ("
//                   << pcl::getFieldsList(*cloudSrc) << ")." << std::endl;
//         std::cout << "PointCloud after filtering: " << cloudDst->width * cloudDst->height << " data points ("
//                   << pcl::getFieldsList(*cloudDst) << ")." << std::endl;
//         pcl::io::savePCDFileASCII("after-pcl-PassThrough.pcd", *cloudDst);
//     }
//     {
//         std::cout << "\n------------checking PCL VoxelGrid---------------- " << std::endl;

//         memset(outputData, 0, sizeof(float) * 4 * nCount);

//         t1 = std::chrono::steady_clock::now();

//         // Create the filtering object
//         pcl::VoxelGrid<pcl::PointXYZ> sor;
//         sor.setInputCloud(cloudSrc);
//         sor.setLeafSize(1, 1, 1);
//         sor.filter(*cloudDst);

//         t2 = std::chrono::steady_clock::now();
//         time_span = std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1000>>>(t2 - t1);
//         std::cout << "PCL VoxelGrid by Time: " << time_span.count() << " ms." << std::endl;
//         std::cout << "PointCloud before filtering: " << cloudSrc->width * cloudSrc->height << " data points ("
//                   << pcl::getFieldsList(*cloudSrc) << ")." << std::endl;
//         std::cout << "PointCloud after filtering: " << cloudDst->width * cloudDst->height << " data points ("
//                   << pcl::getFieldsList(*cloudDst) << ")." << std::endl;

//         pcl::io::savePCDFileASCII("after-pcl-VoxelGrid.pcd", *cloudDst);
//     }
// }

int main(int argc, const char** argv)
{
    std::string file = "./sample.pcd";
    if (argc > 1)
        file = (argv[1]);

    Getinfo();

    // pcl::PointCloud<pcl::PointXYZ>::Ptr cloudSrc(new pcl::PointCloud<pcl::PointXYZ>);
    // pcl::PointCloud<pcl::PointXYZ>::Ptr cloudDst(new pcl::PointCloud<pcl::PointXYZ>);
    std::shared_ptr<geometry::PointCloud> input_cloud{std::make_shared<geometry::PointCloud>()};
    std::shared_ptr<geometry::PointCloud> output_cloud{std::make_shared<geometry::PointCloud>()};

    // if (pcl::io::loadPCDFile<pcl::PointXYZ>(file.c_str(), *cloudSrc) == -1)
    // {
    //     std::cout << "Error:can not open the file: " << file.c_str() << std::endl;
    //     return (-1);
    // }
    utility::InitializeAllocator();
    utility::SetVerbosityLevel(utility::VerbosityLevel::Warning);

    // Loading second scan of room from new perspective.
    input_cloud = io::CreatePointCloudFromFile(FLAGS_input_cloud);
    std::cout << "Loaded " << input_cloud->points_.size() << " data points from " << FLAGS_input_cloud << std::endl;

    testCupoch(input_cloud, output_cloud);
    // testPCL(cloudSrc, cloudDst);

    return 0;
}

neka-nat · 2021-04-13T04:51:03Z

Hi,
Thank you for reporting!
I've modified the pool allocator to work better.
Using the latest master and adding the following settings makes it about 30% faster.

InitializeAllocator(PoolAllocation, 1000000000);

ZhenshengLee · 2021-04-13T08:45:12Z

Thank you very much!

ZhenshengLee · 2021-04-21T02:37:43Z

x86 pc with qudro-p4000 gpu
PoolAllocation strategy
cuda-filter-cupoch context
filter 119978 points to 5510.

The passfilter function costs 0.096551ms

ZhenshengLee · 2021-04-21T03:04:09Z

The InitializeAllocator calling position does affect the behavior of pointcloud function

// speed up around 30% as you said
std::shared_ptr<geometry::PointCloud> input_cloud{std::make_shared<geometry::PointCloud>()};
std::shared_ptr<geometry::PointCloud> output_cloud{std::make_shared<geometry::PointCloud>()};
InitializeAllocator(PoolAllocation, 1000000000);
utility::SetVerbosityLevel(utility::VerbosityLevel::Warning);
//...

// changes back to the normal spped
InitializeAllocator(PoolAllocation, 1000000000);
utility::SetVerbosityLevel(utility::VerbosityLevel::Warning);
std::shared_ptr<geometry::PointCloud> input_cloud{std::make_shared<geometry::PointCloud>()};
std::shared_ptr<geometry::PointCloud> output_cloud{std::make_shared<geometry::PointCloud>()};

I know it is about allocation order of pointcloud, but it is abnormal.

neka-nat added the enhancement New feature or request label Apr 13, 2021

neka-nat closed this as completed Apr 15, 2021

This was referenced Apr 20, 2021

[BUG] performance in ros callback context. #62

Closed

[QST]About the best practice of memory copy. #61

Closed

ZhenshengLee mentioned this issue Aug 10, 2021

Add CudaManagedMemoryPool rmmAllocationMode. #67

Merged

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Low performance comparing to NVIDIA's official 3d software stack in jetson. #60

Low performance comparing to NVIDIA's official 3d software stack in jetson. #60

ZhenshengLee commented Apr 13, 2021

ZhenshengLee commented Apr 13, 2021

neka-nat commented Apr 13, 2021 •

edited

ZhenshengLee commented Apr 13, 2021

ZhenshengLee commented Apr 21, 2021

ZhenshengLee commented Apr 21, 2021

Low performance comparing to NVIDIA's official 3d software stack in jetson. #60

Low performance comparing to NVIDIA's official 3d software stack in jetson. #60

Comments

ZhenshengLee commented Apr 13, 2021

ZhenshengLee commented Apr 13, 2021

neka-nat commented Apr 13, 2021 • edited

ZhenshengLee commented Apr 13, 2021

ZhenshengLee commented Apr 21, 2021

ZhenshengLee commented Apr 21, 2021

neka-nat commented Apr 13, 2021 •

edited