-
-
Notifications
You must be signed in to change notification settings - Fork 106
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Low performance comparing to NVIDIA's official 3d software stack in jetson. #60
Comments
The cupoch code is below, which is inspired by main.cpp from cuda-pcl #include <fstream>
#include <chrono>
// #include <pcl/io/pcd_io.h>
// #include <pcl/filters/passthrough.h>
// #include <pcl/filters/voxel_grid.h>
// #include "cuda_runtime.h"
// #include "lib/cudaFilter.h"
#include <cupoch/cupoch.h>
#include <gflags/gflags.h>
// DEFINE_string(target_cloud, "./res/pcd/ndt/room_scan1.pcd", "Filename of target_cloud for ndt alignment");
DEFINE_string(input_cloud, "./res/pcd/sample.pcd", "Filename of source_cloud for cuda-filter");
using namespace cupoch;
void Getinfo(void)
{
cudaDeviceProp prop;
int count = 0;
cudaGetDeviceCount(&count);
printf("\nGPU has cuda devices: %d\n", count);
for (int i = 0; i < count; ++i)
{
cudaGetDeviceProperties(&prop, i);
printf("----device id: %d info----\n", i);
printf(" GPU : %s \n", prop.name);
printf(" Capbility: %d.%d\n", prop.major, prop.minor);
printf(" Global memory: %luMB\n", prop.totalGlobalMem >> 20);
printf(" Const memory: %luKB\n", prop.totalConstMem >> 10);
printf(" SM in a block: %luKB\n", prop.sharedMemPerBlock >> 10);
printf(" warp size: %d\n", prop.warpSize);
printf(" threads in a block: %d\n", prop.maxThreadsPerBlock);
printf(" block dim: (%d,%d,%d)\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
printf(" grid dim: (%d,%d,%d)\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
}
printf("\n");
}
void testCupoch(std::shared_ptr<geometry::PointCloud>& cloudSrc, std::shared_ptr<geometry::PointCloud>& cloudDst)
{
std::chrono::steady_clock::time_point t1 = std::chrono::steady_clock::now();
std::chrono::steady_clock::time_point t2 = std::chrono::steady_clock::now();
std::chrono::duration<double, std::ratio<1, 1000>> time_span =
std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1000>>>(t2 - t1);
std::cout << "\n------------checking CUDA ---------------- " << std::endl;
std::cout << "CUDA Loaded " << cloudSrc->points_.size() << std::endl;
{
unsigned int nCount = cloudSrc->points_.size();
unsigned int countLeft = 0;
std::cout << "\n------------checking CUDA PassThrough ---------------- " << std::endl;
// setP.type = type;
// setP.dim = 0;
// setP.upFilterLimits = 0.5;
// setP.downFilterLimits = -0.5;
// setP.limitsNegative = false;
// filterTest.set(setP);
t1 = std::chrono::steady_clock::now();
// filterTest.filter(output, &countLeft, input, nCount);
cloudDst = cloudSrc->PassThroughFilter(0, -0.5, 0.5);
// cloudSrc->PassThroughFilter(1, m_min_height_low, m_max_height_low);
// cloudSrc->PassThroughFilter(2, m_min_distance_low, m_max_distance_low);
countLeft = cloudDst->points_.size();
t2 = std::chrono::steady_clock::now();
time_span = std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1000>>>(t2 - t1);
std::cout << "CUDA PassThrough by Time: " << time_span.count() << " ms." << std::endl;
std::cout << "CUDA PassThrough before filtering: " << nCount << std::endl;
std::cout << "CUDA PassThrough after filtering: " << countLeft << std::endl;
}
// {
// unsigned int countLeft = 0;
// std::cout << "\n------------checking CUDA VoxelGrid---------------- " << std::endl;
// memset(outputData, 0, sizeof(float) * 4 * nCount);
// type = VOXELGRID;
// setP.type = type;
// setP.voxelX = 1;
// setP.voxelY = 1;
// setP.voxelZ = 1;
// filterTest.set(setP);
// int status = 0;
// cudaDeviceSynchronize();
// t1 = std::chrono::steady_clock::now();
// status = filterTest.filter(output, &countLeft, input, nCount);
// cudaDeviceSynchronize();
// t2 = std::chrono::steady_clock::now();
// if (status != 0)
// return;
// time_span = std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1000>>>(t2 - t1);
// std::cout << "CUDA VoxelGrid by Time: " << time_span.count() << " ms." << std::endl;
// std::cout << "CUDA VoxelGrid before filtering: " << nCount << std::endl;
// std::cout << "CUDA VoxelGrid after filtering: " << countLeft << std::endl;
// pcl::PointCloud<pcl::PointXYZ>::Ptr cloudNew(new pcl::PointCloud<pcl::PointXYZ>);
// cloudNew->width = countLeft;
// cloudNew->height = 1;
// cloudNew->points.resize(cloudNew->width * cloudNew->height);
// int check = 0;
// for (std::size_t i = 0; i < cloudNew->size(); ++i)
// {
// cloudNew->points[i].x = output[i * 4 + 0];
// cloudNew->points[i].y = output[i * 4 + 1];
// cloudNew->points[i].z = output[i * 4 + 2];
// }
// pcl::io::savePCDFileASCII("after-cuda-VoxelGrid.pcd", *cloudNew);
// }
}
// void testPCL(pcl::PointCloud<pcl::PointXYZ>::Ptr cloudSrc, pcl::PointCloud<pcl::PointXYZ>::Ptr cloudDst)
// {
// std::cout << "\n\n------------checking PCL ---------------- " << std::endl;
// std::cout << "PCL(CPU) Loaded " << cloudSrc->width * cloudSrc->height
// << " data points from PCD file with the following fields: " << pcl::getFieldsList(*cloudSrc) << std::endl;
// std::chrono::steady_clock::time_point t1 = std::chrono::steady_clock::now();
// std::chrono::steady_clock::time_point t2 = std::chrono::steady_clock::now();
// std::chrono::duration<double, std::ratio<1, 1000>> time_span =
// std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1000>>>(t2 - t1);
// int nCount = cloudSrc->width * cloudSrc->height;
// float* outputData = (float*)cloudDst->points.data();
// {
// std::cout << "\n------------checking PCL(CPU) PassThrough ---------------- " << std::endl;
// memset(outputData, 0, sizeof(float) * 4 * nCount);
// // Create the filtering object
// pcl::PassThrough<pcl::PointXYZ> pass;
// pass.setInputCloud(cloudSrc);
// pass.setFilterFieldName("x");
// pass.setFilterLimits(-0.5, 0.5);
// pass.setFilterLimitsNegative(false);
// t1 = std::chrono::steady_clock::now();
// pass.filter(*cloudDst);
// t2 = std::chrono::steady_clock::now();
// time_span = std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1000>>>(t2 - t1);
// std::cout << "PCL(CPU) PassThrough by Time: " << time_span.count() << " ms." << std::endl;
// std::cout << "PointCloud before filtering: " << cloudSrc->width * cloudSrc->height << " data points ("
// << pcl::getFieldsList(*cloudSrc) << ")." << std::endl;
// std::cout << "PointCloud after filtering: " << cloudDst->width * cloudDst->height << " data points ("
// << pcl::getFieldsList(*cloudDst) << ")." << std::endl;
// pcl::io::savePCDFileASCII("after-pcl-PassThrough.pcd", *cloudDst);
// }
// {
// std::cout << "\n------------checking PCL VoxelGrid---------------- " << std::endl;
// memset(outputData, 0, sizeof(float) * 4 * nCount);
// t1 = std::chrono::steady_clock::now();
// // Create the filtering object
// pcl::VoxelGrid<pcl::PointXYZ> sor;
// sor.setInputCloud(cloudSrc);
// sor.setLeafSize(1, 1, 1);
// sor.filter(*cloudDst);
// t2 = std::chrono::steady_clock::now();
// time_span = std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1000>>>(t2 - t1);
// std::cout << "PCL VoxelGrid by Time: " << time_span.count() << " ms." << std::endl;
// std::cout << "PointCloud before filtering: " << cloudSrc->width * cloudSrc->height << " data points ("
// << pcl::getFieldsList(*cloudSrc) << ")." << std::endl;
// std::cout << "PointCloud after filtering: " << cloudDst->width * cloudDst->height << " data points ("
// << pcl::getFieldsList(*cloudDst) << ")." << std::endl;
// pcl::io::savePCDFileASCII("after-pcl-VoxelGrid.pcd", *cloudDst);
// }
// }
int main(int argc, const char** argv)
{
std::string file = "./sample.pcd";
if (argc > 1)
file = (argv[1]);
Getinfo();
// pcl::PointCloud<pcl::PointXYZ>::Ptr cloudSrc(new pcl::PointCloud<pcl::PointXYZ>);
// pcl::PointCloud<pcl::PointXYZ>::Ptr cloudDst(new pcl::PointCloud<pcl::PointXYZ>);
std::shared_ptr<geometry::PointCloud> input_cloud{std::make_shared<geometry::PointCloud>()};
std::shared_ptr<geometry::PointCloud> output_cloud{std::make_shared<geometry::PointCloud>()};
// if (pcl::io::loadPCDFile<pcl::PointXYZ>(file.c_str(), *cloudSrc) == -1)
// {
// std::cout << "Error:can not open the file: " << file.c_str() << std::endl;
// return (-1);
// }
utility::InitializeAllocator();
utility::SetVerbosityLevel(utility::VerbosityLevel::Warning);
// Loading second scan of room from new perspective.
input_cloud = io::CreatePointCloudFromFile(FLAGS_input_cloud);
std::cout << "Loaded " << input_cloud->points_.size() << " data points from " << FLAGS_input_cloud << std::endl;
testCupoch(input_cloud, output_cloud);
// testPCL(cloudSrc, cloudDst);
return 0;
}
|
Hi, InitializeAllocator(PoolAllocation, 1000000000); |
Thank you very much! |
The passfilter function costs 0.096551ms |
The // speed up around 30% as you said
std::shared_ptr<geometry::PointCloud> input_cloud{std::make_shared<geometry::PointCloud>()};
std::shared_ptr<geometry::PointCloud> output_cloud{std::make_shared<geometry::PointCloud>()};
InitializeAllocator(PoolAllocation, 1000000000);
utility::SetVerbosityLevel(utility::VerbosityLevel::Warning);
//...
// changes back to the normal spped
InitializeAllocator(PoolAllocation, 1000000000);
utility::SetVerbosityLevel(utility::VerbosityLevel::Warning);
std::shared_ptr<geometry::PointCloud> input_cloud{std::make_shared<geometry::PointCloud>()};
std::shared_ptr<geometry::PointCloud> output_cloud{std::make_shared<geometry::PointCloud>()}; I know it is about allocation order of pointcloud, but it is abnormal. |
Recently I tested cuda-pcl which only provides api for jetson.
The test in my jetson xavier shows that the performance of pass through filter in cupoch is nearly only a half of which with cuda-pcl
below is my test screenshot. The test resource is from cuda-pcl
When using cuda-pcl-filter, it only costs 0.586943ms to filter 119978 points to 5110 points.
But when using cupoch, it costs
I will try to find solution inside cupoch and try not to import 3rd party codes or use lower api from cuda.
Any Suggestions to improve the performance so that it can make full usage of jetson gpu? @neka-nat
Thanks!
[ ]managed memory allocation
The text was updated successfully, but these errors were encountered: