From 6b0168c05e1364b7354915e52d3b73da948e4336 Mon Sep 17 00:00:00 2001
From: MatrixCompSci <wirezapp@gmail.com>
Date: Sat, 11 Apr 2015 22:36:50 -0400
Subject: [PATCH 1/4] Started work on the cuda implementation. As of right now
 it works, and is about 20% faster than the CPU implementation. My code is
 very poorly optimized and should be taken with a grain of salt.

I'll be working on it some more later next week.

In a best case scenario, this function will perform in O(nlogn) time. In a worse case
scenario, it'll be O(n^2), where a worst case scenario is two points within the
same cluster being on exactly opposite sides of the tree.

Points should be organized as so

[Cluster|Cluster|Cluster|Cluster]
Where each cluster is 32 points large, and is ordered from
least to greatest compared around their distance from the center of the cluster.
The furthest away point should be no more than max_rad distance from the center.

Eventually this code, if my predictions hold up, should perform 7 - 9x faster than the CPU implementation.
Obviously thats quite a long ways away, will be countless days of work, and will never be optimized perfectly,
but you get the idea.

This code is highly unstable and has a very large amount of bugs (I counted 20+). DO NOT use this
code in any application yet. It will almost certainly crash either your GPU driver or the application.

This code was written to look, initially, as close to the openCL code as possible. Said being, the amount
of branching that currently occurs is huge, since I directly copied the transversal patterns
of the OpenCL version. I will be reducing the amount of branching soon.

-Louis
---
 CMakeLists.txt                        |  15 ++
 nabo/cuda/kd_node.cu                  | 362 ++++++++++++++++++++++++++
 nabo/opencl/knn_kdtree_pt_in_nodes.cl |   9 -
 3 files changed, 377 insertions(+), 9 deletions(-)
 create mode 100644 nabo/cuda/kd_node.cu

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8e829b4..9832c13 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -48,6 +48,21 @@ if(MSVC AND (MSVC_VERSION LESS 1600))
 	add_definitions(-DBOOST_STDINT)
 endif(MSVC AND (MSVC_VERSION LESS 1600))
 
+#CUDA
+set(USE_CUDA "false" CACHE BOOL "Set phasers to CUDA!")
+if(USE_CUDA)
+	#Sorry Linux support only right now :( I don't own a copy of Windows, so its a bit hard for me to make sure that it'll work on windows. -Louis
+	find_package(CUDA)
+
+	SET(CUDA_SEPARABLE_COMPILATION ON)
+	SET(CUDA_PROPAGATE_HOST_FLAGS OFF)
+
+	SET(CUDA_NVCC_FLAGS "-arch=compute_30;-code=sm_30;-Xcompiler=-fPIC;--shared;--use_fast_math;-G")
+
+	link_with(${CUDA_LIBRARIES})
+	message("CUDA has been enabled. Currently only KD_Node is supported in this build")
+endif(USE_CUDA)
+
 # openmp
 find_package(OpenMP)
 if (OPENMP_FOUND)
diff --git a/nabo/cuda/kd_node.cu b/nabo/cuda/kd_node.cu
new file mode 100644
index 0000000..4100f65
--- /dev/null
+++ b/nabo/cuda/kd_node.cu
@@ -0,0 +1,362 @@
+//CUDA runtime for KD_nodes
+
+
+
+/*Roadmap: 
+
+			(We are here)
+ 			 |
+			\ /
+			 v
+Core functionality -> Dynamic Parralelism Experimentation -> Linking to existing libnabo framework -> optimization -> finalization
+
+/Optimization |= (Search key sorting, linearizing the KD tree, improving GPU caching for node heaps)/
+
+EST: Probably a month? Unsure. I need to look through the rest of the SDK.*/
+#define maximum_depth 22
+#define dim_count 3
+#define K_size 16
+#ifndef FLOAT_MAX 
+#define FLOAT_MAX 33554430.0f
+#endif 
+#define BLOCK_SIZE 32
+#define max_rad 256
+//If coordinates are within 5% of eachother when compared to their cluster maximimum, treat them as a single point. To be used later
+#define max_error 0.05f
+
+#define OFFSIDE 0
+#define ONSIDE 1
+#define POINT_STRIDE 3
+struct point
+{
+	float data[dim_count];
+};
+struct heap_entry
+{
+	float value;
+	unsigned char index;
+};
+struct stack_entry{
+	size_t n;
+	uint state;
+};
+
+__device__ float heapHeadValue(heap_entry* h)
+{
+	return h->value;
+}
+
+__device__ heap_entry* heapHeadReplace(heap_entry* h, const int index, const float value, const uint K)
+{
+	uint i = 0;
+	for (; i < K - 1; ++i)
+	{
+		if (h[i + 1].value > value)
+			h[i] = h[i + 1];
+		else
+			break;
+	}
+	h[i].value = value;
+	h[i].index = index;
+	return h;
+}
+
+__device__ heap_entry *heapInit(const uint K)
+{
+	heap_entry *h;
+	for (uint i = 0; i < K; ++i)
+		h[i].value = FLOAT_MAX;
+	return h;
+}
+struct kd_node
+{
+	//Which dimension
+	unsigned int dim;
+	//At what value was this node split?
+	int cutVal;
+	//The index of the current node
+	int index;
+};
+
+
+#define inx_size 12
+struct /*__align__(inx_size)*/ indx
+{
+	//The points of the KD tree
+	point *pts;
+	//The linked nodes
+	const kd_node *nodes;
+};
+
+//Just a utility function for converting an int equal to zero to one, and vice versa. Its not well optimized, but it was quick to write :P Would be better with bitshifts
+__device__ int flip(int in)
+{
+	return abs(in - 1);
+}
+__device__ unsigned int childLeft(const unsigned int pos) { return 2*pos + 1; }
+__device__ unsigned int childRight(const unsigned int pos) { return 2*pos + 2; }
+struct maxAB
+{
+	float A,B;
+	int indx_a, indx_b;
+};
+
+//Clamp the value to 1 or 0
+__device__ static float intensity(float a)
+{
+	return fmax(1,fmin(0,fabs(a)));
+}
+struct heap
+{
+	heap_entry *entries;
+	int current_count;
+};
+//If dynamic parrallelism is not available, default to compute model 3_2. Eg: The early 700 series
+#ifndef CM3_5
+#define CM3_2
+#endif
+//Used to see if we're within bounds and are ready to jump a node
+__device__ unsigned int withinBounds(int cd, point q, point p, float heapHeadVal, float maxRad, float maxError)
+{
+	float diff = q.data[cd] -p.data[cd];
+	float side2 = diff*diff;
+	if ((side2 <= maxRad) &&
+		(side2 * maxError < heapHeadVal))
+	{ 
+		return 1;
+	}
+	return 0;
+}
+//Used for warp devices. One if distance is greater than zero. Returns 0 or 1 
+__device__ unsigned int nodeMinor(int cd, point q, point p)
+{
+	float diff = q.data[cd] -p.data[cd];
+	return (unsigned int)intensity(diff); 
+	
+}
+//Implementation details: http://on-demand.gputechconf.com/gtc/2012/presentations/S0079-Warped-Parallel-Nearest-Neighbor-Searches-Using-KD-Trees.pdf
+__device__ void recursive_warp_search(const indx static_data, const point query_point,  unsigned int _Mask, heap *output, 
+					uint *stackpointer, stack_entry *stack, stack_entry *s)
+{
+	//Go up one
+	--stackpointer;
+	const size_t n = s->n;
+	const kd_node node = static_data.nodes[n];
+	const int cd = node.cutVal;
+	//Continue doesn't do anything anymore since we're in a __device__ function (Not __global__), and there is no while loop
+	/*if (cd == -2)
+		continue;*/
+	const int index = node.index;
+	point p = static_data.pts[index];
+	// compute new distance and update if lower
+	float dist = 0;
+	for (uint i = 0; i < dim_count; ++i)
+	{
+		const float diff = query_point.data[i] - p.data[i];
+		dist += diff * diff;
+	}
+	if ((dist <= max_rad) &&
+		(dist < heapHeadValue(output->entries)) &&
+		(dist > (float)max_error)){
+		output->entries = heapHeadReplace(output->entries, index, dist, K_size);output->current_count++;}
+		// look for recursion
+	//Let the warp group decide which way we want to travel next
+	_Mask = _Mask & __ballot(withinBounds(cd, query_point,p,heapHeadValue(output->entries), max_rad, max_error));
+	
+	//This is going to be a very large amount of nested if statements o.O Luckily nvcc cleans it up a lot :D I'll eventually make it look nicer here as well
+
+	//Did we go through the first branch?
+	bool check = false;
+	//If the group decided to travel off side first, continue
+	if(_Mask!=0)
+	{
+		check = true;
+		stackpointer++;
+		_Mask = _Mask & __ballot(nodeMinor(cd,query_point,p));
+		//Make a copy of S incase we want to branch right later on
+		stack_entry s_copy1 = *s;
+		s->state = OFFSIDE;
+		//Branch left first
+		if(_Mask!=0)
+		{
+			s->n = childLeft(n);	
+			//Make another copy of s to come back to late
+			stack_entry s_copy2 = *s;
+			_Mask = _Mask & __ballot(nodeMinor(cd,query_point,p)); 
+			*s = stack[*stackpointer];
+			s->state = ONSIDE;	
+			//We're going to branch to the right	
+			stackpointer++;		
+			if(_Mask !=0)
+			{
+				s->n = childRight(n);
+				//Execute branch down. Eventually this will be doing through dynamic parrallelism instead of direct recursion				
+				recursive_warp_search(static_data, query_point, _Mask, output, stackpointer, stack, s);
+				*s = s_copy2;
+				//If any threads want to go to the left			
+				if(__any(flip(nodeMinor(cd,query_point,p))))
+				{
+					//If I did not want this decision
+					if(!flip(nodeMinor(cd,query_point,p)))
+					{ goto exit; }
+					s->n = childLeft(n);
+					//Execute branch down. Eventually this will be doing through dynamic parrallelism instead of direct recursion				
+					recursive_warp_search(static_data, query_point, _Mask, output, stackpointer, stack,s);
+				}
+			}
+			//If the group would rather branch left second
+			else
+			{
+				s->n = childLeft(n);
+				//Execute branch down. Eventually this will be doing through dynamic parrallelism instead of direct recursion				
+				recursive_warp_search(static_data, query_point, _Mask, output, stackpointer, stack, s);
+				*s = s_copy2;
+				//If any threads want to go to the left			
+				if(__any(flip(nodeMinor(cd,query_point,p))))
+				{
+					//If I did not want this decision
+					if(!flip(nodeMinor(cd,query_point,p)))
+					{ goto exit; }
+					s->n = childRight(n);
+					//Execute branch down. Eventually this will be doing through dynamic parrallelism instead of direct recursion				
+					recursive_warp_search(static_data, query_point, _Mask, output, stackpointer, stack,s);
+				}				
+			}	
+			stackpointer++;
+			//We just came out of that branch, so lets set where we currently are in the stack back to s
+			*s = s_copy1;
+			//If any threads wanted to go right instead of left, branch that way now. In a worst case scenario, half the threads from the group are now gone
+			if(__any(flip(nodeMinor(cd,query_point,p))))
+			{
+				//If I did not want this decision
+				if(!flip(nodeMinor(cd,query_point,p)))
+				{ goto exit; }
+				s->n = childRight(n);
+			}
+		}
+		//Branch right first
+		else if(_Mask== 0)
+		{
+			s->n = childRight(n);	
+			//Make another copy of s to come back to late
+			stack_entry s_copy2 = *s;
+			_Mask = _Mask & __ballot(nodeMinor(cd,query_point,p)); 
+			*s = stack[*stackpointer];
+			s->state = ONSIDE;	
+			//We're going to branch to the right	
+			stackpointer++;		
+			if(_Mask !=0)
+			{
+				s->n = childRight(n);
+				//Execute branch down. Eventually this will be doing through dynamic parrallelism instead of direct recursion				
+				recursive_warp_search(static_data, query_point, _Mask, output, stackpointer, stack, s);
+				*s = s_copy2;
+				//If any threads want to go to the left			
+				if(__any(flip(nodeMinor(cd,query_point,p))))
+				{
+					//If I did not want this decision
+					if(!flip(nodeMinor(cd,query_point,p)))
+					{ goto exit; }
+					s->n = childLeft(n);
+					//Execute branch down. Eventually this will be doing through dynamic parrallelism instead of direct recursion				
+					recursive_warp_search(static_data, query_point, _Mask, output, stackpointer, stack, s);
+				}
+			}
+			//If the group would rather branch left second
+			else
+			{
+				s->n = childLeft(n);
+				//Execute branch down. Eventually this will be doing through dynamic parrallelism instead of direct recursion				
+				recursive_warp_search(static_data, query_point, _Mask, output, stackpointer, stack, s);
+				*s = s_copy2;
+				//If any threads want to go to the left			
+				if(__any(flip(nodeMinor(cd,query_point,p))))
+				{
+					//If I did not want this decision
+					if(!flip(nodeMinor(cd,query_point,p)))
+					{ goto exit; }
+					s->n = childRight(n);
+					//Execute branch down. Eventually this will be doing through dynamic parrallelism instead of direct recursion				
+					recursive_warp_search(static_data, query_point, _Mask, output, stackpointer, stack, s);
+				}				
+			}	
+			//We just came out of that branch, so lets set where we currently are in the stack back to s
+			*s = s_copy1;
+			//If any threads wanted to go right instead of left, branch that way now. In a worst case scenario, half the threads from the group are now gone
+			if(__any(flip(nodeMinor(cd,query_point,p))))
+			{
+				s->n = childLeft(n);
+			}
+		}		
+	}
+	//We want to branch onside and not offside
+	else if((_Mask== 0) || ((check == true) && (__any(flip(nodeMinor(cd,query_point,p))))))
+	{
+		//Make another copy of s to come back to late
+		stack_entry s_copy2 = *s;
+		_Mask = _Mask & __ballot(nodeMinor(cd,query_point,p)); 
+		*s = stack[*stackpointer];
+		s->state = ONSIDE;	
+		//We're going to branch to the right	
+		stackpointer++;		
+		if(_Mask !=0)
+		{
+			s->n = childRight(n);
+			//Execute branch down. Eventually this will be doing through dynamic parrallelism instead of direct recursion				
+			recursive_warp_search(static_data, query_point, _Mask, output, stackpointer, stack, s);
+			*s = s_copy2;
+			//If any threads want to go to the left			
+			if(__any(flip(nodeMinor(cd,query_point,p))))
+			{
+				//If I did not want this decision
+				if(!flip(nodeMinor(cd,query_point,p)))
+				{ goto exit; }
+				s->n = childLeft(n);
+				//Execute branch down. Eventually this will be doing through dynamic parrallelism instead of direct recursion				
+				recursive_warp_search(static_data, query_point, _Mask, output, stackpointer, stack, s);
+			}
+		}
+		//If the group would rather branch left second
+		else
+		{
+			s->n = childLeft(n);
+			//Execute branch down. Eventually this will be doing through dynamic parrallelism instead of direct recursion				
+			recursive_warp_search(static_data, query_point, _Mask, output, stackpointer, stack, s);
+			*s = s_copy2;
+			//If any threads want to go to the left			
+			if(__any(flip(nodeMinor(cd,query_point,p))))
+			{
+				//If I did not want this decision
+				if(!flip(nodeMinor(cd,query_point,p)))
+				{ goto exit; }				
+				s->n = childRight(n);
+				//Execute branch down. Eventually this will be doing through dynamic parrallelism instead of direct recursion				
+				recursive_warp_search(static_data, query_point, _Mask, output, stackpointer, stack, s);
+			}				
+		}	
+	}
+exit:
+	return;
+	//TODO: Sort
+	
+} 
+/*Kernel is to be executed as 32x1
+indx is pre malloced and copied to the GPU to avoid memory bottlenecks. Query points is copied per iteration.
+Uses a warped ballot system. Preferable for clustered points that are closely together.
+Make sure the thread group size is equal to the size of the cluster & is a multiple of 32*/
+__global__ void clustered_search(indx static_data, const point *query_points, int *indices,  heap *ret, int query_amt)
+{
+	stack_entry stack[maximum_depth];
+	//Global thread number
+	int thread_num = blockIdx.x * BLOCK_SIZE + threadIdx.x;
+	heap myHeap;
+	myHeap.entries = heapInit(K_size);
+	myHeap.current_count = 0;
+	//Start at root node
+	stack_entry* s = stack;
+	uint *startpos;
+	*startpos = 0;
+	recursive_warp_search(static_data, query_points[thread_num], 1, &myHeap, startpos,s,stack);
+	ret[thread_num] = myHeap;
+} 
+
diff --git a/nabo/opencl/knn_kdtree_pt_in_nodes.cl b/nabo/opencl/knn_kdtree_pt_in_nodes.cl
index 113fff5..104747b 100644
--- a/nabo/opencl/knn_kdtree_pt_in_nodes.cl
+++ b/nabo/opencl/knn_kdtree_pt_in_nodes.cl
@@ -64,15 +64,6 @@ kernel void knnKDTree(	const global T* cloud,
 			continue;
 		const int index = node->index;
 		const global T* p = &cloud[index * POINT_STRIDE];
-		// check whether we already have better dist
-		if ((s->state == OFFSIDE) && (cd >= 0))
-		{
-			const T diff = q[cd] - p[cd];
-			// TODO: use approximate dist
-			// FIXME: bug in this early out
-			//if (diff * diff > heapHeadValue(heap))
-			//	continue;
-		}
 		// compute new distance and update if lower
 		T dist = 0;
 		for (uint i = 0; i < DIM_COUNT; ++i)

From 7f27d7220d3f900cfa44502fb9de0d6e7ea2fa13 Mon Sep 17 00:00:00 2001
From: MatrixCompSci <wirezapp@gmail.com>
Date: Sun, 12 Apr 2015 11:44:09 -0400
Subject: [PATCH 2/4] Improved readability.

Optimized branching.

Reduced memory overhead.

First tests with dynamic parrallelism failed. I'll try again tomorrow.

Began to diverage from libnabo's default transversal patterns. I'm using my own, as they
seem better for CUDA. This may have a negative impact later on. I don't know yet.

Improved comments.
---
 nabo/cuda/kd_node.cu | 210 +++++++++----------------------------------
 1 file changed, 41 insertions(+), 169 deletions(-)

diff --git a/nabo/cuda/kd_node.cu b/nabo/cuda/kd_node.cu
index 4100f65..4cb3ac3 100644
--- a/nabo/cuda/kd_node.cu
+++ b/nabo/cuda/kd_node.cu
@@ -136,10 +136,9 @@ __device__ unsigned int nodeMinor(int cd, point q, point p)
 }
 //Implementation details: http://on-demand.gputechconf.com/gtc/2012/presentations/S0079-Warped-Parallel-Nearest-Neighbor-Searches-Using-KD-Trees.pdf
 __device__ void recursive_warp_search(const indx static_data, const point query_point,  unsigned int _Mask, heap *output, 
-					uint *stackpointer, stack_entry *stack, stack_entry *s)
+					uint stackpointer, stack_entry *stack, stack_entry *s)
 {
-	//Go up one
-	--stackpointer;
+	stackpointer--;
 	const size_t n = s->n;
 	const kd_node node = static_data.nodes[n];
 	const int cd = node.cutVal;
@@ -161,182 +160,56 @@ __device__ void recursive_warp_search(const indx static_data, const point query_
 		output->entries = heapHeadReplace(output->entries, index, dist, K_size);output->current_count++;}
 		// look for recursion
 	//Let the warp group decide which way we want to travel next
-	_Mask = _Mask & __ballot(withinBounds(cd, query_point,p,heapHeadValue(output->entries), max_rad, max_error));
+	_Mask = _Mask & __ballot(nodeMinor(cd, query_point,p));
 	
-	//This is going to be a very large amount of nested if statements o.O Luckily nvcc cleans it up a lot :D I'll eventually make it look nicer here as well
-
-	//Did we go through the first branch?
-	bool check = false;
-	//If the group decided to travel off side first, continue
-	if(_Mask!=0)
+	
+	//If side >= 0, then we branch right first
+	if(_Mask)
 	{
-		check = true;
+		s->n = childRight(n);
+		recursive_warp_search(static_data, query_point,  _Mask, output, 
+					stackpointer, stack, s);
 		stackpointer++;
-		_Mask = _Mask & __ballot(nodeMinor(cd,query_point,p));
-		//Make a copy of S incase we want to branch right later on
-		stack_entry s_copy1 = *s;
-		s->state = OFFSIDE;
-		//Branch left first
-		if(_Mask!=0)
+		s = stack[stackpointer];
+		//This needs to be called before the __any, since the thread needs to be terminated before we conduct the next vote.
+		if(output->current_count > K_size)
+		{	
+			/*Exit the kernel if we have all of the points that we need. Since all of the points are clustered, hopefully this is greatly reduced and all threads exit
+			at near the same time*/
+			return;
+		}
+		//If any of the remaining active threads are within bounds of the left node
+		if(__any(withinBounds(cd,query_point, p, heapHeadValue(ouput->entries), max_rad, max_error)
 		{
-			s->n = childLeft(n);	
-			//Make another copy of s to come back to late
-			stack_entry s_copy2 = *s;
-			_Mask = _Mask & __ballot(nodeMinor(cd,query_point,p)); 
-			*s = stack[*stackpointer];
-			s->state = ONSIDE;	
-			//We're going to branch to the right	
-			stackpointer++;		
-			if(_Mask !=0)
-			{
-				s->n = childRight(n);
-				//Execute branch down. Eventually this will be doing through dynamic parrallelism instead of direct recursion				
-				recursive_warp_search(static_data, query_point, _Mask, output, stackpointer, stack, s);
-				*s = s_copy2;
-				//If any threads want to go to the left			
-				if(__any(flip(nodeMinor(cd,query_point,p))))
-				{
-					//If I did not want this decision
-					if(!flip(nodeMinor(cd,query_point,p)))
-					{ goto exit; }
-					s->n = childLeft(n);
-					//Execute branch down. Eventually this will be doing through dynamic parrallelism instead of direct recursion				
-					recursive_warp_search(static_data, query_point, _Mask, output, stackpointer, stack,s);
-				}
-			}
-			//If the group would rather branch left second
-			else
-			{
-				s->n = childLeft(n);
-				//Execute branch down. Eventually this will be doing through dynamic parrallelism instead of direct recursion				
-				recursive_warp_search(static_data, query_point, _Mask, output, stackpointer, stack, s);
-				*s = s_copy2;
-				//If any threads want to go to the left			
-				if(__any(flip(nodeMinor(cd,query_point,p))))
-				{
-					//If I did not want this decision
-					if(!flip(nodeMinor(cd,query_point,p)))
-					{ goto exit; }
-					s->n = childRight(n);
-					//Execute branch down. Eventually this will be doing through dynamic parrallelism instead of direct recursion				
-					recursive_warp_search(static_data, query_point, _Mask, output, stackpointer, stack,s);
-				}				
-			}	
+			s->n = childLeft(n);
+			recursive_warp_search(static_data, query_point,  _Mask, output, 
+				stackpointer, stack, s);
 			stackpointer++;
-			//We just came out of that branch, so lets set where we currently are in the stack back to s
-			*s = s_copy1;
-			//If any threads wanted to go right instead of left, branch that way now. In a worst case scenario, half the threads from the group are now gone
-			if(__any(flip(nodeMinor(cd,query_point,p))))
-			{
-				//If I did not want this decision
-				if(!flip(nodeMinor(cd,query_point,p)))
-				{ goto exit; }
-				s->n = childRight(n);
-			}
 		}
-		//Branch right first
-		else if(_Mask== 0)
-		{
-			s->n = childRight(n);	
-			//Make another copy of s to come back to late
-			stack_entry s_copy2 = *s;
-			_Mask = _Mask & __ballot(nodeMinor(cd,query_point,p)); 
-			*s = stack[*stackpointer];
-			s->state = ONSIDE;	
-			//We're going to branch to the right	
-			stackpointer++;		
-			if(_Mask !=0)
-			{
-				s->n = childRight(n);
-				//Execute branch down. Eventually this will be doing through dynamic parrallelism instead of direct recursion				
-				recursive_warp_search(static_data, query_point, _Mask, output, stackpointer, stack, s);
-				*s = s_copy2;
-				//If any threads want to go to the left			
-				if(__any(flip(nodeMinor(cd,query_point,p))))
-				{
-					//If I did not want this decision
-					if(!flip(nodeMinor(cd,query_point,p)))
-					{ goto exit; }
-					s->n = childLeft(n);
-					//Execute branch down. Eventually this will be doing through dynamic parrallelism instead of direct recursion				
-					recursive_warp_search(static_data, query_point, _Mask, output, stackpointer, stack, s);
-				}
-			}
-			//If the group would rather branch left second
-			else
-			{
-				s->n = childLeft(n);
-				//Execute branch down. Eventually this will be doing through dynamic parrallelism instead of direct recursion				
-				recursive_warp_search(static_data, query_point, _Mask, output, stackpointer, stack, s);
-				*s = s_copy2;
-				//If any threads want to go to the left			
-				if(__any(flip(nodeMinor(cd,query_point,p))))
-				{
-					//If I did not want this decision
-					if(!flip(nodeMinor(cd,query_point,p)))
-					{ goto exit; }
-					s->n = childRight(n);
-					//Execute branch down. Eventually this will be doing through dynamic parrallelism instead of direct recursion				
-					recursive_warp_search(static_data, query_point, _Mask, output, stackpointer, stack, s);
-				}				
-			}	
-			//We just came out of that branch, so lets set where we currently are in the stack back to s
-			*s = s_copy1;
-			//If any threads wanted to go right instead of left, branch that way now. In a worst case scenario, half the threads from the group are now gone
-			if(__any(flip(nodeMinor(cd,query_point,p))))
-			{
-				s->n = childLeft(n);
-			}
-		}		
 	}
-	//We want to branch onside and not offside
-	else if((_Mask== 0) || ((check == true) && (__any(flip(nodeMinor(cd,query_point,p))))))
+	//Otherwise we branch left
+	else
 	{
-		//Make another copy of s to come back to late
-		stack_entry s_copy2 = *s;
-		_Mask = _Mask & __ballot(nodeMinor(cd,query_point,p)); 
-		*s = stack[*stackpointer];
-		s->state = ONSIDE;	
-		//We're going to branch to the right	
-		stackpointer++;		
-		if(_Mask !=0)
+		s->n = childLeft(n);
+		recursive_warp_search(static_data, query_point,  _Mask, output, 
+					stackpointer, stack, s);
+		stackpointer++;
+		s = stack[stackpointer];
+		if(output->current_count > K_size)
+		{	
+			/*Exit the kernel if we have all of the points that we need. Since all of the points are clustered, hopefully this is greatly reduced and all threads exit
+			at near the same time*/
+			return;
+		}
+		//If any of the remaining active threads are within bounds of the right node
+		if(__any(withinBounds(cd,query_point, p, heapHeadValue(ouput->entries), max_rad, max_error)
 		{
 			s->n = childRight(n);
-			//Execute branch down. Eventually this will be doing through dynamic parrallelism instead of direct recursion				
-			recursive_warp_search(static_data, query_point, _Mask, output, stackpointer, stack, s);
-			*s = s_copy2;
-			//If any threads want to go to the left			
-			if(__any(flip(nodeMinor(cd,query_point,p))))
-			{
-				//If I did not want this decision
-				if(!flip(nodeMinor(cd,query_point,p)))
-				{ goto exit; }
-				s->n = childLeft(n);
-				//Execute branch down. Eventually this will be doing through dynamic parrallelism instead of direct recursion				
-				recursive_warp_search(static_data, query_point, _Mask, output, stackpointer, stack, s);
-			}
+			recursive_warp_search(static_data, query_point,  _Mask, output, 
+				stackpointer, stack, s);
+			stackpointer++;
 		}
-		//If the group would rather branch left second
-		else
-		{
-			s->n = childLeft(n);
-			//Execute branch down. Eventually this will be doing through dynamic parrallelism instead of direct recursion				
-			recursive_warp_search(static_data, query_point, _Mask, output, stackpointer, stack, s);
-			*s = s_copy2;
-			//If any threads want to go to the left			
-			if(__any(flip(nodeMinor(cd,query_point,p))))
-			{
-				//If I did not want this decision
-				if(!flip(nodeMinor(cd,query_point,p)))
-				{ goto exit; }				
-				s->n = childRight(n);
-				//Execute branch down. Eventually this will be doing through dynamic parrallelism instead of direct recursion				
-				recursive_warp_search(static_data, query_point, _Mask, output, stackpointer, stack, s);
-			}				
-		}	
 	}
-exit:
-	return;
 	//TODO: Sort
 	
 } 
@@ -354,8 +227,7 @@ __global__ void clustered_search(indx static_data, const point *query_points, in
 	myHeap.current_count = 0;
 	//Start at root node
 	stack_entry* s = stack;
-	uint *startpos;
-	*startpos = 0;
+	uint startpos = 1;
 	recursive_warp_search(static_data, query_points[thread_num], 1, &myHeap, startpos,s,stack);
 	ret[thread_num] = myHeap;
 } 

From ad270ac747f1889974230e8726d13bfbf8cbbce7 Mon Sep 17 00:00:00 2001
From: MatrixCompSci <wirezapp@gmail.com>
Date: Sun, 12 Apr 2015 12:46:32 -0400
Subject: [PATCH 3/4] Syntax fix

Fixed some syntax errors that were preventing it from compiling.
---
 nabo/cuda/kd_node.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nabo/cuda/kd_node.cu b/nabo/cuda/kd_node.cu
index 4cb3ac3..9ecde32 100644
--- a/nabo/cuda/kd_node.cu
+++ b/nabo/cuda/kd_node.cu
@@ -179,7 +179,7 @@ __device__ void recursive_warp_search(const indx static_data, const point query_
 			return;
 		}
 		//If any of the remaining active threads are within bounds of the left node
-		if(__any(withinBounds(cd,query_point, p, heapHeadValue(ouput->entries), max_rad, max_error)
+		if(__any(withinBounds(cd,query_point, p, heapHeadValue(ouput->entries), max_rad, max_error)))
 		{
 			s->n = childLeft(n);
 			recursive_warp_search(static_data, query_point,  _Mask, output, 
@@ -202,7 +202,7 @@ __device__ void recursive_warp_search(const indx static_data, const point query_
 			return;
 		}
 		//If any of the remaining active threads are within bounds of the right node
-		if(__any(withinBounds(cd,query_point, p, heapHeadValue(ouput->entries), max_rad, max_error)
+		if(__any(withinBounds(cd,query_point, p, heapHeadValue(ouput->entries), max_rad, max_error)))
 		{
 			s->n = childRight(n);
 			recursive_warp_search(static_data, query_point,  _Mask, output, 

From 448b00882f1eaebae74e7945fbfad8b2d43c8afd Mon Sep 17 00:00:00 2001
From: MatrixCompSci <wirezapp@gmail.com>
Date: Sun, 12 Apr 2015 13:11:23 -0400
Subject: [PATCH 4/4] Update README.md

Added a best practice guide and installation guide for CUDA
---
 README.md | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/README.md b/README.md
index 4504d51..24dc874 100644
--- a/README.md
+++ b/README.md
@@ -48,6 +48,38 @@ libnabo provides the following compilation options, available through [CMake]:
 You can specify them with a command-line tool, `ccmake`, or with a graphical tool, `cmake-gui`.
 Please read the [CMake documentation] for more information.
 
+In order to utilize CUDA, please install the latest NVIDIA developer driver, and atleast CUDA 7.0:
+
+  * For Ubuntu or Debian, follow [these instructions](http://docs.nvidia.com/cuda/cuda-getting-started-guide-for-linux/#axzz3X7AKITTy). 
+  
+  * For Windows, follow [these instructions](http://docs.nvidia.com/cuda/cuda-getting-started-guide-for-microsoft-windows/#axzz3X7AKITTy). Please note that this code has, as of April 12th 2015, NOT been tested on Windows.
+  
+  * For Mac OSX, follow [these instructions](http://docs.nvidia.com/cuda/cuda-getting-started-guide-for-mac-os-x/#axzz3X7AKITTy). Clang 6.0 is required.
+
+Compute Model 3.2 or better is required. Compute Model 3.5 or better is recommended. To see which compute model you have, click [here](https://developer.nvidia.com/cuda-gpus).
+
+CUDA Best Practices 
+-------------------
+
+Query points should be stored in a linearized array, and should be in sub groups of a multiple of 16. No more than 1024 groups should be queried at any given time.
+
+Subgroups should be ordered in regard to their distance from the center of the group in least to greatest. This will insure minimalized thread divergence and greatly improve performance. 
+
+The smaller the overall maximum distance from the center any given point within a subgroup is, the smaller the overall divergence. 
+
+If a path proves to be too divergent, a new kernel will be launched and the query point subgroup will be split. This is suboptimal when compared to a tighly packed subgroup, so avoid it at all costs. It is also only available on Compute Model 3.5. 
+
+Currently, in order to change the dimension could, the kernel must be recompiled. This primarily is due to the simplicity of removing point striding from the in-development version. It will be added at a later date.
+
+In order to call CUDA from libnabo, please create a C to C++ wrapper and properly invoke it from the search function within kdtree_opencl.cpp. I have not created a full C++ wrapper yet for the CUDA code; however, I have plans to eventually do so. 
+
+For optimal performance, restrict BucketSize to 10. Eventually a method to dynamically set BucketSize will be used, as it is highly dependent on the GPU being utilized for the computations.
+
+Double precision is NOT available and I have no intentions of adding it. This code is single precision only. If you wish to utilize doubles, modify it to your liking.
+
+When dynamic parralelism is finalized, tree depth will be limited to 22, as this is the standard limit for Compute Model 3.5. If NVIDIA plans to alleviate this limit for consumer GPUs in the near future, or a work around is discovered, I will remove this limit.
+
+While dynamic parralelism will be used for divergent paths, only 1024 paths will be allowed to launch new kernels at any given point. Therefore, if the number of paths that prove to be divergent is greater than 1024, please optimize or recluster your querry set. A path is considered to be divergent if any given query point is outside of the KNN max_rad2 of the center of the cluster. 
 Quick compilation and installation under Unix
 ---------------------------------------------