Merge branch 'master' of https://github.com/parallel-forall/code-samples

NVIDIA-developer-blog · Mar 4, 2014 · fee0a91 · fee0a91
2 parents a8d965f + 144ad9c
commit fee0a91
Show file tree

Hide file tree

Showing 14 changed files with 530 additions and 0 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "posts/parallel_reduction_with_shfl/cub"]
+	path = posts/parallel_reduction_with_shfl/cub
+	url = https://github.com/NVLabs/cub.git
diff --git a/posts/parallel_reduction_with_shfl/Makefile b/posts/parallel_reduction_with_shfl/Makefile
@@ -0,0 +1,6 @@
+reduce: main.cu *.h
+	nvcc -O3 main.cu -o reduce -arch=sm_35
+
+clean:
+	rm -f reduce
+
diff --git a/posts/parallel_reduction_with_shfl/block_reduce.h b/posts/parallel_reduction_with_shfl/block_reduce.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include "warp_reduce.h"
+
+__inline__ __device__
+int blockReduceSum(int val) {
+  static __shared__ int shared[32];
+  int lane=threadIdx.x%warpSize;
+  int wid=threadIdx.x/warpSize;
+  val=warpReduceSum(val);
+
+  //write reduced value to shared memory
+  if(lane==0) shared[wid]=val;
+  __syncthreads();
+
+  //ensure we only grab a value from shared memory if that warp existed
+  val = (threadIdx.x<blockDim.x/warpSize) ? shared[lane] : int(0);
+  if(wid==0) val=warpReduceSum(val);
+
+  return val;
+}
diff --git a/posts/parallel_reduction_with_shfl/cub b/posts/parallel_reduction_with_shfl/cub
diff --git a/posts/parallel_reduction_with_shfl/device_reduce_atomic.h b/posts/parallel_reduction_with_shfl/device_reduce_atomic.h
@@ -0,0 +1,64 @@
+#pragma once
+
+#include "fake_atomic.h"
+
+
+__global__ void device_reduce_atomic_kernel(int *in, int* out, int N) {
+  int sum=int(0);
+  for(int i=blockIdx.x*blockDim.x+threadIdx.x;i<N;i+=blockDim.x*gridDim.x) {
+    sum+=in[i];
+  }
+  atomicAdd(out,sum);
+}
+
+void device_reduce_atomic(int *in, int* out, int N) {
+  int threads=256;
+  int blocks=min((N+threads-1)/threads,2048);
+
+  cudaMemsetAsync(out,0,sizeof(int));
+  device_reduce_atomic_kernel<<<blocks,threads>>>(in,out,N); 
+}
+
+__global__ void device_reduce_atomic_kernel_vector2(int *in, int* out, int N) {
+  int sum=0;
+  int idx=blockIdx.x*blockDim.x+threadIdx.x;
+  for(int i=idx;i<N/2;i+=blockDim.x*gridDim.x) {
+    int2 val=reinterpret_cast<int2*>(in)[i];
+    sum+=val.x+val.y;
+  }
+  int i=idx+N/2*2;
+  if(i<N) 
+    sum+=in[i];
+
+  atomicAdd(out,sum);
+}
+
+void device_reduce_atomic_vector2(int *in, int* out, int N) {
+  int threads=256;
+  int blocks=min((N/2+threads-1)/threads,2048);
+
+  cudaMemsetAsync(out,0,sizeof(int));
+  device_reduce_atomic_kernel_vector2<<<blocks,threads>>>(in,out,N); 
+}
+
+__global__ void device_reduce_atomic_kernel_vector4(int *in, int* out, int N) {
+  int sum=0;
+  int idx=blockIdx.x*blockDim.x+threadIdx.x;
+  for(int i=idx;i<N/4;i+=blockDim.x*gridDim.x) {
+    int4 val=reinterpret_cast<int4*>(in)[i];
+    sum+=(val.x+val.y)+(val.z+val.w);
+  }
+  int i=idx+N/4*4;
+  if(i<N) 
+    sum+=in[i];
+
+  atomicAdd(out,sum);
+}
+
+void device_reduce_atomic_vector4(int *in, int* out, int N) {
+  int threads=256;
+  int blocks=min((N/4+threads-1)/threads,2048);
+
+  cudaMemsetAsync(out,0,sizeof(int));
+  device_reduce_atomic_kernel_vector4<<<blocks,threads>>>(in,out,N); 
+}
diff --git a/posts/parallel_reduction_with_shfl/device_reduce_block_atomic.h b/posts/parallel_reduction_with_shfl/device_reduce_block_atomic.h
@@ -0,0 +1,69 @@
+#pragma once
+
+#include "fake_atomic.h"
+#include "block_reduce.h"
+
+__global__ void device_reduce_block_atomic_kernel(int *in, int* out, int N) {
+  int sum=int(0);
+  for(int i=blockIdx.x*blockDim.x+threadIdx.x;i<N;i+=blockDim.x*gridDim.x) {
+    sum+=in[i];
+  }
+  sum=blockReduceSum(sum);
+  if(threadIdx.x==0)
+    atomicAdd(out,sum);
+}
+
+void device_reduce_block_atomic(int *in, int* out, int N) {
+  int threads=256;
+  int blocks=min((N+threads-1)/threads,2048);
+
+  cudaMemsetAsync(out,0,sizeof(int));
+  device_reduce_block_atomic_kernel<<<blocks,threads>>>(in,out,N); 
+}
+
+__global__ void device_reduce_block_atomic_kernel_vector2(int *in, int* out, int N) {
+  int sum=0;
+  int idx=blockIdx.x*blockDim.x+threadIdx.x;
+  for(int i=idx;i<N/2;i+=blockDim.x*gridDim.x) {
+    int2 val=reinterpret_cast<int2*>(in)[i];
+    sum+=val.x+val.y;
+  }
+  int i=idx+N/2*2;
+  if(i<N) 
+    sum+=in[i];
+  sum=blockReduceSum(sum);
+  if(threadIdx.x==0)
+    atomicAdd(out,sum);
+}
+
+void device_reduce_block_atomic_vector2(int *in, int* out, int N) {
+  int threads=256;
+  int blocks=min((N/2+threads-1)/threads,2048);
+
+  cudaMemsetAsync(out,0,sizeof(int));
+  device_reduce_block_atomic_kernel_vector2<<<blocks,threads>>>(in,out,N); 
+}
+
+__global__ void device_reduce_block_atomic_kernel_vector4(int *in, int* out, int N) {
+  int sum=0;
+  int idx=blockIdx.x*blockDim.x+threadIdx.x;
+  for(int i=idx;i<N/4;i+=blockDim.x*gridDim.x) {
+    int4 val=reinterpret_cast<int4*>(in)[i];
+    sum+=(val.x+val.y)+(val.z+val.w);
+  }
+  int i=idx+N/4*4;
+  if(i<N) 
+    sum+=in[i];
+
+  sum=blockReduceSum(sum);
+  if(threadIdx.x==0)
+    atomicAdd(out,sum);
+}
+
+void device_reduce_block_atomic_vector4(int *in, int* out, int N) {
+  int threads=256;
+  int blocks=min((N/4+threads-1)/threads,2048);
+
+  cudaMemsetAsync(out,0,sizeof(int));
+  device_reduce_block_atomic_kernel_vector4<<<blocks,threads>>>(in,out,N); 
+}
diff --git a/posts/parallel_reduction_with_shfl/device_reduce_stable.h b/posts/parallel_reduction_with_shfl/device_reduce_stable.h
@@ -0,0 +1,69 @@
+#pragma once
+#pragma once
+
+#include "block_reduce.h"
+
+__global__ void device_reduce_stable_kernel(int *in, int* out, int N) {
+  int sum=int(0);
+  for(int i=blockIdx.x*blockDim.x+threadIdx.x;i<N;i+=blockDim.x*gridDim.x) {
+    sum+=in[i];
+  }
+  sum=blockReduceSum(sum);
+  if(threadIdx.x==0)
+    out[blockIdx.x]=sum;
+}
+
+void device_reduce_stable(int *in, int* out, int N) {
+  int threads=512;
+  int blocks=min((N+threads-1)/threads,1024);
+
+  device_reduce_stable_kernel<<<blocks,threads>>>(in,out,N);
+  device_reduce_stable_kernel<<<1,1024>>>(out,out,blocks); 
+}
+
+__global__ void device_reduce_stable_kernel_vector2(int *in, int* out, int N) {
+  int sum=0;
+  int idx=blockIdx.x*blockDim.x+threadIdx.x;
+  for(int i=idx;i<N/2;i+=blockDim.x*gridDim.x) {
+    int2 val=reinterpret_cast<int2*>(in)[i];
+    sum+=val.x+val.y;
+  }
+  int i=idx+N/2*2;
+  if(i<N) 
+    sum+=in[i];
+  sum=blockReduceSum(sum);
+  if(threadIdx.x==0)
+    out[blockIdx.x]=sum;
+}
+
+void device_reduce_stable_vector2(int *in, int* out, int N) {
+  int threads=512;
+  int blocks=min((N/2+threads-1)/threads,1024);
+
+  device_reduce_stable_kernel_vector2<<<blocks,threads>>>(in,out,N);
+  device_reduce_stable_kernel<<<1,1024>>>(out,out,blocks); 
+}
+
+__global__ void device_reduce_stable_kernel_vector4(int *in, int* out, int N) {
+  int sum=0;
+  int idx=blockIdx.x*blockDim.x+threadIdx.x;
+  for(int i=idx;i<N/4;i+=blockDim.x*gridDim.x) {
+    int4 val=reinterpret_cast<int4*>(in)[i];
+    sum+=(val.x+val.y)+(val.z+val.w);
+  }
+  int i=idx+N/4*4;
+  if(i<N) 
+    sum+=in[i];
+
+  sum=blockReduceSum(sum);
+  if(threadIdx.x==0)
+    out[blockIdx.x]=sum;
+}
+
+void device_reduce_stable_vector4(int *in, int* out, int N) {
+  int threads=512;
+  int blocks=min((N/4+threads-1)/threads,1024);
+
+  device_reduce_stable_kernel_vector4<<<blocks,threads>>>(in,out,N);
+  device_reduce_stable_kernel<<<1,1024>>>(out,out,blocks); 
+}
diff --git a/posts/parallel_reduction_with_shfl/device_reduce_warp_atomic.h b/posts/parallel_reduction_with_shfl/device_reduce_warp_atomic.h
@@ -0,0 +1,69 @@
+#pragma once
+
+#include "fake_atomic.h"
+#include "warp_reduce.h"
+
+__global__ void device_reduce_warp_atomic_kernel(int *in, int* out, int N) {
+  int sum=int(0);
+  for(int i=blockIdx.x*blockDim.x+threadIdx.x;i<N;i+=blockDim.x*gridDim.x) {
+    sum+=in[i];
+  }
+  sum=warpReduceSum(sum);
+  if(threadIdx.x%warpSize==0)
+    atomicAdd(out,sum);
+}
+
+void device_reduce_warp_atomic(int *in, int* out, int N) {
+  int threads=256;
+  int blocks=min((N+threads-1)/threads,2048);
+
+  cudaMemsetAsync(out,0,sizeof(int));
+  device_reduce_warp_atomic_kernel<<<blocks,threads>>>(in,out,N); 
+}
+
+__global__ void device_reduce_warp_atomic_kernel_vector2(int *in, int* out, int N) {
+  int sum=0;
+  int idx=blockIdx.x*blockDim.x+threadIdx.x;
+  for(int i=idx;i<N/2;i+=blockDim.x*gridDim.x) {
+    int2 val=reinterpret_cast<int2*>(in)[i];
+    sum+=val.x+val.y;
+  }
+  int i=idx+N/2*2;
+  if(i<N) 
+    sum+=in[i];
+  sum=warpReduceSum(sum);
+  if(threadIdx.x%warpSize==0)
+    atomicAdd(out,sum);
+}
+
+void device_reduce_warp_atomic_vector2(int *in, int* out, int N) {
+  int threads=256;
+  int blocks=min((N/2+threads-1)/threads,2048);
+
+  cudaMemsetAsync(out,0,sizeof(int));
+  device_reduce_warp_atomic_kernel_vector2<<<blocks,threads>>>(in,out,N); 
+}
+
+__global__ void device_reduce_warp_atomic_kernel_vector4(int *in, int* out, int N) {
+  int sum=0;
+  int idx=blockIdx.x*blockDim.x+threadIdx.x;
+  for(int i=idx;i<N/4;i+=blockDim.x*gridDim.x) {
+    int4 val=reinterpret_cast<int4*>(in)[i];
+    sum+=(val.x+val.y)+(val.z+val.w);
+  }
+  int i=idx+N/4*4;
+  if(i<N) 
+    sum+=in[i];
+
+  sum=warpReduceSum(sum);
+  if(threadIdx.x%warpSize==0)
+    atomicAdd(out,sum);
+}
+
+void device_reduce_warp_atomic_vector4(int *in, int* out, int N) {
+  int threads=256;
+  int blocks=min((N/4+threads-1)/threads,2048);
+
+  cudaMemsetAsync(out,0,sizeof(int));
+  device_reduce_warp_atomic_kernel_vector4<<<blocks,threads>>>(in,out,N); 
+}
diff --git a/posts/parallel_reduction_with_shfl/fake_atomic.h b/posts/parallel_reduction_with_shfl/fake_atomic.h
@@ -0,0 +1,10 @@
+#pragma once
+
+#if 0
+template <class T>
+__device__ __inline__
+void atomicAdd(T* ptr, T val) {
+  *ptr+=val;
+}
+#endif
+
diff --git a/posts/parallel_reduction_with_shfl/fake_shfl.h b/posts/parallel_reduction_with_shfl/fake_shfl.h
@@ -0,0 +1,17 @@
+#pragma once
+
+//#define __shfl_down fake_shfl_down
+#define MAX_BLOCK 512
+__inline__ __device__
+int fake_shfl_down(int val, int offset, int width=32) {
+  static __shared__ int shared[MAX_BLOCK];
+  int lane=threadIdx.x%32;
+
+  shared[threadIdx.x]=val;
+  __syncthreads();
+
+  val = (lane+offset<width) ? shared[threadIdx.x+offset] : 0;
+  __syncthreads();
+
+  return val;
+}
diff --git a/posts/parallel_reduction_with_shfl/getdata.sh b/posts/parallel_reduction_with_shfl/getdata.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+BASE=2
+LOW=10
+HIGH=27
+
+HEADER=`./reduce 100 1 | grep -v NUM_ELEMS | cut -d ":" -f 1`
+
+HEADER="SIZE $HEADER"
+echo $HEADER 
+for (( i=$LOW; i<=$HIGH; i++ ))
+do
+  size=`echo "$BASE^$i" | bc`
+  TIMES=`./reduce $size 100 | grep -v NUM_ELEMS | cut -d ":" -f 4 | cut -f 2 -d " "`
+  bytes=`echo "$size*4" | bc`
+  echo $bytes $TIMES
+done