Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
harrism committed Mar 4, 2014
2 parents a8d965f + 144ad9c commit fee0a91
Show file tree
Hide file tree
Showing 14 changed files with 530 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[submodule "posts/parallel_reduction_with_shfl/cub"]
path = posts/parallel_reduction_with_shfl/cub
url = https://github.com/NVLabs/cub.git
6 changes: 6 additions & 0 deletions posts/parallel_reduction_with_shfl/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
reduce: main.cu *.h
nvcc -O3 main.cu -o reduce -arch=sm_35

clean:
rm -f reduce

21 changes: 21 additions & 0 deletions posts/parallel_reduction_with_shfl/block_reduce.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#pragma once

#include "warp_reduce.h"

__inline__ __device__
int blockReduceSum(int val) {
static __shared__ int shared[32];
int lane=threadIdx.x%warpSize;
int wid=threadIdx.x/warpSize;
val=warpReduceSum(val);

//write reduced value to shared memory
if(lane==0) shared[wid]=val;
__syncthreads();

//ensure we only grab a value from shared memory if that warp existed
val = (threadIdx.x<blockDim.x/warpSize) ? shared[lane] : int(0);
if(wid==0) val=warpReduceSum(val);

return val;
}
1 change: 1 addition & 0 deletions posts/parallel_reduction_with_shfl/cub
Submodule cub added at 93696c
64 changes: 64 additions & 0 deletions posts/parallel_reduction_with_shfl/device_reduce_atomic.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#pragma once

#include "fake_atomic.h"


__global__ void device_reduce_atomic_kernel(int *in, int* out, int N) {
int sum=int(0);
for(int i=blockIdx.x*blockDim.x+threadIdx.x;i<N;i+=blockDim.x*gridDim.x) {
sum+=in[i];
}
atomicAdd(out,sum);
}

void device_reduce_atomic(int *in, int* out, int N) {
int threads=256;
int blocks=min((N+threads-1)/threads,2048);

cudaMemsetAsync(out,0,sizeof(int));
device_reduce_atomic_kernel<<<blocks,threads>>>(in,out,N);
}

__global__ void device_reduce_atomic_kernel_vector2(int *in, int* out, int N) {
int sum=0;
int idx=blockIdx.x*blockDim.x+threadIdx.x;
for(int i=idx;i<N/2;i+=blockDim.x*gridDim.x) {
int2 val=reinterpret_cast<int2*>(in)[i];
sum+=val.x+val.y;
}
int i=idx+N/2*2;
if(i<N)
sum+=in[i];

atomicAdd(out,sum);
}

void device_reduce_atomic_vector2(int *in, int* out, int N) {
int threads=256;
int blocks=min((N/2+threads-1)/threads,2048);

cudaMemsetAsync(out,0,sizeof(int));
device_reduce_atomic_kernel_vector2<<<blocks,threads>>>(in,out,N);
}

__global__ void device_reduce_atomic_kernel_vector4(int *in, int* out, int N) {
int sum=0;
int idx=blockIdx.x*blockDim.x+threadIdx.x;
for(int i=idx;i<N/4;i+=blockDim.x*gridDim.x) {
int4 val=reinterpret_cast<int4*>(in)[i];
sum+=(val.x+val.y)+(val.z+val.w);
}
int i=idx+N/4*4;
if(i<N)
sum+=in[i];

atomicAdd(out,sum);
}

void device_reduce_atomic_vector4(int *in, int* out, int N) {
int threads=256;
int blocks=min((N/4+threads-1)/threads,2048);

cudaMemsetAsync(out,0,sizeof(int));
device_reduce_atomic_kernel_vector4<<<blocks,threads>>>(in,out,N);
}
69 changes: 69 additions & 0 deletions posts/parallel_reduction_with_shfl/device_reduce_block_atomic.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#pragma once

#include "fake_atomic.h"
#include "block_reduce.h"

__global__ void device_reduce_block_atomic_kernel(int *in, int* out, int N) {
int sum=int(0);
for(int i=blockIdx.x*blockDim.x+threadIdx.x;i<N;i+=blockDim.x*gridDim.x) {
sum+=in[i];
}
sum=blockReduceSum(sum);
if(threadIdx.x==0)
atomicAdd(out,sum);
}

void device_reduce_block_atomic(int *in, int* out, int N) {
int threads=256;
int blocks=min((N+threads-1)/threads,2048);

cudaMemsetAsync(out,0,sizeof(int));
device_reduce_block_atomic_kernel<<<blocks,threads>>>(in,out,N);
}

__global__ void device_reduce_block_atomic_kernel_vector2(int *in, int* out, int N) {
int sum=0;
int idx=blockIdx.x*blockDim.x+threadIdx.x;
for(int i=idx;i<N/2;i+=blockDim.x*gridDim.x) {
int2 val=reinterpret_cast<int2*>(in)[i];
sum+=val.x+val.y;
}
int i=idx+N/2*2;
if(i<N)
sum+=in[i];
sum=blockReduceSum(sum);
if(threadIdx.x==0)
atomicAdd(out,sum);
}

void device_reduce_block_atomic_vector2(int *in, int* out, int N) {
int threads=256;
int blocks=min((N/2+threads-1)/threads,2048);

cudaMemsetAsync(out,0,sizeof(int));
device_reduce_block_atomic_kernel_vector2<<<blocks,threads>>>(in,out,N);
}

__global__ void device_reduce_block_atomic_kernel_vector4(int *in, int* out, int N) {
int sum=0;
int idx=blockIdx.x*blockDim.x+threadIdx.x;
for(int i=idx;i<N/4;i+=blockDim.x*gridDim.x) {
int4 val=reinterpret_cast<int4*>(in)[i];
sum+=(val.x+val.y)+(val.z+val.w);
}
int i=idx+N/4*4;
if(i<N)
sum+=in[i];

sum=blockReduceSum(sum);
if(threadIdx.x==0)
atomicAdd(out,sum);
}

void device_reduce_block_atomic_vector4(int *in, int* out, int N) {
int threads=256;
int blocks=min((N/4+threads-1)/threads,2048);

cudaMemsetAsync(out,0,sizeof(int));
device_reduce_block_atomic_kernel_vector4<<<blocks,threads>>>(in,out,N);
}
69 changes: 69 additions & 0 deletions posts/parallel_reduction_with_shfl/device_reduce_stable.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#pragma once
#pragma once

#include "block_reduce.h"

__global__ void device_reduce_stable_kernel(int *in, int* out, int N) {
int sum=int(0);
for(int i=blockIdx.x*blockDim.x+threadIdx.x;i<N;i+=blockDim.x*gridDim.x) {
sum+=in[i];
}
sum=blockReduceSum(sum);
if(threadIdx.x==0)
out[blockIdx.x]=sum;
}

void device_reduce_stable(int *in, int* out, int N) {
int threads=512;
int blocks=min((N+threads-1)/threads,1024);

device_reduce_stable_kernel<<<blocks,threads>>>(in,out,N);
device_reduce_stable_kernel<<<1,1024>>>(out,out,blocks);
}

__global__ void device_reduce_stable_kernel_vector2(int *in, int* out, int N) {
int sum=0;
int idx=blockIdx.x*blockDim.x+threadIdx.x;
for(int i=idx;i<N/2;i+=blockDim.x*gridDim.x) {
int2 val=reinterpret_cast<int2*>(in)[i];
sum+=val.x+val.y;
}
int i=idx+N/2*2;
if(i<N)
sum+=in[i];
sum=blockReduceSum(sum);
if(threadIdx.x==0)
out[blockIdx.x]=sum;
}

void device_reduce_stable_vector2(int *in, int* out, int N) {
int threads=512;
int blocks=min((N/2+threads-1)/threads,1024);

device_reduce_stable_kernel_vector2<<<blocks,threads>>>(in,out,N);
device_reduce_stable_kernel<<<1,1024>>>(out,out,blocks);
}

__global__ void device_reduce_stable_kernel_vector4(int *in, int* out, int N) {
int sum=0;
int idx=blockIdx.x*blockDim.x+threadIdx.x;
for(int i=idx;i<N/4;i+=blockDim.x*gridDim.x) {
int4 val=reinterpret_cast<int4*>(in)[i];
sum+=(val.x+val.y)+(val.z+val.w);
}
int i=idx+N/4*4;
if(i<N)
sum+=in[i];

sum=blockReduceSum(sum);
if(threadIdx.x==0)
out[blockIdx.x]=sum;
}

void device_reduce_stable_vector4(int *in, int* out, int N) {
int threads=512;
int blocks=min((N/4+threads-1)/threads,1024);

device_reduce_stable_kernel_vector4<<<blocks,threads>>>(in,out,N);
device_reduce_stable_kernel<<<1,1024>>>(out,out,blocks);
}
69 changes: 69 additions & 0 deletions posts/parallel_reduction_with_shfl/device_reduce_warp_atomic.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#pragma once

#include "fake_atomic.h"
#include "warp_reduce.h"

__global__ void device_reduce_warp_atomic_kernel(int *in, int* out, int N) {
int sum=int(0);
for(int i=blockIdx.x*blockDim.x+threadIdx.x;i<N;i+=blockDim.x*gridDim.x) {
sum+=in[i];
}
sum=warpReduceSum(sum);
if(threadIdx.x%warpSize==0)
atomicAdd(out,sum);
}

void device_reduce_warp_atomic(int *in, int* out, int N) {
int threads=256;
int blocks=min((N+threads-1)/threads,2048);

cudaMemsetAsync(out,0,sizeof(int));
device_reduce_warp_atomic_kernel<<<blocks,threads>>>(in,out,N);
}

__global__ void device_reduce_warp_atomic_kernel_vector2(int *in, int* out, int N) {
int sum=0;
int idx=blockIdx.x*blockDim.x+threadIdx.x;
for(int i=idx;i<N/2;i+=blockDim.x*gridDim.x) {
int2 val=reinterpret_cast<int2*>(in)[i];
sum+=val.x+val.y;
}
int i=idx+N/2*2;
if(i<N)
sum+=in[i];
sum=warpReduceSum(sum);
if(threadIdx.x%warpSize==0)
atomicAdd(out,sum);
}

void device_reduce_warp_atomic_vector2(int *in, int* out, int N) {
int threads=256;
int blocks=min((N/2+threads-1)/threads,2048);

cudaMemsetAsync(out,0,sizeof(int));
device_reduce_warp_atomic_kernel_vector2<<<blocks,threads>>>(in,out,N);
}

__global__ void device_reduce_warp_atomic_kernel_vector4(int *in, int* out, int N) {
int sum=0;
int idx=blockIdx.x*blockDim.x+threadIdx.x;
for(int i=idx;i<N/4;i+=blockDim.x*gridDim.x) {
int4 val=reinterpret_cast<int4*>(in)[i];
sum+=(val.x+val.y)+(val.z+val.w);
}
int i=idx+N/4*4;
if(i<N)
sum+=in[i];

sum=warpReduceSum(sum);
if(threadIdx.x%warpSize==0)
atomicAdd(out,sum);
}

void device_reduce_warp_atomic_vector4(int *in, int* out, int N) {
int threads=256;
int blocks=min((N/4+threads-1)/threads,2048);

cudaMemsetAsync(out,0,sizeof(int));
device_reduce_warp_atomic_kernel_vector4<<<blocks,threads>>>(in,out,N);
}
10 changes: 10 additions & 0 deletions posts/parallel_reduction_with_shfl/fake_atomic.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#pragma once

#if 0
template <class T>
__device__ __inline__
void atomicAdd(T* ptr, T val) {
*ptr+=val;
}
#endif

17 changes: 17 additions & 0 deletions posts/parallel_reduction_with_shfl/fake_shfl.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#pragma once

//#define __shfl_down fake_shfl_down
#define MAX_BLOCK 512
__inline__ __device__
int fake_shfl_down(int val, int offset, int width=32) {
static __shared__ int shared[MAX_BLOCK];
int lane=threadIdx.x%32;

shared[threadIdx.x]=val;
__syncthreads();

val = (lane+offset<width) ? shared[threadIdx.x+offset] : 0;
__syncthreads();

return val;
}
17 changes: 17 additions & 0 deletions posts/parallel_reduction_with_shfl/getdata.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/bin/bash

BASE=2
LOW=10
HIGH=27

HEADER=`./reduce 100 1 | grep -v NUM_ELEMS | cut -d ":" -f 1`

HEADER="SIZE $HEADER"
echo $HEADER
for (( i=$LOW; i<=$HIGH; i++ ))
do
size=`echo "$BASE^$i" | bc`
TIMES=`./reduce $size 100 | grep -v NUM_ELEMS | cut -d ":" -f 4 | cut -f 2 -d " "`
bytes=`echo "$size*4" | bc`
echo $bytes $TIMES
done
Loading

0 comments on commit fee0a91

Please sign in to comment.