Rename constant and move definitions to common header.

moinakg · Apr 6, 2013 · a12a1fc · a12a1fc
1 parent 9c0149c
commit a12a1fc
Show file tree

Hide file tree

Showing 5 changed files with 63 additions and 102 deletions.
diff --git a/Makefile b/Makefile
@@ -119,13 +119,13 @@ all: build
 
 build: vecCrypt vecCrypt_strm vecCrypt_strm_cpuxor
 
-vecCrypt.o: vecCrypt.cu
+vecCrypt.o: vecCrypt.cu common.h
 	$(NVCC) $(NVCCFLAGS) $(EXTRA_NVCCFLAGS) -I$(CUDA_INC_PATH) $(GENCODE_FLAGS) -o $@ -c $<
 
-vecCrypt_strm.o: vecCrypt_strm.cu
+vecCrypt_strm.o: vecCrypt_strm.cu common.h
 	$(NVCC) $(NVCCFLAGS) $(EXTRA_NVCCFLAGS) -I$(CUDA_INC_PATH) $(GENCODE_FLAGS) -o $@ -c $<
 
-vecCrypt_strm_cpuxor.o: vecCrypt_strm_cpuxor.cu
+vecCrypt_strm_cpuxor.o: vecCrypt_strm_cpuxor.cu common.h
 	$(NVCC) $(NVCCFLAGS) $(EXTRA_NVCCFLAGS) -I$(CUDA_INC_PATH) $(GENCODE_FLAGS) -o $@ -c $<
 
 stream.o: stream.s

diff --git a/common.h b/common.h
@@ -0,0 +1,29 @@
+#ifndef __COMMON_H__
+#define __COMMON_H__
+
+#if defined(__CUDACC__) // NVCC
+   #define MY_ALIGN(n) __align__(n)
+#elif defined(__GNUC__) // GCC
+  #define MY_ALIGN(n) __attribute__((aligned(n)))
+#elif defined(_MSC_VER) // MSVC
+  #define MY_ALIGN(n) __declspec(align(n))
+#else
+  #error "Please provide a definition for MY_ALIGN macro for your host compiler!"
+#endif
+
+#define ROUNDS 20
+#ifndef UINT64_MAX
+#define	UINT64_MAX (18446744073709551615ULL)
+#endif
+
+#define THREADS_PER_BLOCK (128)
+#define XSALSA20_CRYPTO_KEYBYTES 32
+#define XSALSA20_CRYPTO_NONCEBYTES 24
+#define XSALSA20_BLOCKSZ 64
+#define CTR_INBLOCK_SZ (16)
+#define CTR_KS_SZ (XSALSA20_BLOCKSZ)
+#define BLOCKS_PER_CHUNK_1X 4
+#define BLOCKS_PER_CHUNK_2X 1
+#define NUM_STREAMS 16
+
+#endif
diff --git a/vecCrypt.cu b/vecCrypt.cu
@@ -27,29 +27,7 @@
 // includes CUDA
 #include <cuda_runtime.h>
 
-#if defined(__CUDACC__) // NVCC
-   #define MY_ALIGN(n) __align__(n)
-#elif defined(__GNUC__) // GCC
-  #define MY_ALIGN(n) __attribute__((aligned(n)))
-#elif defined(_MSC_VER) // MSVC
-  #define MY_ALIGN(n) __declspec(align(n))
-#else
-  #error "Please provide a definition for MY_ALIGN macro for your host compiler!"
-#endif
-
-#define ROUNDS 20
-#ifndef UINT64_MAX
-#define	UINT64_MAX (18446744073709551615ULL)
-#endif
-
-#define THREADS_PER_BLOCK (128)
-#define XSALSA20_CRYPTO_KEYBYTES 32
-#define XSALSA20_CRYPTO_NONCEBYTES 24
-#define XSALSA20_BLOCKSZ 64
-#define CTR_INBLOCK_SZ (16)
-#define CTR_KS_SZ (XSALSA20_BLOCKSZ)
-#define BLOCKS_PER_CHUNK_1X 4
-#define BLOCKS_PER_CHUNK_2X 1
+#include "common.h"
 
 extern "C" int crypto_stream_salsa20_amd64_xmm6_xor(unsigned char *c, unsigned char *m,
 		unsigned long long mlen, unsigned char *n, unsigned char *k);

diff --git a/vecCrypt_strm.cu b/vecCrypt_strm.cu
@@ -27,30 +27,7 @@
 // includes CUDA
 #include <cuda_runtime.h>
 
-#if defined(__CUDACC__) // NVCC
-   #define MY_ALIGN(n) __align__(n)
-#elif defined(__GNUC__) // GCC
-  #define MY_ALIGN(n) __attribute__((aligned(n)))
-#elif defined(_MSC_VER) // MSVC
-  #define MY_ALIGN(n) __declspec(align(n))
-#else
-  #error "Please provide a definition for MY_ALIGN macro for your host compiler!"
-#endif
-
-#define ROUNDS 20
-#ifndef UINT64_MAX
-#define	UINT64_MAX (18446744073709551615ULL)
-#endif
-
-#define THREADS_PER_BLOCK (128)
-#define XSALSA20_CRYPTO_KEYBYTES 32
-#define XSALSA20_CRYPTO_NONCEBYTES 24
-#define XSALSA20_BLOCKSZ 64
-#define CTR_INBLOCK_SZ (16)
-#define CTR_KS_SZ (XSALSA20_BLOCKSZ)
-#define BLOCKS_PER_CHUNK_1X 4
-#define BLOCKS_PER_CHUNK_2X 1
-#define NUM_ITERS 16
+#include "common.h"
 
 extern "C" int crypto_stream_salsa20_amd64_xmm6_xor(unsigned char *c, unsigned char *m,
 		unsigned long long mlen, unsigned char *n, unsigned char *k);
@@ -436,7 +413,7 @@ int main(int argc, char** argv)
     double gpuTime1, cpuTime1, cpuTime2, strt, en;
     uint64_t v_nonce;
     cudaDeviceProp deviceProp;
-    cudaStream_t strm[NUM_ITERS];
+    cudaStream_t strm[NUM_STREAMS];
 
     ParseArguments(argc, argv);
     cudaGetDeviceProperties(&deviceProp, 0);
@@ -449,7 +426,7 @@ int main(int argc, char** argv)
     if (NBLKS % blks_per_chunk) N++;
     size = NBLKS * XSALSA20_BLOCKSZ;
 
-    for (i = 0; i < NUM_ITERS; i++)
+    for (i = 0; i < NUM_STREAMS; i++)
         checkCudaErrors( cudaStreamCreate(&strm[i]) );
 
     // Allocate input vectors h_A and h_B in host memory
@@ -486,36 +463,36 @@ int main(int argc, char** argv)
     h_A1 = h_A;
     d_A1 = d_A;
     blk_off = 0;
-    for (i = 0; i < NUM_ITERS; i++) {
-        if (i == NUM_ITERS - 1) {
-            sz1 = NBLKS/NUM_ITERS * NUM_ITERS;
-            sz1 = NBLKS/NUM_ITERS + (NBLKS - sz1);
+    for (i = 0; i < NUM_STREAMS; i++) {
+        if (i == NUM_STREAMS - 1) {
+            sz1 = NBLKS/NUM_STREAMS * NUM_STREAMS;
+            sz1 = NBLKS/NUM_STREAMS + (NBLKS - sz1);
         } else {
-            sz1 = NBLKS/NUM_ITERS;
+            sz1 = NBLKS/NUM_STREAMS;
         }
         sz1_bytes = sz1 * XSALSA20_BLOCKSZ;
         N = sz1 / blks_per_chunk;
         if (sz1 % blks_per_chunk) N++;
 
-        checkCudaErrors( cudaMemcpyAsync(d_A1, h_A1, sz1_bytes, cudaMemcpyHostToDevice, strm[i%NUM_ITERS]) );
+        checkCudaErrors( cudaMemcpyAsync(d_A1, h_A1, sz1_bytes, cudaMemcpyHostToDevice, strm[i%NUM_STREAMS]) );
         blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
-        VecCrypt<<<blocksPerGrid, threadsPerBlock, 0, strm[i%NUM_ITERS]>>>(d_A1, N, sz1, v_nonce, blks_per_chunk, blk_off);
+        VecCrypt<<<blocksPerGrid, threadsPerBlock, 0, strm[i%NUM_STREAMS]>>>(d_A1, N, sz1, v_nonce, blks_per_chunk, blk_off);
         h_A1 += sz1_bytes;
         d_A1 += sz1_bytes;
         blk_off += sz1;
     }
 
     h_A1 = h_A;
     d_A1 = d_A;
-    for (i = 0; i < NUM_ITERS; i++) {
-        if (i == NUM_ITERS - 1) {
-            sz1 = NBLKS/NUM_ITERS * NUM_ITERS;
-            sz1 = NBLKS/NUM_ITERS + (NBLKS - sz1);
+    for (i = 0; i < NUM_STREAMS; i++) {
+        if (i == NUM_STREAMS - 1) {
+            sz1 = NBLKS/NUM_STREAMS * NUM_STREAMS;
+            sz1 = NBLKS/NUM_STREAMS + (NBLKS - sz1);
         } else {
-            sz1 = NBLKS/NUM_ITERS;
+            sz1 = NBLKS/NUM_STREAMS;
         }
         sz1_bytes = sz1 * XSALSA20_BLOCKSZ;
-        checkCudaErrors( cudaMemcpyAsync(h_A1, d_A1, sz1_bytes, cudaMemcpyDeviceToHost, strm[i%NUM_ITERS]) );
+        checkCudaErrors( cudaMemcpyAsync(h_A1, d_A1, sz1_bytes, cudaMemcpyDeviceToHost, strm[i%NUM_STREAMS]) );
         h_A1 += sz1_bytes;
         d_A1 += sz1_bytes;
     }

diff --git a/vecCrypt_strm_cpuxor.cu b/vecCrypt_strm_cpuxor.cu
@@ -29,30 +29,7 @@
 // includes CUDA
 #include <cuda_runtime.h>
 
-#if defined(__CUDACC__) // NVCC
-   #define MY_ALIGN(n) __align__(n)
-#elif defined(__GNUC__) // GCC
-  #define MY_ALIGN(n) __attribute__((aligned(n)))
-#elif defined(_MSC_VER) // MSVC
-  #define MY_ALIGN(n) __declspec(align(n))
-#else
-  #error "Please provide a definition for MY_ALIGN macro for your host compiler!"
-#endif
-
-#define ROUNDS 20
-#ifndef UINT64_MAX
-#define	UINT64_MAX (18446744073709551615ULL)
-#endif
-
-#define THREADS_PER_BLOCK (128)
-#define XSALSA20_CRYPTO_KEYBYTES 32
-#define XSALSA20_CRYPTO_NONCEBYTES 24
-#define XSALSA20_BLOCKSZ 64
-#define CTR_INBLOCK_SZ (16)
-#define CTR_KS_SZ (XSALSA20_BLOCKSZ)
-#define BLOCKS_PER_CHUNK_1X 4
-#define BLOCKS_PER_CHUNK_2X 1
-#define NUM_ITERS 16
+#include "common.h"
 
 extern "C" void xor_buffer_aligned(unsigned char *buf1, unsigned char *buf2, size_t nblks, int blksz);
 extern "C" int crypto_stream_salsa20_amd64_xmm6_xor(unsigned char *c, unsigned char *m,
@@ -449,16 +426,16 @@ proc_buf(void *dat)
 // Host code
 int main(int argc, char** argv)
 {
-    printf("Salsa20 Vector Encryption using CUDA streams and XOR on CPU with OpenMP\n");
+    printf("Salsa20 Vector Encryption using CUDA streams and multi-threaded XOR on CPU\n");
     unsigned int NBLKS = 4000000, N;
     int rv, blks_per_chunk, threadsPerBlock, blocksPerGrid;
     size_t size, i, sz1, sz1_bytes, blk_off;
     unsigned char k[32], *h_A1, *d_A1, *h_B1;
     double gpuTime1, cpuTime1, cpuTime2, strt, en;
     uint64_t v_nonce;
     cudaDeviceProp deviceProp;
-    cudaStream_t strm[NUM_ITERS];
-    struct tdat t_data[NUM_ITERS];
+    cudaStream_t strm[NUM_STREAMS];
+    struct tdat t_data[NUM_STREAMS];
 
     ParseArguments(argc, argv);
     cudaGetDeviceProperties(&deviceProp, 0);
@@ -471,7 +448,7 @@ int main(int argc, char** argv)
     if (NBLKS % blks_per_chunk) N++;
     size = NBLKS * XSALSA20_BLOCKSZ;
 
-    for (i = 0; i < NUM_ITERS; i++)
+    for (i = 0; i < NUM_STREAMS; i++)
         checkCudaErrors( cudaStreamCreate(&strm[i]) );
 
     // Allocate input vectors h_A and h_B in host memory
@@ -513,24 +490,24 @@ int main(int argc, char** argv)
     d_A1 = d_A;
     h_B1 = h_B;
     blk_off = 0;
-    for (i = 0; i < NUM_ITERS; i++) {
-        if (i == NUM_ITERS - 1) {
-            sz1 = NBLKS/NUM_ITERS * NUM_ITERS;
-            sz1 = NBLKS/NUM_ITERS + (NBLKS - sz1);
+    for (i = 0; i < NUM_STREAMS; i++) {
+        if (i == NUM_STREAMS - 1) {
+            sz1 = NBLKS/NUM_STREAMS * NUM_STREAMS;
+            sz1 = NBLKS/NUM_STREAMS + (NBLKS - sz1);
         } else {
-            sz1 = NBLKS/NUM_ITERS;
+            sz1 = NBLKS/NUM_STREAMS;
         }
         sz1_bytes = sz1 * XSALSA20_BLOCKSZ;
         N = sz1 / blks_per_chunk;
         if (sz1 % blks_per_chunk) N++;
 
         blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
-        VecCrypt<<<blocksPerGrid, threadsPerBlock, 0, strm[i%NUM_ITERS]>>>(d_A1, N, sz1, v_nonce, blks_per_chunk, blk_off);
-        checkCudaErrors( cudaMemcpyAsync(h_A1, d_A1, sz1_bytes, cudaMemcpyDeviceToHost, strm[i%NUM_ITERS]) );
+        VecCrypt<<<blocksPerGrid, threadsPerBlock, 0, strm[i%NUM_STREAMS]>>>(d_A1, N, sz1, v_nonce, blks_per_chunk, blk_off);
+        checkCudaErrors( cudaMemcpyAsync(h_A1, d_A1, sz1_bytes, cudaMemcpyDeviceToHost, strm[i%NUM_STREAMS]) );
         t_data[i].h_A1 = h_A1;
         t_data[i].h_B1 = h_B1;
         t_data[i].nblks = sz1;
-        t_data[i].strm = strm[i%NUM_ITERS];
+        t_data[i].strm = strm[i%NUM_STREAMS];
         pthread_create(&(t_data[i].tid), NULL, proc_buf, (void *)&(t_data[i]));
 
         h_A1 += sz1_bytes;
@@ -540,7 +517,7 @@ int main(int argc, char** argv)
     }
 
     checkCudaErrors( cudaDeviceSynchronize() );
-    for (i = 0; i < NUM_ITERS; i++)
+    for (i = 0; i < NUM_STREAMS; i++)
         pthread_join(t_data[i].tid, NULL);
 
     en = get_wtime_millis();