Skip to content

Commit

Permalink
Rename constant and move definitions to common header.
Browse files Browse the repository at this point in the history
  • Loading branch information
moinakg committed Apr 6, 2013
1 parent 9c0149c commit a12a1fc
Show file tree
Hide file tree
Showing 5 changed files with 63 additions and 102 deletions.
6 changes: 3 additions & 3 deletions Makefile
Expand Up @@ -119,13 +119,13 @@ all: build

build: vecCrypt vecCrypt_strm vecCrypt_strm_cpuxor

vecCrypt.o: vecCrypt.cu
vecCrypt.o: vecCrypt.cu common.h
$(NVCC) $(NVCCFLAGS) $(EXTRA_NVCCFLAGS) -I$(CUDA_INC_PATH) $(GENCODE_FLAGS) -o $@ -c $<

vecCrypt_strm.o: vecCrypt_strm.cu
vecCrypt_strm.o: vecCrypt_strm.cu common.h
$(NVCC) $(NVCCFLAGS) $(EXTRA_NVCCFLAGS) -I$(CUDA_INC_PATH) $(GENCODE_FLAGS) -o $@ -c $<

vecCrypt_strm_cpuxor.o: vecCrypt_strm_cpuxor.cu
vecCrypt_strm_cpuxor.o: vecCrypt_strm_cpuxor.cu common.h
$(NVCC) $(NVCCFLAGS) $(EXTRA_NVCCFLAGS) -I$(CUDA_INC_PATH) $(GENCODE_FLAGS) -o $@ -c $<

stream.o: stream.s
Expand Down
29 changes: 29 additions & 0 deletions common.h
@@ -0,0 +1,29 @@
#ifndef __COMMON_H__
#define __COMMON_H__

#if defined(__CUDACC__) // NVCC
#define MY_ALIGN(n) __align__(n)
#elif defined(__GNUC__) // GCC
#define MY_ALIGN(n) __attribute__((aligned(n)))
#elif defined(_MSC_VER) // MSVC
#define MY_ALIGN(n) __declspec(align(n))
#else
#error "Please provide a definition for MY_ALIGN macro for your host compiler!"
#endif

#define ROUNDS 20
#ifndef UINT64_MAX
#define UINT64_MAX (18446744073709551615ULL)
#endif

#define THREADS_PER_BLOCK (128)
#define XSALSA20_CRYPTO_KEYBYTES 32
#define XSALSA20_CRYPTO_NONCEBYTES 24
#define XSALSA20_BLOCKSZ 64
#define CTR_INBLOCK_SZ (16)
#define CTR_KS_SZ (XSALSA20_BLOCKSZ)
#define BLOCKS_PER_CHUNK_1X 4
#define BLOCKS_PER_CHUNK_2X 1
#define NUM_STREAMS 16

#endif
24 changes: 1 addition & 23 deletions vecCrypt.cu
Expand Up @@ -27,29 +27,7 @@
// includes CUDA
#include <cuda_runtime.h>

#if defined(__CUDACC__) // NVCC
#define MY_ALIGN(n) __align__(n)
#elif defined(__GNUC__) // GCC
#define MY_ALIGN(n) __attribute__((aligned(n)))
#elif defined(_MSC_VER) // MSVC
#define MY_ALIGN(n) __declspec(align(n))
#else
#error "Please provide a definition for MY_ALIGN macro for your host compiler!"
#endif

#define ROUNDS 20
#ifndef UINT64_MAX
#define UINT64_MAX (18446744073709551615ULL)
#endif

#define THREADS_PER_BLOCK (128)
#define XSALSA20_CRYPTO_KEYBYTES 32
#define XSALSA20_CRYPTO_NONCEBYTES 24
#define XSALSA20_BLOCKSZ 64
#define CTR_INBLOCK_SZ (16)
#define CTR_KS_SZ (XSALSA20_BLOCKSZ)
#define BLOCKS_PER_CHUNK_1X 4
#define BLOCKS_PER_CHUNK_2X 1
#include "common.h"

extern "C" int crypto_stream_salsa20_amd64_xmm6_xor(unsigned char *c, unsigned char *m,
unsigned long long mlen, unsigned char *n, unsigned char *k);
Expand Down
55 changes: 16 additions & 39 deletions vecCrypt_strm.cu
Expand Up @@ -27,30 +27,7 @@
// includes CUDA
#include <cuda_runtime.h>

#if defined(__CUDACC__) // NVCC
#define MY_ALIGN(n) __align__(n)
#elif defined(__GNUC__) // GCC
#define MY_ALIGN(n) __attribute__((aligned(n)))
#elif defined(_MSC_VER) // MSVC
#define MY_ALIGN(n) __declspec(align(n))
#else
#error "Please provide a definition for MY_ALIGN macro for your host compiler!"
#endif

#define ROUNDS 20
#ifndef UINT64_MAX
#define UINT64_MAX (18446744073709551615ULL)
#endif

#define THREADS_PER_BLOCK (128)
#define XSALSA20_CRYPTO_KEYBYTES 32
#define XSALSA20_CRYPTO_NONCEBYTES 24
#define XSALSA20_BLOCKSZ 64
#define CTR_INBLOCK_SZ (16)
#define CTR_KS_SZ (XSALSA20_BLOCKSZ)
#define BLOCKS_PER_CHUNK_1X 4
#define BLOCKS_PER_CHUNK_2X 1
#define NUM_ITERS 16
#include "common.h"

extern "C" int crypto_stream_salsa20_amd64_xmm6_xor(unsigned char *c, unsigned char *m,
unsigned long long mlen, unsigned char *n, unsigned char *k);
Expand Down Expand Up @@ -436,7 +413,7 @@ int main(int argc, char** argv)
double gpuTime1, cpuTime1, cpuTime2, strt, en;
uint64_t v_nonce;
cudaDeviceProp deviceProp;
cudaStream_t strm[NUM_ITERS];
cudaStream_t strm[NUM_STREAMS];

ParseArguments(argc, argv);
cudaGetDeviceProperties(&deviceProp, 0);
Expand All @@ -449,7 +426,7 @@ int main(int argc, char** argv)
if (NBLKS % blks_per_chunk) N++;
size = NBLKS * XSALSA20_BLOCKSZ;

for (i = 0; i < NUM_ITERS; i++)
for (i = 0; i < NUM_STREAMS; i++)
checkCudaErrors( cudaStreamCreate(&strm[i]) );

// Allocate input vectors h_A and h_B in host memory
Expand Down Expand Up @@ -486,36 +463,36 @@ int main(int argc, char** argv)
h_A1 = h_A;
d_A1 = d_A;
blk_off = 0;
for (i = 0; i < NUM_ITERS; i++) {
if (i == NUM_ITERS - 1) {
sz1 = NBLKS/NUM_ITERS * NUM_ITERS;
sz1 = NBLKS/NUM_ITERS + (NBLKS - sz1);
for (i = 0; i < NUM_STREAMS; i++) {
if (i == NUM_STREAMS - 1) {
sz1 = NBLKS/NUM_STREAMS * NUM_STREAMS;
sz1 = NBLKS/NUM_STREAMS + (NBLKS - sz1);
} else {
sz1 = NBLKS/NUM_ITERS;
sz1 = NBLKS/NUM_STREAMS;
}
sz1_bytes = sz1 * XSALSA20_BLOCKSZ;
N = sz1 / blks_per_chunk;
if (sz1 % blks_per_chunk) N++;

checkCudaErrors( cudaMemcpyAsync(d_A1, h_A1, sz1_bytes, cudaMemcpyHostToDevice, strm[i%NUM_ITERS]) );
checkCudaErrors( cudaMemcpyAsync(d_A1, h_A1, sz1_bytes, cudaMemcpyHostToDevice, strm[i%NUM_STREAMS]) );
blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
VecCrypt<<<blocksPerGrid, threadsPerBlock, 0, strm[i%NUM_ITERS]>>>(d_A1, N, sz1, v_nonce, blks_per_chunk, blk_off);
VecCrypt<<<blocksPerGrid, threadsPerBlock, 0, strm[i%NUM_STREAMS]>>>(d_A1, N, sz1, v_nonce, blks_per_chunk, blk_off);
h_A1 += sz1_bytes;
d_A1 += sz1_bytes;
blk_off += sz1;
}

h_A1 = h_A;
d_A1 = d_A;
for (i = 0; i < NUM_ITERS; i++) {
if (i == NUM_ITERS - 1) {
sz1 = NBLKS/NUM_ITERS * NUM_ITERS;
sz1 = NBLKS/NUM_ITERS + (NBLKS - sz1);
for (i = 0; i < NUM_STREAMS; i++) {
if (i == NUM_STREAMS - 1) {
sz1 = NBLKS/NUM_STREAMS * NUM_STREAMS;
sz1 = NBLKS/NUM_STREAMS + (NBLKS - sz1);
} else {
sz1 = NBLKS/NUM_ITERS;
sz1 = NBLKS/NUM_STREAMS;
}
sz1_bytes = sz1 * XSALSA20_BLOCKSZ;
checkCudaErrors( cudaMemcpyAsync(h_A1, d_A1, sz1_bytes, cudaMemcpyDeviceToHost, strm[i%NUM_ITERS]) );
checkCudaErrors( cudaMemcpyAsync(h_A1, d_A1, sz1_bytes, cudaMemcpyDeviceToHost, strm[i%NUM_STREAMS]) );
h_A1 += sz1_bytes;
d_A1 += sz1_bytes;
}
Expand Down
51 changes: 14 additions & 37 deletions vecCrypt_strm_cpuxor.cu
Expand Up @@ -29,30 +29,7 @@
// includes CUDA
#include <cuda_runtime.h>

#if defined(__CUDACC__) // NVCC
#define MY_ALIGN(n) __align__(n)
#elif defined(__GNUC__) // GCC
#define MY_ALIGN(n) __attribute__((aligned(n)))
#elif defined(_MSC_VER) // MSVC
#define MY_ALIGN(n) __declspec(align(n))
#else
#error "Please provide a definition for MY_ALIGN macro for your host compiler!"
#endif

#define ROUNDS 20
#ifndef UINT64_MAX
#define UINT64_MAX (18446744073709551615ULL)
#endif

#define THREADS_PER_BLOCK (128)
#define XSALSA20_CRYPTO_KEYBYTES 32
#define XSALSA20_CRYPTO_NONCEBYTES 24
#define XSALSA20_BLOCKSZ 64
#define CTR_INBLOCK_SZ (16)
#define CTR_KS_SZ (XSALSA20_BLOCKSZ)
#define BLOCKS_PER_CHUNK_1X 4
#define BLOCKS_PER_CHUNK_2X 1
#define NUM_ITERS 16
#include "common.h"

extern "C" void xor_buffer_aligned(unsigned char *buf1, unsigned char *buf2, size_t nblks, int blksz);
extern "C" int crypto_stream_salsa20_amd64_xmm6_xor(unsigned char *c, unsigned char *m,
Expand Down Expand Up @@ -449,16 +426,16 @@ proc_buf(void *dat)
// Host code
int main(int argc, char** argv)
{
printf("Salsa20 Vector Encryption using CUDA streams and XOR on CPU with OpenMP\n");
printf("Salsa20 Vector Encryption using CUDA streams and multi-threaded XOR on CPU\n");
unsigned int NBLKS = 4000000, N;
int rv, blks_per_chunk, threadsPerBlock, blocksPerGrid;
size_t size, i, sz1, sz1_bytes, blk_off;
unsigned char k[32], *h_A1, *d_A1, *h_B1;
double gpuTime1, cpuTime1, cpuTime2, strt, en;
uint64_t v_nonce;
cudaDeviceProp deviceProp;
cudaStream_t strm[NUM_ITERS];
struct tdat t_data[NUM_ITERS];
cudaStream_t strm[NUM_STREAMS];
struct tdat t_data[NUM_STREAMS];

ParseArguments(argc, argv);
cudaGetDeviceProperties(&deviceProp, 0);
Expand All @@ -471,7 +448,7 @@ int main(int argc, char** argv)
if (NBLKS % blks_per_chunk) N++;
size = NBLKS * XSALSA20_BLOCKSZ;

for (i = 0; i < NUM_ITERS; i++)
for (i = 0; i < NUM_STREAMS; i++)
checkCudaErrors( cudaStreamCreate(&strm[i]) );

// Allocate input vectors h_A and h_B in host memory
Expand Down Expand Up @@ -513,24 +490,24 @@ int main(int argc, char** argv)
d_A1 = d_A;
h_B1 = h_B;
blk_off = 0;
for (i = 0; i < NUM_ITERS; i++) {
if (i == NUM_ITERS - 1) {
sz1 = NBLKS/NUM_ITERS * NUM_ITERS;
sz1 = NBLKS/NUM_ITERS + (NBLKS - sz1);
for (i = 0; i < NUM_STREAMS; i++) {
if (i == NUM_STREAMS - 1) {
sz1 = NBLKS/NUM_STREAMS * NUM_STREAMS;
sz1 = NBLKS/NUM_STREAMS + (NBLKS - sz1);
} else {
sz1 = NBLKS/NUM_ITERS;
sz1 = NBLKS/NUM_STREAMS;
}
sz1_bytes = sz1 * XSALSA20_BLOCKSZ;
N = sz1 / blks_per_chunk;
if (sz1 % blks_per_chunk) N++;

blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
VecCrypt<<<blocksPerGrid, threadsPerBlock, 0, strm[i%NUM_ITERS]>>>(d_A1, N, sz1, v_nonce, blks_per_chunk, blk_off);
checkCudaErrors( cudaMemcpyAsync(h_A1, d_A1, sz1_bytes, cudaMemcpyDeviceToHost, strm[i%NUM_ITERS]) );
VecCrypt<<<blocksPerGrid, threadsPerBlock, 0, strm[i%NUM_STREAMS]>>>(d_A1, N, sz1, v_nonce, blks_per_chunk, blk_off);
checkCudaErrors( cudaMemcpyAsync(h_A1, d_A1, sz1_bytes, cudaMemcpyDeviceToHost, strm[i%NUM_STREAMS]) );
t_data[i].h_A1 = h_A1;
t_data[i].h_B1 = h_B1;
t_data[i].nblks = sz1;
t_data[i].strm = strm[i%NUM_ITERS];
t_data[i].strm = strm[i%NUM_STREAMS];
pthread_create(&(t_data[i].tid), NULL, proc_buf, (void *)&(t_data[i]));

h_A1 += sz1_bytes;
Expand All @@ -540,7 +517,7 @@ int main(int argc, char** argv)
}

checkCudaErrors( cudaDeviceSynchronize() );
for (i = 0; i < NUM_ITERS; i++)
for (i = 0; i < NUM_STREAMS; i++)
pthread_join(t_data[i].tid, NULL);

en = get_wtime_millis();
Expand Down

0 comments on commit a12a1fc

Please sign in to comment.