openwall · magnumripper · Apr 5, 2012 · Mar 26, 2012 · Mar 26, 2012 · Mar 30, 2012
diff --git a/src/Makefile b/src/Makefile
@@ -36,7 +36,7 @@ OMPFLAGS =
 #HAVE_NSS = -DHAVE_NSS
 
 # Change sm_10 to sm_20 if you have Fermi support (400 series or later)
-NVCC_FLAGS = -c -Xptxas -v -arch sm_10
+NVCC_FLAGS = -c -Xptxas -v -arch sm_13
 CUDAPATH = /usr/local/cuda/lib
 CUDA64PATH = /usr/local/cuda/lib64
 
@@ -136,7 +136,8 @@ CUDA_OBJS = \
 	cuda_mscash2_fmt.o cuda_mscash2.o \
 	cuda_rawsha256_fmt.o cuda_rawsha256.o \
 	cuda_rawsha224_fmt.o cuda_rawsha224.o \
-	cuda_mscash_fmt.o cuda_mscash.o
+	cuda_mscash_fmt.o cuda_mscash.o \
+	cuda_xsha512_fmt.o cuda_xsha512.o
 
 BENCH_DES_OBJS_ORIG = \
 	DES_fmt.o DES_std.o
@@ -1372,6 +1373,13 @@ cuda_rawsha224.o:  cuda_rawsha256.h cuda/rawsha256.cu cuda_common.o
 cuda_rawsha224_fmt.o: cuda_rawsha224.o cuda_rawsha256_fmt.c cuda_rawsha256.h
 	$(CC)  $(CFLAGS) -DSHA224 cuda_rawsha256_fmt.c -o cuda_rawsha224_fmt.o
 
+cuda_xsha512.o: cuda_xsha512.h cuda/xsha512.cu cuda_common.o
+	cd cuda; nvcc $(NVCC_FLAGS) xsha512.cu
+	mv cuda/xsha512.o cuda_xsha512.o
+
+cuda_xsha512_fmt.o: cuda_xsha512.o cuda_xsha512_fmt.c cuda_xsha512.h
+	$(CC) $(CFLAGS) cuda_xsha512_fmt.c -o cuda_xsha512_fmt.o
+
 ../run/john: $(JOHN_OBJS)
 	$(LD) $(JOHN_OBJS) $(LDFLAGS) -o ../run/john
 

diff --git a/src/cuda/xsha512.cu b/src/cuda/xsha512.cu
@@ -0,0 +1,260 @@
+/*
+ * This software is Copyright (c) 2011 Myrice <qqlddg at gmail dot com>
+ * and it is hereby released to the general public under the following terms:
+ * Redistribution and use in source and binary forms, with or without modification, are permitted.
+ * Thanks to Lukas Odzioba <lukas dot odzioba at gmail dot com>, his code helps me a lot
+*/
+#include "../cuda_xsha512.h"
+#include "cuda_common.cuh"
+
+extern "C" void cuda_xsha512(xsha512_key *host_password, xsha512_salt *host_salt, xsha512_hash* host_hash);
+extern "C" void cuda_xsha512_init();
+extern "C" int cuda_cmp_all(void *binary, int count);
+extern "C" void cuda_xsha512_cpy_hash(xsha512_hash* host_hash);
+
+static xsha512_key *cuda_password;
+static xsha512_hash *cuda_hash;
+static size_t password_size;
+static size_t hash_size;
+
+__constant__ uint64_t k[] = {
+	0x428a2f98d728ae22LL, 0x7137449123ef65cdLL, 0xb5c0fbcfec4d3b2fLL,
+	    0xe9b5dba58189dbbcLL,
+	0x3956c25bf348b538LL, 0x59f111f1b605d019LL, 0x923f82a4af194f9bLL,
+	    0xab1c5ed5da6d8118LL,
+	0xd807aa98a3030242LL, 0x12835b0145706fbeLL, 0x243185be4ee4b28cLL,
+	    0x550c7dc3d5ffb4e2LL,
+	0x72be5d74f27b896fLL, 0x80deb1fe3b1696b1LL, 0x9bdc06a725c71235LL,
+	    0xc19bf174cf692694LL,
+	0xe49b69c19ef14ad2LL, 0xefbe4786384f25e3LL, 0x0fc19dc68b8cd5b5LL,
+	    0x240ca1cc77ac9c65LL,
+	0x2de92c6f592b0275LL, 0x4a7484aa6ea6e483LL, 0x5cb0a9dcbd41fbd4LL,
+	    0x76f988da831153b5LL,
+	0x983e5152ee66dfabLL, 0xa831c66d2db43210LL, 0xb00327c898fb213fLL,
+	    0xbf597fc7beef0ee4LL,
+	0xc6e00bf33da88fc2LL, 0xd5a79147930aa725LL, 0x06ca6351e003826fLL,
+	    0x142929670a0e6e70LL,
+	0x27b70a8546d22ffcLL, 0x2e1b21385c26c926LL, 0x4d2c6dfc5ac42aedLL,
+	    0x53380d139d95b3dfLL,
+	0x650a73548baf63deLL, 0x766a0abb3c77b2a8LL, 0x81c2c92e47edaee6LL,
+	    0x92722c851482353bLL,
+	0xa2bfe8a14cf10364LL, 0xa81a664bbc423001LL, 0xc24b8b70d0f89791LL,
+	    0xc76c51a30654be30LL,
+	0xd192e819d6ef5218LL, 0xd69906245565a910LL, 0xf40e35855771202aLL,
+	    0x106aa07032bbd1b8LL,
+	0x19a4c116b8d2d0c8LL, 0x1e376c085141ab53LL, 0x2748774cdf8eeb99LL,
+	    0x34b0bcb5e19b48a8LL,
+	0x391c0cb3c5c95a63LL, 0x4ed8aa4ae3418acbLL, 0x5b9cca4f7763e373LL,
+	    0x682e6ff3d6b2b8a3LL,
+	0x748f82ee5defb2fcLL, 0x78a5636f43172f60LL, 0x84c87814a1f0ab72LL,
+	    0x8cc702081a6439ecLL,
+	0x90befffa23631e28LL, 0xa4506cebde82bde9LL, 0xbef9a3f7b2c67915LL,
+	    0xc67178f2e372532bLL,
+	0xca273eceea26619cLL, 0xd186b8c721c0c207LL, 0xeada7dd6cde0eb1eLL,
+	    0xf57d4f7fee6ed178LL,
+	0x06f067aa72176fbaLL, 0x0a637dc5a2c898a6LL, 0x113f9804bef90daeLL,
+	    0x1b710b35131c471bLL,
+	0x28db77f523047d84LL, 0x32caab7b40c72493LL, 0x3c9ebe0a15c9bebcLL,
+	    0x431d67c49c100d4cLL,
+	0x4cc5d4becb3e42b6LL, 0x597f299cfc657e2aLL, 0x5fcb6fab3ad6faecLL,
+	    0x6c44198c4a475817LL,
+};
+
+__constant__ xsha512_salt cuda_salt[1];
+__constant__ uint64_t cuda_b0[1];
+
+__device__ void xsha512_init(xsha512_ctx *ctx)
+{
+    ctx->H[0] = 0x6a09e667f3bcc908LL;
+	ctx->H[1] = 0xbb67ae8584caa73bLL;
+	ctx->H[2] = 0x3c6ef372fe94f82bLL;
+	ctx->H[3] = 0xa54ff53a5f1d36f1LL;
+	ctx->H[4] = 0x510e527fade682d1LL;
+	ctx->H[5] = 0x9b05688c2b3e6c1fLL;
+	ctx->H[6] = 0x1f83d9abfb41bd6bLL;
+	ctx->H[7] = 0x5be0cd19137e2179LL;
+	ctx->buflen = 0;
+}
+
+
+__device__ void xsha512_update(xsha512_ctx *ctx, const char *string, uint8_t length)
+{
+    uint8_t *off = &ctx->buffer[ctx->buflen];
+    memcpy(off, string, length);
+    ctx->buflen += length;
+}
+
+// The function below is from Lukas' crypt512-cuda
+__device__ void sha512_block(xsha512_ctx * ctx)
+{
+	int i;
+	uint64_t a = ctx->H[0];
+	uint64_t b = ctx->H[1];
+	uint64_t c = ctx->H[2];
+	uint64_t d = ctx->H[3];
+	uint64_t e = ctx->H[4];
+	uint64_t f = ctx->H[5];
+	uint64_t g = ctx->H[6];
+	uint64_t h = ctx->H[7];
+
+
+	uint64_t w[16];
+
+	uint64_t *data = (uint64_t *) ctx->buffer;
+    #pragma unroll 16
+	for (i = 0; i < 16; i++)
+		w[i] = SWAP64(data[i]);
+
+	uint64_t t1, t2;
+    #pragma unroll 16
+	for (i = 0; i < 16; i++) {
+
+		t1 = k[i] + w[i] + h + Sigma1(e) + Ch(e, f, g);
+		t2 = Maj(a, b, c) + Sigma0(a);
+
+		h = g;
+		g = f;
+		f = e;
+		e = d + t1;
+		d = c;
+		c = b;
+		b = a;
+		a = t1 + t2;
+
+	}
+    #pragma unroll 64
+	for (i = 16; i < 80; i++) {
+
+		w[i & 15] =sigma1(w[(i - 2) & 15]) + sigma0(w[(i - 15) & 15]) + w[(i -16) & 15] + w[(i - 7) & 15];
+		t1 = k[i] + w[i & 15] + h + Sigma1(e) + Ch(e, f, g);
+		t2 = Maj(a, b, c) + Sigma0(a);
+
+		h = g;
+		g = f;
+		f = e;
+		e = d + t1;
+		d = c;
+		c = b;
+		b = a;
+		a = t1 + t2;
+
+	}
+
+	ctx->H[0] += a;
+#if 0
+	ctx->H[1] += b;
+	ctx->H[2] += c;
+	ctx->H[3] += d;
+	ctx->H[4] += e;
+	ctx->H[5] += f;
+	ctx->H[6] += g;
+	ctx->H[7] += h;
+#endif
+}
+
+__device__ void xsha512_final(xsha512_ctx *ctx)
+{
+    //append 1 to ctx buffer
+    uint32_t length = ctx->buflen;
+    uint8_t *buffer8 = &ctx->buffer[length];
+
+    *buffer8++ = 0x80;
+
+    while(++length % 4 != 0)  {
+        *buffer8++ = 0;
+    }
+
+    uint32_t *buffer32 = (uint32_t*)buffer8;
+    for(uint32_t i = length; i < 128; i+=4) {// append 0 to 128
+        *buffer32++=0;
+    }
+
+    //append length to ctx buffer
+    uint64_t *buffer64 = (uint64_t *)ctx->buffer;
+    buffer64[15] = SWAP64((uint64_t) ctx->buflen * 8); 
+
+    sha512_block(ctx);
+}
+
+__device__ void xsha512(const char* password, uint8_t pass_len, uint64_t *hash, uint32_t offset)
+{
+    xsha512_ctx ctx;
+    xsha512_init(&ctx);
+    xsha512_update(&ctx, (const char*)cuda_salt[0].v, SALT_SIZE);
+    xsha512_update(&ctx, password, pass_len);
+    xsha512_final(&ctx);
+
+#if 0
+	#pragma unroll 8
+	for(uint32_t i = 0; i < 8; ++i) {
+		hash[hash_addr(i, idx)] = SWAP64(ctx.H[i]);
+	}
+#else
+	hash[hash_addr(0, offset)] = SWAP64(ctx.H[0]);
+#endif
+}
+
+__global__ void kernel_xsha512(xsha512_key *cuda_password, xsha512_hash *cuda_hash)
+{
+
+    uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+	for(uint32_t it = 0; it < (MAX_KEYS_PER_CRYPT/KEYS_PER_CRYPT); ++it) {		
+		uint32_t offset = idx+it*KEYS_PER_CRYPT;
+    	xsha512((const char*)cuda_password[offset].v, cuda_password[offset].length, (uint64_t*)cuda_hash, offset);
+	}
+}
+
+void cuda_xsha512_init()
+{
+    password_size = sizeof(xsha512_key) * MAX_KEYS_PER_CRYPT;
+    hash_size = sizeof(xsha512_hash) * MAX_KEYS_PER_CRYPT;
+	HANDLE_ERROR(cudaMalloc(&cuda_password, password_size));
+    HANDLE_ERROR(cudaMalloc(&cuda_hash, hash_size));
+}
+
+void cuda_xsha512_cpy_hash(xsha512_hash* host_hash)
+{
+	HANDLE_ERROR(cudaMemcpy(host_hash, cuda_hash, hash_size, cudaMemcpyDeviceToHost));
+}
+
+void cuda_xsha512(xsha512_key *host_password, xsha512_salt *host_salt, xsha512_hash* host_hash) 
+{
+	if(xsha512_key_changed) {
+	    HANDLE_ERROR(cudaMemcpy(cuda_password, host_password, password_size, cudaMemcpyHostToDevice));
+	}
+    HANDLE_ERROR(cudaMemcpyToSymbol(cuda_salt, host_salt, sizeof(xsha512_salt)));
+
+    dim3 dimGrid(BLOCKS);
+    dim3 dimBlock(THREADS);
+    kernel_xsha512 <<< dimGrid, dimBlock >>> (cuda_password, cuda_hash);
+}
+
+__global__ void kernel_cmp_all(int count, uint64_t* hash, uint8_t *result)
+{
+	uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+	if(idx == 0)
+		*result = 0;
+	__syncthreads();
+	for(uint32_t it = 0; it < (MAX_KEYS_PER_CRYPT/KEYS_PER_CRYPT); ++it) {		
+		uint32_t offset = idx+it*KEYS_PER_CRYPT;
+		if(offset < count){
+			if (cuda_b0[0] == hash[hash_addr(0, offset)])
+				*result = 1;
+		}
+	}
+}
+
+int cuda_cmp_all(void *binary, int count)
+{
+	uint64_t b0 = *(uint64_t *)binary;
+	HANDLE_ERROR(cudaMemcpyToSymbol(cuda_b0, &b0, sizeof(uint64_t)));
+	uint8_t result = 0;
+	uint8_t *cuda_result;
+	HANDLE_ERROR(cudaMalloc(&cuda_result, sizeof(uint8_t)));
+    dim3 dimGrid(BLOCKS);
+    dim3 dimBlock(THREADS);
+	kernel_cmp_all <<< dimGrid, dimBlock >>> (count, (uint64_t*)cuda_hash, cuda_result);
+	HANDLE_ERROR(cudaMemcpy(&result, cuda_result, sizeof(uint8_t), cudaMemcpyDeviceToHost));
+	return result;
+}
diff --git a/src/cuda_xsha512.h b/src/cuda_xsha512.h
@@ -0,0 +1,82 @@
+/*
+ * This software is Copyright (c) 2011 Myrice <qqlddg at gmail dot com>
+ * and it is hereby released to the general public under the following terms:
+ * Redistribution and use in source and binary forms, with or without modification, are permitted.
+ * Thanks to Lukas Odzioba <lukas dot odzioba at gmail dot com>, his code helps me a lot
+*/
+
+#ifndef _CUDA_XSHA512_H
+#define _CUDA_XSHA512_H
+
+#define uint8_t  unsigned char
+#define uint32_t unsigned int
+#define uint64_t unsigned long long int
+
+#define BLOCKS 32
+#define THREADS 512
+#define KEYS_PER_CRYPT (BLOCKS*THREADS)
+
+#define MIN_KEYS_PER_CRYPT	(KEYS_PER_CRYPT)
+#define MAX_KEYS_PER_CRYPT	(128*KEYS_PER_CRYPT)
+
+#define SALT_SIZE 4
+#if 0
+#define BINARY_SIZE 64
+#else
+#define BINARY_SIZE 8
+#endif
+
+#if 0
+#define PLAINTEXT_LENGTH		107
+#else
+#define PLAINTEXT_LENGTH		12
+#endif
+#define CIPHERTEXT_LENGTH		136
+
+extern uint8_t xsha512_key_changed;
+// Thanks for Lukas' code here
+# define SWAP64(n) \
+  (((n) << 56)					\
+   | (((n) & 0xff00) << 40)			\
+   | (((n) & 0xff0000) << 24)			\
+   | (((n) & 0xff000000) << 8)			\
+   | (((n) >> 8) & 0xff000000)			\
+   | (((n) >> 24) & 0xff0000)			\
+   | (((n) >> 40) & 0xff00)			\
+   | ((n) >> 56))
+
+
+#define rol(x,n) ((x << n) | (x >> (64-n)))
+#define ror(x,n) ((x >> n) | (x << (64-n)))
+#define Ch(x,y,z) ((x & y) ^ ( (~x) & z))
+#define Maj(x,y,z) ((x & y) ^ (x & z) ^ (y & z))
+#define Sigma0(x) ((ror(x,28))  ^ (ror(x,34)) ^ (ror(x,39)))
+#define Sigma1(x) ((ror(x,14))  ^ (ror(x,18)) ^ (ror(x,41)))
+#define sigma0(x) ((ror(x,1))  ^ (ror(x,8)) ^(x>>7))
+#define sigma1(x) ((ror(x,19)) ^ (ror(x,61)) ^(x>>6))
+
+#define hash_addr(j,idx) (((j)*(MAX_KEYS_PER_CRYPT))+(idx))
+
+
+typedef struct { // notice memory align problem
+	uint8_t buffer[128];	//1024bits
+	uint32_t buflen;
+	uint64_t H[8];
+} xsha512_ctx;
+
+
+typedef struct {
+    uint8_t v[SALT_SIZE]; // 32bits
+} xsha512_salt;
+
+typedef struct {
+    uint8_t length;
+    char v[PLAINTEXT_LENGTH+1];
+} xsha512_key;
+
+typedef struct {
+    uint64_t v[BINARY_SIZE / 8]; // up to 512 bits
+} xsha512_hash;
+
+#endif
+