In [None]:
mod = SourceModule("""
__global__ void get_row_indices(uint* )
{
    int pos_y = threadIdx.y + blockDim.y * blockIdx.y; // 画像のy方向の位置を取得。
    int pos_x = threadIdx.x + blockDim.x * blockIdx.x; // 画像のx方向の位置を取得。

    int idx = pos_y * height + pos_x; // 内部では1次元配列化しているので、インデックスはこのようになる

    image[idx] = 255 - image[idx];  //画素値を反転
}
""")


In [22]:
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
from pycuda import gpuarray
import numpy as np
from PIL import Image

In [None]:
def _generate_signature_matrix_cpu(numhash, numband, bandwidth, data):
    signature_matrix = np.zeros((numhash, data.shape[1]))
    for row in range(numhash):
        for col in range(data.shape[1]):
            idsets = np.where(data[:, col] >= 1)[0]
            if len(idsets) > 0: 
                signature_matrix[row, col] = minhash(idsets, seed = row)
            else:
                signature_matrix[row, col] = hash_fn(3511 * col, seed = row)
    return signature_matrix


In [95]:
#
# 以下CUDAカーネル
#

mod = SourceModule("""
__global__ void negative(unsigned char *image, int height, int width)
{
    int pos_y = threadIdx.y + blockDim.y * blockIdx.y; // 画像のy方向の位置を取得。
    int pos_x = threadIdx.x + blockDim.x * blockIdx.x; // 画像のx方向の位置を取得。

    int idx = pos_y * height + pos_x; // 内部では1次元配列化しているので、インデックスはこのようになる

    image[idx] = 255 - image[idx];  //画素値を反転
}
""")


#
# 画像の読みこみ
#
pil_image = Image.open("./lena_gray.png")   # 画像の読み込み
image = np.array(pil_image, np.uint8)       # PIL Image型からnumpy arrayに変換。

height, width = image.shape[:2] # 今回は512x512pxの画像を利用

#
# CUDA周りの準備
#
cuda_kernel = mod.get_function("negative")  # 上で定義したカーネルを呼び出す

block   = (512, 1, 1)                       # Block size (後述) 
grid    = (512, 1, 1)                      # Grid size (後述)


#
# カーネルを実行
#
cuda_kernel(cuda.InOut(image),              # imageを参照渡しする。
            np.int32(height),               # int型定数はnumpyで明示的に型を定義する
            np.int32(width), 
            block=block, grid=grid)         # BlockとGridも引数として与える

#
# 処理後の画像を保存
#
pil_output_image = Image.fromarray(image)
pil_output_image.save("out.png")

In [99]:
def negative_cpu(image):
    for row in range(image.shape[0]):
        for col in range(image.shape[1]):
            image[row, col] = -image[row, col]
    return image

In [100]:
%%timeit
negative_cpu(image)

92.8 ms ± 525 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [96]:
%%timeit
#
# CUDA周りの準備
#
cuda_kernel = mod.get_function("negative")  # 上で定義したカーネルを呼び出す

block   = (512, 1, 1)                       # Block size (後述) 
grid    = (512, 1, 1)                      # Grid size (後述)


#
# カーネルを実行
#
cuda_kernel(cuda.InOut(image),              # imageを参照渡しする。
            np.int32(height),               # int型定数はnumpyで明示的に型を定義する
            np.int32(width), 
            block=block, grid=grid)         # BlockとGridも引数として与える


# #
# # 処理後の画像を保存
# #
# pil_output_image = Image.fromarray(image)
# pil_output_image.save("out.png")

4.83 ms ± 6.67 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [94]:
mod = SourceModule("""
#include <stdio.h>
#include <cuda.h>
#include <cublas.h>
#define UINT4 uint

/* F, G and H are basic MD5 functions: selection, majority, parity */

#define F(x, y, z) (((x) & (y)) | ((~x) & (z)))
#define G(x, y, z) (((x) & (z)) | ((y) & (~z)))
#define H(x, y, z) ((x) ^ (y) ^ (z))
#define I(x, y, z) ((y) ^ ((x) | (~z))) 

/* ROTATE_LEFT rotates x left n bits */
#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))

/* FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4 */
/* Rotation is separate from addition to prevent recomputation */
#define FF(a, b, c, d, x, s, ac) \
  {(a) += F ((b), (c), (d)) + (x) + (UINT4)(ac); \
   (a) = ROTATE_LEFT ((a), (s)); \
   (a) += (b); \
  }

#define GG(a, b, c, d, x, s, ac) \
  {(a) += G ((b), (c), (d)) + (x) + (UINT4)(ac); \
   (a) = ROTATE_LEFT ((a), (s)); \
   (a) += (b); \
  }

#define HH(a, b, c, d, x, s, ac) \
  {(a) += H ((b), (c), (d)) + (x) + (UINT4)(ac); \
   (a) = ROTATE_LEFT ((a), (s)); \
   (a) += (b); \
  }

#define II(a, b, c, d, x, s, ac) \
  {(a) += I ((b), (c), (d)) + (x) + (UINT4)(ac); \
   (a) = ROTATE_LEFT ((a), (s)); \
   (a) += (b); \
  }

__global__ void md5(uint *in, uint *hash) {
uint a, b, c, d;

const uint a0 = 0x67452301;
const uint b0 = 0xEFCDAB89;
const uint c0 = 0x98BADCFE;
const uint d0 = 0x10325476;

    a = a0;
    b = b0;
    c = c0;
    d = d0;

  /* Round 1 */
#define S11 7
#define S12 12
#define S13 17
#define S14 22
  FF ( a, b, c, d, in[ 0], S11, 3614090360); /* 1 */
  FF ( d, a, b, c, in[ 1], S12, 3905402710); /* 2 */
  FF ( c, d, a, b, in[ 2], S13,  606105819); /* 3 */
  FF ( b, c, d, a, in[ 3], S14, 3250441966); /* 4 */
  FF ( a, b, c, d, in[ 4], S11, 4118548399); /* 5 */
  FF ( d, a, b, c, in[ 5], S12, 1200080426); /* 6 */
  FF ( c, d, a, b, in[ 6], S13, 2821735955); /* 7 */
  FF ( b, c, d, a, in[ 7], S14, 4249261313); /* 8 */
  FF ( a, b, c, d, in[ 8], S11, 1770035416); /* 9 */
  FF ( d, a, b, c, in[ 9], S12, 2336552879); /* 10 */
  FF ( c, d, a, b, in[10], S13, 4294925233); /* 11 */
  FF ( b, c, d, a, in[11], S14, 2304563134); /* 12 */
  FF ( a, b, c, d, in[12], S11, 1804603682); /* 13 */
  FF ( d, a, b, c, in[13], S12, 4254626195); /* 14 */
  FF ( c, d, a, b, in[14], S13, 2792965006); /* 15 */
  FF ( b, c, d, a, in[15], S14, 1236535329); /* 16 */

  /* Round 2 */
#define S21 5
#define S22 9
#define S23 14
#define S24 20
  GG ( a, b, c, d, in[ 1], S21, 4129170786); /* 17 */
  GG ( d, a, b, c, in[ 6], S22, 3225465664); /* 18 */
  GG ( c, d, a, b, in[11], S23,  643717713); /* 19 */
  GG ( b, c, d, a, in[ 0], S24, 3921069994); /* 20 */
  GG ( a, b, c, d, in[ 5], S21, 3593408605); /* 21 */
  GG ( d, a, b, c, in[10], S22,   38016083); /* 22 */
  GG ( c, d, a, b, in[15], S23, 3634488961); /* 23 */
  GG ( b, c, d, a, in[ 4], S24, 3889429448); /* 24 */
  GG ( a, b, c, d, in[ 9], S21,  568446438); /* 25 */
  GG ( d, a, b, c, in[14], S22, 3275163606); /* 26 */
  GG ( c, d, a, b, in[ 3], S23, 4107603335); /* 27 */
  GG ( b, c, d, a, in[ 8], S24, 1163531501); /* 28 */
  GG ( a, b, c, d, in[13], S21, 2850285829); /* 29 */
  GG ( d, a, b, c, in[ 2], S22, 4243563512); /* 30 */
  GG ( c, d, a, b, in[ 7], S23, 1735328473); /* 31 */
  GG ( b, c, d, a, in[12], S24, 2368359562); /* 32 */

  /* Round 3 */
#define S31 4
#define S32 11
#define S33 16
#define S34 23
  HH ( a, b, c, d, in[ 5], S31, 4294588738); /* 33 */
  HH ( d, a, b, c, in[ 8], S32, 2272392833); /* 34 */
  HH ( c, d, a, b, in[11], S33, 1839030562); /* 35 */
  HH ( b, c, d, a, in[14], S34, 4259657740); /* 36 */
  HH ( a, b, c, d, in[ 1], S31, 2763975236); /* 37 */
  HH ( d, a, b, c, in[ 4], S32, 1272893353); /* 38 */
  HH ( c, d, a, b, in[ 7], S33, 4139469664); /* 39 */
  HH ( b, c, d, a, in[10], S34, 3200236656); /* 40 */
  HH ( a, b, c, d, in[13], S31,  681279174); /* 41 */
  HH ( d, a, b, c, in[ 0], S32, 3936430074); /* 42 */
  HH ( c, d, a, b, in[ 3], S33, 3572445317); /* 43 */
  HH ( b, c, d, a, in[ 6], S34,   76029189); /* 44 */
  HH ( a, b, c, d, in[ 9], S31, 3654602809); /* 45 */
  HH ( d, a, b, c, in[12], S32, 3873151461); /* 46 */
  HH ( c, d, a, b, in[15], S33,  530742520); /* 47 */
  HH ( b, c, d, a, in[ 2], S34, 3299628645); /* 48 */

  /* Round 4 */
#define S41 6
#define S42 10
#define S43 15
#define S44 21
  II ( a, b, c, d, in[ 0], S41, 4096336452); /* 49 */
  II ( d, a, b, c, in[ 7], S42, 1126891415); /* 50 */
  II ( c, d, a, b, in[14], S43, 2878612391); /* 51 */
  II ( b, c, d, a, in[ 5], S44, 4237533241); /* 52 */
  II ( a, b, c, d, in[12], S41, 1700485571); /* 53 */
  II ( d, a, b, c, in[ 3], S42, 2399980690); /* 54 */
  II ( c, d, a, b, in[10], S43, 4293915773); /* 55 */
  II ( b, c, d, a, in[ 1], S44, 2240044497); /* 56 */
  II ( a, b, c, d, in[ 8], S41, 1873313359); /* 57 */
  II ( d, a, b, c, in[15], S42, 4264355552); /* 58 */
  II ( c, d, a, b, in[ 6], S43, 2734768916); /* 59 */
  II ( b, c, d, a, in[13], S44, 1309151649); /* 60 */
  II ( a, b, c, d, in[ 4], S41, 4149444226); /* 61 */
  II ( d, a, b, c, in[11], S42, 3174756917); /* 62 */
  II ( c, d, a, b, in[ 2], S43,  718787259); /* 63 */
  II ( b, c, d, a, in[ 9], S44, 3951481745); /* 64 */
  
  a += a0;
  b += b0;
  c += c0;
  d += d0;
  hash[0] = a;
  hash[1] = b;
  hash[2] = c;
  hash[3] = d;

return;
}
""")

In [29]:
#  md5(uint *in, uint *hash)

In [63]:
#
# CUDA周りの準備
#
cuda_kernel = mod.get_function("md5")  # 上で定義したカーネルを呼び出す

block   = (512, 1, 1)                       # Block size (後述) 
grid    = (512, 1, 1)                      # Grid size (後述)

a = np.arange(16)
b = np.zeros(4, dtype = np.uint8)
b_gpu = gpuarray.to_gpu(b)
#
# カーネルを実行
#
cuda_kernel(
            cuda.InOut(np.uint8(np.arange(100))),               # int型定数はnumpyで明示的に型を定義する
            b_gpu, 
            block=block, grid=grid)         # BlockとGridも引数として与える


In [None]:
unsigned int RSHash(const std::string& str)
{
    unsigned int b    = 378551;
    unsigned int a    = 63689;
    unsigned int hash = 0;

    for(std::size_t i = 0; i < str.length(); i++)
    {
        hash = hash * a + str[i];
        a    = a * b;
    }

    return (hash & 0x7FFFFFFF);
 }

In [None]:
def rshash(key):
    a = 63689
    b = 378551
    hash_ = 0
    for i in range(len(key)):
        hash_ = hash_ * a + key[i]
        a = a * b
    return hash_ & 0x7FFFFFFF

In [75]:
0x7FFFFFFF

2147483647

In [72]:
ord("t")

116

In [92]:
def rshash(key):
    a = 63689
    b = 378551
    hash_ = 0
    for i in range(len(key)):
        hash_ = hash_ * a + key[i]
        a = a * b
    return hash_ & 0x7FFFFFFF

In [93]:
RSHash(np.arange(12))

684956120

In [64]:
b_gpu.get()

array([202, 217,  68, 145], dtype=uint8)

In [53]:
c = b_gpu.get()