In [1]:
import math

import numba
import numpy
from numba import cuda
import numpy as np
import random
import time
import copy


def init_input(plaintext, block_size):
    res = [[], [], [], []]
    binary = bin(plaintext)[2:]
    binary = binary.zfill(block_size)
    for i in range(block_size):
        group_index = int(i) % 4
        res[group_index].append(binary[i])
    for r in res:
        r.reverse()
    rl = res[3] + res[2] + res[1] + res[0]
    rl.reverse()
    initial_num = ''.join(rl)

    return int(initial_num, 2)


def key_schedule(key):
    k_3 = init_input(key >> 32 * 3, 32)
    k_2 = init_input(key >> 32 * 2, 32)
    k_1 = init_input(key >> 32 * 1, 32)
    k_0 = init_input(key >> 32 * 0, 32)
    block_size = 128 // 4
    sub_key = [k_0, k_1, k_2, k_3]
    for i in range(48 - 4):
        k = a8(a8(a8(sub_key[i + 3], block_size), block_size), block_size) ^ sub_key[i] ^ (i + 1)
        sub_key.append(k)
    return sub_key


# @cuda.jit
# def init_input_gpu(plaintext, block_size, temp_list):
# res = [[], [], [], []]
#
# binary = bin(plaintext)[2:]
# binary = binary.zfill(block_size)
# for i in range(block_size):
#     group_index = int(i) % 4
#     res[group_index].append(binary[i])
# for r in res:
#     r.reverse()
# rl = res[3] + res[2] + res[1] + res[0]
# rl.reverse()
# initial_num = ''.join(rl)
#
# temp_list[1] = int(initial_num, 2)


@cuda.jit
def g0(num, block_size, temp_list):
    res = 0
    mask_0 = 0xFF
    mask_1 = 0xFF00
    mask_2 = 0xFF0000
    mask_3 = 0xFF000000
    group_size = block_size // 4
    # y{0} = x{3} and x{2} xor x{0}
    res |= ((num >> (group_size * 3)) & mask_0) & ((num >> (group_size * 2)) & mask_0) ^ (num & mask_0)
    # y{3} = y{0} and x{1} xor x{3}
    res |= ((res << (group_size * 3)) & mask_3) & ((num << (group_size * 2)) & mask_3) ^ (num & mask_3)
    # y{2} = x{2}
    res |= (num & mask_2)
    # y{1} = x{1}
    res |= (num & mask_1)
    temp_list[3] = res


@cuda.jit
def g1(num, block_size, temp_list):
    res = 0
    mask_0 = 0xFF
    mask_1 = 0xFF00
    mask_2 = 0xFF0000
    mask_3 = 0xFF000000
    group_size = block_size // 4
    # y{2} = x{3} and x{1} xor x{2}
    res |= ((num >> (group_size * 1)) & mask_2) & ((num << (group_size * 1)) & mask_2) ^ (num & mask_2)
    # y{1} = y{2} and x{0} xor x{1}
    res |= ((res >> (group_size * 1)) & mask_1) & ((num << (group_size * 1)) & mask_1) ^ (num & mask_1)
    # y{3} = x{3}
    res |= (num & mask_3)
    # y{0} = x{0}
    res |= (num & mask_0)
    temp_list[4] = res


@cuda.jit
def rotation(rot_size, block_size, temp_list):
    num = temp_list[2]
    if rot_size == 0:
        temp_list[2] = num
    else:
        mask_0 = 0xFF
        group_size = block_size // 4
        n_3 = num >> (group_size * 3) & mask_0
        n_2 = num >> (group_size * 2) & mask_0
        n_1 = num >> (group_size * 1) & mask_0
        n_0 = num >> (group_size * 0) & mask_0

        n_3 = (n_3 << rot_size | n_3 >> (group_size - rot_size)) & mask_0

        n_2 = (n_2 << rot_size | n_2 >> (group_size - rot_size)) & mask_0

        n_1 = (n_1 << rot_size | n_1 >> (group_size - rot_size)) & mask_0

        n_0 = (n_0 << rot_size | n_0 >> (group_size - rot_size)) & mask_0

        res = n_3 << group_size * 3 | n_2 << group_size * 2 | n_1 << group_size | n_0
        temp_list[2] = res


def a8(num, block_size):
    t0 = 3
    t1 = 1
    step = block_size // 8
    res = num >> step
    x0 = num & 0xF
    x7 = num >> step * 7 & 0xF
    y6 = x7 ^ (x7 << t0)
    y7 = (x7 << t1 | x7 >> (step - t1)) ^ x0
    res = (y7 << step * 7) | (y6 << step * 6) | res
    return res


@cuda.jit
def perm(num, block_size, temp_list, perm_list):
    group_size = block_size // 4

    res0 = ((num & 0xFF >> (7 - perm_list[7])) & 0b1)
    res1 = ((((num >> group_size) & 0xFF) >> (7 - perm_list[0])) & 0b1)
    res2 = ((((num >> group_size * 2) & 0xFF) >> (7 - perm_list[0])) & 0b1)
    res3 = ((((num >> group_size * 3) & 0xFF) >> (7 - perm_list[0])) & 0b1)

    for j in range(1, group_size):
        res0 <<= 1
        res0 |= (((num & 0xFF) >> (7 - perm_list[j])) & 0b1)
    for j in range(1, group_size):
        res1 <<= 1
        res1 |= ((((num >> group_size) & 0xFF) >> (7 - perm_list[j])) & 0b1)
    for j in range(1, group_size):
        res2 <<= 1
        res2 |= ((((num >> group_size * 2) & 0xFF) >> (7 - perm_list[j])) & 0b1)
    for j in range(1, group_size):
        res3 <<= 1
        res3 |= ((((num >> group_size * 3) & 0xFF) >> (7 - perm_list[j])) & 0b1)

    res = res3 << group_size * 3 | res2 << group_size * 2 | res1 << group_size * 1 | res0
    temp_list[5] = res


@cuda.jit
def enc(rounds, word_size, keys, temp_list, perm_list):
    plaintext = temp_list[0]
    alpha = 0
    beta = 1
    block_size = word_size // 2
    ################
    temp_list[31] = block_size
    block_size = temp_list[31]
    ################
    mask = 0xFFFFFFFF

    if block_size == 64:
        mask = 0xFFFFFFFFFFFFFFFF
    p_l = plaintext >> block_size
    p_r = plaintext & 0xFFFFFFFF
    # init_input_gpu(p_l, block_size, temp_list)
    init_l = p_l
    # init_input_gpu(p_r, block_size, temp_list)
    init_r = p_r
    for i in range(rounds):
        ori_l = init_l

        temp_list[2] = init_l
        rotation(alpha, block_size, temp_list)
        rot_g0 = temp_list[2]

        temp_list[2] = init_l
        rotation(beta, block_size, temp_list)
        rot_g1 = temp_list[2]

        g0(rot_g0, block_size, temp_list)
        g0_out = temp_list[3]

        g1(rot_g1, block_size, temp_list)
        g1_out = temp_list[4]

        perm(g0_out ^ g1_out, block_size, temp_list, perm_list)

        perm_out = temp_list[5]
        temp_list[31] = init_r
        init_r = temp_list[31]

        temp_list[31] = i
        i = temp_list[31]

        init_l = perm_out ^ init_r #^ keys[i]
        init_r = ori_l

        temp_list[31] = init_l
        init_l = temp_list[31]
        temp_list[31] = init_r
        init_r = temp_list[31]

    temp_list[31] = init_r
    init_r = temp_list[32]
    temp_list[0] = init_l << block_size | init_r


@cuda.jit
def start_gpu_task(keys, input_diff, output_diff, rounds, result_collector, temp_list, word_size, perm_list):
    weight = 14
    thread_index = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
    result_collector[thread_index] = 0
    res = result_collector[thread_index]
    used_list = temp_list[thread_index]
    start = thread_index * (2 ** weight)
    end = thread_index * (2 ** weight) + 2 ** weight
    for i in range(start, end):
        x1 = i
        if x1 > (x1 ^ input_diff):
            continue
        used_list[0] = x1
        enc(rounds, word_size, keys, used_list, perm_list)
        c1 = used_list[0]

        x2 = x1 ^ input_diff
        used_list[0] = x2
        enc(rounds, word_size, keys, used_list, perm_list)
        c2 = used_list[0]

        if c1 ^ c2 == output_diff:
            res += 1

        # c3 = c1 ^ output_diff
        # c4 = c2 ^ output_diff
        #
        # dec(c3, keys, ir, offset, rounds, used_list)
        # x3 = used_list[0]
        #
        # dec(c4, keys, ir, offset, rounds, used_list)
        # x4 = used_list[0]
        # if x3 ^ x4 == input_diff:
        #     res += 2
    result_collector[thread_index] = res


# GPU Tasks
def test():
    diff_left = 0x00080000
    diff_right = 0x00480000
    out_left = 0x00480000
    out_right = 0x00080000
    input_dff = diff_left << 32 | diff_right
    output_diff = out_left << 32 | out_right
    total = 2 ** 20
    rounds = 3
    counter = 0
    word_size = 64

    # GPU Setting
    threads_in_per_block = 2 ** 8
    blocks_in_per_grid = 2 ** 10
    total_threads = threads_in_per_block * blocks_in_per_grid

    result = numpy.zeros((total_threads,), dtype=numpy.uint64)
    temp_list = numpy.array([[0 for _ in range(32)] for _ in range(total_threads)], dtype=numpy.uint64)
    key = random.randint(0, 2 ** 128)
    sub_keys = key_schedule(key)
    perm_list = [7, 4, 1, 6, 3, 0, 5, 2]

    cuda_sub_keys = cuda.to_device(sub_keys)
    cuda_result = cuda.to_device(result)
    cuda_temp_list = cuda.to_device(temp_list)
    cuda_perm_list = cuda.to_device(perm_list)
    start_time = time.time()

    (start_gpu_task[blocks_in_per_grid, threads_in_per_block](cuda_sub_keys, input_dff, output_diff, rounds,
                                                              cuda_result,
                                                              cuda_temp_list, word_size, cuda_perm_list))

    res = numpy.zeros((1,), dtype=numpy.uint64)[0]
    for r in cuda_result:
        res += r
    if res == 0:
        tip = "Invalid"
    else:
        tip = math.log2(res / 2 ** (10 + 8 + 14))
    print("w:{}".format(tip))
    print(res)
    print("Task done, time:{}".format(time.time() - start_time))


test()


w:Invalid
0
Task done, time:60.179816007614136
