From 1857d6feb0ba5010f54e3a9c9a9209b944d1179c Mon Sep 17 00:00:00 2001
From: Pavel Belevich <pbelevich@fb.com>
Date: Sat, 14 Nov 2020 14:39:49 -0500
Subject: [PATCH 01/26] encrypt/decrypt

[ghstack-poisoned]
---
 test/test_csprng.py               |  43 ++++++++
 torchcsprng/csrc/aes.h            | 117 +++++++++++++++++++++
 torchcsprng/csrc/block_cipher_2.h | 164 ++++++++++++++++++++++++++++++
 torchcsprng/csrc/csprng.h         |  73 +++++++++++++
 4 files changed, 397 insertions(+)
 create mode 100644 torchcsprng/csrc/block_cipher_2.h

diff --git a/test/test_csprng.py b/test/test_csprng.py
index b4c5443..43c7980 100644
--- a/test/test_csprng.py
+++ b/test/test_csprng.py
@@ -354,5 +354,48 @@ def test_const_generator(self):
                     second = torch.empty(self.size, dtype=dtype, device=device).random_(generator=const_gen)
                     self.assertTrue((first - second).max().abs() == 0)
 
+    def test_encrypt_decrypt(self):
+        key_size_bytes = 16
+        block_size_bytes = 16
+
+        def sizeof(dtype):
+            if dtype == torch.bool:
+                return 1
+            elif dtype.is_floating_point:
+                return torch.finfo(dtype).bits // 8
+            else:
+                return torch.iinfo(dtype).bits // 8
+
+        for device in self.all_devices:
+            for key_dtype in self.all_dtypes:
+                key_size = key_size_bytes // sizeof(key_dtype)
+                key = torch.empty(key_size, dtype=key_dtype, device=device).random_()
+                for initial_dtype in self.all_dtypes:
+                    for encrypted_dtype in self.all_dtypes:
+                        for decrypted_dtype in self.all_dtypes:
+                            for initial_size in [0, 4, 8, 15, 16, 23, 42]:
+
+                                encrypted_size = (initial_size * sizeof(initial_dtype) + block_size_bytes - 1) // block_size_bytes * block_size_bytes // sizeof(encrypted_dtype)
+                                decrypted_size = (encrypted_size * sizeof(encrypted_dtype) + block_size_bytes - 1) // block_size_bytes * block_size_bytes // sizeof(decrypted_dtype)
+
+                                initial = torch.empty(initial_size, dtype=initial_dtype, device=device).random_()
+                                encrypted = torch.empty(encrypted_size, dtype=encrypted_dtype, device=device).random_()
+                                decrypted = torch.empty(decrypted_size, dtype=decrypted_dtype, device=device).random_()
+
+                                initial_np = initial.numpy().view(np.int8)
+                                decrypted_np = decrypted.numpy().view(np.int8)
+                                padding_size_bytes = initial_size * sizeof(initial_dtype) - decrypted_size * sizeof(decrypted_dtype)
+                                if padding_size_bytes != 0:
+                                    decrypted_np = decrypted_np[:padding_size_bytes]
+
+                                csprng.encrypt(initial, encrypted, key, "aes128", "ecb")
+
+                                if initial_size > 8:
+                                    self.assertFalse(np.array_equal(initial_np, decrypted_np))
+
+                                csprng.decrypt(encrypted, decrypted, key, "aes128", "ecb")
+
+                                self.assertTrue(np.array_equal(initial_np, decrypted_np))
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/torchcsprng/csrc/aes.h b/torchcsprng/csrc/aes.h
index 09596f1..bd41928 100644
--- a/torchcsprng/csrc/aes.h
+++ b/torchcsprng/csrc/aes.h
@@ -97,6 +97,24 @@ TORCH_CSPRNG_CONSTANT const uint8_t sbox[256] = {
   0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
   0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 };
 
+TORCH_CSPRNG_CONSTANT const uint8_t rsbox[256] = {
+    0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
+    0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
+    0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
+    0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
+    0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
+    0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
+    0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
+    0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
+    0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
+    0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
+    0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
+    0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
+    0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
+    0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
+    0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
+    0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d };
+
 // The round constant word array, Rcon[i], contains the values given by 
 // x to the power (i-1) being powers of x (x is denoted as {02}) in the field GF(2^8)
 TORCH_CSPRNG_CONSTANT const uint8_t Rcon[11] = {
@@ -104,6 +122,8 @@ TORCH_CSPRNG_CONSTANT const uint8_t Rcon[11] = {
 
 #define getSBoxValue(num) (sbox[(num)])
 
+#define getSBoxInvert(num) (rsbox[(num)])
+
 // This function produces Nb(Nr+1) round keys. The round keys are used in each round to decrypt the states. 
 TORCH_CSPRNG_HOST_DEVICE void KeyExpansion(uint8_t* RoundKey, const uint8_t* Key){
   unsigned int i, j, k;
@@ -257,6 +277,78 @@ TORCH_CSPRNG_HOST_DEVICE void MixColumns(state_t* state)
   }
 }
 
+TORCH_CSPRNG_HOST_DEVICE uint8_t Multiply(uint8_t x, uint8_t y)
+{
+  return (((y & 1) * x) ^
+          ((y>>1 & 1) * xtime(x)) ^
+          ((y>>2 & 1) * xtime(xtime(x))) ^
+          ((y>>3 & 1) * xtime(xtime(xtime(x)))) ^
+          ((y>>4 & 1) * xtime(xtime(xtime(xtime(x)))))); /* this last call to xtime() can be omitted */
+}
+
+// MixColumns function mixes the columns of the state matrix.
+// The method used to multiply may be difficult to understand for the inexperienced.
+// Please use the references to gain more information.
+TORCH_CSPRNG_HOST_DEVICE void InvMixColumns(state_t* state)
+{
+  int i;
+  uint8_t a, b, c, d;
+  for (i = 0; i < 4; ++i)
+  {
+    a = (*state)[i][0];
+    b = (*state)[i][1];
+    c = (*state)[i][2];
+    d = (*state)[i][3];
+
+    (*state)[i][0] = Multiply(a, 0x0e) ^ Multiply(b, 0x0b) ^ Multiply(c, 0x0d) ^ Multiply(d, 0x09);
+    (*state)[i][1] = Multiply(a, 0x09) ^ Multiply(b, 0x0e) ^ Multiply(c, 0x0b) ^ Multiply(d, 0x0d);
+    (*state)[i][2] = Multiply(a, 0x0d) ^ Multiply(b, 0x09) ^ Multiply(c, 0x0e) ^ Multiply(d, 0x0b);
+    (*state)[i][3] = Multiply(a, 0x0b) ^ Multiply(b, 0x0d) ^ Multiply(c, 0x09) ^ Multiply(d, 0x0e);
+  }
+}
+
+// The SubBytes Function Substitutes the values in the
+// state matrix with values in an S-box.
+TORCH_CSPRNG_HOST_DEVICE void InvSubBytes(state_t* state)
+{
+  uint8_t i, j;
+  for (i = 0; i < 4; ++i)
+  {
+    for (j = 0; j < 4; ++j)
+    {
+      (*state)[j][i] = getSBoxInvert((*state)[j][i]);
+    }
+  }
+}
+
+TORCH_CSPRNG_HOST_DEVICE void InvShiftRows(state_t* state)
+{
+  uint8_t temp;
+
+  // Rotate first row 1 columns to right
+  temp = (*state)[3][1];
+  (*state)[3][1] = (*state)[2][1];
+  (*state)[2][1] = (*state)[1][1];
+  (*state)[1][1] = (*state)[0][1];
+  (*state)[0][1] = temp;
+
+  // Rotate second row 2 columns to right
+  temp = (*state)[0][2];
+  (*state)[0][2] = (*state)[2][2];
+  (*state)[2][2] = temp;
+
+  temp = (*state)[1][2];
+  (*state)[1][2] = (*state)[3][2];
+  (*state)[3][2] = temp;
+
+  // Rotate third row 3 columns to right
+  temp = (*state)[0][3];
+  (*state)[0][3] = (*state)[1][3];
+  (*state)[1][3] = (*state)[2][3];
+  (*state)[2][3] = (*state)[3][3];
+  (*state)[3][3] = temp;
+}
+
 TORCH_CSPRNG_HOST_DEVICE void encrypt(uint8_t* state, const uint8_t* key) {
   uint8_t RoundKey[176];
   KeyExpansion(RoundKey, key); 
@@ -284,4 +376,29 @@ TORCH_CSPRNG_HOST_DEVICE void encrypt(uint8_t* state, const uint8_t* key) {
   AddRoundKey(Nr, (state_t*)state, RoundKey);
 }
 
+TORCH_CSPRNG_HOST_DEVICE void decrypt(uint8_t* state, const uint8_t* key) {
+  uint8_t RoundKey[176];
+  KeyExpansion(RoundKey, key);
+
+  uint8_t round = 0;
+
+  // Add the First round key to the state before starting the rounds.
+  AddRoundKey(Nr, (state_t*)state, RoundKey);
+
+  // There will be Nr rounds.
+  // The first Nr-1 rounds are identical.
+  // These Nr rounds are executed in the loop below.
+  // Last one without InvMixColumn()
+  for (round = (Nr - 1); ; --round)
+  {
+    InvShiftRows((state_t*)state);
+    InvSubBytes((state_t*)state);
+    AddRoundKey(round, (state_t*)state, RoundKey);
+    if (round == 0) {
+      break;
+    }
+    InvMixColumns((state_t*)state);
+  }
+}
+
 }}}
diff --git a/torchcsprng/csrc/block_cipher_2.h b/torchcsprng/csrc/block_cipher_2.h
new file mode 100644
index 0000000..d7a6925
--- /dev/null
+++ b/torchcsprng/csrc/block_cipher_2.h
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include "macros.h"
+#include <ATen/ATen.h>
+#include <ATen/native/TensorIterator.h>
+#include "OffsetCalculator.cuh"
+#include <ATen/Parallel.h>
+#include <cstdint>
+#include <mutex>
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+#include <c10/cuda/CUDAStream.h>
+#include <ATen/cuda/Exceptions.h>
+#endif
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+#define UNROLL_IF_CUDA #pragma unroll
+#else
+#define UNROLL_IF_CUDA
+#endif
+
+namespace torch {
+namespace csprng {
+
+template<typename cipher_t, typename input_index_calc_t, typename output_index_calc_t>
+TORCH_CSPRNG_HOST_DEVICE static void block_cipher_kernel_helper_2(int64_t idx, cipher_t cipher, int block_size,
+    void* input_ptr, int64_t input_numel, int input_type_size, input_index_calc_t input_index_calc,
+    void* output_ptr, int64_t output_numel, int output_type_size, output_index_calc_t output_index_calc) {
+
+  uint8_t block[block_size];
+  memset(&block, 0, block_size); // is it ok to use zeros as padding?
+
+  for (auto i = 0; i < block_size / input_type_size; ++i) {
+    const auto linear_index = idx * block_size / input_type_size + i;
+    for (auto j = 0; j < input_type_size; ++j) {
+      const auto byte_index = i * input_type_size + j;
+      if (linear_index < input_numel) {
+        block[byte_index] = reinterpret_cast<uint8_t*>(input_ptr)[input_index_calc(linear_index) + j];
+      }
+    }
+  }
+
+  cipher(idx, block);
+
+  for (auto i = 0; i < block_size / output_type_size; ++i) {
+    const auto linear_index = idx * block_size / output_type_size + i;
+    for (auto j = 0; j < output_type_size; ++j) {
+      const auto byte_index = i * output_type_size + j;
+      if (linear_index < output_numel) {
+        reinterpret_cast<uint8_t*>(output_ptr)[output_index_calc(linear_index) + j] = block[byte_index];
+      }
+    }
+  }
+}
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+template<typename cipher_t, typename input_index_calc_t, typename output_index_calc_t>
+__global__ static void block_cipher_kernel_cuda_2(cipher_t cipher, int block_size,
+    void* input_ptr, int64_t input_numel, int input_type_size, input_index_calc_t input_index_calc,
+    void* output_ptr, int64_t output_numel, int output_type_size, output_index_calc_t output_index_calc) {
+  const auto idx = blockIdx.x * blockDim.x + threadIdx.x;
+  block_cipher_kernel_helper_2(idx, cipher, block_size,
+    input_ptr, input_numel, input_type_size, input_index_calc,
+    output_ptr, output_numel, output_type_size, output_index_calc);
+}
+#endif
+
+template<typename cipher_t, typename input_index_calc_t, typename output_index_calc_t>
+static void block_cipher_kernel_cpu_serial_2(int64_t begin, int64_t end, cipher_t cipher, int block_size,
+    void* input_ptr, int64_t input_numel, int input_type_size, input_index_calc_t input_index_calc,
+    void* output_ptr, int64_t output_numel, int output_type_size, output_index_calc_t output_index_calc) {
+  for (auto idx = begin; idx < end; ++idx) {
+    block_cipher_kernel_helper_2(idx, cipher, block_size,
+      input_ptr, input_numel, input_type_size, input_index_calc,
+      output_ptr, output_numel, output_type_size, output_index_calc);
+  }
+}
+
+template<typename cipher_t, typename input_index_calc_t, typename output_index_calc_t>
+static void block_cipher_kernel_cpu_2(int64_t total, cipher_t cipher, int block_size,
+    void* input_ptr, int64_t input_numel, int input_type_size, input_index_calc_t input_index_calc,
+    void* output_ptr, int64_t output_numel, int output_type_size, output_index_calc_t output_index_calc) {
+  if (total < at::internal::GRAIN_SIZE || at::get_num_threads() == 1) {
+    block_cipher_kernel_cpu_serial_2(0, total, cipher, block_size,
+      input_ptr, input_numel, input_type_size, input_index_calc,
+      output_ptr, output_numel, output_type_size, output_index_calc);
+  } else {
+    at::parallel_for(0, total, at::internal::GRAIN_SIZE, [&](int64_t begin, int64_t end) {
+      block_cipher_kernel_cpu_serial_2(begin, end, cipher, block_size,
+        input_ptr, input_numel, input_type_size, input_index_calc,
+        output_ptr, output_numel, output_type_size, output_index_calc);
+    });
+  }
+}
+
+template<typename cipher_t>
+void block_cipher_2(Tensor input, Tensor output, cipher_t cipher, int block_size) {
+  if (input.numel() == 0) {
+    return;
+  }
+
+  const auto input_ptr = input.data_ptr();
+  const auto input_numel = input.numel();
+  const auto input_type_size = input.element_size();
+  const auto input_iter = TensorIterator::nullary_op(input);
+  const auto input_offset_calc = make_offset_calculator<1>(input_iter);
+  const std::function<int(int)> input_index_calc_contiguous = [input_type_size] TORCH_CSPRNG_HOST_DEVICE (int li) -> int { // TODO: int or uint32_t?
+    return li * input_type_size;
+  };
+  const std::function<int(int)> input_index_calc_non_contiguous = [input_offset_calc] TORCH_CSPRNG_HOST_DEVICE (int li) -> int {  // TODO: int or uint32_t?
+    return input_offset_calc.get(li)[0];
+  };
+  const auto input_index_calc = input.is_contiguous() ? input_index_calc_contiguous : input_index_calc_non_contiguous;
+
+  const auto output_ptr = output.data_ptr();
+  const auto output_numel = output.numel();
+  const auto output_type_size = output.element_size();
+  const auto output_iter = TensorIterator::nullary_op(output);
+  const auto output_offset_calc = make_offset_calculator<1>(output_iter);
+  const std::function<int(int)> output_index_calc_contiguous = [output_type_size] TORCH_CSPRNG_HOST_DEVICE (int li) -> int { // TODO: int or uint32_t?
+    return li * output_type_size;
+  };
+  const std::function<int(int)> output_index_calc_non_contiguous = [output_offset_calc] TORCH_CSPRNG_HOST_DEVICE (int li) -> int {  // TODO: int or uint32_t?
+    return output_offset_calc.get(li)[0];
+  };
+  const auto output_index_calc = output.is_contiguous() ? output_index_calc_contiguous : output_index_calc_non_contiguous;
+
+  TORCH_CHECK((input_numel * input_type_size + block_size - 1) / block_size * block_size == output_numel * output_type_size, "wrong size");
+
+  const auto size_in_bytes = input_numel * input_type_size;
+
+  const auto block = 256;
+  const auto grid = (size_in_bytes + (block * block_size) - 1) / (block * block_size);
+
+  if (input.device().type() == at::kCPU) {
+    block_cipher_kernel_cpu_2(grid * block, cipher, block_size,
+        input_ptr, input_numel, input_type_size, input_index_calc,
+        output_ptr, output_numel, output_type_size, output_index_calc
+    );
+  } else if (input.device().type() == at::kCUDA) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+    auto stream = at::cuda::getCurrentCUDAStream();
+    block_cipher_kernel_cuda_2<<<grid, block, 0, stream>>>(cipher, block_size,
+        input_ptr, input_numel, input_type_size, input_index_calc,
+        output_ptr, output_numel, output_type_size, output_index_calc
+    );
+    AT_CUDA_CHECK(cudaGetLastError());
+#else
+    TORCH_CHECK(false, "torchcsprng was compiled without CUDA support");
+#endif
+  } else {
+    TORCH_CHECK(false, "block_cipher supports only CPU and CUDA devices");
+  }
+}
+
+}}
diff --git a/torchcsprng/csrc/csprng.h b/torchcsprng/csrc/csprng.h
index f1fa0f9..def4d31 100644
--- a/torchcsprng/csrc/csprng.h
+++ b/torchcsprng/csrc/csprng.h
@@ -18,6 +18,7 @@
 #include <random>
 #include "macros.h"
 #include "block_cipher.h"
+#include "block_cipher_2.h"
 #include "aes.h"
 
 #if defined(__CUDACC__) || defined(__HIPCC__)
@@ -416,6 +417,76 @@ Tensor& randperm_generator_out(Tensor& result, int64_t n, c10::optional<Generato
 
 // ====================================================================================================================
 
+// Let's assume that input and output have integral dtype, so there is no transform for now.
+Tensor encrypt_pybind(Tensor input, Tensor output, Tensor key, const std::string& cipher, const std::string& mode) {
+//  TORCH_CHECK(input.numel() * input.itemsize() == output.numel() * output.itemsize(), "input and output tensors must have the same size in byte");
+  if (cipher == "aes128") {
+    TORCH_CHECK(key.element_size() * key.numel() == 16, "key tensor must have 16 bytes(128 bits)");
+  } else {
+    TORCH_CHECK(false, "encrypt/decrypt supports \"aes128\" cipher, \"", cipher, "\" is not supported.");
+  }
+  const auto key_bytes = reinterpret_cast<uint8_t*>(key.contiguous().data_ptr());
+  if (mode == "ecb") {
+    block_cipher_2(input, output,
+      [key_bytes] TORCH_CSPRNG_HOST_DEVICE (int64_t idx, uint8_t* block) -> void {
+        aes::encrypt(block, key_bytes);
+      },
+      aes::block_t_size
+    );
+  } else if (mode == "ctr") {
+    block_cipher_2(input, output,
+      [key_bytes] TORCH_CSPRNG_HOST_DEVICE (int64_t idx, uint8_t* block) -> void {
+        uint8_t idx_block[aes::block_t_size];
+        *(reinterpret_cast<int64_t*>(idx_block)) = idx;
+        aes::encrypt(idx_block, key_bytes);
+        for (size_t i = 0; i < aes::block_t_size; i++) {
+          block[i] ^= idx_block[i];
+        }
+      },
+      aes::block_t_size
+    );
+  } else {
+    TORCH_CHECK(false, "encrypt/decrypt supports \"ecb\" and \"ctr\" modes, \"", mode, "\" is not supported.");
+  }
+  return output;
+}
+
+// Let's assume that input and output have integral dtype, so there is no transform for now.
+Tensor decrypt_pybind(Tensor input, Tensor output, Tensor key, std::string cipher, std::string mode) {
+//  TORCH_CHECK(input.numel() * input.itemsize() == output.numel() * output.itemsize(), "input and output tensors must have the same size in byte");
+  if (cipher == "aes128") {
+    TORCH_CHECK(key.element_size() * key.numel() == 16, "key tensor must have 16 bytes(128 bits)");
+  } else {
+    TORCH_CHECK(false, "encrypt/decrypt supports \"aes128\" cipher, \"", cipher, "\" is not supported.");
+  }
+  const auto key_bytes = reinterpret_cast<uint8_t*>(key.contiguous().data_ptr());
+  if (mode == "ecb") {
+    block_cipher_2(input, output,
+      [key_bytes] TORCH_CSPRNG_HOST_DEVICE (int64_t idx, uint8_t* block) -> void {
+        aes::decrypt(block, key_bytes);
+      },
+      aes::block_t_size
+    );
+  } else if (mode == "ctr") {
+    block_cipher_2(input, output,
+      [key_bytes] TORCH_CSPRNG_HOST_DEVICE (int64_t idx, uint8_t* block) -> void {
+        uint8_t idx_block[aes::block_t_size];
+        *(reinterpret_cast<int64_t*>(idx_block)) = idx;
+        aes::decrypt(idx_block, key_bytes);
+        for (size_t i = 0; i < aes::block_t_size; i++) {
+          block[i] ^= idx_block[i];
+        }
+      },
+      aes::block_t_size
+    );
+  } else {
+    TORCH_CHECK(false, "encrypt/decrypt supports \"ecb\" and \"ctr\" modes, \"", mode, "\" is not supported.");
+  }
+  return output;
+}
+
+// ====================================================================================================================
+
 Generator create_random_device_generator(c10::optional<std::string> token = c10::nullopt) {
   if (token.has_value()) {
     return make_generator<CSPRNGGeneratorImpl>(*token);
@@ -481,4 +552,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.def("create_mt19937_generator", &create_mt19937_generator, py::arg("seed") = nullptr);
   m.def("aes128_key_tensor", &aes128_key_tensor_pybind);
   m.def("create_const_generator", &create_const_generator);
+  m.def("encrypt", &encrypt_pybind);
+  m.def("decrypt", &decrypt_pybind);
 }

From 1737f817124c4bfbe5250ef9720e66decd53d2b9 Mon Sep 17 00:00:00 2001
From: Pavel Belevich <pbelevich@fb.com>
Date: Mon, 16 Nov 2020 15:16:38 -0500
Subject: [PATCH 02/26] Update on "encrypt/decrypt"

[ghstack-poisoned]
---
 test/test_csprng.py               | 32 ++++++++---------
 torchcsprng/csrc/block_cipher_2.h | 60 ++++++++++++++++++-------------
 torchcsprng/csrc/csprng.h         |  4 ++-
 3 files changed, 54 insertions(+), 42 deletions(-)

diff --git a/test/test_csprng.py b/test/test_csprng.py
index 43c7980..513d42d 100644
--- a/test/test_csprng.py
+++ b/test/test_csprng.py
@@ -374,28 +374,28 @@ def sizeof(dtype):
                     for encrypted_dtype in self.all_dtypes:
                         for decrypted_dtype in self.all_dtypes:
                             for initial_size in [0, 4, 8, 15, 16, 23, 42]:
+                                for mode in ["ecb", "ctr"]:
+                                    encrypted_size = (initial_size * sizeof(initial_dtype) + block_size_bytes - 1) // block_size_bytes * block_size_bytes // sizeof(encrypted_dtype)
+                                    decrypted_size = (encrypted_size * sizeof(encrypted_dtype) + block_size_bytes - 1) // block_size_bytes * block_size_bytes // sizeof(decrypted_dtype)
 
-                                encrypted_size = (initial_size * sizeof(initial_dtype) + block_size_bytes - 1) // block_size_bytes * block_size_bytes // sizeof(encrypted_dtype)
-                                decrypted_size = (encrypted_size * sizeof(encrypted_dtype) + block_size_bytes - 1) // block_size_bytes * block_size_bytes // sizeof(decrypted_dtype)
+                                    initial = torch.empty(initial_size, dtype=initial_dtype, device=device).random_()
+                                    encrypted = torch.empty(encrypted_size, dtype=encrypted_dtype, device=device).random_()
+                                    decrypted = torch.empty(decrypted_size, dtype=decrypted_dtype, device=device).random_()
 
-                                initial = torch.empty(initial_size, dtype=initial_dtype, device=device).random_()
-                                encrypted = torch.empty(encrypted_size, dtype=encrypted_dtype, device=device).random_()
-                                decrypted = torch.empty(decrypted_size, dtype=decrypted_dtype, device=device).random_()
+                                    initial_np = initial.numpy().view(np.int8)
+                                    decrypted_np = decrypted.numpy().view(np.int8)
+                                    padding_size_bytes = initial_size * sizeof(initial_dtype) - decrypted_size * sizeof(decrypted_dtype)
+                                    if padding_size_bytes != 0:
+                                        decrypted_np = decrypted_np[:padding_size_bytes]
 
-                                initial_np = initial.numpy().view(np.int8)
-                                decrypted_np = decrypted.numpy().view(np.int8)
-                                padding_size_bytes = initial_size * sizeof(initial_dtype) - decrypted_size * sizeof(decrypted_dtype)
-                                if padding_size_bytes != 0:
-                                    decrypted_np = decrypted_np[:padding_size_bytes]
+                                    csprng.encrypt(initial, encrypted, key, "aes128", mode)
 
-                                csprng.encrypt(initial, encrypted, key, "aes128", "ecb")
+                                    if initial_size > 8:
+                                        self.assertFalse(np.array_equal(initial_np, decrypted_np))
 
-                                if initial_size > 8:
-                                    self.assertFalse(np.array_equal(initial_np, decrypted_np))
+                                    csprng.decrypt(encrypted, decrypted, key, "aes128", mode)
 
-                                csprng.decrypt(encrypted, decrypted, key, "aes128", "ecb")
-
-                                self.assertTrue(np.array_equal(initial_np, decrypted_np))
+                                    self.assertTrue(np.array_equal(initial_np, decrypted_np))
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/torchcsprng/csrc/block_cipher_2.h b/torchcsprng/csrc/block_cipher_2.h
index d7a6925..0eca492 100644
--- a/torchcsprng/csrc/block_cipher_2.h
+++ b/torchcsprng/csrc/block_cipher_2.h
@@ -30,37 +30,47 @@
 namespace torch {
 namespace csprng {
 
-template<typename cipher_t, typename input_index_calc_t, typename output_index_calc_t>
-TORCH_CSPRNG_HOST_DEVICE static void block_cipher_kernel_helper_2(int64_t idx, cipher_t cipher, int block_size,
-    void* input_ptr, int64_t input_numel, int input_type_size, input_index_calc_t input_index_calc,
-    void* output_ptr, int64_t output_numel, int output_type_size, output_index_calc_t output_index_calc) {
-
-  uint8_t block[block_size];
-  memset(&block, 0, block_size); // is it ok to use zeros as padding?
-
+template<typename input_index_calc_t>
+TORCH_CSPRNG_HOST_DEVICE static void copy_input_to_block(int64_t idx, uint8_t* block, int block_size,
+    void* input_ptr, int64_t input_numel, int input_type_size, input_index_calc_t input_index_calc) {
   for (auto i = 0; i < block_size / input_type_size; ++i) {
-    const auto linear_index = idx * block_size / input_type_size + i;
-    for (auto j = 0; j < input_type_size; ++j) {
-      const auto byte_index = i * input_type_size + j;
-      if (linear_index < input_numel) {
-        block[byte_index] = reinterpret_cast<uint8_t*>(input_ptr)[input_index_calc(linear_index) + j];
-      }
+    const auto linear_index = idx * (block_size / input_type_size) + i;
+    if (linear_index < input_numel) {
+      std::memcpy(
+          &(block[i * input_type_size]),
+          &(reinterpret_cast<uint8_t*>(input_ptr)[input_index_calc(linear_index)]),
+          input_type_size
+      );
     }
   }
+}
 
-  cipher(idx, block);
-
+template<typename output_index_calc_t>
+TORCH_CSPRNG_HOST_DEVICE static void copy_block_to_output(int64_t idx, uint8_t* block, int block_size,
+    void* output_ptr, int64_t output_numel, int output_type_size, output_index_calc_t output_index_calc) {
   for (auto i = 0; i < block_size / output_type_size; ++i) {
     const auto linear_index = idx * block_size / output_type_size + i;
-    for (auto j = 0; j < output_type_size; ++j) {
-      const auto byte_index = i * output_type_size + j;
-      if (linear_index < output_numel) {
-        reinterpret_cast<uint8_t*>(output_ptr)[output_index_calc(linear_index) + j] = block[byte_index];
-      }
+    if (linear_index < output_numel) {
+      std::memcpy(
+          &(reinterpret_cast<uint8_t*>(output_ptr)[output_index_calc(linear_index)]),
+          &(block[i * output_type_size]),
+          output_type_size
+      );
     }
   }
 }
 
+template<typename cipher_t, typename input_index_calc_t, typename output_index_calc_t>
+TORCH_CSPRNG_HOST_DEVICE static void block_cipher_kernel_helper_2(int64_t idx, cipher_t cipher, int block_size,
+    void* input_ptr, int64_t input_numel, int input_type_size, input_index_calc_t input_index_calc,
+    void* output_ptr, int64_t output_numel, int output_type_size, output_index_calc_t output_index_calc) {
+  uint8_t block[block_size];
+  memset(&block, 0, block_size); // is it ok to use zeros as padding?
+  copy_input_to_block(idx, block, block_size, input_ptr, input_numel, input_type_size, input_index_calc);
+  cipher(idx, block);
+  copy_block_to_output(idx, block, block_size, output_ptr, output_numel, output_type_size, output_index_calc);
+}
+
 #if defined(__CUDACC__) || defined(__HIPCC__)
 template<typename cipher_t, typename input_index_calc_t, typename output_index_calc_t>
 __global__ static void block_cipher_kernel_cuda_2(cipher_t cipher, int block_size,
@@ -137,16 +147,16 @@ void block_cipher_2(Tensor input, Tensor output, cipher_t cipher, int block_size
 
   const auto size_in_bytes = input_numel * input_type_size;
 
-  const auto block = 256;
-  const auto grid = (size_in_bytes + (block * block_size) - 1) / (block * block_size);
-
   if (input.device().type() == at::kCPU) {
-    block_cipher_kernel_cpu_2(grid * block, cipher, block_size,
+    const auto total = (size_in_bytes + block_size - 1) / block_size;
+    block_cipher_kernel_cpu_2(total, cipher, block_size,
         input_ptr, input_numel, input_type_size, input_index_calc,
         output_ptr, output_numel, output_type_size, output_index_calc
     );
   } else if (input.device().type() == at::kCUDA) {
 #if defined(__CUDACC__) || defined(__HIPCC__)
+    const auto block = 256;
+    const auto grid = (size_in_bytes + (block * block_size) - 1) / (block * block_size);
     auto stream = at::cuda::getCurrentCUDAStream();
     block_cipher_kernel_cuda_2<<<grid, block, 0, stream>>>(cipher, block_size,
         input_ptr, input_numel, input_type_size, input_index_calc,
diff --git a/torchcsprng/csrc/csprng.h b/torchcsprng/csrc/csprng.h
index def4d31..a1cb806 100644
--- a/torchcsprng/csrc/csprng.h
+++ b/torchcsprng/csrc/csprng.h
@@ -437,6 +437,7 @@ Tensor encrypt_pybind(Tensor input, Tensor output, Tensor key, const std::string
     block_cipher_2(input, output,
       [key_bytes] TORCH_CSPRNG_HOST_DEVICE (int64_t idx, uint8_t* block) -> void {
         uint8_t idx_block[aes::block_t_size];
+        std::memset(&idx_block, 0, aes::block_t_size);
         *(reinterpret_cast<int64_t*>(idx_block)) = idx;
         aes::encrypt(idx_block, key_bytes);
         for (size_t i = 0; i < aes::block_t_size; i++) {
@@ -471,8 +472,9 @@ Tensor decrypt_pybind(Tensor input, Tensor output, Tensor key, std::string ciphe
     block_cipher_2(input, output,
       [key_bytes] TORCH_CSPRNG_HOST_DEVICE (int64_t idx, uint8_t* block) -> void {
         uint8_t idx_block[aes::block_t_size];
+        std::memset(&idx_block, 0, aes::block_t_size);
         *(reinterpret_cast<int64_t*>(idx_block)) = idx;
-        aes::decrypt(idx_block, key_bytes);
+        aes::encrypt(idx_block, key_bytes);
         for (size_t i = 0; i < aes::block_t_size; i++) {
           block[i] ^= idx_block[i];
         }

From e4955f71fda61541a13dd0e5dadf55639e4761ab Mon Sep 17 00:00:00 2001
From: Pavel Belevich <pbelevich@fb.com>
Date: Tue, 17 Nov 2020 23:16:37 -0500
Subject: [PATCH 03/26] Update on "encrypt/decrypt"

[ghstack-poisoned]
---
 test/test_csprng.py               |  10 ++
 torchcsprng/csrc/block_cipher.h   | 251 +++++++++++++++++-------------
 torchcsprng/csrc/block_cipher_2.h | 174 ---------------------
 torchcsprng/csrc/csprng.h         |  88 +++++++++--
 4 files changed, 227 insertions(+), 296 deletions(-)
 delete mode 100644 torchcsprng/csrc/block_cipher_2.h

diff --git a/test/test_csprng.py b/test/test_csprng.py
index 513d42d..3ed2b1e 100644
--- a/test/test_csprng.py
+++ b/test/test_csprng.py
@@ -69,6 +69,11 @@ def test_random_to_kstest(self):
                 for dtype in self.num_dtypes:
                     t = torch.zeros(self.size, dtype=dtype, device=device).random_(to_, generator=gen)
                     res = stats.kstest(t.cpu(), stats.randint.cdf, args=(0, to_))
+                    # TODO REVERT!
+                    # if res.statistic >= 0.1:
+                    #     print(t[:10])
+                    #     print(dtype)
+                    #     print(res.statistic)
                     self.assertTrue(res.statistic < 0.1)
 
     @unittest.skipIf(not torch.cuda.is_available() or not csprng.supports_cuda(), "CUDA is not available or csprng was not compiled with CUDA support")
@@ -90,6 +95,11 @@ def test_random_from_to_kstest(self):
                             if from_ < to_:
                                 t = torch.zeros(self.size, dtype=dtype, device=device).random_(from_, to_, generator=gen)
                                 res = stats.kstest(t.cpu(), stats.randint.cdf, args=(from_, to_))
+                                # TODO REVERT!
+                                # if res.statistic >= 0.1:
+                                #     print(t[:10])
+                                #     print(dtype)
+                                #     print(res.statistic)
                                 self.assertTrue(res.statistic < 0.2)
 
     @unittest.skipIf(not torch.cuda.is_available() or not csprng.supports_cuda(), "CUDA is not available or csprng was not compiled with CUDA support")
diff --git a/torchcsprng/csrc/block_cipher.h b/torchcsprng/csrc/block_cipher.h
index 2ca313f..a612061 100644
--- a/torchcsprng/csrc/block_cipher.h
+++ b/torchcsprng/csrc/block_cipher.h
@@ -30,144 +30,181 @@
 namespace torch {
 namespace csprng {
 
-// Generates `block_t_size`-bytes random key Tensor on CPU 
-// using `generator`, which must be an instance of `at::CPUGeneratorImpl`
-// and passes it to the `device`.
-template<typename RNG>
-at::Tensor key_tensor(size_t block_t_size, c10::optional<at::Generator> generator) {
-  std::lock_guard<std::mutex> lock(generator->mutex());
-  auto gen = at::check_generator<RNG>(generator);
-  if (gen->key().defined()) {
-    return gen->key().clone();
-  }
-  auto t = torch::empty({static_cast<signed long>(block_t_size)}, torch::kUInt8);
-  using random_t = uint32_t;
-  constexpr size_t random_t_size = sizeof(random_t);
-  for (size_t i = 0; i < block_t_size / random_t_size; i++) {
-    const auto rand = gen->random();
-    for (size_t j = 0; j < random_t_size; j++) {
-      size_t k = i * random_t_size + j;
-      t[k] = static_cast<uint8_t>((rand >> (j * 8)) & 0xff);
+template<typename input_index_calc_t>
+TORCH_CSPRNG_HOST_DEVICE static void copy_input_to_block(int64_t idx, uint8_t* block, int block_size,
+    void* input_ptr, int64_t input_numel, int input_type_size, input_index_calc_t input_index_calc) {
+  for (auto i = 0; i < block_size / input_type_size; ++i) {
+    const auto linear_index = idx * (block_size / input_type_size) + i;
+    if (linear_index < input_numel) {
+      std::memcpy(
+          &(block[i * input_type_size]),
+          &(reinterpret_cast<uint8_t*>(input_ptr)[input_index_calc(linear_index)]),
+          input_type_size
+      );
     }
   }
-  return t;
 }
 
-// A simple container for random state sub-blocks that implements RNG interface 
-// with random() and random64() methods, that are used by transformation function
-template<size_t size>
-struct RNGValues {
-  TORCH_CSPRNG_HOST_DEVICE RNGValues(uint64_t* vals) {
-    memcpy(&vals_, vals, size * sizeof(uint64_t));
-  }
-  uint32_t TORCH_CSPRNG_HOST_DEVICE random() { auto res = static_cast<uint32_t>(vals_[index]); index++; return res; }
-  uint64_t TORCH_CSPRNG_HOST_DEVICE random64() { auto res = vals_[index]; index++; return res; }
-private:
-  uint64_t vals_[size];
-  int index = 0;
-};
-
-// Runs a block cipher in a counter mode in approximately `numel / (block_t_size / sizeof(uint_t) / N)` CUDA threads,
-// without any assumption about target tensor layout. It uses `index_calc` to find memory locations of
-// the tensor elements.
-// `scalar_t`       is a scalar type equivalent of target tensor dtype
-// `uint_t`         is an unsigned integral type of sub-blocks that random state is divided to
-//                  (e.g, 16 bytes random state block can be divided into 16 uint8_t sub-blocks 
-//                  or 8 uint16_t sub-block or 4 uint32_t sub-block or 2 uint64_t sub-blocks)
-// `N`              is a number of sub-block which is used by `transform_func` 
-//                  to generate a random value of specific distribution (e.g. `normal` uses 2)
-// `numel`          is a number of elements in target tensor
-// `block_t_size`   is a number of bytes in cipher's block (e.g. 16 for AES128)
-// `cipher`         is a callable that receives a counter `idx` and returns an encrypted block
-// `transform_func` is a callable that converts N `uint_t` random state sub-blocks passed in RNGValues into target dtype `scalar_t`
-template<typename scalar_t, typename uint_t, size_t N = 1, typename cipher_t, typename transform_t, typename index_calc_t>
-TORCH_CSPRNG_HOST_DEVICE static void block_cipher_kernel_helper(int idx, scalar_t* data, int64_t numel, size_t block_t_size, cipher_t cipher, transform_t transform_func, index_calc_t index_calc) {
-  const int unroll_factor = block_t_size / sizeof(uint_t) / N;
-  if (unroll_factor * idx < numel) {
-    auto block = cipher(idx);
-    UNROLL_IF_CUDA
-    for (auto i = 0; i < unroll_factor; ++i) {
-      const auto li = unroll_factor * idx + i;
-      if (li < numel) {
-        uint64_t vals[N];
-        UNROLL_IF_CUDA
-        for (size_t j = 0; j < N; j++) {
-          vals[j] = (reinterpret_cast<uint_t*>(&block))[N * i + j];
-        }
-        RNGValues<N> rng(vals);
-        data[index_calc(li)] = transform_func(&rng);
-      }
+template<typename output_index_calc_t>
+TORCH_CSPRNG_HOST_DEVICE static void copy_block_to_output(int64_t idx, uint8_t* block, int block_size, int output_elem_per_block,
+    void* output_ptr, int64_t output_numel, int output_type_size, output_index_calc_t output_index_calc) {
+//  std::cout << "output_elem_per_block = " << output_elem_per_block << std::endl;
+//  std::cout << "block_size = " << block_size << std::endl;
+//  std::cout << "output_type_size = " << output_type_size << std::endl;
+  for (auto i = 0; i < output_elem_per_block; ++i) {
+    const auto linear_index = idx * output_elem_per_block + i;
+    if (linear_index < output_numel) {
+      std::memcpy(
+          &(reinterpret_cast<uint8_t*>(output_ptr)[output_index_calc(linear_index)]),
+          &(block[i * output_type_size]),
+          output_type_size
+      );
     }
   }
 }
 
+template<typename cipher_t, typename input_index_calc_t, typename output_index_calc_t, typename transform_t>
+TORCH_CSPRNG_HOST_DEVICE static void block_cipher_kernel_helper(
+    int64_t idx, cipher_t cipher, int block_size, int output_elem_per_block,
+    void* input_ptr, int64_t input_numel, int input_type_size, input_index_calc_t input_index_calc,
+    void* output_ptr, int64_t output_numel, int output_type_size, output_index_calc_t output_index_calc,
+    transform_t transform) {
+  uint8_t block[block_size];
+  std::memset(&block, 0, block_size); // is it ok to use zeros as padding?
+  if (input_ptr != nullptr) {
+    copy_input_to_block(idx, block, block_size, input_ptr, input_numel, input_type_size, input_index_calc);
+  }
+  cipher(idx, block);
+  const auto new_block_size = transform(block);
+  copy_block_to_output(idx, block, new_block_size, output_elem_per_block, output_ptr, output_numel, output_type_size, output_index_calc);
+}
+
 #if defined(__CUDACC__) || defined(__HIPCC__)
-template<typename scalar_t, typename uint_t, size_t N = 1, typename cipher_t, typename transform_t, typename index_calc_t>
-__global__ static void block_cipher_kernel_cuda(scalar_t* data, int64_t numel, int block_t_size, cipher_t cipher, transform_t transform_func, index_calc_t index_calc) {
+template<typename cipher_t, typename input_index_calc_t, typename output_index_calc_t, typename transform_t>
+__global__ static void block_cipher_kernel_cuda(cipher_t cipher, int block_size, int output_elem_per_block,
+    void* input_ptr, int64_t input_numel, int input_type_size, input_index_calc_t input_index_calc,
+    void* output_ptr, int64_t output_numel, int output_type_size, output_index_calc_t output_index_calc,
+    transform_t transform) {
   const auto idx = blockIdx.x * blockDim.x + threadIdx.x;
-  block_cipher_kernel_helper<scalar_t, uint_t, N>(idx, data, numel, block_t_size, cipher, transform_func, index_calc);
+  block_cipher_kernel_helper(idx, cipher, block_size, output_elem_per_block
+    input_ptr, input_numel, input_type_size, input_index_calc,
+    output_ptr, output_numel, output_type_size, output_index_calc,
+    transform);
 }
 #endif
 
-template<typename scalar_t, typename uint_t, size_t N = 1, typename cipher_t, typename transform_t, typename index_calc_t>
-static void block_cipher_kernel_cpu_serial(int64_t begin, int64_t end, scalar_t* data, int64_t numel, int block_t_size, cipher_t cipher, transform_t transform_func, index_calc_t index_calc) {
+template<typename cipher_t, typename input_index_calc_t, typename output_index_calc_t, typename transform_t>
+static void block_cipher_kernel_cpu_serial(int64_t begin, int64_t end, cipher_t cipher, int block_size, int output_elem_per_block,
+    void* input_ptr, int64_t input_numel, int input_type_size, input_index_calc_t input_index_calc,
+    void* output_ptr, int64_t output_numel, int output_type_size, output_index_calc_t output_index_calc,
+    transform_t transform) {
   for (auto idx = begin; idx < end; ++idx) {
-    block_cipher_kernel_helper<scalar_t, uint_t, N>(idx, data, numel, block_t_size, cipher, transform_func, index_calc);
+    block_cipher_kernel_helper(idx, cipher, block_size, output_elem_per_block,
+      input_ptr, input_numel, input_type_size, input_index_calc,
+      output_ptr, output_numel, output_type_size, output_index_calc,
+      transform);
   }
 }
 
-template<typename scalar_t, typename uint_t, size_t N = 1, typename cipher_t, typename transform_t, typename index_calc_t>
-static void block_cipher_kernel_cpu(int64_t total, scalar_t* data, int64_t numel, int block_t_size, cipher_t cipher, transform_t transform_func, index_calc_t index_calc) {
+template<typename cipher_t, typename input_index_calc_t, typename output_index_calc_t, typename transform_t>
+static void block_cipher_kernel_cpu(int64_t total, cipher_t cipher, int block_size, int output_elem_per_block,
+    void* input_ptr, int64_t input_numel, int input_type_size, input_index_calc_t input_index_calc,
+    void* output_ptr, int64_t output_numel, int output_type_size, output_index_calc_t output_index_calc,
+    transform_t transform_func) {
   if (total < at::internal::GRAIN_SIZE || at::get_num_threads() == 1) {
-    block_cipher_kernel_cpu_serial<scalar_t, uint_t, N>(0, total, data, numel, block_t_size, cipher, transform_func, index_calc);
+    block_cipher_kernel_cpu_serial(0, total, cipher, block_size, output_elem_per_block,
+      input_ptr, input_numel, input_type_size, input_index_calc,
+      output_ptr, output_numel, output_type_size, output_index_calc,
+      transform_func);
   } else {
     at::parallel_for(0, total, at::internal::GRAIN_SIZE, [&](int64_t begin, int64_t end) {
-      block_cipher_kernel_cpu_serial<scalar_t, uint_t, N>(begin, end, data, numel, block_t_size, cipher, transform_func, index_calc);
+      block_cipher_kernel_cpu_serial(begin, end, cipher, block_size, output_elem_per_block,
+        input_ptr, input_numel, input_type_size, input_index_calc,
+        output_ptr, output_numel, output_type_size, output_index_calc,
+        transform_func);
     });
   }
 }
 
-// Runs a block cipher in a counter mode in approximately `numel / (block_t_size / sizeof(uint_t) / N)` CUDA threads.
-// Each CUDA thread generates `block_t_size`-bytes random state and divides it into `block_t_size / sizeof(uint_t)` sub-blocks.
-// Then `transform_func` transforms `N` random state sub-blocks passed in a `RNGValues` to final random values of type `scalar_t`.
-template<typename scalar_t, typename uint_t, size_t N = 1, typename cipher_t, typename transform_t>
-void block_cipher_ctr_mode(at::TensorIterator& iter, int block_t_size, cipher_t cipher, transform_t transform_func) {
-  const auto numel = iter.numel();
-  if (numel == 0) {
-    return;
-  }
-  const int unroll_factor = block_t_size / sizeof(uint_t) / N;
-  const auto block = 256;
-  const auto grid = (numel + (block * unroll_factor) - 1) / (block * unroll_factor);
-  scalar_t* data = (scalar_t*)iter.data_ptr(0);
-  auto offset_calc = make_offset_calculator<1>(iter);
-  auto index_calc_identity = [] TORCH_CSPRNG_HOST_DEVICE (int li) -> int { return li; };
-  auto index_calc_offset = [offset_calc] TORCH_CSPRNG_HOST_DEVICE (int li) -> int { return offset_calc.get(li)[0] / sizeof(scalar_t); };
-  if (iter.device_type() == at::kCPU) {
-    if (iter.output(0).is_contiguous()) {
-      block_cipher_kernel_cpu<scalar_t, uint_t, N, cipher_t, transform_t>(
-        grid * block, data, numel, block_t_size, cipher, transform_func, index_calc_identity);
-    } else {
-      block_cipher_kernel_cpu<scalar_t, uint_t, N, cipher_t, transform_t>(
-        grid * block, data, numel, block_t_size, cipher, transform_func, index_calc_offset);
-    }
-  } else if (iter.device_type() == at::kCUDA) {
+template<typename cipher_t, typename input_index_calc_t, typename output_index_calc_t, typename transform_t>
+void block_cipher(
+    void* input_ptr, int64_t input_numel, int input_type_size, input_index_calc_t input_index_calc,
+    void* output_ptr, int64_t output_numel, int output_type_size, output_index_calc_t output_index_calc,
+    Device device, cipher_t cipher, int block_size, int output_elem_per_block, transform_t transform_func) {
+//  if (input.numel() == 0) {
+//    return;
+//  }
+//  TORCH_CHECK((input_numel * input_type_size + block_size - 1) / block_size * block_size == output_numel * output_type_size, "wrong size");
+
+//  const auto size_in_bytes = input_numel * input_type_size;
+//  const auto size_in_bytes = output_numel * output_type_size;
+
+  if (device.type() == at::kCPU) {
+//    const auto total = (size_in_bytes + block_size - 1) / block_size;
+//    const auto total = (size_in_bytes + block_size / N - 1) / block_size * N;
+    const auto total = (output_numel + output_elem_per_block - 1) / output_elem_per_block;
+    block_cipher_kernel_cpu(total,
+        cipher, block_size, output_elem_per_block,
+        input_ptr, input_numel, input_type_size, input_index_calc,
+        output_ptr, output_numel, output_type_size, output_index_calc,
+        transform_func
+    );
+  } else if (device.type() == at::kCUDA) {
 #if defined(__CUDACC__) || defined(__HIPCC__)
+    const auto threads = 256;
+    const auto grid = (output_numel + (threads * output_elem_per_block) - 1) / (threads * output_elem_per_block);
     auto stream = at::cuda::getCurrentCUDAStream();
-    if (iter.output(0).is_contiguous()) {
-      block_cipher_kernel_cuda<scalar_t, uint_t, N, cipher_t, transform_t><<<grid, block, 0, stream>>>(
-        data, numel, block_t_size, cipher, transform_func, index_calc_identity);
-    } else {
-      block_cipher_kernel_cuda<scalar_t, uint_t, N, cipher_t, transform_t><<<grid, block, 0, stream>>>(
-        data, numel, block_t_size, cipher, transform_func, index_calc_offset);
-    }
+    block_cipher_kernel_cuda<<<grid, threads, 0, stream>>>(
+        cipher, block_size, output_elem_per_block
+        input_ptr, input_numel, input_type_size, input_index_calc,
+        output_ptr, output_numel, output_type_size, output_index_calc,
+        transform_func
+    );
     AT_CUDA_CHECK(cudaGetLastError());
 #else
-    TORCH_CHECK(false, "csprng was compiled without CUDA support");
+    TORCH_CHECK(false, "torchcsprng was compiled without CUDA support");
 #endif
   } else {
-    TORCH_CHECK(false, "block_cipher_ctr_mode supports only CPU and CUDA devices");
+    TORCH_CHECK(false, "block_cipher supports only CPU and CUDA devices");
   }
 }
 
+std::function<int(int)> create_index_calc(Tensor input) {
+  if (input.is_contiguous()) {
+    const auto input_type_size = input.element_size();
+    return [input_type_size] TORCH_CSPRNG_HOST_DEVICE (uint32_t li) -> uint32_t {
+      return li * input_type_size;
+    };
+  } else {
+    const auto input_iter = TensorIterator::nullary_op(input);
+    const auto input_offset_calc = make_offset_calculator<1>(input_iter);
+    return [input_offset_calc] TORCH_CSPRNG_HOST_DEVICE (uint32_t li) -> uint32_t {
+      return input_offset_calc.get(li)[0];
+    };
+  }
+}
+
+template<typename cipher_t>
+void block_cipher(Tensor input, Tensor output,
+                  cipher_t cipher, int block_size) {
+
+  const auto input_ptr = input.data_ptr();
+  const auto input_numel = input.numel();
+  const auto input_type_size = input.element_size();
+  const auto input_index_calc = create_index_calc(input);
+
+  const auto output_ptr = output.data_ptr();
+  const auto output_numel = output.numel();
+  const auto output_type_size = output.element_size();
+  const auto output_index_calc = create_index_calc(output);
+
+  const auto device = output.device();
+
+  block_cipher(
+      input_ptr, input_numel, input_type_size, input_index_calc,
+      output_ptr, output_numel, output_type_size, output_index_calc,
+      device, cipher, block_size, block_size / output_type_size,
+      [block_size] (auto x) { return block_size; });
+}
+
 }}
diff --git a/torchcsprng/csrc/block_cipher_2.h b/torchcsprng/csrc/block_cipher_2.h
deleted file mode 100644
index 0eca492..0000000
--- a/torchcsprng/csrc/block_cipher_2.h
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include "macros.h"
-#include <ATen/ATen.h>
-#include <ATen/native/TensorIterator.h>
-#include "OffsetCalculator.cuh"
-#include <ATen/Parallel.h>
-#include <cstdint>
-#include <mutex>
-
-#if defined(__CUDACC__) || defined(__HIPCC__)
-#include <c10/cuda/CUDAStream.h>
-#include <ATen/cuda/Exceptions.h>
-#endif
-
-#if defined(__CUDACC__) || defined(__HIPCC__)
-#define UNROLL_IF_CUDA #pragma unroll
-#else
-#define UNROLL_IF_CUDA
-#endif
-
-namespace torch {
-namespace csprng {
-
-template<typename input_index_calc_t>
-TORCH_CSPRNG_HOST_DEVICE static void copy_input_to_block(int64_t idx, uint8_t* block, int block_size,
-    void* input_ptr, int64_t input_numel, int input_type_size, input_index_calc_t input_index_calc) {
-  for (auto i = 0; i < block_size / input_type_size; ++i) {
-    const auto linear_index = idx * (block_size / input_type_size) + i;
-    if (linear_index < input_numel) {
-      std::memcpy(
-          &(block[i * input_type_size]),
-          &(reinterpret_cast<uint8_t*>(input_ptr)[input_index_calc(linear_index)]),
-          input_type_size
-      );
-    }
-  }
-}
-
-template<typename output_index_calc_t>
-TORCH_CSPRNG_HOST_DEVICE static void copy_block_to_output(int64_t idx, uint8_t* block, int block_size,
-    void* output_ptr, int64_t output_numel, int output_type_size, output_index_calc_t output_index_calc) {
-  for (auto i = 0; i < block_size / output_type_size; ++i) {
-    const auto linear_index = idx * block_size / output_type_size + i;
-    if (linear_index < output_numel) {
-      std::memcpy(
-          &(reinterpret_cast<uint8_t*>(output_ptr)[output_index_calc(linear_index)]),
-          &(block[i * output_type_size]),
-          output_type_size
-      );
-    }
-  }
-}
-
-template<typename cipher_t, typename input_index_calc_t, typename output_index_calc_t>
-TORCH_CSPRNG_HOST_DEVICE static void block_cipher_kernel_helper_2(int64_t idx, cipher_t cipher, int block_size,
-    void* input_ptr, int64_t input_numel, int input_type_size, input_index_calc_t input_index_calc,
-    void* output_ptr, int64_t output_numel, int output_type_size, output_index_calc_t output_index_calc) {
-  uint8_t block[block_size];
-  memset(&block, 0, block_size); // is it ok to use zeros as padding?
-  copy_input_to_block(idx, block, block_size, input_ptr, input_numel, input_type_size, input_index_calc);
-  cipher(idx, block);
-  copy_block_to_output(idx, block, block_size, output_ptr, output_numel, output_type_size, output_index_calc);
-}
-
-#if defined(__CUDACC__) || defined(__HIPCC__)
-template<typename cipher_t, typename input_index_calc_t, typename output_index_calc_t>
-__global__ static void block_cipher_kernel_cuda_2(cipher_t cipher, int block_size,
-    void* input_ptr, int64_t input_numel, int input_type_size, input_index_calc_t input_index_calc,
-    void* output_ptr, int64_t output_numel, int output_type_size, output_index_calc_t output_index_calc) {
-  const auto idx = blockIdx.x * blockDim.x + threadIdx.x;
-  block_cipher_kernel_helper_2(idx, cipher, block_size,
-    input_ptr, input_numel, input_type_size, input_index_calc,
-    output_ptr, output_numel, output_type_size, output_index_calc);
-}
-#endif
-
-template<typename cipher_t, typename input_index_calc_t, typename output_index_calc_t>
-static void block_cipher_kernel_cpu_serial_2(int64_t begin, int64_t end, cipher_t cipher, int block_size,
-    void* input_ptr, int64_t input_numel, int input_type_size, input_index_calc_t input_index_calc,
-    void* output_ptr, int64_t output_numel, int output_type_size, output_index_calc_t output_index_calc) {
-  for (auto idx = begin; idx < end; ++idx) {
-    block_cipher_kernel_helper_2(idx, cipher, block_size,
-      input_ptr, input_numel, input_type_size, input_index_calc,
-      output_ptr, output_numel, output_type_size, output_index_calc);
-  }
-}
-
-template<typename cipher_t, typename input_index_calc_t, typename output_index_calc_t>
-static void block_cipher_kernel_cpu_2(int64_t total, cipher_t cipher, int block_size,
-    void* input_ptr, int64_t input_numel, int input_type_size, input_index_calc_t input_index_calc,
-    void* output_ptr, int64_t output_numel, int output_type_size, output_index_calc_t output_index_calc) {
-  if (total < at::internal::GRAIN_SIZE || at::get_num_threads() == 1) {
-    block_cipher_kernel_cpu_serial_2(0, total, cipher, block_size,
-      input_ptr, input_numel, input_type_size, input_index_calc,
-      output_ptr, output_numel, output_type_size, output_index_calc);
-  } else {
-    at::parallel_for(0, total, at::internal::GRAIN_SIZE, [&](int64_t begin, int64_t end) {
-      block_cipher_kernel_cpu_serial_2(begin, end, cipher, block_size,
-        input_ptr, input_numel, input_type_size, input_index_calc,
-        output_ptr, output_numel, output_type_size, output_index_calc);
-    });
-  }
-}
-
-template<typename cipher_t>
-void block_cipher_2(Tensor input, Tensor output, cipher_t cipher, int block_size) {
-  if (input.numel() == 0) {
-    return;
-  }
-
-  const auto input_ptr = input.data_ptr();
-  const auto input_numel = input.numel();
-  const auto input_type_size = input.element_size();
-  const auto input_iter = TensorIterator::nullary_op(input);
-  const auto input_offset_calc = make_offset_calculator<1>(input_iter);
-  const std::function<int(int)> input_index_calc_contiguous = [input_type_size] TORCH_CSPRNG_HOST_DEVICE (int li) -> int { // TODO: int or uint32_t?
-    return li * input_type_size;
-  };
-  const std::function<int(int)> input_index_calc_non_contiguous = [input_offset_calc] TORCH_CSPRNG_HOST_DEVICE (int li) -> int {  // TODO: int or uint32_t?
-    return input_offset_calc.get(li)[0];
-  };
-  const auto input_index_calc = input.is_contiguous() ? input_index_calc_contiguous : input_index_calc_non_contiguous;
-
-  const auto output_ptr = output.data_ptr();
-  const auto output_numel = output.numel();
-  const auto output_type_size = output.element_size();
-  const auto output_iter = TensorIterator::nullary_op(output);
-  const auto output_offset_calc = make_offset_calculator<1>(output_iter);
-  const std::function<int(int)> output_index_calc_contiguous = [output_type_size] TORCH_CSPRNG_HOST_DEVICE (int li) -> int { // TODO: int or uint32_t?
-    return li * output_type_size;
-  };
-  const std::function<int(int)> output_index_calc_non_contiguous = [output_offset_calc] TORCH_CSPRNG_HOST_DEVICE (int li) -> int {  // TODO: int or uint32_t?
-    return output_offset_calc.get(li)[0];
-  };
-  const auto output_index_calc = output.is_contiguous() ? output_index_calc_contiguous : output_index_calc_non_contiguous;
-
-  TORCH_CHECK((input_numel * input_type_size + block_size - 1) / block_size * block_size == output_numel * output_type_size, "wrong size");
-
-  const auto size_in_bytes = input_numel * input_type_size;
-
-  if (input.device().type() == at::kCPU) {
-    const auto total = (size_in_bytes + block_size - 1) / block_size;
-    block_cipher_kernel_cpu_2(total, cipher, block_size,
-        input_ptr, input_numel, input_type_size, input_index_calc,
-        output_ptr, output_numel, output_type_size, output_index_calc
-    );
-  } else if (input.device().type() == at::kCUDA) {
-#if defined(__CUDACC__) || defined(__HIPCC__)
-    const auto block = 256;
-    const auto grid = (size_in_bytes + (block * block_size) - 1) / (block * block_size);
-    auto stream = at::cuda::getCurrentCUDAStream();
-    block_cipher_kernel_cuda_2<<<grid, block, 0, stream>>>(cipher, block_size,
-        input_ptr, input_numel, input_type_size, input_index_calc,
-        output_ptr, output_numel, output_type_size, output_index_calc
-    );
-    AT_CUDA_CHECK(cudaGetLastError());
-#else
-    TORCH_CHECK(false, "torchcsprng was compiled without CUDA support");
-#endif
-  } else {
-    TORCH_CHECK(false, "block_cipher supports only CPU and CUDA devices");
-  }
-}
-
-}}
diff --git a/torchcsprng/csrc/csprng.h b/torchcsprng/csrc/csprng.h
index a1cb806..96618c7 100644
--- a/torchcsprng/csrc/csprng.h
+++ b/torchcsprng/csrc/csprng.h
@@ -18,7 +18,6 @@
 #include <random>
 #include "macros.h"
 #include "block_cipher.h"
-#include "block_cipher_2.h"
 #include "aes.h"
 
 #if defined(__CUDACC__) || defined(__HIPCC__)
@@ -58,6 +57,29 @@ struct CSPRNGGeneratorImpl : public c10::GeneratorImpl {
   Tensor key_;
 };
 
+// Generates `block_t_size`-bytes random key Tensor on CPU
+// using `generator`, which must be an instance of `at::CPUGeneratorImpl`
+// and passes it to the `device`.
+template<typename RNG>
+at::Tensor key_tensor(size_t block_t_size, c10::optional<at::Generator> generator) {
+  std::lock_guard<std::mutex> lock(generator->mutex());
+  auto gen = at::check_generator<RNG>(generator);
+  if (gen->key().defined()) {
+    return gen->key().clone();
+  }
+  auto t = torch::empty({static_cast<signed long>(block_t_size)}, torch::kUInt8);
+  using random_t = uint32_t;
+  constexpr size_t random_t_size = sizeof(random_t);
+  for (size_t i = 0; i < block_t_size / random_t_size; i++) {
+    const auto rand = gen->random();
+    for (size_t j = 0; j < random_t_size; j++) {
+      size_t k = i * random_t_size + j;
+      t[k] = static_cast<uint8_t>((rand >> (j * 8)) & 0xff);
+    }
+  }
+  return t;
+}
+
 template<typename RNG>
 Tensor aes128_key_tensor(Generator generator) {
   return key_tensor<RNG>(aes::block_t_size, generator);
@@ -65,6 +87,20 @@ Tensor aes128_key_tensor(Generator generator) {
 
 // ====================================================================================================================
 
+// A simple container for random state sub-blocks that implements RNG interface
+// with random() and random64() methods, that are used by transformation function
+template<size_t size>
+struct RNGValues {
+  TORCH_CSPRNG_HOST_DEVICE RNGValues(uint64_t* vals) {
+    memcpy(&vals_, vals, size * sizeof(uint64_t));
+  }
+  uint32_t TORCH_CSPRNG_HOST_DEVICE random() { auto res = static_cast<uint32_t>(vals_[index]); index++; return res; }
+  uint64_t TORCH_CSPRNG_HOST_DEVICE random64() { auto res = vals_[index]; index++; return res; }
+private:
+  uint64_t vals_[size];
+  int index = 0;
+};
+
 // Applies AES in CTR mode with the `key` for passed TensorIterator iter.
 // `scalar_t`       is a scalar type equivalent of target tensor dtype
 // `uint_t`         is an unsigned integral type of sub-blocks that random state is divided to
@@ -75,16 +111,38 @@ Tensor aes128_key_tensor(Generator generator) {
 // `key`            is a CUDA pointer to random key memory block
 // `transform_func` is a callable that converts N `uint_t` random state sub-blocks passed in RNGValues into target dtype `scalar_t`
 template<typename scalar_t, typename uint_t, size_t N = 1, typename transform_t>
-void aes_helper(TensorIterator& iter, const uint8_t* key, transform_t transform_func) {
-  block_cipher_ctr_mode<scalar_t, uint_t, N>(iter, aes::block_t_size,
-    [key] TORCH_CSPRNG_HOST_DEVICE (unsigned int idx) -> aes::block_t {
-      aes::block_t block;
-      memset(&block, 0, aes::block_t_size);
-      block.x = idx;
-      aes::encrypt(reinterpret_cast<uint8_t*>(&block), key);
-      return block;
+void aes_helper(TensorIterator& iter, const uint8_t* key_bytes, transform_t transform_func) {
+  auto  output = iter.tensor(0);
+  const auto index_calc = create_index_calc(output);
+  block_cipher(
+    nullptr, 0, 0, index_calc,
+    output.data_ptr(), output.numel(), output.element_size(), index_calc,
+    iter.device_type(),
+    [key_bytes] TORCH_CSPRNG_HOST_DEVICE (int64_t idx, uint8_t* block) -> void {
+      uint8_t idx_block[aes::block_t_size];
+      std::memset(&idx_block, 0, aes::block_t_size);
+      *(reinterpret_cast<int64_t*>(idx_block)) = idx;
+      aes::encrypt(idx_block, key_bytes);
+      for (size_t i = 0; i < aes::block_t_size; i++) {
+        block[i] ^= idx_block[i];
+      }
     },
-    transform_func
+    aes::block_t_size, aes::block_t_size / (N * sizeof(uint_t)),
+    [transform_func] (auto block) {
+      const auto n = aes::block_t_size / (N * sizeof(uint_t));
+//      std::cout << "N = " << N << std::endl;
+//      std::cout << "sizeof(uint_t) = " << sizeof(uint_t) << std::endl;
+//      std::cout << "n = " << n << std::endl;
+      for (size_t i = 0; i < n; ++i) {
+        uint64_t vals[N];
+        for (size_t j = 0; j < N; ++j) {
+          vals[j] = (reinterpret_cast<uint_t*>(block))[N * i + j];
+        }
+        RNGValues<N> rng(vals);
+        reinterpret_cast<scalar_t*>(block)[i] = transform_func(&rng);
+      }
+      return n * sizeof(uint_t);
+    }
   );
 }
 
@@ -152,7 +210,7 @@ struct RandomFromToKernel {
         std::is_same<scalar_t, int64_t>::value ||
         std::is_same<scalar_t, double>::value ||
         std::is_same<scalar_t, float>::value ||
-        std::is_same<scalar_t, at::BFloat16>::value) && range >= 1ULL << 32)
+        std::is_same<scalar_t, at::BFloat16>::value)/* TODO: && range >= 1ULL << 32*/)
       {
         random_from_to_kernel_helper<scalar_t, uint64_t>(iter, range, base, key);
       } else {
@@ -427,14 +485,14 @@ Tensor encrypt_pybind(Tensor input, Tensor output, Tensor key, const std::string
   }
   const auto key_bytes = reinterpret_cast<uint8_t*>(key.contiguous().data_ptr());
   if (mode == "ecb") {
-    block_cipher_2(input, output,
+    block_cipher(input, output,
       [key_bytes] TORCH_CSPRNG_HOST_DEVICE (int64_t idx, uint8_t* block) -> void {
         aes::encrypt(block, key_bytes);
       },
       aes::block_t_size
     );
   } else if (mode == "ctr") {
-    block_cipher_2(input, output,
+    block_cipher(input, output,
       [key_bytes] TORCH_CSPRNG_HOST_DEVICE (int64_t idx, uint8_t* block) -> void {
         uint8_t idx_block[aes::block_t_size];
         std::memset(&idx_block, 0, aes::block_t_size);
@@ -462,14 +520,14 @@ Tensor decrypt_pybind(Tensor input, Tensor output, Tensor key, std::string ciphe
   }
   const auto key_bytes = reinterpret_cast<uint8_t*>(key.contiguous().data_ptr());
   if (mode == "ecb") {
-    block_cipher_2(input, output,
+    block_cipher(input, output,
       [key_bytes] TORCH_CSPRNG_HOST_DEVICE (int64_t idx, uint8_t* block) -> void {
         aes::decrypt(block, key_bytes);
       },
       aes::block_t_size
     );
   } else if (mode == "ctr") {
-    block_cipher_2(input, output,
+    block_cipher(input, output,
       [key_bytes] TORCH_CSPRNG_HOST_DEVICE (int64_t idx, uint8_t* block) -> void {
         uint8_t idx_block[aes::block_t_size];
         std::memset(&idx_block, 0, aes::block_t_size);

From 583fb8025dcfbb78ec1dbd0f7938428d956514f4 Mon Sep 17 00:00:00 2001
From: Pavel Belevich <pbelevich@fb.com>
Date: Tue, 17 Nov 2020 23:21:22 -0500
Subject: [PATCH 04/26] Update on "encrypt/decrypt"

[ghstack-poisoned]
---
 torchcsprng/csrc/block_cipher.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchcsprng/csrc/block_cipher.h b/torchcsprng/csrc/block_cipher.h
index a612061..d4415f3 100644
--- a/torchcsprng/csrc/block_cipher.h
+++ b/torchcsprng/csrc/block_cipher.h
@@ -155,7 +155,7 @@ void block_cipher(
     const auto grid = (output_numel + (threads * output_elem_per_block) - 1) / (threads * output_elem_per_block);
     auto stream = at::cuda::getCurrentCUDAStream();
     block_cipher_kernel_cuda<<<grid, threads, 0, stream>>>(
-        cipher, block_size, output_elem_per_block
+        cipher, block_size, output_elem_per_block,
         input_ptr, input_numel, input_type_size, input_index_calc,
         output_ptr, output_numel, output_type_size, output_index_calc,
         transform_func

From 66b2e5506fb3be6dc2ad006c73c3a9321227c711 Mon Sep 17 00:00:00 2001
From: Pavel Belevich <pbelevich@fb.com>
Date: Tue, 17 Nov 2020 23:39:32 -0500
Subject: [PATCH 05/26] Update on "encrypt/decrypt"

[ghstack-poisoned]
---
 torchcsprng/csrc/block_cipher.h | 53 ++++++++++++++++-----------------
 torchcsprng/csrc/csprng.h       | 25 +++++++---------
 2 files changed, 35 insertions(+), 43 deletions(-)

diff --git a/torchcsprng/csrc/block_cipher.h b/torchcsprng/csrc/block_cipher.h
index d4415f3..ba1e427 100644
--- a/torchcsprng/csrc/block_cipher.h
+++ b/torchcsprng/csrc/block_cipher.h
@@ -46,11 +46,8 @@ TORCH_CSPRNG_HOST_DEVICE static void copy_input_to_block(int64_t idx, uint8_t* b
 }
 
 template<typename output_index_calc_t>
-TORCH_CSPRNG_HOST_DEVICE static void copy_block_to_output(int64_t idx, uint8_t* block, int block_size, int output_elem_per_block,
+TORCH_CSPRNG_HOST_DEVICE static void copy_block_to_output(int64_t idx, uint8_t* block, int output_elem_per_block,
     void* output_ptr, int64_t output_numel, int output_type_size, output_index_calc_t output_index_calc) {
-//  std::cout << "output_elem_per_block = " << output_elem_per_block << std::endl;
-//  std::cout << "block_size = " << block_size << std::endl;
-//  std::cout << "output_type_size = " << output_type_size << std::endl;
   for (auto i = 0; i < output_elem_per_block; ++i) {
     const auto linear_index = idx * output_elem_per_block + i;
     if (linear_index < output_numel) {
@@ -63,9 +60,9 @@ TORCH_CSPRNG_HOST_DEVICE static void copy_block_to_output(int64_t idx, uint8_t*
   }
 }
 
-template<typename cipher_t, typename input_index_calc_t, typename output_index_calc_t, typename transform_t>
+template<int block_size, typename cipher_t, typename input_index_calc_t, typename output_index_calc_t, typename transform_t>
 TORCH_CSPRNG_HOST_DEVICE static void block_cipher_kernel_helper(
-    int64_t idx, cipher_t cipher, int block_size, int output_elem_per_block,
+    int64_t idx, cipher_t cipher, int output_elem_per_block,
     void* input_ptr, int64_t input_numel, int input_type_size, input_index_calc_t input_index_calc,
     void* output_ptr, int64_t output_numel, int output_type_size, output_index_calc_t output_index_calc,
     transform_t transform) {
@@ -75,50 +72,50 @@ TORCH_CSPRNG_HOST_DEVICE static void block_cipher_kernel_helper(
     copy_input_to_block(idx, block, block_size, input_ptr, input_numel, input_type_size, input_index_calc);
   }
   cipher(idx, block);
-  const auto new_block_size = transform(block);
-  copy_block_to_output(idx, block, new_block_size, output_elem_per_block, output_ptr, output_numel, output_type_size, output_index_calc);
+  transform(block);
+  copy_block_to_output(idx, block, output_elem_per_block, output_ptr, output_numel, output_type_size, output_index_calc);
 }
 
 #if defined(__CUDACC__) || defined(__HIPCC__)
-template<typename cipher_t, typename input_index_calc_t, typename output_index_calc_t, typename transform_t>
-__global__ static void block_cipher_kernel_cuda(cipher_t cipher, int block_size, int output_elem_per_block,
+template<int block_size, typename cipher_t, typename input_index_calc_t, typename output_index_calc_t, typename transform_t>
+__global__ static void block_cipher_kernel_cuda(cipher_t cipher, int output_elem_per_block,
     void* input_ptr, int64_t input_numel, int input_type_size, input_index_calc_t input_index_calc,
     void* output_ptr, int64_t output_numel, int output_type_size, output_index_calc_t output_index_calc,
     transform_t transform) {
   const auto idx = blockIdx.x * blockDim.x + threadIdx.x;
-  block_cipher_kernel_helper(idx, cipher, block_size, output_elem_per_block
+  block_cipher_kernel_helper(idx, cipher, output_elem_per_block
     input_ptr, input_numel, input_type_size, input_index_calc,
     output_ptr, output_numel, output_type_size, output_index_calc,
     transform);
 }
 #endif
 
-template<typename cipher_t, typename input_index_calc_t, typename output_index_calc_t, typename transform_t>
-static void block_cipher_kernel_cpu_serial(int64_t begin, int64_t end, cipher_t cipher, int block_size, int output_elem_per_block,
+template<int block_size, typename cipher_t, typename input_index_calc_t, typename output_index_calc_t, typename transform_t>
+static void block_cipher_kernel_cpu_serial(int64_t begin, int64_t end, cipher_t cipher, int output_elem_per_block,
     void* input_ptr, int64_t input_numel, int input_type_size, input_index_calc_t input_index_calc,
     void* output_ptr, int64_t output_numel, int output_type_size, output_index_calc_t output_index_calc,
     transform_t transform) {
   for (auto idx = begin; idx < end; ++idx) {
-    block_cipher_kernel_helper(idx, cipher, block_size, output_elem_per_block,
+    block_cipher_kernel_helper<block_size>(idx, cipher, output_elem_per_block,
       input_ptr, input_numel, input_type_size, input_index_calc,
       output_ptr, output_numel, output_type_size, output_index_calc,
       transform);
   }
 }
 
-template<typename cipher_t, typename input_index_calc_t, typename output_index_calc_t, typename transform_t>
-static void block_cipher_kernel_cpu(int64_t total, cipher_t cipher, int block_size, int output_elem_per_block,
+template<int block_size, typename cipher_t, typename input_index_calc_t, typename output_index_calc_t, typename transform_t>
+static void block_cipher_kernel_cpu(int64_t total, cipher_t cipher, int output_elem_per_block,
     void* input_ptr, int64_t input_numel, int input_type_size, input_index_calc_t input_index_calc,
     void* output_ptr, int64_t output_numel, int output_type_size, output_index_calc_t output_index_calc,
     transform_t transform_func) {
   if (total < at::internal::GRAIN_SIZE || at::get_num_threads() == 1) {
-    block_cipher_kernel_cpu_serial(0, total, cipher, block_size, output_elem_per_block,
+    block_cipher_kernel_cpu_serial<block_size>(0, total, cipher, output_elem_per_block,
       input_ptr, input_numel, input_type_size, input_index_calc,
       output_ptr, output_numel, output_type_size, output_index_calc,
       transform_func);
   } else {
     at::parallel_for(0, total, at::internal::GRAIN_SIZE, [&](int64_t begin, int64_t end) {
-      block_cipher_kernel_cpu_serial(begin, end, cipher, block_size, output_elem_per_block,
+      block_cipher_kernel_cpu_serial<block_size>(begin, end, cipher, output_elem_per_block,
         input_ptr, input_numel, input_type_size, input_index_calc,
         output_ptr, output_numel, output_type_size, output_index_calc,
         transform_func);
@@ -126,11 +123,11 @@ static void block_cipher_kernel_cpu(int64_t total, cipher_t cipher, int block_si
   }
 }
 
-template<typename cipher_t, typename input_index_calc_t, typename output_index_calc_t, typename transform_t>
+template<int block_size, typename cipher_t, typename input_index_calc_t, typename output_index_calc_t, typename transform_t>
 void block_cipher(
     void* input_ptr, int64_t input_numel, int input_type_size, input_index_calc_t input_index_calc,
     void* output_ptr, int64_t output_numel, int output_type_size, output_index_calc_t output_index_calc,
-    Device device, cipher_t cipher, int block_size, int output_elem_per_block, transform_t transform_func) {
+    Device device, cipher_t cipher, int output_elem_per_block, transform_t transform_func) {
 //  if (input.numel() == 0) {
 //    return;
 //  }
@@ -143,8 +140,8 @@ void block_cipher(
 //    const auto total = (size_in_bytes + block_size - 1) / block_size;
 //    const auto total = (size_in_bytes + block_size / N - 1) / block_size * N;
     const auto total = (output_numel + output_elem_per_block - 1) / output_elem_per_block;
-    block_cipher_kernel_cpu(total,
-        cipher, block_size, output_elem_per_block,
+    block_cipher_kernel_cpu<block_size>(total,
+        cipher, output_elem_per_block,
         input_ptr, input_numel, input_type_size, input_index_calc,
         output_ptr, output_numel, output_type_size, output_index_calc,
         transform_func
@@ -154,7 +151,7 @@ void block_cipher(
     const auto threads = 256;
     const auto grid = (output_numel + (threads * output_elem_per_block) - 1) / (threads * output_elem_per_block);
     auto stream = at::cuda::getCurrentCUDAStream();
-    block_cipher_kernel_cuda<<<grid, threads, 0, stream>>>(
+    block_cipher_kernel_cuda<block_sizev><<<grid, threads, 0, stream>>>(
         cipher, block_size, output_elem_per_block,
         input_ptr, input_numel, input_type_size, input_index_calc,
         output_ptr, output_numel, output_type_size, output_index_calc,
@@ -184,9 +181,9 @@ std::function<int(int)> create_index_calc(Tensor input) {
   }
 }
 
-template<typename cipher_t>
+template<int block_size, typename cipher_t>
 void block_cipher(Tensor input, Tensor output,
-                  cipher_t cipher, int block_size) {
+                  cipher_t cipher) {
 
   const auto input_ptr = input.data_ptr();
   const auto input_numel = input.numel();
@@ -200,11 +197,11 @@ void block_cipher(Tensor input, Tensor output,
 
   const auto device = output.device();
 
-  block_cipher(
+  block_cipher<block_size>(
       input_ptr, input_numel, input_type_size, input_index_calc,
       output_ptr, output_numel, output_type_size, output_index_calc,
-      device, cipher, block_size, block_size / output_type_size,
-      [block_size] (auto x) { return block_size; });
+      device, cipher, block_size / output_type_size,
+      [] (auto x) {});
 }
 
 }}
diff --git a/torchcsprng/csrc/csprng.h b/torchcsprng/csrc/csprng.h
index 96618c7..2752d1a 100644
--- a/torchcsprng/csrc/csprng.h
+++ b/torchcsprng/csrc/csprng.h
@@ -114,7 +114,7 @@ template<typename scalar_t, typename uint_t, size_t N = 1, typename transform_t>
 void aes_helper(TensorIterator& iter, const uint8_t* key_bytes, transform_t transform_func) {
   auto  output = iter.tensor(0);
   const auto index_calc = create_index_calc(output);
-  block_cipher(
+  block_cipher<aes::block_t_size>(
     nullptr, 0, 0, index_calc,
     output.data_ptr(), output.numel(), output.element_size(), index_calc,
     iter.device_type(),
@@ -127,7 +127,7 @@ void aes_helper(TensorIterator& iter, const uint8_t* key_bytes, transform_t tran
         block[i] ^= idx_block[i];
       }
     },
-    aes::block_t_size, aes::block_t_size / (N * sizeof(uint_t)),
+    aes::block_t_size / (N * sizeof(uint_t)),
     [transform_func] (auto block) {
       const auto n = aes::block_t_size / (N * sizeof(uint_t));
 //      std::cout << "N = " << N << std::endl;
@@ -141,7 +141,6 @@ void aes_helper(TensorIterator& iter, const uint8_t* key_bytes, transform_t tran
         RNGValues<N> rng(vals);
         reinterpret_cast<scalar_t*>(block)[i] = transform_func(&rng);
       }
-      return n * sizeof(uint_t);
     }
   );
 }
@@ -485,14 +484,13 @@ Tensor encrypt_pybind(Tensor input, Tensor output, Tensor key, const std::string
   }
   const auto key_bytes = reinterpret_cast<uint8_t*>(key.contiguous().data_ptr());
   if (mode == "ecb") {
-    block_cipher(input, output,
+    block_cipher<aes::block_t_size>(input, output,
       [key_bytes] TORCH_CSPRNG_HOST_DEVICE (int64_t idx, uint8_t* block) -> void {
         aes::encrypt(block, key_bytes);
-      },
-      aes::block_t_size
+      }
     );
   } else if (mode == "ctr") {
-    block_cipher(input, output,
+    block_cipher<aes::block_t_size>(input, output,
       [key_bytes] TORCH_CSPRNG_HOST_DEVICE (int64_t idx, uint8_t* block) -> void {
         uint8_t idx_block[aes::block_t_size];
         std::memset(&idx_block, 0, aes::block_t_size);
@@ -501,8 +499,7 @@ Tensor encrypt_pybind(Tensor input, Tensor output, Tensor key, const std::string
         for (size_t i = 0; i < aes::block_t_size; i++) {
           block[i] ^= idx_block[i];
         }
-      },
-      aes::block_t_size
+      }
     );
   } else {
     TORCH_CHECK(false, "encrypt/decrypt supports \"ecb\" and \"ctr\" modes, \"", mode, "\" is not supported.");
@@ -520,14 +517,13 @@ Tensor decrypt_pybind(Tensor input, Tensor output, Tensor key, std::string ciphe
   }
   const auto key_bytes = reinterpret_cast<uint8_t*>(key.contiguous().data_ptr());
   if (mode == "ecb") {
-    block_cipher(input, output,
+    block_cipher<aes::block_t_size>(input, output,
       [key_bytes] TORCH_CSPRNG_HOST_DEVICE (int64_t idx, uint8_t* block) -> void {
         aes::decrypt(block, key_bytes);
-      },
-      aes::block_t_size
+      }
     );
   } else if (mode == "ctr") {
-    block_cipher(input, output,
+    block_cipher<aes::block_t_size>(input, output,
       [key_bytes] TORCH_CSPRNG_HOST_DEVICE (int64_t idx, uint8_t* block) -> void {
         uint8_t idx_block[aes::block_t_size];
         std::memset(&idx_block, 0, aes::block_t_size);
@@ -536,8 +532,7 @@ Tensor decrypt_pybind(Tensor input, Tensor output, Tensor key, std::string ciphe
         for (size_t i = 0; i < aes::block_t_size; i++) {
           block[i] ^= idx_block[i];
         }
-      },
-      aes::block_t_size
+      }
     );
   } else {
     TORCH_CHECK(false, "encrypt/decrypt supports \"ecb\" and \"ctr\" modes, \"", mode, "\" is not supported.");

From 6bd00308e7f24e9ce1b95c21989238932dd37b07 Mon Sep 17 00:00:00 2001
From: Pavel Belevich <pbelevich@fb.com>
Date: Tue, 17 Nov 2020 23:42:53 -0500
Subject: [PATCH 06/26] Update on "encrypt/decrypt"

[ghstack-poisoned]
---
 torchcsprng/csrc/block_cipher.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchcsprng/csrc/block_cipher.h b/torchcsprng/csrc/block_cipher.h
index ba1e427..62bb9a6 100644
--- a/torchcsprng/csrc/block_cipher.h
+++ b/torchcsprng/csrc/block_cipher.h
@@ -151,7 +151,7 @@ void block_cipher(
     const auto threads = 256;
     const auto grid = (output_numel + (threads * output_elem_per_block) - 1) / (threads * output_elem_per_block);
     auto stream = at::cuda::getCurrentCUDAStream();
-    block_cipher_kernel_cuda<block_sizev><<<grid, threads, 0, stream>>>(
+    block_cipher_kernel_cuda<block_size><<<grid, threads, 0, stream>>>(
         cipher, block_size, output_elem_per_block,
         input_ptr, input_numel, input_type_size, input_index_calc,
         output_ptr, output_numel, output_type_size, output_index_calc,

From 7465ef10dde96fce81e039509f507bcce4dcb0a3 Mon Sep 17 00:00:00 2001
From: Pavel Belevich <pbelevich@fb.com>
Date: Tue, 17 Nov 2020 23:48:32 -0500
Subject: [PATCH 07/26] Update on "encrypt/decrypt"

[ghstack-poisoned]
---
 torchcsprng/csrc/block_cipher.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchcsprng/csrc/block_cipher.h b/torchcsprng/csrc/block_cipher.h
index 62bb9a6..f1d9b2d 100644
--- a/torchcsprng/csrc/block_cipher.h
+++ b/torchcsprng/csrc/block_cipher.h
@@ -152,7 +152,7 @@ void block_cipher(
     const auto grid = (output_numel + (threads * output_elem_per_block) - 1) / (threads * output_elem_per_block);
     auto stream = at::cuda::getCurrentCUDAStream();
     block_cipher_kernel_cuda<block_size><<<grid, threads, 0, stream>>>(
-        cipher, block_size, output_elem_per_block,
+        cipher, output_elem_per_block,
         input_ptr, input_numel, input_type_size, input_index_calc,
         output_ptr, output_numel, output_type_size, output_index_calc,
         transform_func

From 247ca74ad46dfb7f066fa85c966edee1104e2b8f Mon Sep 17 00:00:00 2001
From: Pavel Belevich <pbelevich@fb.com>
Date: Tue, 17 Nov 2020 23:50:52 -0500
Subject: [PATCH 08/26] Update on "encrypt/decrypt"

[ghstack-poisoned]
---
 torchcsprng/csrc/block_cipher.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchcsprng/csrc/block_cipher.h b/torchcsprng/csrc/block_cipher.h
index f1d9b2d..631ffe4 100644
--- a/torchcsprng/csrc/block_cipher.h
+++ b/torchcsprng/csrc/block_cipher.h
@@ -83,7 +83,7 @@ __global__ static void block_cipher_kernel_cuda(cipher_t cipher, int output_elem
     void* output_ptr, int64_t output_numel, int output_type_size, output_index_calc_t output_index_calc,
     transform_t transform) {
   const auto idx = blockIdx.x * blockDim.x + threadIdx.x;
-  block_cipher_kernel_helper(idx, cipher, output_elem_per_block
+  block_cipher_kernel_helper(idx, cipher, output_elem_per_block,
     input_ptr, input_numel, input_type_size, input_index_calc,
     output_ptr, output_numel, output_type_size, output_index_calc,
     transform);

From f6a260a405de33a648c5b94a18214bf0f31de597 Mon Sep 17 00:00:00 2001
From: Pavel Belevich <pbelevich@fb.com>
Date: Wed, 18 Nov 2020 00:00:40 -0500
Subject: [PATCH 09/26] Update on "encrypt/decrypt"

[ghstack-poisoned]
---
 torchcsprng/csrc/block_cipher.h | 2 +-
 torchcsprng/csrc/csprng.h       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/torchcsprng/csrc/block_cipher.h b/torchcsprng/csrc/block_cipher.h
index 631ffe4..6db2cd5 100644
--- a/torchcsprng/csrc/block_cipher.h
+++ b/torchcsprng/csrc/block_cipher.h
@@ -201,7 +201,7 @@ void block_cipher(Tensor input, Tensor output,
       input_ptr, input_numel, input_type_size, input_index_calc,
       output_ptr, output_numel, output_type_size, output_index_calc,
       device, cipher, block_size / output_type_size,
-      [] (auto x) {});
+      [] TORCH_CSPRNG_HOST_DEVICE (auto x) {});
 }
 
 }}
diff --git a/torchcsprng/csrc/csprng.h b/torchcsprng/csrc/csprng.h
index 2752d1a..977b742 100644
--- a/torchcsprng/csrc/csprng.h
+++ b/torchcsprng/csrc/csprng.h
@@ -128,7 +128,7 @@ void aes_helper(TensorIterator& iter, const uint8_t* key_bytes, transform_t tran
       }
     },
     aes::block_t_size / (N * sizeof(uint_t)),
-    [transform_func] (auto block) {
+    [transform_func] TORCH_CSPRNG_HOST_DEVICE (auto block) {
       const auto n = aes::block_t_size / (N * sizeof(uint_t));
 //      std::cout << "N = " << N << std::endl;
 //      std::cout << "sizeof(uint_t) = " << sizeof(uint_t) << std::endl;

From 9cbd83f64bc512d8ddee7ded5030428dda576b40 Mon Sep 17 00:00:00 2001
From: Pavel Belevich <pbelevich@fb.com>
Date: Wed, 18 Nov 2020 00:06:58 -0500
Subject: [PATCH 10/26] Update on "encrypt/decrypt"

[ghstack-poisoned]
---
 torchcsprng/csrc/block_cipher.h | 2 +-
 torchcsprng/csrc/csprng.h       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/torchcsprng/csrc/block_cipher.h b/torchcsprng/csrc/block_cipher.h
index 6db2cd5..9b9aecf 100644
--- a/torchcsprng/csrc/block_cipher.h
+++ b/torchcsprng/csrc/block_cipher.h
@@ -201,7 +201,7 @@ void block_cipher(Tensor input, Tensor output,
       input_ptr, input_numel, input_type_size, input_index_calc,
       output_ptr, output_numel, output_type_size, output_index_calc,
       device, cipher, block_size / output_type_size,
-      [] TORCH_CSPRNG_HOST_DEVICE (auto x) {});
+      [] TORCH_CSPRNG_HOST_DEVICE (uint8_t* x) {});
 }
 
 }}
diff --git a/torchcsprng/csrc/csprng.h b/torchcsprng/csrc/csprng.h
index 977b742..23d2481 100644
--- a/torchcsprng/csrc/csprng.h
+++ b/torchcsprng/csrc/csprng.h
@@ -128,7 +128,7 @@ void aes_helper(TensorIterator& iter, const uint8_t* key_bytes, transform_t tran
       }
     },
     aes::block_t_size / (N * sizeof(uint_t)),
-    [transform_func] TORCH_CSPRNG_HOST_DEVICE (auto block) {
+    [transform_func] TORCH_CSPRNG_HOST_DEVICE (uint8_t* block) {
       const auto n = aes::block_t_size / (N * sizeof(uint_t));
 //      std::cout << "N = " << N << std::endl;
 //      std::cout << "sizeof(uint_t) = " << sizeof(uint_t) << std::endl;

From 5ae1104ac6b84025c6ac1a7a16eac6b8e677183d Mon Sep 17 00:00:00 2001
From: Pavel Belevich <pbelevich@fb.com>
Date: Wed, 18 Nov 2020 00:11:30 -0500
Subject: [PATCH 11/26] Update on "encrypt/decrypt"

[ghstack-poisoned]
---
 torchcsprng/csrc/block_cipher.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchcsprng/csrc/block_cipher.h b/torchcsprng/csrc/block_cipher.h
index 9b9aecf..bb1926e 100644
--- a/torchcsprng/csrc/block_cipher.h
+++ b/torchcsprng/csrc/block_cipher.h
@@ -83,7 +83,7 @@ __global__ static void block_cipher_kernel_cuda(cipher_t cipher, int output_elem
     void* output_ptr, int64_t output_numel, int output_type_size, output_index_calc_t output_index_calc,
     transform_t transform) {
   const auto idx = blockIdx.x * blockDim.x + threadIdx.x;
-  block_cipher_kernel_helper(idx, cipher, output_elem_per_block,
+  block_cipher_kernel_helper<block_size>(idx, cipher, output_elem_per_block,
     input_ptr, input_numel, input_type_size, input_index_calc,
     output_ptr, output_numel, output_type_size, output_index_calc,
     transform);

From 1ea40900d4707f75a5f9cd7c4b9c5b6d93e5896f Mon Sep 17 00:00:00 2001
From: Pavel Belevich <pbelevich@fb.com>
Date: Wed, 18 Nov 2020 00:28:43 -0500
Subject: [PATCH 12/26] Update on "encrypt/decrypt"

[ghstack-poisoned]
---
 torchcsprng/csrc/block_cipher.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchcsprng/csrc/block_cipher.h b/torchcsprng/csrc/block_cipher.h
index bb1926e..290d3a3 100644
--- a/torchcsprng/csrc/block_cipher.h
+++ b/torchcsprng/csrc/block_cipher.h
@@ -166,7 +166,7 @@ void block_cipher(
   }
 }
 
-std::function<int(int)> create_index_calc(Tensor input) {
+std::function<TORCH_CSPRNG_HOST_DEVICE int(int)> create_index_calc(Tensor input) {
   if (input.is_contiguous()) {
     const auto input_type_size = input.element_size();
     return [input_type_size] TORCH_CSPRNG_HOST_DEVICE (uint32_t li) -> uint32_t {

From a04096419a718177cbc0c35c305cd0a6eadc238c Mon Sep 17 00:00:00 2001
From: Pavel Belevich <pbelevich@fb.com>
Date: Wed, 18 Nov 2020 16:26:04 -0500
Subject: [PATCH 13/26] Update on "encrypt/decrypt"

[ghstack-poisoned]
---
 torchcsprng/csrc/block_cipher.h | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/torchcsprng/csrc/block_cipher.h b/torchcsprng/csrc/block_cipher.h
index 290d3a3..524e0c4 100644
--- a/torchcsprng/csrc/block_cipher.h
+++ b/torchcsprng/csrc/block_cipher.h
@@ -128,9 +128,9 @@ void block_cipher(
     void* input_ptr, int64_t input_numel, int input_type_size, input_index_calc_t input_index_calc,
     void* output_ptr, int64_t output_numel, int output_type_size, output_index_calc_t output_index_calc,
     Device device, cipher_t cipher, int output_elem_per_block, transform_t transform_func) {
-//  if (input.numel() == 0) {
-//    return;
-//  }
+  if (output_ptr == nullptr || output_numel == 0) {
+    return;
+  }
 //  TORCH_CHECK((input_numel * input_type_size + block_size - 1) / block_size * block_size == output_numel * output_type_size, "wrong size");
 
 //  const auto size_in_bytes = input_numel * input_type_size;
@@ -182,16 +182,26 @@ std::function<TORCH_CSPRNG_HOST_DEVICE int(int)> create_index_calc(Tensor input)
 }
 
 template<int block_size, typename cipher_t>
-void block_cipher(Tensor input, Tensor output,
-                  cipher_t cipher) {
-
+void block_cipher(Tensor input, Tensor output, cipher_t cipher) {
   const auto input_ptr = input.data_ptr();
   const auto input_numel = input.numel();
+
+  // Otherwise OffsetCalculator/IntDivider crashes with integer division by zero
+  if (input_ptr == nullptr || input_numel == 0) {
+    return;
+  }
+
   const auto input_type_size = input.element_size();
   const auto input_index_calc = create_index_calc(input);
 
   const auto output_ptr = output.data_ptr();
   const auto output_numel = output.numel();
+
+  // Otherwise OffsetCalculator/IntDivider crashes with integer division by zero
+  if (output_ptr == nullptr || output_numel == 0) {
+    return;
+  }
+
   const auto output_type_size = output.element_size();
   const auto output_index_calc = create_index_calc(output);
 

From 2ae9d3e663897d768849362205393cca64fbb434 Mon Sep 17 00:00:00 2001
From: Pavel Belevich <pbelevich@fb.com>
Date: Wed, 18 Nov 2020 16:34:31 -0500
Subject: [PATCH 14/26] Update on "encrypt/decrypt"

[ghstack-poisoned]
---
 torchcsprng/csrc/block_cipher.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/torchcsprng/csrc/block_cipher.h b/torchcsprng/csrc/block_cipher.h
index 524e0c4..2b75e7f 100644
--- a/torchcsprng/csrc/block_cipher.h
+++ b/torchcsprng/csrc/block_cipher.h
@@ -166,19 +166,19 @@ void block_cipher(
   }
 }
 
-std::function<TORCH_CSPRNG_HOST_DEVICE int(int)> create_index_calc(Tensor input) {
-  if (input.is_contiguous()) {
-    const auto input_type_size = input.element_size();
-    return [input_type_size] TORCH_CSPRNG_HOST_DEVICE (uint32_t li) -> uint32_t {
-      return li * input_type_size;
-    };
-  } else {
+auto create_index_calc(Tensor input) {
+//  if (input.is_contiguous()) {
+//    const auto input_type_size = input.element_size();
+//    return [input_type_size] TORCH_CSPRNG_HOST_DEVICE (uint32_t li) -> uint32_t {
+//      return li * input_type_size;
+//    };
+//  } else {
     const auto input_iter = TensorIterator::nullary_op(input);
     const auto input_offset_calc = make_offset_calculator<1>(input_iter);
     return [input_offset_calc] TORCH_CSPRNG_HOST_DEVICE (uint32_t li) -> uint32_t {
       return input_offset_calc.get(li)[0];
     };
-  }
+//  }
 }
 
 template<int block_size, typename cipher_t>

From cb7d77be20b24807d2b3b77864ce88da792f9c81 Mon Sep 17 00:00:00 2001
From: Pavel Belevich <pbelevich@fb.com>
Date: Wed, 18 Nov 2020 17:22:53 -0500
Subject: [PATCH 15/26] Update on "encrypt/decrypt"

[ghstack-poisoned]
---
 torchcsprng/csrc/block_cipher.h | 35 +++++++++++++++++++--------------
 torchcsprng/csrc/csprng.h       |  9 ++++++---
 2 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/torchcsprng/csrc/block_cipher.h b/torchcsprng/csrc/block_cipher.h
index 2b75e7f..0d8575c 100644
--- a/torchcsprng/csrc/block_cipher.h
+++ b/torchcsprng/csrc/block_cipher.h
@@ -166,20 +166,19 @@ void block_cipher(
   }
 }
 
-auto create_index_calc(Tensor input) {
-//  if (input.is_contiguous()) {
-//    const auto input_type_size = input.element_size();
-//    return [input_type_size] TORCH_CSPRNG_HOST_DEVICE (uint32_t li) -> uint32_t {
-//      return li * input_type_size;
+//auto create_index_calc(Tensor input) {
+////  if (input.is_contiguous()) {
+////    const auto input_type_size = input.element_size();
+////    return [input_type_size] TORCH_CSPRNG_HOST_DEVICE (uint32_t li) -> uint32_t {
+////      return li * input_type_size;
+////    };
+////  } else {
+//    const auto input_offset_calc = make_offset_calculator<1>(TensorIterator::nullary_op(input));
+//    return [input_offset_calc] TORCH_CSPRNG_HOST_DEVICE (uint32_t li) -> uint32_t {
+//      return input_offset_calc.get(li)[0];
 //    };
-//  } else {
-    const auto input_iter = TensorIterator::nullary_op(input);
-    const auto input_offset_calc = make_offset_calculator<1>(input_iter);
-    return [input_offset_calc] TORCH_CSPRNG_HOST_DEVICE (uint32_t li) -> uint32_t {
-      return input_offset_calc.get(li)[0];
-    };
-//  }
-}
+////  }
+//}
 
 template<int block_size, typename cipher_t>
 void block_cipher(Tensor input, Tensor output, cipher_t cipher) {
@@ -192,7 +191,10 @@ void block_cipher(Tensor input, Tensor output, cipher_t cipher) {
   }
 
   const auto input_type_size = input.element_size();
-  const auto input_index_calc = create_index_calc(input);
+  const auto input_offset_calc = make_offset_calculator<1>(TensorIterator::nullary_op(input));
+  const auto input_index_calc = [input_offset_calc] TORCH_CSPRNG_HOST_DEVICE (uint32_t li) -> uint32_t {
+    return input_offset_calc.get(li)[0];
+  };
 
   const auto output_ptr = output.data_ptr();
   const auto output_numel = output.numel();
@@ -203,7 +205,10 @@ void block_cipher(Tensor input, Tensor output, cipher_t cipher) {
   }
 
   const auto output_type_size = output.element_size();
-  const auto output_index_calc = create_index_calc(output);
+  const auto output_offset_calc = make_offset_calculator<1>(TensorIterator::nullary_op(output));
+  const auto output_index_calc = [output_offset_calc] TORCH_CSPRNG_HOST_DEVICE (uint32_t li) -> uint32_t {
+    return output_offset_calc.get(li)[0];
+  };
 
   const auto device = output.device();
 
diff --git a/torchcsprng/csrc/csprng.h b/torchcsprng/csrc/csprng.h
index 23d2481..a2c993d 100644
--- a/torchcsprng/csrc/csprng.h
+++ b/torchcsprng/csrc/csprng.h
@@ -113,10 +113,13 @@ struct RNGValues {
 template<typename scalar_t, typename uint_t, size_t N = 1, typename transform_t>
 void aes_helper(TensorIterator& iter, const uint8_t* key_bytes, transform_t transform_func) {
   auto  output = iter.tensor(0);
-  const auto index_calc = create_index_calc(output);
+  const auto output_offset_calc = make_offset_calculator<1>(TensorIterator::nullary_op(output));
+  const auto output_index_calc = [output_offset_calc] TORCH_CSPRNG_HOST_DEVICE (uint32_t li) -> uint32_t {
+    return output_offset_calc.get(li)[0];
+  };
   block_cipher<aes::block_t_size>(
-    nullptr, 0, 0, index_calc,
-    output.data_ptr(), output.numel(), output.element_size(), index_calc,
+    nullptr, 0, 0, output_index_calc,
+    output.data_ptr(), output.numel(), output.element_size(), output_index_calc,
     iter.device_type(),
     [key_bytes] TORCH_CSPRNG_HOST_DEVICE (int64_t idx, uint8_t* block) -> void {
       uint8_t idx_block[aes::block_t_size];

From fb23aae726498b53d647cf2377f5883957f537a2 Mon Sep 17 00:00:00 2001
From: Pavel Belevich <pbelevich@fb.com>
Date: Wed, 18 Nov 2020 19:28:01 -0500
Subject: [PATCH 16/26] Update on "encrypt/decrypt"

[ghstack-poisoned]
---
 test/test_csprng.py       | 4 ++--
 torchcsprng/csrc/csprng.h | 5 +----
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/test/test_csprng.py b/test/test_csprng.py
index 3ed2b1e..b19a811 100644
--- a/test/test_csprng.py
+++ b/test/test_csprng.py
@@ -392,8 +392,8 @@ def sizeof(dtype):
                                     encrypted = torch.empty(encrypted_size, dtype=encrypted_dtype, device=device).random_()
                                     decrypted = torch.empty(decrypted_size, dtype=decrypted_dtype, device=device).random_()
 
-                                    initial_np = initial.numpy().view(np.int8)
-                                    decrypted_np = decrypted.numpy().view(np.int8)
+                                    initial_np = initial.cpu().numpy().view(np.int8)
+                                    decrypted_np = decrypted.cpu().numpy().view(np.int8)
                                     padding_size_bytes = initial_size * sizeof(initial_dtype) - decrypted_size * sizeof(decrypted_dtype)
                                     if padding_size_bytes != 0:
                                         decrypted_np = decrypted_np[:padding_size_bytes]
diff --git a/torchcsprng/csrc/csprng.h b/torchcsprng/csrc/csprng.h
index a2c993d..1cf03be 100644
--- a/torchcsprng/csrc/csprng.h
+++ b/torchcsprng/csrc/csprng.h
@@ -112,7 +112,7 @@ struct RNGValues {
 // `transform_func` is a callable that converts N `uint_t` random state sub-blocks passed in RNGValues into target dtype `scalar_t`
 template<typename scalar_t, typename uint_t, size_t N = 1, typename transform_t>
 void aes_helper(TensorIterator& iter, const uint8_t* key_bytes, transform_t transform_func) {
-  auto  output = iter.tensor(0);
+  const auto output = iter.tensor(0);
   const auto output_offset_calc = make_offset_calculator<1>(TensorIterator::nullary_op(output));
   const auto output_index_calc = [output_offset_calc] TORCH_CSPRNG_HOST_DEVICE (uint32_t li) -> uint32_t {
     return output_offset_calc.get(li)[0];
@@ -133,9 +133,6 @@ void aes_helper(TensorIterator& iter, const uint8_t* key_bytes, transform_t tran
     aes::block_t_size / (N * sizeof(uint_t)),
     [transform_func] TORCH_CSPRNG_HOST_DEVICE (uint8_t* block) {
       const auto n = aes::block_t_size / (N * sizeof(uint_t));
-//      std::cout << "N = " << N << std::endl;
-//      std::cout << "sizeof(uint_t) = " << sizeof(uint_t) << std::endl;
-//      std::cout << "n = " << n << std::endl;
       for (size_t i = 0; i < n; ++i) {
         uint64_t vals[N];
         for (size_t j = 0; j < N; ++j) {

From fa7970d2cecc93c6a4b1050fd73d770a731c3c81 Mon Sep 17 00:00:00 2001
From: Pavel Belevich <pbelevich@fb.com>
Date: Wed, 18 Nov 2020 19:31:29 -0500
Subject: [PATCH 17/26] Update on "encrypt/decrypt"

[ghstack-poisoned]
---
 torchcsprng/csrc/csprng.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchcsprng/csrc/csprng.h b/torchcsprng/csrc/csprng.h
index 1cf03be..56bb818 100644
--- a/torchcsprng/csrc/csprng.h
+++ b/torchcsprng/csrc/csprng.h
@@ -112,7 +112,7 @@ struct RNGValues {
 // `transform_func` is a callable that converts N `uint_t` random state sub-blocks passed in RNGValues into target dtype `scalar_t`
 template<typename scalar_t, typename uint_t, size_t N = 1, typename transform_t>
 void aes_helper(TensorIterator& iter, const uint8_t* key_bytes, transform_t transform_func) {
-  const auto output = iter.tensor(0);
+  auto output = iter.tensor(0);
   const auto output_offset_calc = make_offset_calculator<1>(TensorIterator::nullary_op(output));
   const auto output_index_calc = [output_offset_calc] TORCH_CSPRNG_HOST_DEVICE (uint32_t li) -> uint32_t {
     return output_offset_calc.get(li)[0];

From caefc3438b030a32e1378b0880981eda8a5521ab Mon Sep 17 00:00:00 2001
From: Pavel Belevich <pbelevich@fb.com>
Date: Wed, 18 Nov 2020 20:17:07 -0500
Subject: [PATCH 18/26] Update on "encrypt/decrypt"

[ghstack-poisoned]
---
 test/test_csprng.py             |  3 +++
 torchcsprng/csrc/block_cipher.h | 14 --------------
 2 files changed, 3 insertions(+), 14 deletions(-)

diff --git a/test/test_csprng.py b/test/test_csprng.py
index b19a811..78e5438 100644
--- a/test/test_csprng.py
+++ b/test/test_csprng.py
@@ -404,6 +404,9 @@ def sizeof(dtype):
                                         self.assertFalse(np.array_equal(initial_np, decrypted_np))
 
                                     csprng.decrypt(encrypted, decrypted, key, "aes128", mode)
+                                    decrypted_np = decrypted.cpu().numpy().view(np.int8)
+                                    if padding_size_bytes != 0:
+                                        decrypted_np = decrypted_np[:padding_size_bytes]
 
                                     self.assertTrue(np.array_equal(initial_np, decrypted_np))
 
diff --git a/torchcsprng/csrc/block_cipher.h b/torchcsprng/csrc/block_cipher.h
index 0d8575c..e3822e5 100644
--- a/torchcsprng/csrc/block_cipher.h
+++ b/torchcsprng/csrc/block_cipher.h
@@ -166,20 +166,6 @@ void block_cipher(
   }
 }
 
-//auto create_index_calc(Tensor input) {
-////  if (input.is_contiguous()) {
-////    const auto input_type_size = input.element_size();
-////    return [input_type_size] TORCH_CSPRNG_HOST_DEVICE (uint32_t li) -> uint32_t {
-////      return li * input_type_size;
-////    };
-////  } else {
-//    const auto input_offset_calc = make_offset_calculator<1>(TensorIterator::nullary_op(input));
-//    return [input_offset_calc] TORCH_CSPRNG_HOST_DEVICE (uint32_t li) -> uint32_t {
-//      return input_offset_calc.get(li)[0];
-//    };
-////  }
-//}
-
 template<int block_size, typename cipher_t>
 void block_cipher(Tensor input, Tensor output, cipher_t cipher) {
   const auto input_ptr = input.data_ptr();

From 71e00c328428cbe344db4835ca324ec5e2d0a72e Mon Sep 17 00:00:00 2001
From: Pavel Belevich <pbelevich@fb.com>
Date: Wed, 18 Nov 2020 20:34:08 -0500
Subject: [PATCH 19/26] Update on "encrypt/decrypt"

[ghstack-poisoned]
---
 test/test_csprng.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/test/test_csprng.py b/test/test_csprng.py
index 78e5438..21f0642 100644
--- a/test/test_csprng.py
+++ b/test/test_csprng.py
@@ -69,11 +69,6 @@ def test_random_to_kstest(self):
                 for dtype in self.num_dtypes:
                     t = torch.zeros(self.size, dtype=dtype, device=device).random_(to_, generator=gen)
                     res = stats.kstest(t.cpu(), stats.randint.cdf, args=(0, to_))
-                    # TODO REVERT!
-                    # if res.statistic >= 0.1:
-                    #     print(t[:10])
-                    #     print(dtype)
-                    #     print(res.statistic)
                     self.assertTrue(res.statistic < 0.1)
 
     @unittest.skipIf(not torch.cuda.is_available() or not csprng.supports_cuda(), "CUDA is not available or csprng was not compiled with CUDA support")
@@ -95,11 +90,6 @@ def test_random_from_to_kstest(self):
                             if from_ < to_:
                                 t = torch.zeros(self.size, dtype=dtype, device=device).random_(from_, to_, generator=gen)
                                 res = stats.kstest(t.cpu(), stats.randint.cdf, args=(from_, to_))
-                                # TODO REVERT!
-                                # if res.statistic >= 0.1:
-                                #     print(t[:10])
-                                #     print(dtype)
-                                #     print(res.statistic)
                                 self.assertTrue(res.statistic < 0.2)
 
     @unittest.skipIf(not torch.cuda.is_available() or not csprng.supports_cuda(), "CUDA is not available or csprng was not compiled with CUDA support")

From 9469ece8e16c1bd67557b1120634aea998fe45ed Mon Sep 17 00:00:00 2001
From: Pavel Belevich <pbelevich@fb.com>
Date: Wed, 18 Nov 2020 21:06:10 -0500
Subject: [PATCH 20/26] Update on "encrypt/decrypt"

[ghstack-poisoned]
---
 torchcsprng/csrc/csprng.h | 86 +++++++++++++++++++++------------------
 1 file changed, 46 insertions(+), 40 deletions(-)

diff --git a/torchcsprng/csrc/csprng.h b/torchcsprng/csrc/csprng.h
index 56bb818..cf9fe0d 100644
--- a/torchcsprng/csrc/csprng.h
+++ b/torchcsprng/csrc/csprng.h
@@ -474,33 +474,57 @@ Tensor& randperm_generator_out(Tensor& result, int64_t n, c10::optional<Generato
 
 // ====================================================================================================================
 
-// Let's assume that input and output have integral dtype, so there is no transform for now.
-Tensor encrypt_pybind(Tensor input, Tensor output, Tensor key, const std::string& cipher, const std::string& mode) {
-//  TORCH_CHECK(input.numel() * input.itemsize() == output.numel() * output.itemsize(), "input and output tensors must have the same size in byte");
+void check_cipher(const std::string& cipher, Tensor key) {
   if (cipher == "aes128") {
     TORCH_CHECK(key.element_size() * key.numel() == 16, "key tensor must have 16 bytes(128 bits)");
   } else {
     TORCH_CHECK(false, "encrypt/decrypt supports \"aes128\" cipher, \"", cipher, "\" is not supported.");
   }
+}
+
+void aes_ecb_encrypt(Tensor input, Tensor output, uint8_t* key_bytes) {
+  block_cipher<aes::block_t_size>(input, output,
+    [key_bytes] TORCH_CSPRNG_HOST_DEVICE (int64_t idx, uint8_t* block) -> void {
+      aes::encrypt(block, key_bytes);
+    }
+  );
+}
+
+void aes_ecb_decrypt(Tensor input, Tensor output, uint8_t* key_bytes) {
+  block_cipher<aes::block_t_size>(input, output,
+    [key_bytes] TORCH_CSPRNG_HOST_DEVICE (int64_t idx, uint8_t* block) -> void {
+      aes::decrypt(block, key_bytes);
+    }
+  );
+}
+
+void aes_ctr_encrypt(Tensor input, Tensor output, uint8_t* key_bytes) {
+  block_cipher<aes::block_t_size>(input, output,
+    [key_bytes] TORCH_CSPRNG_HOST_DEVICE (int64_t idx, uint8_t* block) -> void {
+      uint8_t idx_block[aes::block_t_size];
+      std::memset(&idx_block, 0, aes::block_t_size);
+      *(reinterpret_cast<int64_t*>(idx_block)) = idx;
+      aes::encrypt(idx_block, key_bytes);
+      for (size_t i = 0; i < aes::block_t_size; i++) {
+        block[i] ^= idx_block[i];
+      }
+    }
+  );
+}
+
+void aes_ctr_decrypt(Tensor input, Tensor output, uint8_t* key_bytes) {
+  aes_ctr_encrypt(input, output, key_bytes);
+}
+
+// Let's assume that input and output have integral dtype, so there is no transform for now.
+Tensor encrypt_pybind(Tensor input, Tensor output, Tensor key, const std::string& cipher, const std::string& mode) {
+//  TORCH_CHECK(input.numel() * input.itemsize() == output.numel() * output.itemsize(), "input and output tensors must have the same size in byte");
+  check_cipher(cipher, key);
   const auto key_bytes = reinterpret_cast<uint8_t*>(key.contiguous().data_ptr());
   if (mode == "ecb") {
-    block_cipher<aes::block_t_size>(input, output,
-      [key_bytes] TORCH_CSPRNG_HOST_DEVICE (int64_t idx, uint8_t* block) -> void {
-        aes::encrypt(block, key_bytes);
-      }
-    );
+    aes_ecb_encrypt(input, output, key_bytes);
   } else if (mode == "ctr") {
-    block_cipher<aes::block_t_size>(input, output,
-      [key_bytes] TORCH_CSPRNG_HOST_DEVICE (int64_t idx, uint8_t* block) -> void {
-        uint8_t idx_block[aes::block_t_size];
-        std::memset(&idx_block, 0, aes::block_t_size);
-        *(reinterpret_cast<int64_t*>(idx_block)) = idx;
-        aes::encrypt(idx_block, key_bytes);
-        for (size_t i = 0; i < aes::block_t_size; i++) {
-          block[i] ^= idx_block[i];
-        }
-      }
-    );
+    aes_ctr_encrypt(input, output, key_bytes);
   } else {
     TORCH_CHECK(false, "encrypt/decrypt supports \"ecb\" and \"ctr\" modes, \"", mode, "\" is not supported.");
   }
@@ -510,30 +534,12 @@ Tensor encrypt_pybind(Tensor input, Tensor output, Tensor key, const std::string
 // Let's assume that input and output have integral dtype, so there is no transform for now.
 Tensor decrypt_pybind(Tensor input, Tensor output, Tensor key, std::string cipher, std::string mode) {
 //  TORCH_CHECK(input.numel() * input.itemsize() == output.numel() * output.itemsize(), "input and output tensors must have the same size in byte");
-  if (cipher == "aes128") {
-    TORCH_CHECK(key.element_size() * key.numel() == 16, "key tensor must have 16 bytes(128 bits)");
-  } else {
-    TORCH_CHECK(false, "encrypt/decrypt supports \"aes128\" cipher, \"", cipher, "\" is not supported.");
-  }
+  check_cipher(cipher, key);
   const auto key_bytes = reinterpret_cast<uint8_t*>(key.contiguous().data_ptr());
   if (mode == "ecb") {
-    block_cipher<aes::block_t_size>(input, output,
-      [key_bytes] TORCH_CSPRNG_HOST_DEVICE (int64_t idx, uint8_t* block) -> void {
-        aes::decrypt(block, key_bytes);
-      }
-    );
+    aes_ecb_decrypt(input, output, key_bytes);
   } else if (mode == "ctr") {
-    block_cipher<aes::block_t_size>(input, output,
-      [key_bytes] TORCH_CSPRNG_HOST_DEVICE (int64_t idx, uint8_t* block) -> void {
-        uint8_t idx_block[aes::block_t_size];
-        std::memset(&idx_block, 0, aes::block_t_size);
-        *(reinterpret_cast<int64_t*>(idx_block)) = idx;
-        aes::encrypt(idx_block, key_bytes);
-        for (size_t i = 0; i < aes::block_t_size; i++) {
-          block[i] ^= idx_block[i];
-        }
-      }
-    );
+    aes_ctr_decrypt(input, output, key_bytes);
   } else {
     TORCH_CHECK(false, "encrypt/decrypt supports \"ecb\" and \"ctr\" modes, \"", mode, "\" is not supported.");
   }

From 8b01c563dd229b763c5b85faef696bb9a4d05ab2 Mon Sep 17 00:00:00 2001
From: Pavel Belevich <pbelevich@fb.com>
Date: Wed, 18 Nov 2020 21:35:13 -0500
Subject: [PATCH 21/26] Update on "encrypt/decrypt"

[ghstack-poisoned]
---
 torchcsprng/csrc/aes.h    | 10 +---------
 torchcsprng/csrc/csprng.h | 13 +++++++++++--
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/torchcsprng/csrc/aes.h b/torchcsprng/csrc/aes.h
index bd41928..c4ece07 100644
--- a/torchcsprng/csrc/aes.h
+++ b/torchcsprng/csrc/aes.h
@@ -63,15 +63,7 @@ namespace aes {
     #define Nr 10       // The number of rounds in AES Cipher.
 #endif
 
-#if !defined(__CUDACC__) && !defined(__HIPCC__)
-struct ulonglong2 // TODO: should have something like `__builtin_align__(16)`
-{
-  unsigned long long int x, y;
-};
-#endif
-
-typedef ulonglong2 block_t;
-constexpr size_t block_t_size = sizeof(block_t);
+constexpr size_t block_t_size = 16;
 
 typedef uint8_t state_t[4][4];
 
diff --git a/torchcsprng/csrc/csprng.h b/torchcsprng/csrc/csprng.h
index cf9fe0d..21924db 100644
--- a/torchcsprng/csrc/csprng.h
+++ b/torchcsprng/csrc/csprng.h
@@ -518,7 +518,13 @@ void aes_ctr_decrypt(Tensor input, Tensor output, uint8_t* key_bytes) {
 
 // Let's assume that input and output have integral dtype, so there is no transform for now.
 Tensor encrypt_pybind(Tensor input, Tensor output, Tensor key, const std::string& cipher, const std::string& mode) {
-//  TORCH_CHECK(input.numel() * input.itemsize() == output.numel() * output.itemsize(), "input and output tensors must have the same size in byte");
+  const auto output_size_bytes = output.numel() * output.itemsize();
+  const auto input_size_bytes = input.numel() * input.itemsize();
+  const auto input_size_bytes_rounded = (input_size_bytes + aes::block_t_size - 1) / aes::block_t_size * aes::block_t_size;
+  TORCH_CHECK(output_size_bytes == input_size_bytes_rounded,
+              "output size in bytes(", output_size_bytes,
+              ") is not equal to input size in bytes rounded to block size(",
+              input_size_bytes_rounded, ")");
   check_cipher(cipher, key);
   const auto key_bytes = reinterpret_cast<uint8_t*>(key.contiguous().data_ptr());
   if (mode == "ecb") {
@@ -533,7 +539,10 @@ Tensor encrypt_pybind(Tensor input, Tensor output, Tensor key, const std::string
 
 // Let's assume that input and output have integral dtype, so there is no transform for now.
 Tensor decrypt_pybind(Tensor input, Tensor output, Tensor key, std::string cipher, std::string mode) {
-//  TORCH_CHECK(input.numel() * input.itemsize() == output.numel() * output.itemsize(), "input and output tensors must have the same size in byte");
+  const auto output_size_bytes = output.numel() * output.itemsize();
+  const auto input_size_bytes = input.numel() * input.itemsize();
+  TORCH_CHECK(output_size_bytes == input_size_bytes, "input and output tensors must have the same size in byte");
+  TORCH_CHECK(input_size_bytes % aes::block_t_size == 0, "input tensor size in bytes must divisible by cipher block size in bytes without reminder");
   check_cipher(cipher, key);
   const auto key_bytes = reinterpret_cast<uint8_t*>(key.contiguous().data_ptr());
   if (mode == "ecb") {

From 7f91368fa3d8db1b6e4769e9a9c1aa77255e7b2b Mon Sep 17 00:00:00 2001
From: Pavel Belevich <pbelevich@fb.com>
Date: Wed, 18 Nov 2020 21:38:08 -0500
Subject: [PATCH 22/26] Update on "encrypt/decrypt"

[ghstack-poisoned]
---
 torchcsprng/csrc/csprng.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchcsprng/csrc/csprng.h b/torchcsprng/csrc/csprng.h
index 21924db..c59fb23 100644
--- a/torchcsprng/csrc/csprng.h
+++ b/torchcsprng/csrc/csprng.h
@@ -542,7 +542,7 @@ Tensor decrypt_pybind(Tensor input, Tensor output, Tensor key, std::string ciphe
   const auto output_size_bytes = output.numel() * output.itemsize();
   const auto input_size_bytes = input.numel() * input.itemsize();
   TORCH_CHECK(output_size_bytes == input_size_bytes, "input and output tensors must have the same size in byte");
-  TORCH_CHECK(input_size_bytes % aes::block_t_size == 0, "input tensor size in bytes must divisible by cipher block size in bytes without reminder");
+  TORCH_CHECK(input_size_bytes % aes::block_t_size == 0, "input tensor size in bytes must divisible by cipher block size in bytes");
   check_cipher(cipher, key);
   const auto key_bytes = reinterpret_cast<uint8_t*>(key.contiguous().data_ptr());
   if (mode == "ecb") {

From 5d9c1c75c54b786f38332d47d72f8d746188017e Mon Sep 17 00:00:00 2001
From: Pavel Belevich <pbelevich@fb.com>
Date: Wed, 18 Nov 2020 22:41:18 -0500
Subject: [PATCH 23/26] Update on "torchcsprng.encrypt/torchcsprng.decrypt with
 AES128 ECB/CTR support"

This PR introduces two new methods to `torchcsprng` module:
* `torchcsprng.encrypt(input: Tensor, output: Tensor, key: Tensor, cipher: string, mode: string)`

> - `input` tensor can be any CPU or CUDA tensor of any dtype and size in bytes(zero-padding is used to make its size in bytes divisible by block size in bytes)
> - `output` tensor can have any dtype and the same device as `input` tensor and the size in bytes rounded to the block size in bytes(16 bytes for AES 128)
> - `key` tensor can be any CPU or CUDA tensor of any dtype and size in bytes equal to 16 for AES 128
> - `cipher` currently can be only one supported value `"aes128"`
> - `mode` currently can be either [`"ecb"`](https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation#Electronic_codebook_(ECB)) or [`"ctr"`](https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation#Counter_(CTR))

* `torchcsprng.decrypt(input: Tensor, output: Tensor, key: Tensor, cipher: string, mode: string)`

> - `input` tensor can be any CPU or CUDA tensor of any dtype with size in bytes divisible by the block size in bytes(16 bytes for AES 128)
> - `output` tensor can have any dtype but the same device as `input` tensor and the same size in bytes as `input` tensor
> - `key` tensor can be any CPU or CUDA tensor of any dtype and size in bytes equal to 16 for AES 128
> - `cipher` currently can be only one supported value `"aes128"`
> - `mode` currently can be either [`"ecb"`](https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation#Electronic_codebook_(ECB)) or [`"ctr"`](https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation#Counter_(CTR))

Also this PR unifies encryption/decryption with existing random number generation which uses AES128 in CTR mode

Fixes #77


[ghstack-poisoned]
---
 torchcsprng/csrc/csprng.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/torchcsprng/csrc/csprng.h b/torchcsprng/csrc/csprng.h
index c59fb23..4c50eb2 100644
--- a/torchcsprng/csrc/csprng.h
+++ b/torchcsprng/csrc/csprng.h
@@ -518,6 +518,7 @@ void aes_ctr_decrypt(Tensor input, Tensor output, uint8_t* key_bytes) {
 
 // Let's assume that input and output have integral dtype, so there is no transform for now.
 Tensor encrypt_pybind(Tensor input, Tensor output, Tensor key, const std::string& cipher, const std::string& mode) {
+  TORCH_CHECK(input.device() == output.device(), "input and output tensors must have the same device");
   const auto output_size_bytes = output.numel() * output.itemsize();
   const auto input_size_bytes = input.numel() * input.itemsize();
   const auto input_size_bytes_rounded = (input_size_bytes + aes::block_t_size - 1) / aes::block_t_size * aes::block_t_size;
@@ -526,7 +527,7 @@ Tensor encrypt_pybind(Tensor input, Tensor output, Tensor key, const std::string
               ") is not equal to input size in bytes rounded to block size(",
               input_size_bytes_rounded, ")");
   check_cipher(cipher, key);
-  const auto key_bytes = reinterpret_cast<uint8_t*>(key.contiguous().data_ptr());
+  const auto key_bytes = reinterpret_cast<uint8_t*>(key.cpu().contiguous().data_ptr());
   if (mode == "ecb") {
     aes_ecb_encrypt(input, output, key_bytes);
   } else if (mode == "ctr") {
@@ -539,12 +540,13 @@ Tensor encrypt_pybind(Tensor input, Tensor output, Tensor key, const std::string
 
 // Let's assume that input and output have integral dtype, so there is no transform for now.
 Tensor decrypt_pybind(Tensor input, Tensor output, Tensor key, std::string cipher, std::string mode) {
+  TORCH_CHECK(input.device() == output.device(), "input and output tensors must have the same device");
   const auto output_size_bytes = output.numel() * output.itemsize();
   const auto input_size_bytes = input.numel() * input.itemsize();
   TORCH_CHECK(output_size_bytes == input_size_bytes, "input and output tensors must have the same size in byte");
   TORCH_CHECK(input_size_bytes % aes::block_t_size == 0, "input tensor size in bytes must divisible by cipher block size in bytes");
   check_cipher(cipher, key);
-  const auto key_bytes = reinterpret_cast<uint8_t*>(key.contiguous().data_ptr());
+  const auto key_bytes = reinterpret_cast<uint8_t*>(key.cpu().contiguous().data_ptr());
   if (mode == "ecb") {
     aes_ecb_decrypt(input, output, key_bytes);
   } else if (mode == "ctr") {

From c1c75bba4908c7120b74217a402a18510b48796e Mon Sep 17 00:00:00 2001
From: Pavel Belevich <pbelevich@fb.com>
Date: Wed, 18 Nov 2020 22:48:33 -0500
Subject: [PATCH 24/26] Update on "torchcsprng.encrypt/torchcsprng.decrypt with
 AES128 ECB/CTR support"

This PR introduces two new methods to `torchcsprng` module:
* `torchcsprng.encrypt(input: Tensor, output: Tensor, key: Tensor, cipher: string, mode: string)`

> - `input` tensor can be any CPU or CUDA tensor of any dtype and size in bytes(zero-padding is used to make its size in bytes divisible by block size in bytes)
> - `output` tensor can have any dtype and the same device as `input` tensor and the size in bytes rounded to the block size in bytes(16 bytes for AES 128)
> - `key` tensor can be any CPU or CUDA tensor of any dtype and size in bytes equal to 16 for AES 128
> - `cipher` currently can be only one supported value `"aes128"`
> - `mode` currently can be either [`"ecb"`](https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation#Electronic_codebook_(ECB)) or [`"ctr"`](https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation#Counter_(CTR))

* `torchcsprng.decrypt(input: Tensor, output: Tensor, key: Tensor, cipher: string, mode: string)`

> - `input` tensor can be any CPU or CUDA tensor of any dtype with size in bytes divisible by the block size in bytes(16 bytes for AES 128)
> - `output` tensor can have any dtype but the same device as `input` tensor and the same size in bytes as `input` tensor
> - `key` tensor can be any CPU or CUDA tensor of any dtype and size in bytes equal to 16 for AES 128
> - `cipher` currently can be only one supported value `"aes128"`
> - `mode` currently can be either [`"ecb"`](https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation#Electronic_codebook_(ECB)) or [`"ctr"`](https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation#Counter_(CTR))

Also this PR unifies encryption/decryption with existing random number generation which uses AES128 in CTR mode

Fixes #77


[ghstack-poisoned]
---
 torchcsprng/csrc/csprng.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/torchcsprng/csrc/csprng.h b/torchcsprng/csrc/csprng.h
index 4c50eb2..57c4e54 100644
--- a/torchcsprng/csrc/csprng.h
+++ b/torchcsprng/csrc/csprng.h
@@ -518,7 +518,7 @@ void aes_ctr_decrypt(Tensor input, Tensor output, uint8_t* key_bytes) {
 
 // Let's assume that input and output have integral dtype, so there is no transform for now.
 Tensor encrypt_pybind(Tensor input, Tensor output, Tensor key, const std::string& cipher, const std::string& mode) {
-  TORCH_CHECK(input.device() == output.device(), "input and output tensors must have the same device");
+  TORCH_CHECK(input.device() == output.device() && input.device() == key.device(), "input, output and key tensors must have the same device");
   const auto output_size_bytes = output.numel() * output.itemsize();
   const auto input_size_bytes = input.numel() * input.itemsize();
   const auto input_size_bytes_rounded = (input_size_bytes + aes::block_t_size - 1) / aes::block_t_size * aes::block_t_size;
@@ -527,7 +527,7 @@ Tensor encrypt_pybind(Tensor input, Tensor output, Tensor key, const std::string
               ") is not equal to input size in bytes rounded to block size(",
               input_size_bytes_rounded, ")");
   check_cipher(cipher, key);
-  const auto key_bytes = reinterpret_cast<uint8_t*>(key.cpu().contiguous().data_ptr());
+  const auto key_bytes = reinterpret_cast<uint8_t*>(key.contiguous().data_ptr());
   if (mode == "ecb") {
     aes_ecb_encrypt(input, output, key_bytes);
   } else if (mode == "ctr") {
@@ -540,13 +540,13 @@ Tensor encrypt_pybind(Tensor input, Tensor output, Tensor key, const std::string
 
 // Let's assume that input and output have integral dtype, so there is no transform for now.
 Tensor decrypt_pybind(Tensor input, Tensor output, Tensor key, std::string cipher, std::string mode) {
-  TORCH_CHECK(input.device() == output.device(), "input and output tensors must have the same device");
+  TORCH_CHECK(input.device() == output.device() && input.device() == key.device(), "input, output and key tensors must have the same device");
   const auto output_size_bytes = output.numel() * output.itemsize();
   const auto input_size_bytes = input.numel() * input.itemsize();
   TORCH_CHECK(output_size_bytes == input_size_bytes, "input and output tensors must have the same size in byte");
   TORCH_CHECK(input_size_bytes % aes::block_t_size == 0, "input tensor size in bytes must divisible by cipher block size in bytes");
   check_cipher(cipher, key);
-  const auto key_bytes = reinterpret_cast<uint8_t*>(key.cpu().contiguous().data_ptr());
+  const auto key_bytes = reinterpret_cast<uint8_t*>(key.contiguous().data_ptr());
   if (mode == "ecb") {
     aes_ecb_decrypt(input, output, key_bytes);
   } else if (mode == "ctr") {

From c45ff91962cdc0022202d6ba5ffe03391e841288 Mon Sep 17 00:00:00 2001
From: Pavel Belevich <pbelevich@fb.com>
Date: Thu, 19 Nov 2020 17:45:01 -0500
Subject: [PATCH 25/26] Update on "torchcsprng.encrypt/torchcsprng.decrypt with
 AES128 ECB/CTR support"

This PR introduces two new methods to `torchcsprng` module:
* `torchcsprng.encrypt(input: Tensor, output: Tensor, key: Tensor, cipher: string, mode: string)`

> - `input` tensor can be any CPU or CUDA tensor of any dtype and size in bytes(zero-padding is used to make its size in bytes divisible by block size in bytes)
> - `output` tensor can have any dtype and the same device as `input` tensor and the size in bytes rounded to the block size in bytes(16 bytes for AES 128)
> - `key` tensor can have any dtype and the same device as `input` tensor and size in bytes equal to 16 for AES 128
> - `cipher` currently can be only one supported value `"aes128"`
> - `mode` currently can be either [`"ecb"`](https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation#Electronic_codebook_(ECB)) or [`"ctr"`](https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation#Counter_(CTR))

* `torchcsprng.decrypt(input: Tensor, output: Tensor, key: Tensor, cipher: string, mode: string)`

> - `input` tensor can be any CPU or CUDA tensor of any dtype with size in bytes divisible by the block size in bytes(16 bytes for AES 128)
> - `output` tensor can have any dtype but the same device as `input` tensor and the same size in bytes as `input` tensor
> - `key` tensor can have any dtype and the same device as `input` tensor and size in bytes equal to 16 for AES 128
> - `cipher` currently can be only one supported value `"aes128"`
> - `mode` currently can be either [`"ecb"`](https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation#Electronic_codebook_(ECB)) or [`"ctr"`](https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation#Counter_(CTR))

Also this PR unifies encryption/decryption with existing random number generation which uses AES128 in CTR mode

Fixes #77


Differential Revision: [D25080624](https://our.internmc.facebook.com/intern/diff/D25080624)

[ghstack-poisoned]
---
 torchcsprng/csrc/block_cipher.h | 6 ------
 torchcsprng/csrc/csprng.h       | 8 ++++----
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/torchcsprng/csrc/block_cipher.h b/torchcsprng/csrc/block_cipher.h
index e3822e5..d099b5a 100644
--- a/torchcsprng/csrc/block_cipher.h
+++ b/torchcsprng/csrc/block_cipher.h
@@ -131,14 +131,8 @@ void block_cipher(
   if (output_ptr == nullptr || output_numel == 0) {
     return;
   }
-//  TORCH_CHECK((input_numel * input_type_size + block_size - 1) / block_size * block_size == output_numel * output_type_size, "wrong size");
-
-//  const auto size_in_bytes = input_numel * input_type_size;
-//  const auto size_in_bytes = output_numel * output_type_size;
 
   if (device.type() == at::kCPU) {
-//    const auto total = (size_in_bytes + block_size - 1) / block_size;
-//    const auto total = (size_in_bytes + block_size / N - 1) / block_size * N;
     const auto total = (output_numel + output_elem_per_block - 1) / output_elem_per_block;
     block_cipher_kernel_cpu<block_size>(total,
         cipher, output_elem_per_block,
diff --git a/torchcsprng/csrc/csprng.h b/torchcsprng/csrc/csprng.h
index 57c4e54..57d584e 100644
--- a/torchcsprng/csrc/csprng.h
+++ b/torchcsprng/csrc/csprng.h
@@ -67,17 +67,17 @@ at::Tensor key_tensor(size_t block_t_size, c10::optional<at::Generator> generato
   if (gen->key().defined()) {
     return gen->key().clone();
   }
-  auto t = torch::empty({static_cast<signed long>(block_t_size)}, torch::kUInt8);
-  using random_t = uint32_t;
+  auto key = torch::empty({static_cast<signed long>(block_t_size)}, torch::kUInt8);
+  using random_t = typename std::result_of<decltype(&RNG::random)(RNG)>::type;
   constexpr size_t random_t_size = sizeof(random_t);
   for (size_t i = 0; i < block_t_size / random_t_size; i++) {
     const auto rand = gen->random();
     for (size_t j = 0; j < random_t_size; j++) {
       size_t k = i * random_t_size + j;
-      t[k] = static_cast<uint8_t>((rand >> (j * 8)) & 0xff);
+      key[k] = static_cast<uint8_t>((rand >> (j * 8)) & 0xff);
     }
   }
-  return t;
+  return key;
 }
 
 template<typename RNG>

From 36545c291cea483c9ce916f520f2cb5eb889d62f Mon Sep 17 00:00:00 2001
From: Pavel Belevich <pbelevich@fb.com>
Date: Mon, 30 Nov 2020 11:37:31 -0500
Subject: [PATCH 26/26] Update on "torchcsprng.encrypt/torchcsprng.decrypt with
 AES128 ECB/CTR support"

This PR introduces two new methods to `torchcsprng` module:
* `torchcsprng.encrypt(input: Tensor, output: Tensor, key: Tensor, cipher: string, mode: string)`

> - `input` tensor can be any CPU or CUDA tensor of any dtype and size in bytes(zero-padding is used to make its size in bytes divisible by block size in bytes)
> - `output` tensor can have any dtype and the same device as `input` tensor and the size in bytes rounded to the block size in bytes(16 bytes for AES 128)
> - `key` tensor can have any dtype and the same device as `input` tensor and size in bytes equal to 16 for AES 128
> - `cipher` currently can be only one supported value `"aes128"`
> - `mode` currently can be either [`"ecb"`](https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation#Electronic_codebook_(ECB)) or [`"ctr"`](https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation#Counter_(CTR))

* `torchcsprng.decrypt(input: Tensor, output: Tensor, key: Tensor, cipher: string, mode: string)`

> - `input` tensor can be any CPU or CUDA tensor of any dtype with size in bytes divisible by the block size in bytes(16 bytes for AES 128)
> - `output` tensor can have any dtype but the same device as `input` tensor and the same size in bytes as `input` tensor
> - `key` tensor can have any dtype and the same device as `input` tensor and size in bytes equal to 16 for AES 128
> - `cipher` currently can be only one supported value `"aes128"`
> - `mode` currently can be either [`"ecb"`](https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation#Electronic_codebook_(ECB)) or [`"ctr"`](https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation#Counter_(CTR))

Also this PR unifies encryption/decryption with existing random number generation which uses AES128 in CTR mode

Fixes #77


Differential Revision: [D25080624](https://our.internmc.facebook.com/intern/diff/D25080624)

[ghstack-poisoned]
---
 torchcsprng/csrc/block_cipher.h | 4 ++--
 torchcsprng/csrc/csprng.h       | 4 +---
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/torchcsprng/csrc/block_cipher.h b/torchcsprng/csrc/block_cipher.h
index d099b5a..8418e3d 100644
--- a/torchcsprng/csrc/block_cipher.h
+++ b/torchcsprng/csrc/block_cipher.h
@@ -37,7 +37,7 @@ TORCH_CSPRNG_HOST_DEVICE static void copy_input_to_block(int64_t idx, uint8_t* b
     const auto linear_index = idx * (block_size / input_type_size) + i;
     if (linear_index < input_numel) {
       std::memcpy(
-          &(block[i * input_type_size]),
+          block + i * input_type_size,
           &(reinterpret_cast<uint8_t*>(input_ptr)[input_index_calc(linear_index)]),
           input_type_size
       );
@@ -53,7 +53,7 @@ TORCH_CSPRNG_HOST_DEVICE static void copy_block_to_output(int64_t idx, uint8_t*
     if (linear_index < output_numel) {
       std::memcpy(
           &(reinterpret_cast<uint8_t*>(output_ptr)[output_index_calc(linear_index)]),
-          &(block[i * output_type_size]),
+          block + i * output_type_size,
           output_type_size
       );
     }
diff --git a/torchcsprng/csrc/csprng.h b/torchcsprng/csrc/csprng.h
index 57d584e..cf31c10 100644
--- a/torchcsprng/csrc/csprng.h
+++ b/torchcsprng/csrc/csprng.h
@@ -516,7 +516,6 @@ void aes_ctr_decrypt(Tensor input, Tensor output, uint8_t* key_bytes) {
   aes_ctr_encrypt(input, output, key_bytes);
 }
 
-// Let's assume that input and output have integral dtype, so there is no transform for now.
 Tensor encrypt_pybind(Tensor input, Tensor output, Tensor key, const std::string& cipher, const std::string& mode) {
   TORCH_CHECK(input.device() == output.device() && input.device() == key.device(), "input, output and key tensors must have the same device");
   const auto output_size_bytes = output.numel() * output.itemsize();
@@ -538,8 +537,7 @@ Tensor encrypt_pybind(Tensor input, Tensor output, Tensor key, const std::string
   return output;
 }
 
-// Let's assume that input and output have integral dtype, so there is no transform for now.
-Tensor decrypt_pybind(Tensor input, Tensor output, Tensor key, std::string cipher, std::string mode) {
+Tensor decrypt_pybind(Tensor input, Tensor output, Tensor key, const std::string& cipher, const std::string& mode) {
   TORCH_CHECK(input.device() == output.device() && input.device() == key.device(), "input, output and key tensors must have the same device");
   const auto output_size_bytes = output.numel() * output.itemsize();
   const auto input_size_bytes = input.numel() * input.itemsize();