diff --git a/test/test_csprng.py b/test/test_csprng.py
index b4c5443..21f0642 100644
--- a/test/test_csprng.py
+++ b/test/test_csprng.py
@@ -354,5 +354,51 @@ def test_const_generator(self):
                     second = torch.empty(self.size, dtype=dtype, device=device).random_(generator=const_gen)
                     self.assertTrue((first - second).max().abs() == 0)
 
+    def test_encrypt_decrypt(self):
+        key_size_bytes = 16
+        block_size_bytes = 16
+
+        def sizeof(dtype):
+            if dtype == torch.bool:
+                return 1
+            elif dtype.is_floating_point:
+                return torch.finfo(dtype).bits // 8
+            else:
+                return torch.iinfo(dtype).bits // 8
+
+        for device in self.all_devices:
+            for key_dtype in self.all_dtypes:
+                key_size = key_size_bytes // sizeof(key_dtype)
+                key = torch.empty(key_size, dtype=key_dtype, device=device).random_()
+                for initial_dtype in self.all_dtypes:
+                    for encrypted_dtype in self.all_dtypes:
+                        for decrypted_dtype in self.all_dtypes:
+                            for initial_size in [0, 4, 8, 15, 16, 23, 42]:
+                                for mode in ["ecb", "ctr"]:
+                                    encrypted_size = (initial_size * sizeof(initial_dtype) + block_size_bytes - 1) // block_size_bytes * block_size_bytes // sizeof(encrypted_dtype)
+                                    decrypted_size = (encrypted_size * sizeof(encrypted_dtype) + block_size_bytes - 1) // block_size_bytes * block_size_bytes // sizeof(decrypted_dtype)
+
+                                    initial = torch.empty(initial_size, dtype=initial_dtype, device=device).random_()
+                                    encrypted = torch.empty(encrypted_size, dtype=encrypted_dtype, device=device).random_()
+                                    decrypted = torch.empty(decrypted_size, dtype=decrypted_dtype, device=device).random_()
+
+                                    initial_np = initial.cpu().numpy().view(np.int8)
+                                    decrypted_np = decrypted.cpu().numpy().view(np.int8)
+                                    padding_size_bytes = initial_size * sizeof(initial_dtype) - decrypted_size * sizeof(decrypted_dtype)
+                                    if padding_size_bytes != 0:
+                                        decrypted_np = decrypted_np[:padding_size_bytes]
+
+                                    csprng.encrypt(initial, encrypted, key, "aes128", mode)
+
+                                    if initial_size > 8:
+                                        self.assertFalse(np.array_equal(initial_np, decrypted_np))
+
+                                    csprng.decrypt(encrypted, decrypted, key, "aes128", mode)
+                                    decrypted_np = decrypted.cpu().numpy().view(np.int8)
+                                    if padding_size_bytes != 0:
+                                        decrypted_np = decrypted_np[:padding_size_bytes]
+
+                                    self.assertTrue(np.array_equal(initial_np, decrypted_np))
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/torchcsprng/csrc/aes.h b/torchcsprng/csrc/aes.h
index 09596f1..c4ece07 100644
--- a/torchcsprng/csrc/aes.h
+++ b/torchcsprng/csrc/aes.h
@@ -63,15 +63,7 @@ namespace aes {
     #define Nr 10       // The number of rounds in AES Cipher.
 #endif
 
-#if !defined(__CUDACC__) && !defined(__HIPCC__)
-struct ulonglong2 // TODO: should have something like `__builtin_align__(16)`
-{
-  unsigned long long int x, y;
-};
-#endif
-
-typedef ulonglong2 block_t;
-constexpr size_t block_t_size = sizeof(block_t);
+constexpr size_t block_t_size = 16;
 
 typedef uint8_t state_t[4][4];
 
@@ -97,6 +89,24 @@ TORCH_CSPRNG_CONSTANT const uint8_t sbox[256] = {
   0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
   0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 };
 
+TORCH_CSPRNG_CONSTANT const uint8_t rsbox[256] = {
+    0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
+    0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
+    0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
+    0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
+    0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
+    0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
+    0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
+    0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
+    0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
+    0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
+    0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
+    0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
+    0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
+    0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
+    0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
+    0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d };
+
 // The round constant word array, Rcon[i], contains the values given by 
 // x to the power (i-1) being powers of x (x is denoted as {02}) in the field GF(2^8)
 TORCH_CSPRNG_CONSTANT const uint8_t Rcon[11] = {
@@ -104,6 +114,8 @@ TORCH_CSPRNG_CONSTANT const uint8_t Rcon[11] = {
 
 #define getSBoxValue(num) (sbox[(num)])
 
+#define getSBoxInvert(num) (rsbox[(num)])
+
 // This function produces Nb(Nr+1) round keys. The round keys are used in each round to decrypt the states. 
 TORCH_CSPRNG_HOST_DEVICE void KeyExpansion(uint8_t* RoundKey, const uint8_t* Key){
   unsigned int i, j, k;
@@ -257,6 +269,78 @@ TORCH_CSPRNG_HOST_DEVICE void MixColumns(state_t* state)
   }
 }
 
+TORCH_CSPRNG_HOST_DEVICE uint8_t Multiply(uint8_t x, uint8_t y)
+{
+  return (((y & 1) * x) ^
+          ((y>>1 & 1) * xtime(x)) ^
+          ((y>>2 & 1) * xtime(xtime(x))) ^
+          ((y>>3 & 1) * xtime(xtime(xtime(x)))) ^
+          ((y>>4 & 1) * xtime(xtime(xtime(xtime(x)))))); /* this last call to xtime() can be omitted */
+}
+
+// MixColumns function mixes the columns of the state matrix.
+// The method used to multiply may be difficult to understand for the inexperienced.
+// Please use the references to gain more information.
+TORCH_CSPRNG_HOST_DEVICE void InvMixColumns(state_t* state)
+{
+  int i;
+  uint8_t a, b, c, d;
+  for (i = 0; i < 4; ++i)
+  {
+    a = (*state)[i][0];
+    b = (*state)[i][1];
+    c = (*state)[i][2];
+    d = (*state)[i][3];
+
+    (*state)[i][0] = Multiply(a, 0x0e) ^ Multiply(b, 0x0b) ^ Multiply(c, 0x0d) ^ Multiply(d, 0x09);
+    (*state)[i][1] = Multiply(a, 0x09) ^ Multiply(b, 0x0e) ^ Multiply(c, 0x0b) ^ Multiply(d, 0x0d);
+    (*state)[i][2] = Multiply(a, 0x0d) ^ Multiply(b, 0x09) ^ Multiply(c, 0x0e) ^ Multiply(d, 0x0b);
+    (*state)[i][3] = Multiply(a, 0x0b) ^ Multiply(b, 0x0d) ^ Multiply(c, 0x09) ^ Multiply(d, 0x0e);
+  }
+}
+
+// The SubBytes Function Substitutes the values in the
+// state matrix with values in an S-box.
+TORCH_CSPRNG_HOST_DEVICE void InvSubBytes(state_t* state)
+{
+  uint8_t i, j;
+  for (i = 0; i < 4; ++i)
+  {
+    for (j = 0; j < 4; ++j)
+    {
+      (*state)[j][i] = getSBoxInvert((*state)[j][i]);
+    }
+  }
+}
+
+TORCH_CSPRNG_HOST_DEVICE void InvShiftRows(state_t* state)
+{
+  uint8_t temp;
+
+  // Rotate first row 1 columns to right
+  temp = (*state)[3][1];
+  (*state)[3][1] = (*state)[2][1];
+  (*state)[2][1] = (*state)[1][1];
+  (*state)[1][1] = (*state)[0][1];
+  (*state)[0][1] = temp;
+
+  // Rotate second row 2 columns to right
+  temp = (*state)[0][2];
+  (*state)[0][2] = (*state)[2][2];
+  (*state)[2][2] = temp;
+
+  temp = (*state)[1][2];
+  (*state)[1][2] = (*state)[3][2];
+  (*state)[3][2] = temp;
+
+  // Rotate third row 3 columns to right
+  temp = (*state)[0][3];
+  (*state)[0][3] = (*state)[1][3];
+  (*state)[1][3] = (*state)[2][3];
+  (*state)[2][3] = (*state)[3][3];
+  (*state)[3][3] = temp;
+}
+
 TORCH_CSPRNG_HOST_DEVICE void encrypt(uint8_t* state, const uint8_t* key) {
   uint8_t RoundKey[176];
   KeyExpansion(RoundKey, key); 
@@ -284,4 +368,29 @@ TORCH_CSPRNG_HOST_DEVICE void encrypt(uint8_t* state, const uint8_t* key) {
   AddRoundKey(Nr, (state_t*)state, RoundKey);
 }
 
+TORCH_CSPRNG_HOST_DEVICE void decrypt(uint8_t* state, const uint8_t* key) {
+  uint8_t RoundKey[176];
+  KeyExpansion(RoundKey, key);
+
+  uint8_t round = 0;
+
+  // Add the First round key to the state before starting the rounds.
+  AddRoundKey(Nr, (state_t*)state, RoundKey);
+
+  // There will be Nr rounds.
+  // The first Nr-1 rounds are identical.
+  // These Nr rounds are executed in the loop below.
+  // Last one without InvMixColumn()
+  for (round = (Nr - 1); ; --round)
+  {
+    InvShiftRows((state_t*)state);
+    InvSubBytes((state_t*)state);
+    AddRoundKey(round, (state_t*)state, RoundKey);
+    if (round == 0) {
+      break;
+    }
+    InvMixColumns((state_t*)state);
+  }
+}
+
 }}}
diff --git a/torchcsprng/csrc/block_cipher.h b/torchcsprng/csrc/block_cipher.h
index 2ca313f..8418e3d 100644
--- a/torchcsprng/csrc/block_cipher.h
+++ b/torchcsprng/csrc/block_cipher.h
@@ -30,144 +30,173 @@
 namespace torch {
 namespace csprng {
 
-// Generates `block_t_size`-bytes random key Tensor on CPU 
-// using `generator`, which must be an instance of `at::CPUGeneratorImpl`
-// and passes it to the `device`.
-template<typename RNG>
-at::Tensor key_tensor(size_t block_t_size, c10::optional<at::Generator> generator) {
-  std::lock_guard<std::mutex> lock(generator->mutex());
-  auto gen = at::check_generator<RNG>(generator);
-  if (gen->key().defined()) {
-    return gen->key().clone();
-  }
-  auto t = torch::empty({static_cast<signed long>(block_t_size)}, torch::kUInt8);
-  using random_t = uint32_t;
-  constexpr size_t random_t_size = sizeof(random_t);
-  for (size_t i = 0; i < block_t_size / random_t_size; i++) {
-    const auto rand = gen->random();
-    for (size_t j = 0; j < random_t_size; j++) {
-      size_t k = i * random_t_size + j;
-      t[k] = static_cast<uint8_t>((rand >> (j * 8)) & 0xff);
+template<typename input_index_calc_t>
+TORCH_CSPRNG_HOST_DEVICE static void copy_input_to_block(int64_t idx, uint8_t* block, int block_size,
+    void* input_ptr, int64_t input_numel, int input_type_size, input_index_calc_t input_index_calc) {
+  for (auto i = 0; i < block_size / input_type_size; ++i) {
+    const auto linear_index = idx * (block_size / input_type_size) + i;
+    if (linear_index < input_numel) {
+      std::memcpy(
+          block + i * input_type_size,
+          &(reinterpret_cast<uint8_t*>(input_ptr)[input_index_calc(linear_index)]),
+          input_type_size
+      );
     }
   }
-  return t;
 }
 
-// A simple container for random state sub-blocks that implements RNG interface 
-// with random() and random64() methods, that are used by transformation function
-template<size_t size>
-struct RNGValues {
-  TORCH_CSPRNG_HOST_DEVICE RNGValues(uint64_t* vals) {
-    memcpy(&vals_, vals, size * sizeof(uint64_t));
-  }
-  uint32_t TORCH_CSPRNG_HOST_DEVICE random() { auto res = static_cast<uint32_t>(vals_[index]); index++; return res; }
-  uint64_t TORCH_CSPRNG_HOST_DEVICE random64() { auto res = vals_[index]; index++; return res; }
-private:
-  uint64_t vals_[size];
-  int index = 0;
-};
-
-// Runs a block cipher in a counter mode in approximately `numel / (block_t_size / sizeof(uint_t) / N)` CUDA threads,
-// without any assumption about target tensor layout. It uses `index_calc` to find memory locations of
-// the tensor elements.
-// `scalar_t`       is a scalar type equivalent of target tensor dtype
-// `uint_t`         is an unsigned integral type of sub-blocks that random state is divided to
-//                  (e.g, 16 bytes random state block can be divided into 16 uint8_t sub-blocks 
-//                  or 8 uint16_t sub-block or 4 uint32_t sub-block or 2 uint64_t sub-blocks)
-// `N`              is a number of sub-block which is used by `transform_func` 
-//                  to generate a random value of specific distribution (e.g. `normal` uses 2)
-// `numel`          is a number of elements in target tensor
-// `block_t_size`   is a number of bytes in cipher's block (e.g. 16 for AES128)
-// `cipher`         is a callable that receives a counter `idx` and returns an encrypted block
-// `transform_func` is a callable that converts N `uint_t` random state sub-blocks passed in RNGValues into target dtype `scalar_t`
-template<typename scalar_t, typename uint_t, size_t N = 1, typename cipher_t, typename transform_t, typename index_calc_t>
-TORCH_CSPRNG_HOST_DEVICE static void block_cipher_kernel_helper(int idx, scalar_t* data, int64_t numel, size_t block_t_size, cipher_t cipher, transform_t transform_func, index_calc_t index_calc) {
-  const int unroll_factor = block_t_size / sizeof(uint_t) / N;
-  if (unroll_factor * idx < numel) {
-    auto block = cipher(idx);
-    UNROLL_IF_CUDA
-    for (auto i = 0; i < unroll_factor; ++i) {
-      const auto li = unroll_factor * idx + i;
-      if (li < numel) {
-        uint64_t vals[N];
-        UNROLL_IF_CUDA
-        for (size_t j = 0; j < N; j++) {
-          vals[j] = (reinterpret_cast<uint_t*>(&block))[N * i + j];
-        }
-        RNGValues<N> rng(vals);
-        data[index_calc(li)] = transform_func(&rng);
-      }
+template<typename output_index_calc_t>
+TORCH_CSPRNG_HOST_DEVICE static void copy_block_to_output(int64_t idx, uint8_t* block, int output_elem_per_block,
+    void* output_ptr, int64_t output_numel, int output_type_size, output_index_calc_t output_index_calc) {
+  for (auto i = 0; i < output_elem_per_block; ++i) {
+    const auto linear_index = idx * output_elem_per_block + i;
+    if (linear_index < output_numel) {
+      std::memcpy(
+          &(reinterpret_cast<uint8_t*>(output_ptr)[output_index_calc(linear_index)]),
+          block + i * output_type_size,
+          output_type_size
+      );
     }
   }
 }
 
+template<int block_size, typename cipher_t, typename input_index_calc_t, typename output_index_calc_t, typename transform_t>
+TORCH_CSPRNG_HOST_DEVICE static void block_cipher_kernel_helper(
+    int64_t idx, cipher_t cipher, int output_elem_per_block,
+    void* input_ptr, int64_t input_numel, int input_type_size, input_index_calc_t input_index_calc,
+    void* output_ptr, int64_t output_numel, int output_type_size, output_index_calc_t output_index_calc,
+    transform_t transform) {
+  uint8_t block[block_size];
+  std::memset(&block, 0, block_size); // is it ok to use zeros as padding?
+  if (input_ptr != nullptr) {
+    copy_input_to_block(idx, block, block_size, input_ptr, input_numel, input_type_size, input_index_calc);
+  }
+  cipher(idx, block);
+  transform(block);
+  copy_block_to_output(idx, block, output_elem_per_block, output_ptr, output_numel, output_type_size, output_index_calc);
+}
+
 #if defined(__CUDACC__) || defined(__HIPCC__)
-template<typename scalar_t, typename uint_t, size_t N = 1, typename cipher_t, typename transform_t, typename index_calc_t>
-__global__ static void block_cipher_kernel_cuda(scalar_t* data, int64_t numel, int block_t_size, cipher_t cipher, transform_t transform_func, index_calc_t index_calc) {
+template<int block_size, typename cipher_t, typename input_index_calc_t, typename output_index_calc_t, typename transform_t>
+__global__ static void block_cipher_kernel_cuda(cipher_t cipher, int output_elem_per_block,
+    void* input_ptr, int64_t input_numel, int input_type_size, input_index_calc_t input_index_calc,
+    void* output_ptr, int64_t output_numel, int output_type_size, output_index_calc_t output_index_calc,
+    transform_t transform) {
   const auto idx = blockIdx.x * blockDim.x + threadIdx.x;
-  block_cipher_kernel_helper<scalar_t, uint_t, N>(idx, data, numel, block_t_size, cipher, transform_func, index_calc);
+  block_cipher_kernel_helper<block_size>(idx, cipher, output_elem_per_block,
+    input_ptr, input_numel, input_type_size, input_index_calc,
+    output_ptr, output_numel, output_type_size, output_index_calc,
+    transform);
 }
 #endif
 
-template<typename scalar_t, typename uint_t, size_t N = 1, typename cipher_t, typename transform_t, typename index_calc_t>
-static void block_cipher_kernel_cpu_serial(int64_t begin, int64_t end, scalar_t* data, int64_t numel, int block_t_size, cipher_t cipher, transform_t transform_func, index_calc_t index_calc) {
+template<int block_size, typename cipher_t, typename input_index_calc_t, typename output_index_calc_t, typename transform_t>
+static void block_cipher_kernel_cpu_serial(int64_t begin, int64_t end, cipher_t cipher, int output_elem_per_block,
+    void* input_ptr, int64_t input_numel, int input_type_size, input_index_calc_t input_index_calc,
+    void* output_ptr, int64_t output_numel, int output_type_size, output_index_calc_t output_index_calc,
+    transform_t transform) {
   for (auto idx = begin; idx < end; ++idx) {
-    block_cipher_kernel_helper<scalar_t, uint_t, N>(idx, data, numel, block_t_size, cipher, transform_func, index_calc);
+    block_cipher_kernel_helper<block_size>(idx, cipher, output_elem_per_block,
+      input_ptr, input_numel, input_type_size, input_index_calc,
+      output_ptr, output_numel, output_type_size, output_index_calc,
+      transform);
   }
 }
 
-template<typename scalar_t, typename uint_t, size_t N = 1, typename cipher_t, typename transform_t, typename index_calc_t>
-static void block_cipher_kernel_cpu(int64_t total, scalar_t* data, int64_t numel, int block_t_size, cipher_t cipher, transform_t transform_func, index_calc_t index_calc) {
+template<int block_size, typename cipher_t, typename input_index_calc_t, typename output_index_calc_t, typename transform_t>
+static void block_cipher_kernel_cpu(int64_t total, cipher_t cipher, int output_elem_per_block,
+    void* input_ptr, int64_t input_numel, int input_type_size, input_index_calc_t input_index_calc,
+    void* output_ptr, int64_t output_numel, int output_type_size, output_index_calc_t output_index_calc,
+    transform_t transform_func) {
   if (total < at::internal::GRAIN_SIZE || at::get_num_threads() == 1) {
-    block_cipher_kernel_cpu_serial<scalar_t, uint_t, N>(0, total, data, numel, block_t_size, cipher, transform_func, index_calc);
+    block_cipher_kernel_cpu_serial<block_size>(0, total, cipher, output_elem_per_block,
+      input_ptr, input_numel, input_type_size, input_index_calc,
+      output_ptr, output_numel, output_type_size, output_index_calc,
+      transform_func);
   } else {
     at::parallel_for(0, total, at::internal::GRAIN_SIZE, [&](int64_t begin, int64_t end) {
-      block_cipher_kernel_cpu_serial<scalar_t, uint_t, N>(begin, end, data, numel, block_t_size, cipher, transform_func, index_calc);
+      block_cipher_kernel_cpu_serial<block_size>(begin, end, cipher, output_elem_per_block,
+        input_ptr, input_numel, input_type_size, input_index_calc,
+        output_ptr, output_numel, output_type_size, output_index_calc,
+        transform_func);
     });
   }
 }
 
-// Runs a block cipher in a counter mode in approximately `numel / (block_t_size / sizeof(uint_t) / N)` CUDA threads.
-// Each CUDA thread generates `block_t_size`-bytes random state and divides it into `block_t_size / sizeof(uint_t)` sub-blocks.
-// Then `transform_func` transforms `N` random state sub-blocks passed in a `RNGValues` to final random values of type `scalar_t`.
-template<typename scalar_t, typename uint_t, size_t N = 1, typename cipher_t, typename transform_t>
-void block_cipher_ctr_mode(at::TensorIterator& iter, int block_t_size, cipher_t cipher, transform_t transform_func) {
-  const auto numel = iter.numel();
-  if (numel == 0) {
+template<int block_size, typename cipher_t, typename input_index_calc_t, typename output_index_calc_t, typename transform_t>
+void block_cipher(
+    void* input_ptr, int64_t input_numel, int input_type_size, input_index_calc_t input_index_calc,
+    void* output_ptr, int64_t output_numel, int output_type_size, output_index_calc_t output_index_calc,
+    Device device, cipher_t cipher, int output_elem_per_block, transform_t transform_func) {
+  if (output_ptr == nullptr || output_numel == 0) {
     return;
   }
-  const int unroll_factor = block_t_size / sizeof(uint_t) / N;
-  const auto block = 256;
-  const auto grid = (numel + (block * unroll_factor) - 1) / (block * unroll_factor);
-  scalar_t* data = (scalar_t*)iter.data_ptr(0);
-  auto offset_calc = make_offset_calculator<1>(iter);
-  auto index_calc_identity = [] TORCH_CSPRNG_HOST_DEVICE (int li) -> int { return li; };
-  auto index_calc_offset = [offset_calc] TORCH_CSPRNG_HOST_DEVICE (int li) -> int { return offset_calc.get(li)[0] / sizeof(scalar_t); };
-  if (iter.device_type() == at::kCPU) {
-    if (iter.output(0).is_contiguous()) {
-      block_cipher_kernel_cpu<scalar_t, uint_t, N, cipher_t, transform_t>(
-        grid * block, data, numel, block_t_size, cipher, transform_func, index_calc_identity);
-    } else {
-      block_cipher_kernel_cpu<scalar_t, uint_t, N, cipher_t, transform_t>(
-        grid * block, data, numel, block_t_size, cipher, transform_func, index_calc_offset);
-    }
-  } else if (iter.device_type() == at::kCUDA) {
+
+  if (device.type() == at::kCPU) {
+    const auto total = (output_numel + output_elem_per_block - 1) / output_elem_per_block;
+    block_cipher_kernel_cpu<block_size>(total,
+        cipher, output_elem_per_block,
+        input_ptr, input_numel, input_type_size, input_index_calc,
+        output_ptr, output_numel, output_type_size, output_index_calc,
+        transform_func
+    );
+  } else if (device.type() == at::kCUDA) {
 #if defined(__CUDACC__) || defined(__HIPCC__)
+    const auto threads = 256;
+    const auto grid = (output_numel + (threads * output_elem_per_block) - 1) / (threads * output_elem_per_block);
     auto stream = at::cuda::getCurrentCUDAStream();
-    if (iter.output(0).is_contiguous()) {
-      block_cipher_kernel_cuda<scalar_t, uint_t, N, cipher_t, transform_t><<<grid, block, 0, stream>>>(
-        data, numel, block_t_size, cipher, transform_func, index_calc_identity);
-    } else {
-      block_cipher_kernel_cuda<scalar_t, uint_t, N, cipher_t, transform_t><<<grid, block, 0, stream>>>(
-        data, numel, block_t_size, cipher, transform_func, index_calc_offset);
-    }
+    block_cipher_kernel_cuda<block_size><<<grid, threads, 0, stream>>>(
+        cipher, output_elem_per_block,
+        input_ptr, input_numel, input_type_size, input_index_calc,
+        output_ptr, output_numel, output_type_size, output_index_calc,
+        transform_func
+    );
     AT_CUDA_CHECK(cudaGetLastError());
 #else
-    TORCH_CHECK(false, "csprng was compiled without CUDA support");
+    TORCH_CHECK(false, "torchcsprng was compiled without CUDA support");
 #endif
   } else {
-    TORCH_CHECK(false, "block_cipher_ctr_mode supports only CPU and CUDA devices");
+    TORCH_CHECK(false, "block_cipher supports only CPU and CUDA devices");
   }
 }
 
+template<int block_size, typename cipher_t>
+void block_cipher(Tensor input, Tensor output, cipher_t cipher) {
+  const auto input_ptr = input.data_ptr();
+  const auto input_numel = input.numel();
+
+  // Otherwise OffsetCalculator/IntDivider crashes with integer division by zero
+  if (input_ptr == nullptr || input_numel == 0) {
+    return;
+  }
+
+  const auto input_type_size = input.element_size();
+  const auto input_offset_calc = make_offset_calculator<1>(TensorIterator::nullary_op(input));
+  const auto input_index_calc = [input_offset_calc] TORCH_CSPRNG_HOST_DEVICE (uint32_t li) -> uint32_t {
+    return input_offset_calc.get(li)[0];
+  };
+
+  const auto output_ptr = output.data_ptr();
+  const auto output_numel = output.numel();
+
+  // Otherwise OffsetCalculator/IntDivider crashes with integer division by zero
+  if (output_ptr == nullptr || output_numel == 0) {
+    return;
+  }
+
+  const auto output_type_size = output.element_size();
+  const auto output_offset_calc = make_offset_calculator<1>(TensorIterator::nullary_op(output));
+  const auto output_index_calc = [output_offset_calc] TORCH_CSPRNG_HOST_DEVICE (uint32_t li) -> uint32_t {
+    return output_offset_calc.get(li)[0];
+  };
+
+  const auto device = output.device();
+
+  block_cipher<block_size>(
+      input_ptr, input_numel, input_type_size, input_index_calc,
+      output_ptr, output_numel, output_type_size, output_index_calc,
+      device, cipher, block_size / output_type_size,
+      [] TORCH_CSPRNG_HOST_DEVICE (uint8_t* x) {});
+}
+
 }}
diff --git a/torchcsprng/csrc/csprng.h b/torchcsprng/csrc/csprng.h
index f1fa0f9..cf31c10 100644
--- a/torchcsprng/csrc/csprng.h
+++ b/torchcsprng/csrc/csprng.h
@@ -57,6 +57,29 @@ struct CSPRNGGeneratorImpl : public c10::GeneratorImpl {
   Tensor key_;
 };
 
+// Generates `block_t_size`-bytes random key Tensor on CPU
+// using `generator`, which must be an instance of `at::CPUGeneratorImpl`
+// and passes it to the `device`.
+template<typename RNG>
+at::Tensor key_tensor(size_t block_t_size, c10::optional<at::Generator> generator) {
+  std::lock_guard<std::mutex> lock(generator->mutex());
+  auto gen = at::check_generator<RNG>(generator);
+  if (gen->key().defined()) {
+    return gen->key().clone();
+  }
+  auto key = torch::empty({static_cast<signed long>(block_t_size)}, torch::kUInt8);
+  using random_t = typename std::result_of<decltype(&RNG::random)(RNG)>::type;
+  constexpr size_t random_t_size = sizeof(random_t);
+  for (size_t i = 0; i < block_t_size / random_t_size; i++) {
+    const auto rand = gen->random();
+    for (size_t j = 0; j < random_t_size; j++) {
+      size_t k = i * random_t_size + j;
+      key[k] = static_cast<uint8_t>((rand >> (j * 8)) & 0xff);
+    }
+  }
+  return key;
+}
+
 template<typename RNG>
 Tensor aes128_key_tensor(Generator generator) {
   return key_tensor<RNG>(aes::block_t_size, generator);
@@ -64,6 +87,20 @@ Tensor aes128_key_tensor(Generator generator) {
 
 // ====================================================================================================================
 
+// A simple container for random state sub-blocks that implements RNG interface
+// with random() and random64() methods, that are used by transformation function
+template<size_t size>
+struct RNGValues {
+  TORCH_CSPRNG_HOST_DEVICE RNGValues(uint64_t* vals) {
+    memcpy(&vals_, vals, size * sizeof(uint64_t));
+  }
+  uint32_t TORCH_CSPRNG_HOST_DEVICE random() { auto res = static_cast<uint32_t>(vals_[index]); index++; return res; }
+  uint64_t TORCH_CSPRNG_HOST_DEVICE random64() { auto res = vals_[index]; index++; return res; }
+private:
+  uint64_t vals_[size];
+  int index = 0;
+};
+
 // Applies AES in CTR mode with the `key` for passed TensorIterator iter.
 // `scalar_t`       is a scalar type equivalent of target tensor dtype
 // `uint_t`         is an unsigned integral type of sub-blocks that random state is divided to
@@ -74,16 +111,37 @@ Tensor aes128_key_tensor(Generator generator) {
 // `key`            is a CUDA pointer to random key memory block
 // `transform_func` is a callable that converts N `uint_t` random state sub-blocks passed in RNGValues into target dtype `scalar_t`
 template<typename scalar_t, typename uint_t, size_t N = 1, typename transform_t>
-void aes_helper(TensorIterator& iter, const uint8_t* key, transform_t transform_func) {
-  block_cipher_ctr_mode<scalar_t, uint_t, N>(iter, aes::block_t_size,
-    [key] TORCH_CSPRNG_HOST_DEVICE (unsigned int idx) -> aes::block_t {
-      aes::block_t block;
-      memset(&block, 0, aes::block_t_size);
-      block.x = idx;
-      aes::encrypt(reinterpret_cast<uint8_t*>(&block), key);
-      return block;
+void aes_helper(TensorIterator& iter, const uint8_t* key_bytes, transform_t transform_func) {
+  auto output = iter.tensor(0);
+  const auto output_offset_calc = make_offset_calculator<1>(TensorIterator::nullary_op(output));
+  const auto output_index_calc = [output_offset_calc] TORCH_CSPRNG_HOST_DEVICE (uint32_t li) -> uint32_t {
+    return output_offset_calc.get(li)[0];
+  };
+  block_cipher<aes::block_t_size>(
+    nullptr, 0, 0, output_index_calc,
+    output.data_ptr(), output.numel(), output.element_size(), output_index_calc,
+    iter.device_type(),
+    [key_bytes] TORCH_CSPRNG_HOST_DEVICE (int64_t idx, uint8_t* block) -> void {
+      uint8_t idx_block[aes::block_t_size];
+      std::memset(&idx_block, 0, aes::block_t_size);
+      *(reinterpret_cast<int64_t*>(idx_block)) = idx;
+      aes::encrypt(idx_block, key_bytes);
+      for (size_t i = 0; i < aes::block_t_size; i++) {
+        block[i] ^= idx_block[i];
+      }
     },
-    transform_func
+    aes::block_t_size / (N * sizeof(uint_t)),
+    [transform_func] TORCH_CSPRNG_HOST_DEVICE (uint8_t* block) {
+      const auto n = aes::block_t_size / (N * sizeof(uint_t));
+      for (size_t i = 0; i < n; ++i) {
+        uint64_t vals[N];
+        for (size_t j = 0; j < N; ++j) {
+          vals[j] = (reinterpret_cast<uint_t*>(block))[N * i + j];
+        }
+        RNGValues<N> rng(vals);
+        reinterpret_cast<scalar_t*>(block)[i] = transform_func(&rng);
+      }
+    }
   );
 }
 
@@ -151,7 +209,7 @@ struct RandomFromToKernel {
         std::is_same<scalar_t, int64_t>::value ||
         std::is_same<scalar_t, double>::value ||
         std::is_same<scalar_t, float>::value ||
-        std::is_same<scalar_t, at::BFloat16>::value) && range >= 1ULL << 32)
+        std::is_same<scalar_t, at::BFloat16>::value)/* TODO: && range >= 1ULL << 32*/)
       {
         random_from_to_kernel_helper<scalar_t, uint64_t>(iter, range, base, key);
       } else {
@@ -416,6 +474,89 @@ Tensor& randperm_generator_out(Tensor& result, int64_t n, c10::optional<Generato
 
 // ====================================================================================================================
 
+void check_cipher(const std::string& cipher, Tensor key) {
+  if (cipher == "aes128") {
+    TORCH_CHECK(key.element_size() * key.numel() == 16, "key tensor must have 16 bytes(128 bits)");
+  } else {
+    TORCH_CHECK(false, "encrypt/decrypt supports \"aes128\" cipher, \"", cipher, "\" is not supported.");
+  }
+}
+
+void aes_ecb_encrypt(Tensor input, Tensor output, uint8_t* key_bytes) {
+  block_cipher<aes::block_t_size>(input, output,
+    [key_bytes] TORCH_CSPRNG_HOST_DEVICE (int64_t idx, uint8_t* block) -> void {
+      aes::encrypt(block, key_bytes);
+    }
+  );
+}
+
+void aes_ecb_decrypt(Tensor input, Tensor output, uint8_t* key_bytes) {
+  block_cipher<aes::block_t_size>(input, output,
+    [key_bytes] TORCH_CSPRNG_HOST_DEVICE (int64_t idx, uint8_t* block) -> void {
+      aes::decrypt(block, key_bytes);
+    }
+  );
+}
+
+void aes_ctr_encrypt(Tensor input, Tensor output, uint8_t* key_bytes) {
+  block_cipher<aes::block_t_size>(input, output,
+    [key_bytes] TORCH_CSPRNG_HOST_DEVICE (int64_t idx, uint8_t* block) -> void {
+      uint8_t idx_block[aes::block_t_size];
+      std::memset(&idx_block, 0, aes::block_t_size);
+      *(reinterpret_cast<int64_t*>(idx_block)) = idx;
+      aes::encrypt(idx_block, key_bytes);
+      for (size_t i = 0; i < aes::block_t_size; i++) {
+        block[i] ^= idx_block[i];
+      }
+    }
+  );
+}
+
+void aes_ctr_decrypt(Tensor input, Tensor output, uint8_t* key_bytes) {
+  aes_ctr_encrypt(input, output, key_bytes);
+}
+
+Tensor encrypt_pybind(Tensor input, Tensor output, Tensor key, const std::string& cipher, const std::string& mode) {
+  TORCH_CHECK(input.device() == output.device() && input.device() == key.device(), "input, output and key tensors must have the same device");
+  const auto output_size_bytes = output.numel() * output.itemsize();
+  const auto input_size_bytes = input.numel() * input.itemsize();
+  const auto input_size_bytes_rounded = (input_size_bytes + aes::block_t_size - 1) / aes::block_t_size * aes::block_t_size;
+  TORCH_CHECK(output_size_bytes == input_size_bytes_rounded,
+              "output size in bytes(", output_size_bytes,
+              ") is not equal to input size in bytes rounded to block size(",
+              input_size_bytes_rounded, ")");
+  check_cipher(cipher, key);
+  const auto key_bytes = reinterpret_cast<uint8_t*>(key.contiguous().data_ptr());
+  if (mode == "ecb") {
+    aes_ecb_encrypt(input, output, key_bytes);
+  } else if (mode == "ctr") {
+    aes_ctr_encrypt(input, output, key_bytes);
+  } else {
+    TORCH_CHECK(false, "encrypt/decrypt supports \"ecb\" and \"ctr\" modes, \"", mode, "\" is not supported.");
+  }
+  return output;
+}
+
+Tensor decrypt_pybind(Tensor input, Tensor output, Tensor key, const std::string& cipher, const std::string& mode) {
+  TORCH_CHECK(input.device() == output.device() && input.device() == key.device(), "input, output and key tensors must have the same device");
+  const auto output_size_bytes = output.numel() * output.itemsize();
+  const auto input_size_bytes = input.numel() * input.itemsize();
+  TORCH_CHECK(output_size_bytes == input_size_bytes, "input and output tensors must have the same size in byte");
+  TORCH_CHECK(input_size_bytes % aes::block_t_size == 0, "input tensor size in bytes must divisible by cipher block size in bytes");
+  check_cipher(cipher, key);
+  const auto key_bytes = reinterpret_cast<uint8_t*>(key.contiguous().data_ptr());
+  if (mode == "ecb") {
+    aes_ecb_decrypt(input, output, key_bytes);
+  } else if (mode == "ctr") {
+    aes_ctr_decrypt(input, output, key_bytes);
+  } else {
+    TORCH_CHECK(false, "encrypt/decrypt supports \"ecb\" and \"ctr\" modes, \"", mode, "\" is not supported.");
+  }
+  return output;
+}
+
+// ====================================================================================================================
+
 Generator create_random_device_generator(c10::optional<std::string> token = c10::nullopt) {
   if (token.has_value()) {
     return make_generator<CSPRNGGeneratorImpl>(*token);
@@ -481,4 +622,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.def("create_mt19937_generator", &create_mt19937_generator, py::arg("seed") = nullptr);
   m.def("aes128_key_tensor", &aes128_key_tensor_pybind);
   m.def("create_const_generator", &create_const_generator);
+  m.def("encrypt", &encrypt_pybind);
+  m.def("decrypt", &decrypt_pybind);
 }