# **Setup env**

In [1]:
!sudo apt-get update
!sudo apt-get install -y build-essential g++ libssl-dev

#check
!g++ --version
!ls /usr/include/openssl/evp.h
!nvcc --version

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:6 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:7 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:9 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:10 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1,245 kB]
Get:11 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:12 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [8,927 kB]
Get:13 http://archive.ubuntu.com/ubuntu jammy

In [None]:
!df -h /

Filesystem      Size  Used Avail Use% Mounted on
overlay         113G   37G   76G  33% /


In [None]:
# check GPU
print('GPU T4')
!nvidia-smi
# check CPU
print('CPU')
!lscpu

GPU T4
Mon May  5 04:17:10 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   47C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                         

## **Generate file**

In [2]:
%%writefile generate_plaintext.cpp
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#include <stdexcept>
#include <random>

// unit changes
long long parse_size(const std::string& size_str) {
    long long multiplier = 1;
    std::string num_part = size_str;

    if (size_str.length() > 2) {
        std::string unit = size_str.substr(size_str.length() - 2);
        if (unit == "KB" || unit == "kb") {
            multiplier = 1024LL;
            num_part = size_str.substr(0, size_str.length() - 2);
        } else if (unit == "MB" || unit == "mb") {
            multiplier = 1024LL * 1024LL;
            num_part = size_str.substr(0, size_str.length() - 2);
        } else if (unit == "GB" || unit == "gb") {
            multiplier = 1024LL * 1024LL * 1024LL;
            num_part = size_str.substr(0, size_str.length() - 2);
        }else if (size_str.back() == 'B' || size_str.back() == 'b'){
             num_part = size_str.substr(0, size_str.length() - 1);
             if (num_part.empty()){ // Handle case like "B"
                return 1; // or throw error
             }
        }
    } else if (size_str.length() > 1 && (size_str.back() == 'B' || size_str.back() == 'b')) {
         num_part = size_str.substr(0, size_str.length() - 1);
         if (num_part.empty()){ // Handle case like "B"
             return 1; // or throw error
         }
    }


    try {
        long long num = std::stoll(num_part);
        return num * multiplier;
    } catch (const std::invalid_argument& e) {
        throw std::invalid_argument("Invalid size format: " + size_str);
    } catch (const std::out_of_range& e) {
         throw std::out_of_range("Size out of range: " + size_str);
    }
}

int main(int argc, char* argv[]) {
    if (argc != 3) {
        std::cerr << "Usage: " << argv[0] << " <output_filename> <size (e.g., 10MB, 512KB, 1GB)>" << std::endl;
        return 1;
    }

    std::string filename = argv[1];
    std::string size_arg = argv[2];
    long long target_size_bytes;

    try {
         target_size_bytes = parse_size(size_arg);
          if (target_size_bytes <= 0) {
             throw std::invalid_argument("Size must be positive.");
          }
    } catch (const std::exception& e) {
         std::cerr << "Error parsing size: " << e.what() << std::endl;
         return 1;
    }


    std::ofstream outfile(filename, std::ios::binary | std::ios::trunc);
    if (!outfile) {
        std::cerr << "Error opening file for writing: " << filename << std::endl;
        return 1;
    }

    // write random files data
    const size_t buffer_size = 4096; //chunk
    std::vector<char> buffer(buffer_size);
    std::random_device rd;
    std::mt19937 gen(rd());
    std::uniform_int_distribution<> dis(0, 255);
    long long bytes_written = 0;
    while (bytes_written < target_size_bytes) {
        long long remaining = target_size_bytes - bytes_written;
        size_t to_write = (remaining < buffer_size) ? static_cast<size_t>(remaining) : buffer_size;
        for (size_t i = 0; i < to_write; ++i) {
        buffer[i] = static_cast<char>(dis(gen));
        }
        outfile.write(buffer.data(), to_write);
        if (!outfile) {
             std::cerr << "Error writing to file: " << filename << std::endl;
             outfile.close();
             return 1; // if (error) -> break
        }
        bytes_written += to_write;
    }

    outfile.close();
    std::cout << "Successfully generated file '" << filename << "' with approximately " << size_arg << " (" << bytes_written << " bytes)." << std::endl;

    return 0;
}

Writing generate_plaintext.cpp


In [4]:
# compile
!g++ generate_plaintext.cpp -o generate_plaintext -std=c++11

# generate files
!./generate_plaintext plaintext_512KB.bin 512KB
!./generate_plaintext plaintext_10MB.bin 10MB
!./generate_plaintext plaintext_100MB.bin 100MB
!./generate_plaintext plaintext_500MB.bin 500MB
!./generate_plaintext plaintext_1GB.bin 1GB
!./generate_plaintext plaintext_3GB.bin 3GB

# check list files
!ls -lh plaintext_*.bin

Successfully generated file 'plaintext_512KB.bin' with approximately 512KB (524288 bytes).
Successfully generated file 'plaintext_10MB.bin' with approximately 10MB (10485760 bytes).
Successfully generated file 'plaintext_100MB.bin' with approximately 100MB (104857600 bytes).
Successfully generated file 'plaintext_500MB.bin' with approximately 500MB (524288000 bytes).
Successfully generated file 'plaintext_1GB.bin' with approximately 1GB (1073741824 bytes).
Successfully generated file 'plaintext_3GB.bin' with approximately 3GB (3221225472 bytes).
-rw-r--r-- 1 root root 100M May 12 03:41 plaintext_100MB.bin
-rw-r--r-- 1 root root  10M May 12 03:41 plaintext_10MB.bin
-rw-r--r-- 1 root root 1.0G May 12 03:42 plaintext_1GB.bin
-rw-r--r-- 1 root root 3.0G May 12 03:45 plaintext_3GB.bin
-rw-r--r-- 1 root root 500M May 12 03:41 plaintext_500MB.bin
-rw-r--r-- 1 root root 512K May 12 03:41 plaintext_512KB.bin


# **AES 128 Counter Mode với OPenMP**

In [5]:
%%writefile aes_ctr_openssl.cpp
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#include <chrono>
#include <stdexcept>
#include <cstring> // for memcpy, memset

// OpenSSL headers
#include <openssl/evp.h>
#include <openssl/err.h>
#include <openssl/rand.h> // random intialization vector - IV

// OpenMP header
#ifdef _OPENMP
#include <omp.h>
#endif

// Configuration
const int AES_KEY_SIZE = 128 / 8;
const int AES_BLOCK_SIZE = 128 / 8;

// Function OpenSSL error
void handle_openssl_errors() {
    ERR_print_errors_fp(stderr);
    throw std::runtime_error("OpenSSL error occurred.");
}

// Counter up
void increment_counter(unsigned char* counter) {
    for (int i = AES_BLOCK_SIZE - 1; i >= 0; --i) {
        if (++counter[i] != 0) {
            break;
        }
    }
}

// Function AES-128 CTR
// is_parallel: true -> parallel with OpenMP
void aes_128_ctr_encrypt_decrypt(const std::vector<unsigned char>& input,
                                 std::vector<unsigned char>& output,
                                 const unsigned char* key,
                                 const unsigned char* iv_counter_start, // IV
                                 bool is_parallel,
                                 int num_threads = 0)
{
    size_t input_len = input.size();
    if (input_len == 0) {
        output.resize(0);
        return;
    }
    output.resize(input_len);

    size_t num_blocks = (input_len + AES_BLOCK_SIZE - 1) / AES_BLOCK_SIZE;

    #ifdef _OPENMP
    if (is_parallel) {
        if (num_threads > 0) {
            omp_set_num_threads(num_threads);
        }
        std::cout << "Running with OpenMP, max threads: " << omp_get_max_threads() << std::endl;
    } else {
         std::cout << "Running sequentially (OpenMP disabled for this run)." << std::endl;
    }
    #else
    if (is_parallel) {
        std::cout << "Warning: Compiled without OpenMP support, running sequentially." << std::endl;
    } else {
         std::cout << "Running sequentially (no OpenMP)." << std::endl;
    }
    is_parallel = false; // senquential ifnot OpenMP
    #endif

    // Main
    #pragma omp parallel if(is_parallel)
    {
        EVP_CIPHER_CTX* ctx = EVP_CIPHER_CTX_new();
        if (!ctx) handle_openssl_errors();

        // intialize cipher AES-128 ECB
        if (1 != EVP_EncryptInit_ex(ctx, EVP_aes_128_ecb(), NULL, key, NULL)) {
              EVP_CIPHER_CTX_free(ctx);
             handle_openssl_errors();
        }
        EVP_CIPHER_CTX_set_padding(ctx, 0);

        unsigned char current_counter[AES_BLOCK_SIZE];
        unsigned char encrypted_counter[AES_BLOCK_SIZE];
        int out_len_temp = 0;

        #pragma omp for schedule(static) // task parallel
        for (size_t block_idx = 0; block_idx < num_blocks; ++block_idx) {
            // counter caculate
            memcpy(current_counter, iv_counter_start, AES_BLOCK_SIZE);
            // Tcounter up by block_idx
             unsigned long long block_offset = block_idx;
             for (int i = AES_BLOCK_SIZE - 1; i >= 0 && block_offset > 0; --i) {
                 unsigned long long current_val = current_counter[i];
                 unsigned long long add_val = block_offset & 0xFF; // Lấy 8 bit cuối
                 unsigned long long sum = current_val + add_val;
                 current_counter[i] = static_cast<unsigned char>(sum & 0xFF);
                 block_offset >>= 8;
                 if (sum > 0xFF) {
                     block_offset++; // offset up
                 }
             }
            // Counter encryption ~ Keystream generate AES-128 ECB
             if (1 != EVP_EncryptUpdate(ctx, encrypted_counter, &out_len_temp, current_counter, AES_BLOCK_SIZE)) {
                 #pragma omp critical
                 {
                    #ifdef _OPENMP
                    std::cerr << "OpenSSL EncryptUpdate error in thread " << omp_get_thread_num() << std::endl;
                    #else
                    std::cerr << "OpenSSL EncryptUpdate error during sequential execution." << std::endl;
                    #endif
                    ERR_print_errors_fp(stderr);
                 }
                 continue;
             }

            // XOR encrypted_counter with plaintext
            size_t current_block_start = block_idx * AES_BLOCK_SIZE;
            size_t current_block_size = AES_BLOCK_SIZE;
            // the last data block
            if (current_block_start + AES_BLOCK_SIZE > input_len) {
                current_block_size = input_len - current_block_start;
            }

            for (size_t i = 0; i < current_block_size; ++i) {
                output[current_block_start + i] = input[current_block_start + i] ^ encrypted_counter[i];
            }
        } // finish omp for

        // Release the thread context
        EVP_CIPHER_CTX_free(ctx);

    } // finish omp parallel
}


// read file
std::vector<unsigned char> read_file(const std::string& filename) {
    std::ifstream infile(filename, std::ios::binary | std::ios::ate);
    if (!infile) {
        throw std::runtime_error("Cannot open file for reading: " + filename);
    }
    std::streamsize size = infile.tellg();
    infile.seekg(0, std::ios::beg);
    std::vector<unsigned char> buffer(size);
    if (!infile.read(reinterpret_cast<char*>(buffer.data()), size)) {
        throw std::runtime_error("Error reading file: " + filename);
    }
    return buffer;
}

// write file
void write_file(const std::string& filename, const std::vector<unsigned char>& data) {
    std::ofstream outfile(filename, std::ios::binary | std::ios::trunc);
    if (!outfile) {
        throw std::runtime_error("Cannot open file for writing: " + filename);
    }
    if (!outfile.write(reinterpret_cast<const char*>(data.data()), data.size())) {
         throw std::runtime_error("Error writing file: " + filename);
    }
}


int main(int argc, char* argv[]) {
    if (argc < 4 || argc > 6) {
        std::cerr << "Usage: " << argv[0] << " <input_file> <output_file> <mode> [num_threads]" << std::endl;
        std::cerr << "  mode: sequential | parallel" << std::endl;
        std::cerr << "  num_threads (optional, for parallel mode): number of threads (default: OMP default)" << std::endl;
        return 1;
    }

    std::string input_filename = argv[1];
    std::string output_filename = argv[2];
    std::string mode = argv[3];
    bool run_parallel = false;
    int num_threads = 0;

    if (mode == "parallel") {
        run_parallel = true;
        #ifndef _OPENMP
        std::cerr << "Warning: Compiled without OpenMP. Running sequentially instead." << std::endl;
        run_parallel = false;
        #else
        if (argc == 5) {
            try {
                num_threads = std::stoi(argv[4]);
                if (num_threads <= 0) {
                     std::cerr << "Warning: num_threads must be positive. Using OpenMP default." << std::endl;
                     num_threads = 0;
                }
            } catch (const std::exception& e) {
                 std::cerr << "Warning: Invalid num_threads argument. Using OpenMP default. Error: " << e.what() << std::endl;
                 num_threads = 0;
            }
        }
        #endif
    } else if (mode != "sequential") {
        std::cerr << "Error: Invalid mode '" << mode << "'. Use 'sequential' or 'parallel'." << std::endl;
        return 1;
    }

    // Key and IV (example - in practice, the key needs to be kept confidential)
    unsigned char key[AES_KEY_SIZE] = {
        0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6,
        0xab, 0xf7, 0x15, 0x88, 0x09, 0xcf, 0x4f, 0x3c
    };
    unsigned char iv_counter[AES_BLOCK_SIZE];
    memset(iv_counter, 0x00, AES_BLOCK_SIZE);

    try {
        // read input file
        std::cout << "Reading input file: " << input_filename << "..." << std::endl;
        auto start_read = std::chrono::high_resolution_clock::now();
        std::vector<unsigned char> plaintext = read_file(input_filename);
        auto end_read = std::chrono::high_resolution_clock::now();
        std::chrono::duration<double, std::milli> read_duration = end_read - start_read;
        std::cout << "Read " << plaintext.size() << " bytes in " << read_duration.count() << " ms." << std::endl;

        // prepare buffer output
        std::vector<unsigned char> ciphertext;
        ciphertext.reserve(plaintext.size());

        // encrypting and excuting time
        std::cout << "Starting AES-128 CTR encryption (" << mode << ")..." << std::endl;
        auto start_encrypt = std::chrono::high_resolution_clock::now();  // timer

        aes_128_ctr_encrypt_decrypt(plaintext, ciphertext, key, iv_counter, run_parallel, num_threads);

        auto end_encrypt = std::chrono::high_resolution_clock::now();
        std::chrono::duration<double, std::milli> encrypt_duration = end_encrypt - start_encrypt;
        std::cout << "Encryption finished." << std::endl;
        std::cout << "----------------------------------------" << std::endl;
        std::cout << "Mode:          " << mode << (run_parallel ? " (Threads: " + (num_threads > 0 ? std::to_string(num_threads) : "Default") + ")" : "") << std::endl;
        std::cout << "Input size:    " << plaintext.size() << " bytes" << std::endl;
        std::cout << "Output size:   " << ciphertext.size() << " bytes" << std::endl;
        std::cout << "Encryption Time: " << encrypt_duration.count() << " ms" << std::endl;
        std::cout << "----------------------------------------" << std::endl;


        // Ghi file output
        std::cout << "Writing output file: " << output_filename << "..." << std::endl;
        auto start_write = std::chrono::high_resolution_clock::now();
        write_file(output_filename, ciphertext);
        auto end_write = std::chrono::high_resolution_clock::now();
        std::chrono::duration<double, std::milli> write_duration = end_write - start_write;
        std::cout << "Write finished in " << write_duration.count() << " ms." << std::endl;


    } catch (const std::exception& e) {
        std::cerr << "Error: " << e.what() << std::endl;
        // Clean up OpenSSL error queue
        ERR_free_strings();
        return 1;
    }

    // Clean up OpenSSL error queue
    ERR_free_strings();
    return 0;
}

Writing aes_ctr_openssl.cpp


## **Biên dịch và chạy thực nghiệm**

### **Tuần tự**

In [6]:
!echo "--- Re-compiling Sequential with absolute path ---"
!g++ /content/aes_ctr_openssl.cpp -o aes_ctr_seq -O3 -lssl -lcrypto -std=c++11
!echo "--- Compilation Attempt Finished ---"
!ls -l aes_ctr_seq

--- Re-compiling Sequential with absolute path ---
--- Compilation Attempt Finished ---
-rwxr-xr-x 1 root root 38424 May 12 03:45 aes_ctr_seq


In [9]:
# 10MB
!echo "Running Sequential 512KB"
!./aes_ctr_seq plaintext_512KB.bin ciphertext_512KB_seq.bin sequential

# 100MB
!echo "Running Sequential 10MB"
!./aes_ctr_seq plaintext_10MB.bin ciphertext_10MB_seq.bin sequential

# 100MB
!echo "Running Sequential 100MB"
!./aes_ctr_seq plaintext_100MB.bin ciphertext_100MB_seq.bin sequential

# 500MB
!echo "Running Sequential 500MB"
!./aes_ctr_seq plaintext_500MB.bin ciphertext_500MB_seq.bin sequential

# 1GB
!echo "Running Sequential 1GB"
!./aes_ctr_seq plaintext_1GB.bin ciphertext_1GB_seq.bin sequential

# 3GB
!echo "Running Sequential 3GB"
!./aes_ctr_seq plaintext_3GB.bin ciphertext_3GB_seq.bin sequential

# file output check
!ls -lh ciphertext_*_seq.bin

Running Sequential 512KB
Reading input file: plaintext_512KB.bin...
Read 524288 bytes in 0.499976 ms.
Starting AES-128 CTR encryption (sequential)...
Running sequentially (no OpenMP).
Encryption finished.
----------------------------------------
Mode:          sequential
Input size:    524288 bytes
Output size:   524288 bytes
Encryption Time: 3.70351 ms
----------------------------------------
Writing output file: ciphertext_512KB_seq.bin...
Write finished in 0.6259 ms.
Running Sequential 10MB
Reading input file: plaintext_10MB.bin...
Read 10485760 bytes in 10.3023 ms.
Starting AES-128 CTR encryption (sequential)...
Running sequentially (no OpenMP).
Encryption finished.
----------------------------------------
Mode:          sequential
Input size:    10485760 bytes
Output size:   10485760 bytes
Encryption Time: 36.9246 ms
----------------------------------------
Writing output file: ciphertext_10MB_seq.bin...
Write finished in 12.9044 ms.
Running Sequential 100MB
Reading input file: pl

### **Multi threads CPU**

In [11]:
# add -fopenmp flag for multi threads
!g++ aes_ctr_openssl.cpp -o aes_ctr_omp -O3 -fopenmp -lssl -lcrypto -std=c++11

# 512KB
!echo "Running Parallel (OMP Default Threads) 512KB"
!./aes_ctr_omp plaintext_512KB.bin ciphertext_512KB_omp.bin parallel 2

# 10MB
!echo " Running Parallel (OMP Default Threads) 10MB"
!./aes_ctr_omp plaintext_10MB.bin ciphertext_10MB_omp.bin parallel 2

# 100MB
!echo "Running Parallel (OMP Default Threads) 100MB"
!./aes_ctr_omp plaintext_100MB.bin ciphertext_100MB_omp.bin parallel 2

# 500MB
!echo "Running Parallel (OMP Default Threads) 500MB"
!./aes_ctr_omp plaintext_500MB.bin ciphertext_500MB_omp.bin parallel 2

# 1GB
!echo "Running Parallel (OMP Default Threads) 1GB"
!./aes_ctr_omp plaintext_1GB.bin ciphertext_1GB_omp.bin parallel 2

# 3GB
!echo "Running Parallel (OMP Default Threads) 3GB"
!./aes_ctr_omp plaintext_3GB.bin ciphertext_3GB_omp.bin parallel 2

# file output check
!ls -lh ciphertext_*_omp*.bin

Running Parallel (OMP Default Threads) 512KB
Reading input file: plaintext_512KB.bin...
Read 524288 bytes in 0.499919 ms.
Starting AES-128 CTR encryption (parallel)...
Running with OpenMP, max threads: 2
Encryption finished.
----------------------------------------
Mode:          parallel (Threads: 2)
Input size:    524288 bytes
Output size:   524288 bytes
Encryption Time: 6.42164 ms
----------------------------------------
Writing output file: ciphertext_512KB_omp.bin...
Write finished in 1.03809 ms.
 Running Parallel (OMP Default Threads) 10MB
Reading input file: plaintext_10MB.bin...
Read 10485760 bytes in 8.97493 ms.
Starting AES-128 CTR encryption (parallel)...
Running with OpenMP, max threads: 2
Encryption finished.
----------------------------------------
Mode:          parallel (Threads: 2)
Input size:    10485760 bytes
Output size:   10485760 bytes
Encryption Time: 37.9275 ms
----------------------------------------
Writing output file: ciphertext_10MB_omp.bin...
Write finishe

# **AES 128 Counter Mode với T4 GPU**


## Only XOR

In [22]:
%%writefile aes_ctr_gpu.cu
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#include <chrono>
#include <stdexcept>
#include <cstring> // for memcpy, memset

// CUDA headers
#include <cuda_runtime.h>
#include <device_launch_parameters.h> // Cho __global__

// OpenSSL headers (keystream on host)
#include <openssl/evp.h>
#include <openssl/err.h>

// Cấu hình
const int AES_KEY_SIZE = 128 / 8; // 16 bytes
const int AES_BLOCK_SIZE = 128 / 8; // 16 bytes
const int THREADS_PER_BLOCK = 256; // CUDA threads per block

// Macro CUDA error
#define CUDA_CHECK(err) { cudaError_t cuda_err = (err); if (cuda_err != cudaSuccess) { fprintf(stderr, "CUDA Error at %s:%d - %s\n", __FILE__, __LINE__, cudaGetErrorString(cuda_err)); throw std::runtime_error("CUDA Error"); } }

//  OpenSSL error
void handle_openssl_errors() {
    ERR_print_errors_fp(stderr);
    throw std::runtime_error("OpenSSL error occurred.");
}

// CUDA Kernel: XOR
__global__ void xor_kernel(const unsigned char* input,
                           unsigned char* output,
                           const unsigned char* keystream, // Keystream generated
                           size_t n_bytes)
{
    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;

    if (idx < n_bytes) {
        output[idx] = input[idx] ^ keystream[idx];
    }
}

// Counter encryption function on HOST using OpenSSL ECB
// This function runs on CPU to prepare keystream for GPU
void generate_keystream_on_host(unsigned char* keystream_buffer, // Buffer for keystream
                               size_t total_bytes,
                               const unsigned char* key,
                               const unsigned char* iv_counter_start)
{
    EVP_CIPHER_CTX* ctx = EVP_CIPHER_CTX_new();
    if (!ctx) handle_openssl_errors();

    if (1 != EVP_EncryptInit_ex(ctx, EVP_aes_128_ecb(), NULL, key, NULL)) {
        EVP_CIPHER_CTX_free(ctx);
        handle_openssl_errors();
    }
    EVP_CIPHER_CTX_set_padding(ctx, 0); // padding off

    unsigned char current_counter[AES_BLOCK_SIZE];
    memcpy(current_counter, iv_counter_start, AES_BLOCK_SIZE);

    size_t num_blocks = (total_bytes + AES_BLOCK_SIZE - 1) / AES_BLOCK_SIZE;
    int out_len_temp = 0;

    for (size_t block_idx = 0; block_idx < num_blocks; ++block_idx) {
        // 1. Counter caculation
         unsigned char block_specific_counter[AES_BLOCK_SIZE];
         memcpy(block_specific_counter, iv_counter_start, AES_BLOCK_SIZE);
         unsigned long long block_offset = block_idx;
         for (int i = AES_BLOCK_SIZE - 1; i >= 0 && block_offset > 0; --i) {
             unsigned long long current_val = block_specific_counter[i];
             unsigned long long add_val = block_offset & 0xFF;
             unsigned long long sum = current_val + add_val;
             block_specific_counter[i] = static_cast<unsigned char>(sum & 0xFF);
             block_offset >>= 8;
             if (sum > 0xFF) { block_offset++; }
         }

        // 2. counter encrypting AES ECB on host
        size_t buffer_offset = block_idx * AES_BLOCK_SIZE;
        if (buffer_offset + AES_BLOCK_SIZE <= total_bytes) {
             if (1 != EVP_EncryptUpdate(ctx, keystream_buffer + buffer_offset, &out_len_temp, block_specific_counter, AES_BLOCK_SIZE)) {
                 EVP_CIPHER_CTX_free(ctx);
                 handle_openssl_errors();
             }
             // if (out_len_temp != AES_BLOCK_SIZE) { /* error */ }
        } else {
            // last block
             unsigned char temp_encrypted_counter[AES_BLOCK_SIZE];
              if (1 != EVP_EncryptUpdate(ctx, temp_encrypted_counter, &out_len_temp, block_specific_counter, AES_BLOCK_SIZE)) {
                 EVP_CIPHER_CTX_free(ctx);
                 handle_openssl_errors();
             }
            size_t remaining_bytes = total_bytes - buffer_offset;
             memcpy(keystream_buffer + buffer_offset, temp_encrypted_counter, remaining_bytes);
        }
    }

    EVP_CIPHER_CTX_free(ctx);
}


// read files
std::vector<unsigned char> read_file(const std::string& filename) {
    std::ifstream infile(filename, std::ios::binary | std::ios::ate);
    if (!infile) {
        throw std::runtime_error("Cannot open file for reading: " + filename);
    }
    std::streamsize size = infile.tellg();
    infile.seekg(0, std::ios::beg);
    std::vector<unsigned char> buffer(size);
    if (!infile.read(reinterpret_cast<char*>(buffer.data()), size)) {
        throw std::runtime_error("Error reading file: " + filename);
    }
    return buffer;
}

// write
void write_file(const std::string& filename, const std::vector<unsigned char>& data) {
    std::ofstream outfile(filename, std::ios::binary | std::ios::trunc);
    if (!outfile) {
        throw std::runtime_error("Cannot open file for writing: " + filename);
    }
    if (!outfile.write(reinterpret_cast<const char*>(data.data()), data.size())) {
         throw std::runtime_error("Error writing file: " + filename);
    }
}

int main(int argc, char* argv[]) {
    if (argc != 3) {
        std::cerr << "Usage: " << argv[0] << " <input_file> <output_file>" << std::endl;
        return 1;
    }

    std::string input_filename = argv[1];
    std::string output_filename = argv[2];

    // Key IV
    unsigned char key[AES_KEY_SIZE] = {
        0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6,
        0xab, 0xf7, 0x15, 0x88, 0x09, 0xcf, 0x4f, 0x3c
    };
    unsigned char iv_counter[AES_BLOCK_SIZE];
    memset(iv_counter, 0x00, AES_BLOCK_SIZE);

    // Timer
    std::chrono::high_resolution_clock::time_point t_start, t_end;
    std::chrono::duration<double, std::milli> duration;

    // index Device (GPU)
    unsigned char *d_input = nullptr, *d_output = nullptr, *d_keystream = nullptr;
    std::vector<unsigned char> h_plaintext;
    std::vector<unsigned char> h_ciphertext;

    try {
        // 1. input to host memory (h_plaintext)
        t_start = std::chrono::high_resolution_clock::now();
        h_plaintext = read_file(input_filename);
        t_end = std::chrono::high_resolution_clock::now();
        duration = t_end - t_start;
        std::cout << "Read " << h_plaintext.size() << " bytes in " << duration.count() << " ms." << std::endl;
        if (h_plaintext.empty()) {
             std::cout << "Input file is empty. Nothing to do." << std::endl;
             return 0;
        }
        h_ciphertext.resize(h_plaintext.size()); // buffer output on host


        // 2. keystream on HOST
        std::cout << "Generating keystream on host using OpenSSL..." << std::endl;
        t_start = std::chrono::high_resolution_clock::now();
        size_t data_size = h_plaintext.size();
        size_t keystream_size = data_size;
        std::vector<unsigned char> h_keystream(keystream_size);
        generate_keystream_on_host(h_keystream.data(), keystream_size, key, iv_counter);
        t_end = std::chrono::high_resolution_clock::now();
        duration = t_end - t_start;
        std::cout << "Keystream generation finished in " << duration.count() << " ms." << std::endl;


        // 3. Allocate memory on Device (GPU)
        std::cout << "Allocating memory on GPU..." << std::endl;
        t_start = std::chrono::high_resolution_clock::now();
        CUDA_CHECK(cudaMalloc(&d_input, data_size));
        CUDA_CHECK(cudaMalloc(&d_output, data_size));
        CUDA_CHECK(cudaMalloc(&d_keystream, keystream_size));
        t_end = std::chrono::high_resolution_clock::now();
        duration = t_end - t_start;
        std::cout << "GPU Allocation finished in " << duration.count() << " ms." << std::endl;


        // 4. Copy data from Host to Device
        std::cout << "Copying data from Host to Device..." << std::endl;
        t_start = std::chrono::high_resolution_clock::now();
        CUDA_CHECK(cudaMemcpy(d_input, h_plaintext.data(), data_size, cudaMemcpyHostToDevice));
        CUDA_CHECK(cudaMemcpy(d_keystream, h_keystream.data(), keystream_size, cudaMemcpyHostToDevice));
        t_end = std::chrono::high_resolution_clock::now();
        duration = t_end - t_start;
        std::cout << "HtoD Copy finished in " << duration.count() << " ms." << std::endl;


        // 5. Launching CUDA Kernel
        std::cout << "Launching CUDA kernel..." << std::endl;
        size_t num_elements = data_size;
        int num_blocks = (num_elements + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
        dim3 gridDim(num_blocks);
        dim3 blockDim(THREADS_PER_BLOCK);

        // kernel excute time
        cudaEvent_t start_event, stop_event;
        CUDA_CHECK(cudaEventCreate(&start_event));
        CUDA_CHECK(cudaEventCreate(&stop_event));

        CUDA_CHECK(cudaEventRecord(start_event));

        xor_kernel<<<gridDim, blockDim>>>(d_input, d_output, d_keystream, num_elements);

        CUDA_CHECK(cudaEventRecord(stop_event));
        CUDA_CHECK(cudaEventSynchronize(stop_event));

        float kernel_time_ms = 0;
        CUDA_CHECK(cudaEventElapsedTime(&kernel_time_ms, start_event, stop_event));
        std::cout << "CUDA Kernel execution finished." << std::endl;

        CUDA_CHECK(cudaGetLastError());


        // 6. Copy results from Device to Host
        std::cout << "Copying result from Device to Host..." << std::endl;
        t_start = std::chrono::high_resolution_clock::now();
        CUDA_CHECK(cudaMemcpy(h_ciphertext.data(), d_output, data_size, cudaMemcpyDeviceToHost));
        t_end = std::chrono::high_resolution_clock::now();
        duration = t_end - t_start;
        std::cout << "DtoH Copy finished in " << duration.count() << " ms." << std::endl;


        // 7. Write file output
         t_start = std::chrono::high_resolution_clock::now();
         write_file(output_filename, h_ciphertext);
         t_end = std::chrono::high_resolution_clock::now();
         duration = t_end - t_start;
         std::cout << "Write finished in " << duration.count() << " ms." << std::endl;


        // Presentation
        std::cout << "----------------------------------------" << std::endl;
        std::cout << "Mode:          GPU (CUDA)" << std::endl;
        std::cout << "Input size:    " << h_plaintext.size() << " bytes" << std::endl;
        std::cout << "Output size:   " << h_ciphertext.size() << " bytes" << std::endl;
        std::cout << "Kernel Time:   " << kernel_time_ms << " ms" << std::endl;
        std::cout << "(Note: Total time includes data transfers & host processing)" << std::endl;
        std::cout << "----------------------------------------" << std::endl;

        // Release mem
        CUDA_CHECK(cudaFree(d_input));
        CUDA_CHECK(cudaFree(d_output));
        CUDA_CHECK(cudaFree(d_keystream));
        CUDA_CHECK(cudaEventDestroy(start_event));
        CUDA_CHECK(cudaEventDestroy(stop_event));


    } catch (const std::exception& e) {
        std::cerr << "Error: " << e.what() << std::endl;
         if (d_input) cudaFree(d_input);
         if (d_output) cudaFree(d_output);
         if (d_keystream) cudaFree(d_keystream);
        ERR_free_strings(); // Clean OpenSSL
        return 1;
    }

    ERR_free_strings(); // Clean OpenSSL
    return 0;
}

Overwriting aes_ctr_gpu.cu


### **Compile and run experiments**

In [23]:
# compile CUDA by nvcc
!nvcc aes_ctr_gpu.cu -o aes_ctr_gpu -O3 -lssl -lcrypto -std=c++11 -gencode arch=compute_75,code=sm_75

# check update file
!ls -l aes_ctr_gpu

-rwxr-xr-x 1 root root 1028264 May 12 04:38 aes_ctr_gpu


In [24]:
!echo " Running GPU (CUDA) 512KB"
!./aes_ctr_gpu plaintext_10MB.bin ciphertext_10MB_gpu.bin

# 10MB
!echo "Running GPU (CUDA) 10MB "
!./aes_ctr_gpu plaintext_10MB.bin ciphertext_10MB_gpu.bin

# 100MB
!echo "Running GPU (CUDA) 100MB"
!./aes_ctr_gpu plaintext_100MB.bin ciphertext_100MB_gpu.bin

# 500MB
!echo "Running GPU (CUDA) 500MB"
!./aes_ctr_gpu plaintext_500MB.bin ciphertext_500MB_gpu.bin

# 1GB
!echo "Running GPU (CUDA) 1GB"
!./aes_ctr_gpu plaintext_1GB.bin ciphertext_1GB_gpu.bin

# 3GB
!echo " Running GPU (CUDA) 3GB"
!./aes_ctr_gpu plaintext_3GB.bin ciphertext_3GB_gpu.bin

# check file output
!ls -lh ciphertext_*_gpu.bin

 Running GPU (CUDA) 512KB
Read 10485760 bytes in 8.67507 ms.
Generating keystream on host using OpenSSL...
Keystream generation finished in 26.8509 ms.
Allocating memory on GPU...
GPU Allocation finished in 203.354 ms.
Copying data from Host to Device...
HtoD Copy finished in 4.71022 ms.
Launching CUDA kernel...
CUDA Kernel execution finished.
Copying result from Device to Host...
DtoH Copy finished in 2.47949 ms.
Write finished in 17.0002 ms.
----------------------------------------
Mode:          GPU (CUDA)
Input size:    10485760 bytes
Output size:   10485760 bytes
Kernel Time:   0.398144 ms
(Note: Total time includes data transfers & host processing)
----------------------------------------
Running GPU (CUDA) 10MB 
Read 10485760 bytes in 8.37323 ms.
Generating keystream on host using OpenSSL...
Keystream generation finished in 26.6993 ms.
Allocating memory on GPU...
GPU Allocation finished in 175.815 ms.
Copying data from Host to Device...
HtoD Copy finished in 4.85499 ms.
Launchin

## Both Keystream genarate and XOR matrix


In [25]:
%%writefile aes_ctr_gpu_full.cu
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#include <chrono>
#include <stdexcept>
#include <cstring> // for memcpy, memset

// CUDA headers
#include <cuda_runtime.h>
#include <device_launch_parameters.h> // Cho __global__

// Configuration
const int AES_KEY_SIZE_BYTES = 128 / 8; // 16 bytes
const int AES_BLOCK_SIZE_BYTES = 128 / 8; // 16 bytes
const int AES_ROUNDS = 10; // AES 128 round
const int EXPANDED_KEY_SIZE_BYTES = AES_BLOCK_SIZE_BYTES * (AES_ROUNDS + 1); // 16 * (10 + 1) = 176 bytes
const int THREADS_PER_BLOCK = 256; // Số luồng CUDA mỗi block (có thể điều chỉnh)

// CUDA error
#define CUDA_CHECK(err) { cudaError_t cuda_err = (err); if (cuda_err != cudaSuccess) { fprintf(stderr, "CUDA Error at %s:%d - %s\\n", __FILE__, __LINE__, cudaGetErrorString(cuda_err)); throw std::runtime_error("CUDA Error"); } }

// AES S-Box
const unsigned char sbox_host[256] = {
    0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
    0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
    0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
    0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
    0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
    0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
    0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
    0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
    0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
    0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
    0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
    0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
    0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
    0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
    0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
    0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 };

// DEVICE (__constant__) for Kernel
__constant__ unsigned char sbox_device[256];

// Rcon table (constant) (HOST)
const unsigned char Rcon_host[11] = {
    0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36 };

// The extension key is stored in the DEVICE's constant memory
__constant__ unsigned char d_expanded_key[EXPANDED_KEY_SIZE_BYTES];

// Helper Function for AES on Device
__device__ inline unsigned char gmul(unsigned char a) {
    return (a << 1) ^ (((a >> 7) & 1) * 0x1b);
}

__device__ void SubBytes(unsigned char state[AES_BLOCK_SIZE_BYTES]) {
    for (int i = 0; i < AES_BLOCK_SIZE_BYTES; ++i) {
        state[i] = sbox_device[state[i]];
    }
}

__device__ void ShiftRows(unsigned char state[AES_BLOCK_SIZE_BYTES]) {
    unsigned char tmp;
    tmp = state[1]; state[1] = state[5]; state[5] = state[9]; state[9] = state[13]; state[13] = tmp;
    tmp = state[2]; state[2] = state[10]; state[10] = tmp;
    tmp = state[6]; state[6] = state[14]; state[14] = tmp;
    tmp = state[3]; state[3] = state[15]; state[15] = state[11]; state[11] = state[7]; state[7] = tmp;
}

__device__ void MixColumns(unsigned char state[AES_BLOCK_SIZE_BYTES]) {
    unsigned char temp_col[4];
    unsigned char a, b;
    for(int j=0; j<4; ++j) {
        for(int i=0; i<4; ++i) {
            temp_col[i] = state[i*4+j];
        }
        // logic MixColumns chính xác
        a = temp_col[0]; b = temp_col[1];
        state[0*4+j] = gmul(a) ^ (gmul(b) ^ b) ^ temp_col[2] ^ temp_col[3];
        a = temp_col[1]; b = temp_col[2];
        state[1*4+j] = temp_col[0] ^ gmul(a) ^ (gmul(b) ^ b) ^ temp_col[3];
        a = temp_col[2]; b = temp_col[3];
        state[2*4+j] = temp_col[0] ^ temp_col[1] ^ gmul(a) ^ (gmul(b) ^ b);
        a = temp_col[3]; b = temp_col[0];
        state[3*4+j] = (gmul(b) ^ b) ^ temp_col[1] ^ temp_col[2] ^ gmul(a);
    }
}

__device__ void AddRoundKey(unsigned char state[AES_BLOCK_SIZE_BYTES], const unsigned char* roundKey) {
    for (int i = 0; i < AES_BLOCK_SIZE_BYTES; ++i) {
        state[i] ^= roundKey[i];
    }
}

__device__ void aes_encrypt_block_device(unsigned char state[AES_BLOCK_SIZE_BYTES], const unsigned char* expandedKey) {
    AddRoundKey(state, expandedKey);
    for (int round = 1; round < AES_ROUNDS; ++round) {
        SubBytes(state);
        ShiftRows(state);
        MixColumns(state);
        AddRoundKey(state, expandedKey + round * AES_BLOCK_SIZE_BYTES);
    }
    SubBytes(state);
    ShiftRows(state);
    AddRoundKey(state, expandedKey + AES_ROUNDS * AES_BLOCK_SIZE_BYTES);
}

// --- Kernels ---
__global__ void aes_ecb_encrypt_counter_kernel(unsigned char* d_keystream,
                                              const unsigned char* iv_counter_start,
                                              size_t num_total_blocks)
{
    size_t block_id = blockIdx.x * blockDim.x + threadIdx.x;

    if (block_id < num_total_blocks) {
        unsigned char current_counter[AES_BLOCK_SIZE_BYTES];
        unsigned char state[AES_BLOCK_SIZE_BYTES];

        for(int i=0; i < AES_BLOCK_SIZE_BYTES; ++i) {
            current_counter[i] = iv_counter_start[i];
        }

        unsigned long long offset_to_add = block_id;
        for (int i = AES_BLOCK_SIZE_BYTES - 1; i >= 0; --i) {
             unsigned int sum = current_counter[i] + (offset_to_add & 0xFF);
             current_counter[i] = (unsigned char)(sum & 0xFF);
             offset_to_add >>= 8;
             if (sum > 0xFF) {
                 offset_to_add++;
             }
             if (offset_to_add == 0) break;
        }

        for(int i=0; i < AES_BLOCK_SIZE_BYTES; ++i) {
             state[i] = current_counter[i];
        }
        aes_encrypt_block_device(state, d_expanded_key);

        unsigned char* keystream_ptr = d_keystream + block_id * AES_BLOCK_SIZE_BYTES;
        for(int i=0; i < AES_BLOCK_SIZE_BYTES; ++i) {
             keystream_ptr[i] = state[i];
        }
    }
}

__global__ void xor_kernel(const unsigned char* d_input,
                           unsigned char* d_output,
                           const unsigned char* d_keystream,
                           size_t n_bytes)
{
    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n_bytes) {
        d_output[idx] = d_input[idx] ^ d_keystream[idx];
    }
}

// Key Expansion on HOST
void key_expansion(const unsigned char* key, unsigned char* expandedKey) {
    memcpy(expandedKey, key, AES_KEY_SIZE_BYTES);
    unsigned char temp[4];
    int bytesGenerated = AES_KEY_SIZE_BYTES;
    int rconIteration = 1;
    unsigned char* last4Bytes;

    while (bytesGenerated < EXPANDED_KEY_SIZE_BYTES) {
        last4Bytes = expandedKey + bytesGenerated - 4;
        memcpy(temp, last4Bytes, 4);

        if (bytesGenerated % AES_KEY_SIZE_BYTES == 0) {
            unsigned char first_byte = temp[0];
            temp[0] = temp[1]; temp[1] = temp[2]; temp[2] = temp[3]; temp[3] = first_byte;
            for (int i = 0; i < 4; ++i) { temp[i] = sbox_host[temp[i]]; }
            temp[0] ^= Rcon_host[rconIteration++];
        }
        for (int i = 0; i < 4; ++i) { temp[i] ^= expandedKey[bytesGenerated - AES_KEY_SIZE_BYTES + i]; }
        memcpy(expandedKey + bytesGenerated, temp, 4);
        bytesGenerated += 4;
    }
}

// read files
std::vector<unsigned char> read_file(const std::string& filename) {
    std::ifstream infile(filename, std::ios::binary | std::ios::ate);
    if (!infile) { throw std::runtime_error("Cannot open file for reading: " + filename); }
    std::streamsize size = infile.tellg();
    infile.seekg(0, std::ios::beg);
    std::vector<unsigned char> buffer(size);
    if (size > 0) {
       if (!infile.read(reinterpret_cast<char*>(buffer.data()), size)) {
           throw std::runtime_error("Error reading file: " + filename);
       }
    }
    return buffer;
}

// write
void write_file(const std::string& filename, const std::vector<unsigned char>& data) {
    std::ofstream outfile(filename, std::ios::binary | std::ios::trunc);
    if (!outfile) { throw std::runtime_error("Cannot open file for writing: " + filename); }
     if (!data.empty()) {
       if (!outfile.write(reinterpret_cast<const char*>(data.data()), data.size())) {
            throw std::runtime_error("Error writing file: " + filename);
       }
     }
}

// Main
int main(int argc, char* argv[]) {
    if (argc != 3) {
        std::cerr << "Usage: " << argv[0] << " <input_file> <output_file>" << std::endl;
        return 1;
    }
    std::string input_filename = argv[1];
    std::string output_filename = argv[2];

    unsigned char key[AES_KEY_SIZE_BYTES] = {
        0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6,
        0xab, 0xf7, 0x15, 0x88, 0x09, 0xcf, 0x4f, 0x3c
    };
    unsigned char iv_counter_start[AES_BLOCK_SIZE_BYTES];
    memset(iv_counter_start, 0x00, AES_BLOCK_SIZE_BYTES);

    std::chrono::high_resolution_clock::time_point t_overall_start, t_overall_end;
    std::chrono::duration<double, std::milli> duration_total, duration_read, duration_alloc,
                                              duration_h2d_plaintext_iv, duration_key_expansion_copy,
                                              duration_kernel_aes, duration_kernel_xor,
                                              duration_d2h, duration_write;
    float kernel_aes_ms = 0, kernel_xor_ms = 0;
    cudaEvent_t start_event, stop_event;
    CUDA_CHECK(cudaEventCreate(&start_event));
    CUDA_CHECK(cudaEventCreate(&stop_event));

    unsigned char *d_plaintext = nullptr, *d_ciphertext = nullptr, *d_keystream = nullptr, *d_iv_counter_start = nullptr;
    std::vector<unsigned char> h_plaintext;
    std::vector<unsigned char> h_ciphertext;
    unsigned char h_expanded_key[EXPANDED_KEY_SIZE_BYTES];

    t_overall_start = std::chrono::high_resolution_clock::now();
    try {
        auto t_key_exp_start = std::chrono::high_resolution_clock::now();
        key_expansion(key, h_expanded_key);
        CUDA_CHECK(cudaMemcpyToSymbol(d_expanded_key, h_expanded_key, EXPANDED_KEY_SIZE_BYTES));
        CUDA_CHECK(cudaMemcpyToSymbol(sbox_device, sbox_host, sizeof(sbox_host))); // Copy S-Box to device
        auto t_key_exp_end = std::chrono::high_resolution_clock::now();
        duration_key_expansion_copy = t_key_exp_end - t_key_exp_start;
        std::cout << "Key Expansion and S-Box copy to GPU finished in " << duration_key_expansion_copy.count() << " ms." << std::endl;

        auto t_read_start = std::chrono::high_resolution_clock::now();
        h_plaintext = read_file(input_filename);
        auto t_read_end = std::chrono::high_resolution_clock::now();
        duration_read = t_read_end - t_read_start;
        std::cout << "Read " << h_plaintext.size() << " bytes in " << duration_read.count() << " ms." << std::endl;

        if (h_plaintext.empty()) {
             std::cout << "Input file is empty. Exiting." << std::endl;
             write_file(output_filename, h_ciphertext);
             t_overall_end = std::chrono::high_resolution_clock::now();
             duration_total = t_overall_end - t_overall_start;
             std::cout << "Total Execution Time: " << duration_total.count() << " ms (for empty file)" << std::endl;
             return 0;
        }
        size_t data_size = h_plaintext.size();
        h_ciphertext.resize(data_size);
        size_t num_total_blocks = (data_size + AES_BLOCK_SIZE_BYTES - 1) / AES_BLOCK_SIZE_BYTES;
        size_t keystream_size = num_total_blocks * AES_BLOCK_SIZE_BYTES;

        // Allocate memory on GPU
        auto t_alloc_start = std::chrono::high_resolution_clock::now();
        CUDA_CHECK(cudaMalloc(&d_plaintext, data_size));
        CUDA_CHECK(cudaMalloc(&d_ciphertext, data_size));
        CUDA_CHECK(cudaMalloc(&d_keystream, keystream_size));
        CUDA_CHECK(cudaMalloc(&d_iv_counter_start, AES_BLOCK_SIZE_BYTES));
        auto t_alloc_end = std::chrono::high_resolution_clock::now();
        duration_alloc = t_alloc_end - t_alloc_start;
        std::cout << "GPU Allocation finished in " << duration_alloc.count() << " ms." << std::endl;

        // Copy data from Host to Device
        auto t_h2d_pt_iv_start = std::chrono::high_resolution_clock::now();
        CUDA_CHECK(cudaMemcpy(d_plaintext, h_plaintext.data(), data_size, cudaMemcpyHostToDevice));
        CUDA_CHECK(cudaMemcpy(d_iv_counter_start, iv_counter_start, AES_BLOCK_SIZE_BYTES, cudaMemcpyHostToDevice));
        auto t_h2d_pt_iv_end = std::chrono::high_resolution_clock::now();
        duration_h2d_plaintext_iv = t_h2d_pt_iv_end - t_h2d_pt_iv_start;
        std::cout << "HtoD Plaintext & IV Copy finished in " << duration_h2d_plaintext_iv.count() << " ms." << std::endl;

        int num_grid_blocks_for_aes_kernel = (num_total_blocks + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
        dim3 gridDimAes(num_grid_blocks_for_aes_kernel);
        dim3 blockDimAes(THREADS_PER_BLOCK);

        CUDA_CHECK(cudaEventRecord(start_event));
        aes_ecb_encrypt_counter_kernel<<<gridDimAes, blockDimAes>>>(d_keystream, d_iv_counter_start, num_total_blocks);
        CUDA_CHECK(cudaEventRecord(stop_event));
        CUDA_CHECK(cudaGetLastError());
        CUDA_CHECK(cudaEventSynchronize(stop_event));
        CUDA_CHECK(cudaEventElapsedTime(&kernel_aes_ms, start_event, stop_event));
        duration_kernel_aes = std::chrono::duration<double, std::milli>(kernel_aes_ms);
        std::cout << "AES Keystream Kernel (" << THREADS_PER_BLOCK << " t/b) finished in " << duration_kernel_aes.count() << " ms." << std::endl;

        int num_grid_blocks_for_xor_kernel = (data_size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
        dim3 gridDimXor(num_grid_blocks_for_xor_kernel);
        dim3 blockDimXor(THREADS_PER_BLOCK);

        CUDA_CHECK(cudaEventRecord(start_event));
        xor_kernel<<<gridDimXor, blockDimXor>>>(d_plaintext, d_ciphertext, d_keystream, data_size);
        CUDA_CHECK(cudaEventRecord(stop_event));
        CUDA_CHECK(cudaGetLastError());
        CUDA_CHECK(cudaEventSynchronize(stop_event));
        CUDA_CHECK(cudaEventElapsedTime(&kernel_xor_ms, start_event, stop_event));
        duration_kernel_xor = std::chrono::duration<double, std::milli>(kernel_xor_ms);
        std::cout << "XOR Kernel (" << THREADS_PER_BLOCK << " t/b) finished in " << duration_kernel_xor.count() << " ms." << std::endl;

        auto t_d2h_start = std::chrono::high_resolution_clock::now();
        CUDA_CHECK(cudaMemcpy(h_ciphertext.data(), d_ciphertext, data_size, cudaMemcpyDeviceToHost));
        auto t_d2h_end = std::chrono::high_resolution_clock::now();
        duration_d2h = t_d2h_end - t_d2h_start;
        std::cout << "DtoH Copy finished in " << duration_d2h.count() << " ms." << std::endl;

        auto t_write_start = std::chrono::high_resolution_clock::now();
        write_file(output_filename, h_ciphertext);
        auto t_write_end = std::chrono::high_resolution_clock::now();
        duration_write = t_write_end - t_write_start;
        std::cout << "Write finished in " << duration_write.count() << " ms." << std::endl;

    } catch (const std::exception& e) {
        std::cerr << "Error: " << e.what() << std::endl;
    }

    if (d_plaintext) cudaFree(d_plaintext);
    if (d_ciphertext) cudaFree(d_ciphertext);
    if (d_keystream) cudaFree(d_keystream);
    if (d_iv_counter_start) cudaFree(d_iv_counter_start);
    cudaEventDestroy(start_event);
    cudaEventDestroy(stop_event);

    t_overall_end = std::chrono::high_resolution_clock::now();
    duration_total = t_overall_end - t_overall_start;

    // Presentation
    if (std::current_exception() == nullptr && !h_plaintext.empty()) {
        std::cout << "\n----------------------------------------" << std::endl;
        std::cout << "Mode:                 GPU (CUDA - Full AES on GPU)" << std::endl;
        std::cout << "Input size:           " << h_plaintext.size() << " bytes" << std::endl;
        std::cout << "Output size:          " << h_ciphertext.size() << " bytes" << std::endl;
        std::cout << "Total AES Blocks:     " << (h_plaintext.empty() ? 0 : (h_plaintext.size() + AES_BLOCK_SIZE_BYTES - 1) / AES_BLOCK_SIZE_BYTES) << std::endl;
        std::cout << "Threads Per Block:    " << THREADS_PER_BLOCK << std::endl;
        std::cout << "--- Timing Breakdown (ms) ---" << std::endl;
        std::cout << "File Read:            " << duration_read.count() << std::endl;
        std::cout << "KeyExp & SBox Copy:   " << duration_key_expansion_copy.count() << std::endl;
        std::cout << "GPU Allocation:       " << duration_alloc.count() << std::endl;
        std::cout << "HtoD Plaintext&IV:    " << duration_h2d_plaintext_iv.count() << std::endl;
        std::cout << "AES Kernel Time:      " << duration_kernel_aes.count() << std::endl;
        std::cout << "XOR Kernel Time:      " << duration_kernel_xor.count() << std::endl;
        std::cout << "DtoH Ciphertext:      " << duration_d2h.count() << std::endl;
        std::cout << "File Write:           " << duration_write.count() << std::endl;
        std::cout << "---" << std::endl;
        std::cout << "Total Execution Time: " << duration_total.count() << " ms" << std::endl;
        std::cout << "----------------------------------------" << std::endl;
    } else if (std::current_exception()) {
        return 1;
    }
    return 0;
}

Overwriting aes_ctr_gpu_full.cu


In [26]:
# compile
!nvcc aes_ctr_gpu_full.cu -o aes_ctr_gpu_full -O3 -arch=sm_75 -std=c++11
!ls -l aes_ctr_gpu_full

-rwxr-xr-x 1 root root 1044880 May 12 04:49 aes_ctr_gpu_full


In [27]:
!echo " Running GPU (Full AES) 512KB"
!./aes_ctr_gpu_full plaintext_512KB.bin ciphertext_512KB_gpu_full.bin

!echo -e "\\n Running GPU (Full AES) 10MB"
!./aes_ctr_gpu_full plaintext_10MB.bin ciphertext_10MB_gpu_full.bin

!echo -e "\\n Running GPU (Full AES) 100MB"
!./aes_ctr_gpu_full plaintext_100MB.bin ciphertext_100MB_gpu_full.bin

!echo -e "\\n Running GPU (Full AES) 500MB"
!./aes_ctr_gpu_full plaintext_500MB.bin ciphertext_500MB_gpu_full.bin

!echo -e "\\n Running GPU (Full AES) 1GB"
!./aes_ctr_gpu_full plaintext_1GB.bin ciphertext_1GB_gpu_full.bin

!echo -e "\\n Running GPU (Full AES) 3GB"
!./aes_ctr_gpu_full plaintext_3GB.bin ciphertext_3GB_gpu_full.bin

!echo -e "\\n Check"
!ls -lh ciphertext_*_gpu_full.bin

 Running GPU (Full AES) 512KB
Key Expansion and S-Box copy to GPU finished in 0.235241 ms.
Read 524288 bytes in 6.84549 ms.
GPU Allocation finished in 0.156337 ms.
HtoD Plaintext & IV Copy finished in 0.179076 ms.
AES Keystream Kernel (256 t/b) finished in 0.213472 ms.
XOR Kernel (256 t/b) finished in 0.03904 ms.
DtoH Copy finished in 0.17502 ms.
Write finished in 0.575377 ms.

----------------------------------------
Mode:                 GPU (CUDA - Full AES on GPU)
Input size:           524288 bytes
Output size:          524288 bytes
Total AES Blocks:     32768
Threads Per Block:    256
--- Timing Breakdown (ms) ---
File Read:            6.84549
KeyExp & SBox Copy:   0.235241
GPU Allocation:       0.156337
HtoD Plaintext&IV:    0.179076
AES Kernel Time:      0.213472
XOR Kernel Time:      0.03904
DtoH Ciphertext:      0.17502
File Write:           0.575377
---
Total Execution Time: 9.1303 ms
----------------------------------------

 Running GPU (Full AES) 10MB
Key Expansion and S-B