<a href="https://colab.research.google.com/github/nienhcar/hpcs_cuda/blob/main/cudabitap.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [26]:
%%writefile 1_lakh.cu
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda_runtime.h>

#define MAX_PATTERN_LENGTH 64
#define ALPHABET_SIZE 256
#define OPTIMIZED_BLOCK_SIZE 512   // Block size adjusted back to 256
#define OPTIMIZED_CHUNK_SIZE 64   // Each thread handles 32 characters

__constant__ unsigned long long d_pattern_mask[ALPHABET_SIZE];

__global__ void bitap_search_kernel(const unsigned char *text, size_t text_length, int pattern_length, int *results) {
    size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
    size_t start = tid * OPTIMIZED_CHUNK_SIZE;
    size_t end = min(start + OPTIMIZED_CHUNK_SIZE, text_length);

    unsigned long long R = ~0ULL;
    unsigned long long match = 1ULL << (pattern_length - 1);

    for (size_t i = start; i < end; ++i) {
        R = ((R << 1) | d_pattern_mask[text[i]]) & ((1ULL << pattern_length) - 1);
        if ((R & match) == 0) {
            results[i] = 1;
        }
    }
}

void cuda_bitap_search(const unsigned char *h_text, size_t text_length, const char *pattern, FILE *output_file) {
    int pattern_length = strlen(pattern);
    if (pattern_length == 0 || pattern_length > MAX_PATTERN_LENGTH) {
        fprintf(output_file, "Pattern is empty or too long!\n");
        return;
    }

    unsigned long long h_pattern_mask[ALPHABET_SIZE];
    for (int i = 0; i < ALPHABET_SIZE; ++i)
        h_pattern_mask[i] = ~0ULL;
    for (int i = 0; i < pattern_length; ++i)
        h_pattern_mask[(unsigned char)pattern[i]] &= ~(1ULL << i);

    cudaMemcpyToSymbol(d_pattern_mask, h_pattern_mask, ALPHABET_SIZE * sizeof(unsigned long long));

    unsigned char *d_text;
    int *d_results;
    cudaMalloc((void**)&d_text, text_length * sizeof(unsigned char));
    cudaMalloc((void**)&d_results, text_length * sizeof(int));

    cudaMemcpy(d_text, h_text, text_length * sizeof(unsigned char), cudaMemcpyHostToDevice);
    cudaMemset(d_results, 0, text_length * sizeof(int));

    int numBlocks = (text_length + OPTIMIZED_BLOCK_SIZE * OPTIMIZED_CHUNK_SIZE - 1) / (OPTIMIZED_BLOCK_SIZE * OPTIMIZED_CHUNK_SIZE);

    bitap_search_kernel<<<numBlocks, OPTIMIZED_BLOCK_SIZE>>>(d_text, text_length, pattern_length, d_results);

    int *h_results = (int*)malloc(text_length * sizeof(int));
    cudaMemcpy(h_results, d_results, text_length * sizeof(int), cudaMemcpyDeviceToHost);

    int found = 0;
    for (size_t i = 0; i < text_length; ++i) {
        if (h_results[i]) {
            fprintf(output_file, "Pattern found at position: %zu\n", i);
            found = 1;
        }
    }

    if (!found) {
        fprintf(output_file, "No match found.\n");
    }

    cudaFree(d_text);
    cudaFree(d_results);
    free(h_results);
}

int main() {
    const char *pattern = "AGGA";  // Your desired pattern
    unsigned char *text;
    size_t file_size;

    FILE *file = fopen("/content/drive/MyDrive/input_1L.txt", "rb");
    if (file == NULL) {
        perror("Could not open input.txt");
        return EXIT_FAILURE;
    }

    fseek(file, 0, SEEK_END);
    file_size = ftell(file);
    fseek(file, 0, SEEK_SET);

    text = (unsigned char*)malloc(file_size);
    if (text == NULL) {
        perror("Memory allocation failed");
        fclose(file);
        return EXIT_FAILURE;
    }

    size_t bytes_read = fread(text, 1, file_size, file);
    if (bytes_read != file_size) {
        perror("Error reading file");
        free(text);
        fclose(file);
        return EXIT_FAILURE;
    }
    fclose(file);

    FILE *output_file = fopen("output_1L.txt", "w");
    if (output_file == NULL) {
        perror("Could not open output.txt");
        free(text);
        return EXIT_FAILURE;
    }

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);

    cuda_bitap_search(text, file_size, pattern, output_file);

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    printf("Time taken: %f seconds\n", milliseconds / 1000.0);

    free(text);
    fclose(output_file);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return 0;
}


Overwriting 1_lakh.cu


In [27]:
!nvcc -o 1_lakh 1_lakh.cu
!./1_lakh

Time taken: 0.001163 seconds


In [13]:
!nvidia-smi


Sun Oct 27 14:32:33 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   50C    P8              11W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [33]:

%%writefile 10_lakh.cu


#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda_runtime.h>

#define MAX_PATTERN_LENGTH 64
#define ALPHABET_SIZE 256
#define OPTIMIZED_BLOCK_SIZE 512   // Block size adjusted back to 256
#define OPTIMIZED_CHUNK_SIZE 64   // Each thread handles 32 characters

__constant__ unsigned long long d_pattern_mask[ALPHABET_SIZE];

__global__ void bitap_search_kernel(const unsigned char *text, size_t text_length, int pattern_length, int *results) {
    size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
    size_t start = tid * OPTIMIZED_CHUNK_SIZE;
    size_t end = min(start + OPTIMIZED_CHUNK_SIZE, text_length);

    unsigned long long R = ~0ULL;
    unsigned long long match = 1ULL << (pattern_length - 1);

    for (size_t i = start; i < end; ++i) {
        R = ((R << 1) | d_pattern_mask[text[i]]) & ((1ULL << pattern_length) - 1);
        if ((R & match) == 0) {
            results[i] = 1;
        }
    }
}

void cuda_bitap_search(const unsigned char *h_text, size_t text_length, const char *pattern, FILE *output_file) {
    int pattern_length = strlen(pattern);
    if (pattern_length == 0 || pattern_length > MAX_PATTERN_LENGTH) {
        fprintf(output_file, "Pattern is empty or too long!\n");
        return;
    }

    unsigned long long h_pattern_mask[ALPHABET_SIZE];
    for (int i = 0; i < ALPHABET_SIZE; ++i)
        h_pattern_mask[i] = ~0ULL;
    for (int i = 0; i < pattern_length; ++i)
        h_pattern_mask[(unsigned char)pattern[i]] &= ~(1ULL << i);

    cudaMemcpyToSymbol(d_pattern_mask, h_pattern_mask, ALPHABET_SIZE * sizeof(unsigned long long));

    unsigned char *d_text;
    int *d_results;
    cudaMalloc((void**)&d_text, text_length * sizeof(unsigned char));
    cudaMalloc((void**)&d_results, text_length * sizeof(int));

    cudaMemcpy(d_text, h_text, text_length * sizeof(unsigned char), cudaMemcpyHostToDevice);
    cudaMemset(d_results, 0, text_length * sizeof(int));

    int numBlocks = (text_length + OPTIMIZED_BLOCK_SIZE * OPTIMIZED_CHUNK_SIZE - 1) / (OPTIMIZED_BLOCK_SIZE * OPTIMIZED_CHUNK_SIZE);

    bitap_search_kernel<<<numBlocks, OPTIMIZED_BLOCK_SIZE>>>(d_text, text_length, pattern_length, d_results);

    int *h_results = (int*)malloc(text_length * sizeof(int));
    cudaMemcpy(h_results, d_results, text_length * sizeof(int), cudaMemcpyDeviceToHost);

    int found = 0;
    for (size_t i = 0; i < text_length; ++i) {
        if (h_results[i]) {
            fprintf(output_file, "Pattern found at position: %zu\n", i);
            found = 1;
        }
    }

    if (!found) {
        fprintf(output_file, "No match found.\n");
    }

    cudaFree(d_text);
    cudaFree(d_results);
    free(h_results);
}

int main() {
    const char *pattern = "AGGA";  // Your desired pattern
    unsigned char *text;
    size_t file_size;

    FILE *file = fopen("/content/drive/MyDrive/input_10l.txt", "rb");
    if (file == NULL) {
        perror("Could not open input.txt");
        return EXIT_FAILURE;
    }

    fseek(file, 0, SEEK_END);
    file_size = ftell(file);
    fseek(file, 0, SEEK_SET);

    text = (unsigned char*)malloc(file_size);
    if (text == NULL) {
        perror("Memory allocation failed");
        fclose(file);
        return EXIT_FAILURE;
    }

    size_t bytes_read = fread(text, 1, file_size, file);
    if (bytes_read != file_size) {
        perror("Error reading file");
        free(text);
        fclose(file);
        return EXIT_FAILURE;
    }
    fclose(file);

    FILE *output_file = fopen("output_10l.txt", "w");
    if (output_file == NULL) {
        perror("Could not open output.txt");
        free(text);
        return EXIT_FAILURE;
    }

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);

    cuda_bitap_search(text, file_size, pattern, output_file);

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    printf("Time taken: %f seconds\n", milliseconds / 1000.0);

    free(text);
    fclose(output_file);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return 0;
}


Overwriting 10_lakh.cu


In [34]:
!nvcc -o 10_lakh 10_lakh.cu
!./10_lakh

Time taken: 0.006987 seconds


In [38]:

%%writefile 1_thousand.cu

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda_runtime.h>

#define MAX_PATTERN_LENGTH 64
#define ALPHABET_SIZE 256
#define OPTIMIZED_BLOCK_SIZE 512   // Block size adjusted back to 256
#define OPTIMIZED_CHUNK_SIZE 64   // Each thread handles 32 characters

__constant__ unsigned long long d_pattern_mask[ALPHABET_SIZE];

__global__ void bitap_search_kernel(const unsigned char *text, size_t text_length, int pattern_length, int *results) {
    size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
    size_t start = tid * OPTIMIZED_CHUNK_SIZE;
    size_t end = min(start + OPTIMIZED_CHUNK_SIZE, text_length);

    unsigned long long R = ~0ULL;
    unsigned long long match = 1ULL << (pattern_length - 1);

    for (size_t i = start; i < end; ++i) {
        R = ((R << 1) | d_pattern_mask[text[i]]) & ((1ULL << pattern_length) - 1);
        if ((R & match) == 0) {
            results[i] = 1;
        }
    }
}

void cuda_bitap_search(const unsigned char *h_text, size_t text_length, const char *pattern, FILE *output_file) {
    int pattern_length = strlen(pattern);
    if (pattern_length == 0 || pattern_length > MAX_PATTERN_LENGTH) {
        fprintf(output_file, "Pattern is empty or too long!\n");
        return;
    }

    unsigned long long h_pattern_mask[ALPHABET_SIZE];
    for (int i = 0; i < ALPHABET_SIZE; ++i)
        h_pattern_mask[i] = ~0ULL;
    for (int i = 0; i < pattern_length; ++i)
        h_pattern_mask[(unsigned char)pattern[i]] &= ~(1ULL << i);

    cudaMemcpyToSymbol(d_pattern_mask, h_pattern_mask, ALPHABET_SIZE * sizeof(unsigned long long));

    unsigned char *d_text;
    int *d_results;
    cudaMalloc((void**)&d_text, text_length * sizeof(unsigned char));
    cudaMalloc((void**)&d_results, text_length * sizeof(int));

    cudaMemcpy(d_text, h_text, text_length * sizeof(unsigned char), cudaMemcpyHostToDevice);
    cudaMemset(d_results, 0, text_length * sizeof(int));

    int numBlocks = (text_length + OPTIMIZED_BLOCK_SIZE * OPTIMIZED_CHUNK_SIZE - 1) / (OPTIMIZED_BLOCK_SIZE * OPTIMIZED_CHUNK_SIZE);

    bitap_search_kernel<<<numBlocks, OPTIMIZED_BLOCK_SIZE>>>(d_text, text_length, pattern_length, d_results);

    int *h_results = (int*)malloc(text_length * sizeof(int));
    cudaMemcpy(h_results, d_results, text_length * sizeof(int), cudaMemcpyDeviceToHost);

    int found = 0;
    for (size_t i = 0; i < text_length; ++i) {
        if (h_results[i]) {
            fprintf(output_file, "Pattern found at position: %zu\n", i);
            found = 1;
        }
    }

    if (!found) {
        fprintf(output_file, "No match found.\n");
    }

    cudaFree(d_text);
    cudaFree(d_results);
    free(h_results);
}

int main() {
    const char *pattern = "AGGA";  // Your desired pattern
    unsigned char *text;
    size_t file_size;

    FILE *file = fopen("/content/drive/MyDrive/input_1k.txt", "rb");
    if (file == NULL) {
        perror("Could not open input.txt");
        return EXIT_FAILURE;
    }

    fseek(file, 0, SEEK_END);
    file_size = ftell(file);
    fseek(file, 0, SEEK_SET);

    text = (unsigned char*)malloc(file_size);
    if (text == NULL) {
        perror("Memory allocation failed");
        fclose(file);
        return EXIT_FAILURE;
    }

    size_t bytes_read = fread(text, 1, file_size, file);
    if (bytes_read != file_size) {
        perror("Error reading file");
        free(text);
        fclose(file);
        return EXIT_FAILURE;
    }
    fclose(file);

    FILE *output_file = fopen("output_1k.txt", "w");
    if (output_file == NULL) {
        perror("Could not open output.txt");
        free(text);
        return EXIT_FAILURE;
    }

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);

    cuda_bitap_search(text, file_size, pattern, output_file);

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    printf("Time taken: %f seconds\n", milliseconds / 1000.0);

    free(text);
    fclose(output_file);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return 0;
}


Writing 1_thousand.cu


In [39]:
!nvcc -o 1_thousand 1_thousand.cu
!./1_thousand

Time taken: 0.000598 seconds


In [50]:
%%writefile 10thousand.cu

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda_runtime.h>

#define MAX_PATTERN_LENGTH 64
#define ALPHABET_SIZE 256
#define OPTIMIZED_BLOCK_SIZE 512   // Block size adjusted back to 256
#define OPTIMIZED_CHUNK_SIZE 64   // Each thread handles 32 characters

__constant__ unsigned long long d_pattern_mask[ALPHABET_SIZE];

__global__ void bitap_search_kernel(const unsigned char *text, size_t text_length, int pattern_length, int *results) {
    size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
    size_t start = tid * OPTIMIZED_CHUNK_SIZE;
    size_t end = min(start + OPTIMIZED_CHUNK_SIZE, text_length);

    unsigned long long R = ~0ULL;
    unsigned long long match = 1ULL << (pattern_length - 1);

    for (size_t i = start; i < end; ++i) {
        R = ((R << 1) | d_pattern_mask[text[i]]) & ((1ULL << pattern_length) - 1);
        if ((R & match) == 0) {
            results[i] = 1;
        }
    }
}

void cuda_bitap_search(const unsigned char *h_text, size_t text_length, const char *pattern, FILE *output_file) {
    int pattern_length = strlen(pattern);
    if (pattern_length == 0 || pattern_length > MAX_PATTERN_LENGTH) {
        fprintf(output_file, "Pattern is empty or too long!\n");
        return;
    }

    unsigned long long h_pattern_mask[ALPHABET_SIZE];
    for (int i = 0; i < ALPHABET_SIZE; ++i)
        h_pattern_mask[i] = ~0ULL;
    for (int i = 0; i < pattern_length; ++i)
        h_pattern_mask[(unsigned char)pattern[i]] &= ~(1ULL << i);

    cudaMemcpyToSymbol(d_pattern_mask, h_pattern_mask, ALPHABET_SIZE * sizeof(unsigned long long));

    unsigned char *d_text;
    int *d_results;
    cudaMalloc((void**)&d_text, text_length * sizeof(unsigned char));
    cudaMalloc((void**)&d_results, text_length * sizeof(int));

    cudaMemcpy(d_text, h_text, text_length * sizeof(unsigned char), cudaMemcpyHostToDevice);
    cudaMemset(d_results, 0, text_length * sizeof(int));

    int numBlocks = (text_length + OPTIMIZED_BLOCK_SIZE * OPTIMIZED_CHUNK_SIZE - 1) / (OPTIMIZED_BLOCK_SIZE * OPTIMIZED_CHUNK_SIZE);

    bitap_search_kernel<<<numBlocks, OPTIMIZED_BLOCK_SIZE>>>(d_text, text_length, pattern_length, d_results);

    int *h_results = (int*)malloc(text_length * sizeof(int));
    cudaMemcpy(h_results, d_results, text_length * sizeof(int), cudaMemcpyDeviceToHost);

    int found = 0;
    for (size_t i = 0; i < text_length; ++i) {
        if (h_results[i]) {
            fprintf(output_file, "Pattern found at position: %zu\n", i);
            found = 1;
        }
    }

    if (!found) {
        fprintf(output_file, "No match found.\n");
    }

    cudaFree(d_text);
    cudaFree(d_results);
    free(h_results);
}

int main() {
    const char *pattern = "AGGA";  // Your desired pattern
    unsigned char *text;
    size_t file_size;

    FILE *file = fopen("/content/drive/MyDrive/input_10k.txt", "rb");
    if (file == NULL) {
        perror("Could not open input.txt");
        return EXIT_FAILURE;
    }

    fseek(file, 0, SEEK_END);
    file_size = ftell(file);
    fseek(file, 0, SEEK_SET);

    text = (unsigned char*)malloc(file_size);
    if (text == NULL) {
        perror("Memory allocation failed");
        fclose(file);
        return EXIT_FAILURE;
    }

    size_t bytes_read = fread(text, 1, file_size, file);
    if (bytes_read != file_size) {
        perror("Error reading file");
        free(text);
        fclose(file);
        return EXIT_FAILURE;
    }
    fclose(file);

    FILE *output_file = fopen("output_10k.txt", "w");
    if (output_file == NULL) {
        perror("Could not open output.txt");
        free(text);
        return EXIT_FAILURE;
    }

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);

    cuda_bitap_search(text, file_size, pattern, output_file);

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    printf("Time taken: %f seconds\n", milliseconds / 1000.0);

    free(text);
    fclose(output_file);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return 0;
}


Overwriting 10thousand.cu


In [51]:
!nvcc -o 10thousand 10thousand.cu
!./10thousand

Time taken: 0.000640 seconds


In [42]:
%%writefile 25k.cu

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda_runtime.h>

#define MAX_PATTERN_LENGTH 64
#define ALPHABET_SIZE 256
#define OPTIMIZED_BLOCK_SIZE 512   // Block size adjusted back to 256
#define OPTIMIZED_CHUNK_SIZE 64   // Each thread handles 32 characters

__constant__ unsigned long long d_pattern_mask[ALPHABET_SIZE];

__global__ void bitap_search_kernel(const unsigned char *text, size_t text_length, int pattern_length, int *results) {
    size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
    size_t start = tid * OPTIMIZED_CHUNK_SIZE;
    size_t end = min(start + OPTIMIZED_CHUNK_SIZE, text_length);

    unsigned long long R = ~0ULL;
    unsigned long long match = 1ULL << (pattern_length - 1);

    for (size_t i = start; i < end; ++i) {
        R = ((R << 1) | d_pattern_mask[text[i]]) & ((1ULL << pattern_length) - 1);
        if ((R & match) == 0) {
            results[i] = 1;
        }
    }
}

void cuda_bitap_search(const unsigned char *h_text, size_t text_length, const char *pattern, FILE *output_file) {
    int pattern_length = strlen(pattern);
    if (pattern_length == 0 || pattern_length > MAX_PATTERN_LENGTH) {
        fprintf(output_file, "Pattern is empty or too long!\n");
        return;
    }

    unsigned long long h_pattern_mask[ALPHABET_SIZE];
    for (int i = 0; i < ALPHABET_SIZE; ++i)
        h_pattern_mask[i] = ~0ULL;
    for (int i = 0; i < pattern_length; ++i)
        h_pattern_mask[(unsigned char)pattern[i]] &= ~(1ULL << i);

    cudaMemcpyToSymbol(d_pattern_mask, h_pattern_mask, ALPHABET_SIZE * sizeof(unsigned long long));

    unsigned char *d_text;
    int *d_results;
    cudaMalloc((void**)&d_text, text_length * sizeof(unsigned char));
    cudaMalloc((void**)&d_results, text_length * sizeof(int));

    cudaMemcpy(d_text, h_text, text_length * sizeof(unsigned char), cudaMemcpyHostToDevice);
    cudaMemset(d_results, 0, text_length * sizeof(int));

    int numBlocks = (text_length + OPTIMIZED_BLOCK_SIZE * OPTIMIZED_CHUNK_SIZE - 1) / (OPTIMIZED_BLOCK_SIZE * OPTIMIZED_CHUNK_SIZE);

    bitap_search_kernel<<<numBlocks, OPTIMIZED_BLOCK_SIZE>>>(d_text, text_length, pattern_length, d_results);

    int *h_results = (int*)malloc(text_length * sizeof(int));
    cudaMemcpy(h_results, d_results, text_length * sizeof(int), cudaMemcpyDeviceToHost);

    int found = 0;
    for (size_t i = 0; i < text_length; ++i) {
        if (h_results[i]) {
            fprintf(output_file, "Pattern found at position: %zu\n", i);
            found = 1;
        }
    }

    if (!found) {
        fprintf(output_file, "No match found.\n");
    }

    cudaFree(d_text);
    cudaFree(d_results);
    free(h_results);
}

int main() {
    const char *pattern = "AGGA";  // Your desired pattern
    unsigned char *text;
    size_t file_size;

    FILE *file = fopen("/content/drive/MyDrive/input25k.txt", "rb");
    if (file == NULL) {
        perror("Could not open input.txt");
        return EXIT_FAILURE;
    }

    fseek(file, 0, SEEK_END);
    file_size = ftell(file);
    fseek(file, 0, SEEK_SET);

    text = (unsigned char*)malloc(file_size);
    if (text == NULL) {
        perror("Memory allocation failed");
        fclose(file);
        return EXIT_FAILURE;
    }

    size_t bytes_read = fread(text, 1, file_size, file);
    if (bytes_read != file_size) {
        perror("Error reading file");
        free(text);
        fclose(file);
        return EXIT_FAILURE;
    }
    fclose(file);

    FILE *output_file = fopen("output25k.txt", "w");
    if (output_file == NULL) {
        perror("Could not open output.txt");
        free(text);
        return EXIT_FAILURE;
    }

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);

    cuda_bitap_search(text, file_size, pattern, output_file);

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    printf("Time taken: %f seconds\n", milliseconds / 1000.0);

    free(text);
    fclose(output_file);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return 0;
}


Writing 25k.cu


In [43]:
!nvcc -o 25k 25k.cu
!./25k

Time taken: 0.000720 seconds


In [46]:
%%writefile 50k.cu

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda_runtime.h>

#define MAX_PATTERN_LENGTH 64
#define ALPHABET_SIZE 256
#define OPTIMIZED_BLOCK_SIZE 512   // Block size adjusted back to 256
#define OPTIMIZED_CHUNK_SIZE 64   // Each thread handles 32 characters

__constant__ unsigned long long d_pattern_mask[ALPHABET_SIZE];

__global__ void bitap_search_kernel(const unsigned char *text, size_t text_length, int pattern_length, int *results) {
    size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
    size_t start = tid * OPTIMIZED_CHUNK_SIZE;
    size_t end = min(start + OPTIMIZED_CHUNK_SIZE, text_length);

    unsigned long long R = ~0ULL;
    unsigned long long match = 1ULL << (pattern_length - 1);

    for (size_t i = start; i < end; ++i) {
        R = ((R << 1) | d_pattern_mask[text[i]]) & ((1ULL << pattern_length) - 1);
        if ((R & match) == 0) {
            results[i] = 1;
        }
    }
}

void cuda_bitap_search(const unsigned char *h_text, size_t text_length, const char *pattern, FILE *output_file) {
    int pattern_length = strlen(pattern);
    if (pattern_length == 0 || pattern_length > MAX_PATTERN_LENGTH) {
        fprintf(output_file, "Pattern is empty or too long!\n");
        return;
    }

    unsigned long long h_pattern_mask[ALPHABET_SIZE];
    for (int i = 0; i < ALPHABET_SIZE; ++i)
        h_pattern_mask[i] = ~0ULL;
    for (int i = 0; i < pattern_length; ++i)
        h_pattern_mask[(unsigned char)pattern[i]] &= ~(1ULL << i);

    cudaMemcpyToSymbol(d_pattern_mask, h_pattern_mask, ALPHABET_SIZE * sizeof(unsigned long long));

    unsigned char *d_text;
    int *d_results;
    cudaMalloc((void**)&d_text, text_length * sizeof(unsigned char));
    cudaMalloc((void**)&d_results, text_length * sizeof(int));

    cudaMemcpy(d_text, h_text, text_length * sizeof(unsigned char), cudaMemcpyHostToDevice);
    cudaMemset(d_results, 0, text_length * sizeof(int));

    int numBlocks = (text_length + OPTIMIZED_BLOCK_SIZE * OPTIMIZED_CHUNK_SIZE - 1) / (OPTIMIZED_BLOCK_SIZE * OPTIMIZED_CHUNK_SIZE);

    bitap_search_kernel<<<numBlocks, OPTIMIZED_BLOCK_SIZE>>>(d_text, text_length, pattern_length, d_results);

    int *h_results = (int*)malloc(text_length * sizeof(int));
    cudaMemcpy(h_results, d_results, text_length * sizeof(int), cudaMemcpyDeviceToHost);

    int found = 0;
    for (size_t i = 0; i < text_length; ++i) {
        if (h_results[i]) {
            fprintf(output_file, "Pattern found at position: %zu\n", i);
            found = 1;
        }
    }

    if (!found) {
        fprintf(output_file, "No match found.\n");
    }

    cudaFree(d_text);
    cudaFree(d_results);
    free(h_results);
}

int main() {
    const char *pattern = "AGGA";  // Your desired pattern
    unsigned char *text;
    size_t file_size;

    FILE *file = fopen("/content/drive/MyDrive/input50k.txt", "rb");
    if (file == NULL) {
        perror("Could not open input.txt");
        return EXIT_FAILURE;
    }

    fseek(file, 0, SEEK_END);
    file_size = ftell(file);
    fseek(file, 0, SEEK_SET);

    text = (unsigned char*)malloc(file_size);
    if (text == NULL) {
        perror("Memory allocation failed");
        fclose(file);
        return EXIT_FAILURE;
    }

    size_t bytes_read = fread(text, 1, file_size, file);
    if (bytes_read != file_size) {
        perror("Error reading file");
        free(text);
        fclose(file);
        return EXIT_FAILURE;
    }
    fclose(file);

    FILE *output_file = fopen("output50k.txt", "w");
    if (output_file == NULL) {
        perror("Could not open output.txt");
        free(text);
        return EXIT_FAILURE;
    }

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);

    cuda_bitap_search(text, file_size, pattern, output_file);

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    printf("Time taken: %f seconds\n", milliseconds / 1000.0);

    free(text);
    fclose(output_file);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return 0;
}

Writing 50k.cu


In [47]:
!nvcc -o 50k 50k.cu
!./50k

Time taken: 0.000888 seconds


In [48]:

%%writefile 75k.cu

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda_runtime.h>

#define MAX_PATTERN_LENGTH 64
#define ALPHABET_SIZE 256
#define OPTIMIZED_BLOCK_SIZE 512   // Block size adjusted back to 256
#define OPTIMIZED_CHUNK_SIZE 64   // Each thread handles 32 characters

__constant__ unsigned long long d_pattern_mask[ALPHABET_SIZE];

__global__ void bitap_search_kernel(const unsigned char *text, size_t text_length, int pattern_length, int *results) {
    size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
    size_t start = tid * OPTIMIZED_CHUNK_SIZE;
    size_t end = min(start + OPTIMIZED_CHUNK_SIZE, text_length);

    unsigned long long R = ~0ULL;
    unsigned long long match = 1ULL << (pattern_length - 1);

    for (size_t i = start; i < end; ++i) {
        R = ((R << 1) | d_pattern_mask[text[i]]) & ((1ULL << pattern_length) - 1);
        if ((R & match) == 0) {
            results[i] = 1;
        }
    }
}

void cuda_bitap_search(const unsigned char *h_text, size_t text_length, const char *pattern, FILE *output_file) {
    int pattern_length = strlen(pattern);
    if (pattern_length == 0 || pattern_length > MAX_PATTERN_LENGTH) {
        fprintf(output_file, "Pattern is empty or too long!\n");
        return;
    }

    unsigned long long h_pattern_mask[ALPHABET_SIZE];
    for (int i = 0; i < ALPHABET_SIZE; ++i)
        h_pattern_mask[i] = ~0ULL;
    for (int i = 0; i < pattern_length; ++i)
        h_pattern_mask[(unsigned char)pattern[i]] &= ~(1ULL << i);

    cudaMemcpyToSymbol(d_pattern_mask, h_pattern_mask, ALPHABET_SIZE * sizeof(unsigned long long));

    unsigned char *d_text;
    int *d_results;
    cudaMalloc((void**)&d_text, text_length * sizeof(unsigned char));
    cudaMalloc((void**)&d_results, text_length * sizeof(int));

    cudaMemcpy(d_text, h_text, text_length * sizeof(unsigned char), cudaMemcpyHostToDevice);
    cudaMemset(d_results, 0, text_length * sizeof(int));

    int numBlocks = (text_length + OPTIMIZED_BLOCK_SIZE * OPTIMIZED_CHUNK_SIZE - 1) / (OPTIMIZED_BLOCK_SIZE * OPTIMIZED_CHUNK_SIZE);

    bitap_search_kernel<<<numBlocks, OPTIMIZED_BLOCK_SIZE>>>(d_text, text_length, pattern_length, d_results);

    int *h_results = (int*)malloc(text_length * sizeof(int));
    cudaMemcpy(h_results, d_results, text_length * sizeof(int), cudaMemcpyDeviceToHost);

    int found = 0;
    for (size_t i = 0; i < text_length; ++i) {
        if (h_results[i]) {
            fprintf(output_file, "Pattern found at position: %zu\n", i);
            found = 1;
        }
    }

    if (!found) {
        fprintf(output_file, "No match found.\n");
    }

    cudaFree(d_text);
    cudaFree(d_results);
    free(h_results);
}

int main() {
    const char *pattern = "AGGA";  // Your desired pattern
    unsigned char *text;
    size_t file_size;

    FILE *file = fopen("/content/drive/MyDrive/input75k.txt", "rb");
    if (file == NULL) {
        perror("Could not open input.txt");
        return EXIT_FAILURE;
    }

    fseek(file, 0, SEEK_END);
    file_size = ftell(file);
    fseek(file, 0, SEEK_SET);

    text = (unsigned char*)malloc(file_size);
    if (text == NULL) {
        perror("Memory allocation failed");
        fclose(file);
        return EXIT_FAILURE;
    }

    size_t bytes_read = fread(text, 1, file_size, file);
    if (bytes_read != file_size) {
        perror("Error reading file");
        free(text);
        fclose(file);
        return EXIT_FAILURE;
    }
    fclose(file);

    FILE *output_file = fopen("output75k.txt", "w");
    if (output_file == NULL) {
        perror("Could not open output.txt");
        free(text);
        return EXIT_FAILURE;
    }

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);

    cuda_bitap_search(text, file_size, pattern, output_file);

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    printf("Time taken: %f seconds\n", milliseconds / 1000.0);

    free(text);
    fclose(output_file);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return 0;
}

Writing 75k.cu


In [49]:
!nvcc -o 75k 75k.cu
!./75k

Time taken: 0.001061 seconds
