In [None]:
# Assignment: Longest Common Subsequence (HPC Project)
#**Student**: Parrella Marco, Matricola: 0622702536, Email: m.parrella21@studenti.unisa.it
#**Lecturer**: Moscato Francesco, fmoscato@unisa.it

#**License**: GPLv3 (see LICENSE file)
#**Requirements**: Implement Parallel LCS (OpenMP, MPI, CUDA)
#**Purpose**: Notebook per la compilazione e l'esecuzione dei benchmark della versione CUDA su ambiente con GPU (Google Colab).

import os
import subprocess

# --- PASSO 0: FIX DEL CODICE CUDA (Integer Overflow) ---
# Riscriviamo il file .cu aggiungendo i cast a (size_t) per gestire matrici > 46000x46000
cuda_code_fixed = r"""
/*
 * GPLv3
 * CUDA LCS - Anti-diagonal implementation
 * UPDATED: Dynamic block size & Correct Timing & LARGE MATRIX FIX (size_t)
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda.h>
#include <cuda_runtime.h>

static inline void cuda_check(cudaError_t e, const char *msg) {
    if (e != cudaSuccess) { fprintf(stderr, "%s: %s\n", msg, cudaGetErrorString(e)); exit(1); }
}

__global__ void diag_kernel(int *dmat, const char *A, const char *B, int n, int m, int k) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    int i_start = max(1, k - m);
    int i_end = min(n, k - 1);
    int len = i_end - i_start + 1;

    if (idx >= len) return;

    int i = i_start + idx;
    int j = k - i;

    // FIX: Usiamo size_t per la larghezza e per il calcolo dell'indice
    // Altrimenti 51200*51200 supera il limite dei 32 bit (2 miliardi)
    size_t mcols = (size_t)m + 1;

    // Calcolo indici con size_t per evitare overflow
    size_t idx_cur    = (size_t)i * mcols + j;
    size_t idx_up     = (size_t)(i-1) * mcols + j;
    size_t idx_left   = (size_t)i * mcols + (j-1);
    size_t idx_upleft = (size_t)(i-1) * mcols + (j-1);

    if (A[i-1] == B[j-1]) {
        dmat[idx_cur] = dmat[idx_upleft] + 1;
    } else {
        int up = dmat[idx_up];
        int left = dmat[idx_left];
        dmat[idx_cur] = (up > left ? up : left);
    }
}

int main(int argc, char **argv) {
    if (argc < 3) { return 1; }

    const char *fileA = argv[1];
    const char *fileB = argv[2];
    int blockSize = 256;

    if (argc > 3) blockSize = atoi(argv[3]);
    if (blockSize <= 0) blockSize = 256;

    FILE *fa = fopen(fileA, "rb");
    FILE *fb = fopen(fileB, "rb");
    if (!fa || !fb) { perror("fopen"); return 1; }

    fseek(fa, 0, SEEK_END); int n = ftell(fa); fseek(fa, 0, SEEK_SET);
    fseek(fb, 0, SEEK_END); int m = ftell(fb); fseek(fb, 0, SEEK_SET);

    char *hA = (char*)malloc(n);
    char *hB = (char*)malloc(m);
    if (!hA || !hB) { fprintf(stderr, "Host alloc fail\n"); return 1; }

    if (fread(hA, 1, n, fa) != n || fread(hB, 1, m, fb) != m) { fprintf(stderr, "Read error\n"); return 1; }
    fclose(fa); fclose(fb);

    size_t mat_elems = (size_t)(n + 1) * (m + 1);
    size_t mat_bytes = mat_elems * sizeof(int);

    char *dA; char *dB; int *dmat;
    cuda_check(cudaMalloc((void**)&dA, n), "cudaMalloc A");
    cuda_check(cudaMalloc((void**)&dB, m), "cudaMalloc B");
    cuda_check(cudaMalloc((void**)&dmat, mat_bytes), "cudaMalloc mat");

    cuda_check(cudaMemcpy(dA, hA, n, cudaMemcpyHostToDevice), "cpy A");
    cuda_check(cudaMemcpy(dB, hB, m, cudaMemcpyHostToDevice), "cpy B");
    cuda_check(cudaMemset(dmat, 0, mat_bytes), "memset mat");

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);

    int maxk = n + m;
    for (int k = 2; k <= maxk; ++k) {
        int i_start = (k > m+1) ? (k - (m+1)) : 1;
        int i_end = (k - 1 > n) ? n : k - 1;
        int len = i_end - i_start + 1;

        if (len <= 0) continue;
        int threads = blockSize;
        int blocks = (len + threads - 1) / threads;
        diag_kernel<<<blocks, threads>>>(dmat, dA, dB, n, m, k);
    }
    cuda_check(cudaGetLastError(), "kernel launch");

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    int res = 0;
    // Fix indice anche qui
    size_t res_idx = (size_t)n * (m + 1) + m;
    cuda_check(cudaMemcpy(&res, &dmat[res_idx], sizeof(int), cudaMemcpyDeviceToHost), "cpy result");

    printf("RESULT_LEN: %d\n", res);
    printf("ELAPSED_TIME: %.6f\n", milliseconds / 1000.0f);

    cudaFree(dA); cudaFree(dB); cudaFree(dmat);
    free(hA); free(hB);
    cudaEventDestroy(start); cudaEventDestroy(stop);
    return 0;
}
"""

# Scriviamo il codice corretto su disco
with open("lcs_cuda.cu", "w") as f:
    f.write(cuda_code_fixed)
print("Codice lcs_cuda.cu aggiornato con supporto per matrici grandi (size_t).\n")


# --- PASSO 1: COMPILAZIONE ---
print("Compilazione in corso...")
!nvcc -O3 lcs_cuda.cu -o lcs_cuda
print("Compilazione completata.\n")


# --- PASSO 2: BENCHMARK ---
SIZES = [10240, 51200]  # 10KB, 50KB
BLOCK_SIZES = [128, 256, 512]
REPS = 3

print("impl,size,rep,param,time_s")

for s in SIZES:
    file_a = f"bench_{s}_A.bin"
    file_b = f"bench_{s}_B.bin"

    if not os.path.exists(file_a):
        !python3 generate_input.py --size-per-file $s --prefix "bench_{s}" --seed 42 --alphabet ascii

    for r in range(1, REPS + 1):
        for b in BLOCK_SIZES:
            try:
                cmd = f"./lcs_cuda {file_a} {file_b} {b}"
                output = subprocess.check_output(cmd, shell=True).decode("utf-8")

                for line in output.splitlines():
                    if "ELAPSED_TIME:" in line:
                        time_s = line.split(":")[1].strip()
                        print(f"cuda,{s},{r},{b},{time_s}")
            except subprocess.CalledProcessError:
                print(f"cuda,{s},{r},{b},ERROR")

Codice lcs_cuda.cu aggiornato con supporto per matrici grandi (size_t).

Compilazione in corso...
Compilazione completata.

impl,size,rep,param,time_s
cuda,10240,1,128,0.109924
cuda,10240,1,256,0.064252
cuda,10240,1,512,0.076628
cuda,10240,2,128,0.060675
cuda,10240,2,256,0.064462
cuda,10240,2,512,0.076243
cuda,10240,3,128,0.065312
cuda,10240,3,256,0.064488
cuda,10240,3,512,0.076185
cuda,51200,1,128,2.992495
cuda,51200,1,256,3.016323
cuda,51200,1,512,3.072156
cuda,51200,2,128,2.995034
cuda,51200,2,256,3.016395
cuda,51200,2,512,3.078468
cuda,51200,3,128,3.002448
cuda,51200,3,256,3.023699
cuda,51200,3,512,3.081815
