In [1]:
from google.colab import drive
import os

drive.mount('/content/drive')
os.chdir("drive/MyDrive/VF2pp-in-CUDA/")
os.listdir()

Mounted at /content/drive


['README.md',
 'LICENSE',
 '.tips',
 '.imp',
 'temp_ordering.cu',
 '.gitignore',
 'Makefile',
 'test.py',
 'src',
 'data',
 'benchmark',
 '.git',
 'tmp']

CUDA SETUP

In [2]:
!pip install nvcc4jupyter
!pip install pycuda

Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1
Collecting pycuda
  Downloading pycuda-2024.1.2.tar.gz (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m43.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pytools>=2011.2 (from pycuda)
  Downloading pytools-2024.1.11-py3-none-any.whl.metadata (3.0 kB)
Collecting mako (from pycuda)
  Downloading Mako-1.3.5-py3-none-any.whl.metadata (2.9 kB)
Downloading pytools-2024.1.11-py3-none-any.whl (88 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.2/88.2 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Mako-1.3.5-py3-no

GPU TYPE

In [3]:
!nvidia-smi
!nvcc --version
%load_ext nvcc4jupyter

Sat Aug  3 18:52:22 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   44C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [4]:
import pycuda.driver as drv
import pycuda.autoinit
drv.init()
print("%d device(s) found." % drv.Device.count())
for i in range(drv.Device.count()):
  dev = drv.Device(i)
  print("Device #%d: %s" % (i, dev.name()))
  print(" Compute Capability: %d.%d" % dev.compute_capability())
  print(" Total Memory: %s GB" % (dev.total_memory() // (1024 * 1024 * 1024)))

1 device(s) found.
Device #0: Tesla T4
 Compute Capability: 7.5
 Total Memory: 14 GB


GPU INFO

In [5]:
%%cuda

#include <stdio.h>
#include <stdlib.h>

void deviceQuery()
{
  cudaDeviceProp prop;
  int nDevices=0, i;
  cudaError_t ierr;

  ierr = cudaGetDeviceCount(&nDevices);
  if (ierr != cudaSuccess) { printf("Sync error: %s\n", cudaGetErrorString(ierr)); }



  for( i = 0; i < nDevices; ++i )
  {
     ierr = cudaGetDeviceProperties(&prop, i);
     printf("Device number: %d\n", i);
     printf("  Device name: %s\n", prop.name);
     printf("  Compute capability: %d.%d\n\n", prop.major, prop.minor);

     printf("  Clock Rate: %d kHz\n", prop.clockRate);
     printf("  Total SMs: %d \n", prop.multiProcessorCount);
     printf("  Shared Memory Per SM: %lu bytes\n", prop.sharedMemPerMultiprocessor);
     printf("  Registers Per SM: %d 32-bit\n", prop.regsPerMultiprocessor);
     printf("  Max threads per SM: %d\n", prop.maxThreadsPerMultiProcessor);
     printf("  L2 Cache Size: %d bytes\n", prop.l2CacheSize);
     printf("  Total Global Memory: %lu bytes\n", prop.totalGlobalMem);
     printf("  Memory Clock Rate: %d kHz\n\n", prop.memoryClockRate);


     printf("  Max threads per block: %d\n", prop.maxThreadsPerBlock);
     printf("  Max threads in X-dimension of block: %d\n", prop.maxThreadsDim[0]);
     printf("  Max threads in Y-dimension of block: %d\n", prop.maxThreadsDim[1]);
     printf("  Max threads in Z-dimension of block: %d\n\n", prop.maxThreadsDim[2]);

     printf("  Max blocks in X-dimension of grid: %d\n", prop.maxGridSize[0]);
     printf("  Max blocks in Y-dimension of grid: %d\n", prop.maxGridSize[1]);
     printf("  Max blocks in Z-dimension of grid: %d\n\n", prop.maxGridSize[2]);

     printf("  Warp size: %d\n\n", prop.warpSize);
     printf(" Constant Memory: %d\n", prop.totalConstMem);
     printf(" Max resident blocks per SM: %d\n", prop.maxBlocksPerMultiProcessor);
  }
}

int main() {
    deviceQuery();
}

Device number: 0
  Device name: Tesla T4
  Compute capability: 7.5

  Clock Rate: 1590000 kHz
  Total SMs: 40 
  Shared Memory Per SM: 65536 bytes
  Registers Per SM: 65536 32-bit
  Max threads per SM: 1024
  L2 Cache Size: 4194304 bytes
  Total Global Memory: 15835660288 bytes
  Memory Clock Rate: 5001000 kHz

  Max threads per block: 1024
  Max threads in X-dimension of block: 1024
  Max threads in Y-dimension of block: 1024
  Max threads in Z-dimension of block: 64

  Max blocks in X-dimension of grid: 2147483647
  Max blocks in Y-dimension of grid: 65535
  Max blocks in Z-dimension of grid: 65535

  Warp size: 32

 Constant Memory: 65536
 Max resident blocks per SM: 16



In [None]:
!make clean
!make

CHOOSE THE BEST OPTIMIZATION ON RANDOM GRAPH OF DIFFERENT SIZES

In [None]:
graph_dimension = [3000, 5000, 8000, 10000]
optimizations = ['O0','O1','O2','O3']

for opt in optimizations:
    for dim in graph_dimension:
        cmd_sequential = f"./vf2pp_sequential_{opt} {dim} 3"
        cmd_parallel = f"./vf2pp_parallel_{opt} {dim} 3"
        !{cmd_sequential}
        !{cmd_parallel}

In [17]:
'''
Each file name is of the type "result_{dim}_3.txt" so each file will contain 4 values, one for each optimization performed.
So each file represents the execution times on the graph of size "dim" with different optimizations.
We begin by running the simulation, and then, for each optimization, we calculate the average execution time across all dimensions of the graph.
The best optimization is the O1 for both sequential and parallel versions.
'''

import os

def read_files(directory):
    measures = {0: [], 1: [], 2: [], 3: []}

    files = os.listdir(directory)

    for file in files:
        with open(os.path.join(directory, file), 'r') as f:
            line = f.read()
            times = line.split()
            for i, time in enumerate(times):
                measures[i].append(float(time))

    return measures

def calculate_means(measures):
    means = []

    for key, values in measures.items():
        mean = sum(values) / len(values)
        means.append(mean)

    return means

seq_measures = read_files("./benchmark/measures/optimizations/sequential/")
par_measures = read_files("./benchmark/measures/optimizations/parallel/")

seq_means = calculate_means(seq_measures)
par_means = calculate_means(par_measures)

import pandas as pd

data = {
    "OPTIMIZATION 0": [seq_means[0], par_means[0]],
    "OPTIMIZATION 1": [seq_means[1], par_means[1]],
    "OPTIMIZATION 2": [seq_means[2], par_means[2]],
    "OPTIMIZATION 3": [seq_means[3], par_means[3]],
}

df = pd.DataFrame(data, index=["Sequential Means", "Parallel Means"])

output_directory = "./benchmark/measures/optimizations"
output_file = os.path.join(output_directory, "opt_means_results.csv")
df.to_csv(output_file)

print(df)


                  OPTIMIZATION 0  OPTIMIZATION 1  OPTIMIZATION 2  \
Sequential Means     1046.065460      551.812976      567.564568   
Parallel Means         19.479678       19.111196       19.600804   

                  OPTIMIZATION 3  
Sequential Means      678.450359  
Parallel Means         19.885869  
