<a href="https://colab.research.google.com/github/lucas-t-reis/research/blob/master/GPU-placement/annealing_placement.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Setup

In [1]:
!pip install git+git://github.com/canesche/nvcc_colab_icpp2020.git
!git clone https://github.com/canesche/nvcc_colab_icpp2020
!rm -r sample_data
!mkdir nvcc_colab_icpp2020/list_map
!mv nvcc_colab_icpp2020/list_tree/*_r_*.in nvcc_colab_icpp2020/list_map
%load_ext nvcc_plugin

Collecting git+git://github.com/canesche/nvcc_colab_icpp2020.git
  Cloning git://github.com/canesche/nvcc_colab_icpp2020.git to /tmp/pip-req-build-_tlibpzl
  Running command git clone -q git://github.com/canesche/nvcc_colab_icpp2020.git /tmp/pip-req-build-_tlibpzl
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.3-cp36-none-any.whl size=3052 sha256=378ed944bb8fc9b7647f5ab1885988f18b054c853a2ce096c947c0e4959b1702
  Stored in directory: /tmp/pip-ephem-wheel-cache-3p2of9hs/wheels/cf/27/88/a1332c532363fba6de6ee6db6fdf3e5f85d6a711d02aa85de1
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.3
Cloning into 'nvcc_colab_icpp2020'...
remote: Enumerating objects: 37, done.[K
remote: Counting objects: 100% (37/37), done.[K
remote: Compressing objects: 100% (35/35), done.[K
remote: Total 924 (delta 1), reused 34 (del

## Core Libraries

Since the different modules of this solution use the same set of variables and libraries, **core.h** serves as a template header that all the .cpp/.cu files will share in this notebook. 


In [2]:
%%writefile core.h
#ifndef CORE_H
#define CORE_H

// I/O
#include <stdio.h>
#include <fstream>
#include <iostream>
#include <sstream>

// Data types and containers 
#include <unordered_map>
#include <vector>
#include <string>
#include <stdint.h> // - uin8_t discontinued due to bugs in graphs with more than 96 nodes

// Utility functions
#include <math.h>
#include <algorithm>
#include <random>
#include <cstring>
#include <chrono>
#include "nvcc_colab_icpp2020/utils.h"

// CUDA
#include <cuda_runtime.h>
#include <cuda.h>
#include <curand.h>

// GPU parameters
#define MAX_THREADS 128 
#define MAX_BYTES  256
#define MAX_GRIDS 1000
#define MAX_BLOCKS ceil( (1.0*MAX_GRIDS / MAX_THREADS) ) 
#define MAX_RANDS 100000000
using Seconds = std::chrono::duration<double, std::ratio<1>>;


/* Graph Representation */
std::vector <int> A;	// Edges
int nodes, edges;       // Nodes and edges amount
int* v,* v_i;           // Neighbors and neighbors index
int gridSize;	        // Square matrix total elements
int numGrids;	        // Amount of matrices actually read
int dim;		        // Matrix dimension

/* Host variables */
int *h_A;
int *h_edgeA, *h_edgeB;
int *h_results;
int *h_costs;
int *h_table1hop_1d; // Flattened @h_table1hop[17][17]
int *h_tablemesh_1d; // Flattened @h_tablemesh[17][17]
int h_randomPlacements[MAX_BYTES * MAX_GRIDS];

/* Device variables */
int *d_A,* d_v, *d_v_i;
int *d_results, *d_costs; 		// Stores the placement and cost of each grid read 
int *d_tablemesh, *d_table1hop; // Resource used in cost calculation
int *d_grid;					// Grids fed to the Annealing
float *d_seeds, *d_random;		// RNG

/* Aux & Temp */
int *shufflePlacements;

void readInput(const char* fileName) {

    FILE * fptr;
    fptr = fopen(fileName,"r");
    if (fptr == NULL) {
        printf("Error opening file!");
        exit(EXIT_FAILURE);
    }
    
    int c = 0, n1, n2;
    while (fscanf(fptr, "%d %d", & n1, & n2) != EOF) {
        if (c == 0) {
            nodes = n1;
            edges = n2;

            h_edgeA = (int*) malloc(sizeof(int) * edges);
            h_edgeB = (int*) malloc(sizeof(int) * edges);

            v = (int*) malloc(sizeof(int) * nodes);
            v_i = (int*) malloc(sizeof(int) * nodes);

            for (int i = 0; i < nodes; i++) { v[i] = 0; v_i[i] = 0; }
        } 
		else {
            h_edgeA[c - 1] = n1;
            h_edgeB[c - 1] = n2;

            v[n1]++;
            if (n1 != n2) v[n2]++;
        }

        c++;
    }

    dim = ceil(sqrt(nodes));
    gridSize = dim*dim;

    for(int i=1; i<nodes; i++) v_i[i] = v_i[i-1] + v[i-1];

    for(int i=0; i<nodes; i++)
        for(int j=0; j<edges; j++){
            if (h_edgeA[j] != h_edgeB[j]) {
                if(h_edgeA[j]==i) A.push_back(h_edgeB[j]);
                if(h_edgeB[j]==i) A.push_back(h_edgeA[j]);
            } 
			else {
                if(h_edgeA[j]==i) A.push_back(h_edgeB[j]);
            }
        }

    fclose(fptr);
}

void clear() {

// GPU
cudaFree(d_A);
cudaFree(d_v);
cudaFree(d_v_i);
cudaFree(d_results);
cudaFree(d_costs);
cudaFree(d_tablemesh);
cudaFree(d_table1hop);
cudaFree(d_grid);
cudaFree(d_seeds);
cudaFree(d_random);
cudaFree(shufflePlacements);

// CPU
free(v);
free(v_i);
free(h_A);
free(h_edgeA);
free(h_edgeB);
free(h_results);
free(h_costs);
free(h_table1hop_1d);
free(h_tablemesh_1d);

}

#endif

Writing core.h


# Random Shuffles

## Borderless Shuffle

In [3]:
%%writefile shuffle_borderless.cpp
#include "core.h"

void getNshuffledPlacements(int maxShuffles, const char* fileName) {
	
	std::unordered_map<std::string, bool> unique_placement;
	std::vector<int> placement;
	std::string key;
	std::ofstream output;
	
	char buffer[1000];
	strcpy(buffer, fileName);
	strcat(buffer, ".out");

  	shufflePlacements = (int *) malloc(sizeof(int) * (gridSize) * maxShuffles);
	output.open(buffer, std::ofstream::out);

	// Using uniform integer rand generator to seed a seed_seq generator in order to feed the mersenne twister engine	
	std::random_device r; 
	std::seed_seq seed{r(), r(), r(), r(), r(), r(), r(), r()};
	std::mt19937 mersenne_engine(seed);
	
  	// Getting trivial placement
	for(int i=0; i<gridSize; i++)
    	(i<nodes)?placement.push_back(i):placement.push_back(255);
	
	// To debug use to_string(unsigned(placement[i])), since there may be "invisible" ASCII in the current form
	for(int i=0; i<placement.size(); i++){
		key += placement[i]; 
		key += "-";
	}
	unique_placement[key] = true;
	
	// Prevents loops in small graphs, since it may be impossible to avoid repetition.
	int currTries = 0;
    
    // Generating shuffles
	int k = 0;
	while(k<maxShuffles) {
		
		key.clear();
		shuffle(placement.begin(), placement.end(), mersenne_engine);
		for(int i=0; i<placement.size(); i++){
			key += placement[i]; 
			key += "-";
		}
		
		currTries++;

		if(unique_placement.find(key) != unique_placement.end() && currTries < 1000) continue;

		// Reaching here means you've succeeded in generating a random or reached max trials
        // Either way, accept current solution and reset the counter
		currTries = 0;

        // Writing on file
		for(int i=0; i<placement.size(); i++) output << placement[i] << " ";
		output << "\n";
		
        // Keeping track of generated placements
		unique_placement[key] = true;
		memcpy(shufflePlacements + k*(dim*dim), placement.data(), sizeof(int) * (gridSize));
		k++;
	}

	output.close();
}

int main(int argc, const char** argv) {
	
	std::cout << "Reading from " << argv[1] << std::endl;
  	readInput(argv[1]);   	
	getNshuffledPlacements(MAX_GRIDS, argv[2]);

}


Writing shuffle_borderless.cpp


## Getting random shuffles

In [4]:
!echo "Compiling"

!nvcc shuffle_borderless.cpp
!echo "Creating root directory /random_placement "
!mkdir random_placement 2>/dev/null || true


!for file in nvcc_colab_icpp2020/teste/*.in; do ./a.out $file $(basename $file ".in")"_1k_rand"; done
!echo "Moving outputs to random_placement/teste ..."
!mkdir random_placement/teste 2>/dev/null || true
!mv *_1k_rand.out random_placement/teste

!for file in nvcc_colab_icpp2020/list_cgrame/*.in; do ./a.out $file $(basename $file ".in")"_1k_rand"; done
!echo "Moving outputs to random_placement/cgrame ..."
!mkdir random_placement/cgrame 2>/dev/null || true
!mv *_1k_rand.out random_placement/cgrame

!for file in nvcc_colab_icpp2020/list_tree/*.in; do ./a.out $file $(basename $file ".in")"_1k_rand"; done
!echo "Moving outputs to random_placement/tree ..."
!mkdir random_placement/tree 2>/dev/null || true
!mv *1k_rand.out random_placement/tree

!for file in nvcc_colab_icpp2020/list_map/*.in; do ./a.out $file $(basename $file ".in")"_1k_rand"; done
!echo "Moving outputs to random_placement/map ..."
!mkdir random_placement/map 2>/dev/null || true
!mv *1k_rand.out random_placement/map

!for file in nvcc_colab_icpp2020/list_kmeans/*.in; do ./a.out $file $(basename $file ".in")"_1k_rand"; done
!echo "Moving output to random_placement/kmeans ..."
!mkdir random_placement/kmeans 2>/dev/null || true
!mv *1k_rand.out random_placement/kmeans

!for file in nvcc_colab_icpp2020/list_micro/*.in; do ./a.out $file $(basename $file ".in")"_1k_rand"; done
!echo "Moving outputs to random_placement/micro ..."
!mkdir random_placement/micro 2>/dev/null || true
!mv *_1k_rand.out random_placement/micro

!for file in nvcc_colab_icpp2020/list_kmeans_expanded/*.in; do ./a.out $file $(basename $file ".in")"_1k_rand"; done
!echo "Moving outputs to random_placement/kmeans_exp ..."
!mkdir random_placement/kmeans_exp 2>/dev/null || true
!mv *_1k_rand.out random_placement/kmeans_exp

!echo "Removing executables..."
!rm a.out 

Compiling
Creating root directory /random_placement 
Reading from nvcc_colab_icpp2020/teste/*.in
Error opening file!Moving outputs to random_placement/teste ...
mv: cannot stat '*_1k_rand.out': No such file or directory
Reading from nvcc_colab_icpp2020/list_cgrame/cgrame_accumulate_zigzag.in
Reading from nvcc_colab_icpp2020/list_cgrame/cgrame_cap_zigzag.in
Reading from nvcc_colab_icpp2020/list_cgrame/cgrame_conv2_zigzag.in
Reading from nvcc_colab_icpp2020/list_cgrame/cgrame_conv3_zigzag.in
Reading from nvcc_colab_icpp2020/list_cgrame/cgrame_mac2_zigzag.in
Reading from nvcc_colab_icpp2020/list_cgrame/cgrame_mac_zigzag.in
Reading from nvcc_colab_icpp2020/list_cgrame/cgrame_matrixmultiply_zigzag.in
Reading from nvcc_colab_icpp2020/list_cgrame/cgrame_mults1_zigzag.in
Reading from nvcc_colab_icpp2020/list_cgrame/cgrame_mults2_zigzag.in
Reading from nvcc_colab_icpp2020/list_cgrame/cgrame_nomem1_zigzag.in
Reading from nvcc_colab_icpp2020/list_cgrame/cgrame_simple2_zigzag.in
Reading from nvcc_

# Annealing GPU
Using **MESH** cost function


In [5]:
%%writefile GPU_SA.cu
#include "core.h"

int h_table1hop[17][17] = 
{{0,  0,  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7},  
 {0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8},  
 {0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8},  
 {1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9},  
 {1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9},  
 {2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9, 10, 10},  
 {2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9, 10, 10},  
 {3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9, 10, 10, 11, 11},  
 {3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9, 10, 10, 11, 11},  
 {4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9, 10, 10, 11, 11, 12, 12},  
 {4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9, 10, 10, 11, 11, 12, 12},  
 {5,  6,  6,  7,  7,  8,  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13},  
 {5,  6,  6,  7,  7,  8,  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13},  
 {6,  7,  7,  8,  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14},  
 {6,  7,  7,  8,  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14},  
 {7,  8,  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15},  
 {7,  8,  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15}};

int h_tablemesh[17][17] =
{{0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15},  
 {0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16},  
 {1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17},  
 {2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18},  
 {3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19},  
 {4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20},  
 {5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21},  
 {6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22},  
 {7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23},  
 {8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24},  
 {9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25},  
 {10,11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26},  
 {11,12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27},  
 {12,13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28},  
 {13,14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29},  
 {14,15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30},  
 {15,16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}};


int gridCost(int *positions){

    int cost_ = 0, increment=0, distManhattanI, distManhattanJ;

    for(int k=0; k<edges; k++){
        int ifrom = positions[h_edgeA[k]]/dim; 
        int jfrom = positions[h_edgeA[k]]%dim; 
        int ito = positions[h_edgeB[k]]/dim; 
        int jto = positions[h_edgeB[k]]%dim;
        
        distManhattanJ = abs(jto - jfrom);
        distManhattanI = abs(ito - ifrom);

        increment = h_tablemesh[distManhattanI][distManhattanJ];
        
        cost_ += increment;
    }

    return cost_;
}

void readPlacements(const char* fileName) {

    std::string name(fileName); // Without this line, the input mac_inout gives segfault..dunno why
    std::ifstream file(name, std::ifstream::in); 
    if(!file.good()) {
        printf("Failed to load random placements. Check file name or it's existence\nAborting");
        exit(EXIT_FAILURE);
    }

    std::string s;
    int* temp = (int*) malloc(sizeof(int) * gridSize);
    int i = 0;
    int start = 0;
    numGrids = 0;
    while(std::getline(file, s) && numGrids < MAX_GRIDS ) {
        
		std::stringstream line(s);
        int n;
        
        // Read placement line and store in @h_randomPlacements
        while(line >> n) h_randomPlacements[i++] = n;

        // Evaluating cost
        int positions[256];
        for(int k=start, l=0; k<i; k++, l++) {
            temp[l] = h_randomPlacements[k];
            positions[temp[l]] = l;
        }

        // Updating cost for 'ith' grid
        h_costs[numGrids] = gridCost(positions);
        start = i;
        numGrids++;
    }

    file.close();
}


__global__ void annealing(int nodes, int dim, int gridSize,              		// Variables
			   			  	int *d_grid,                        			 	// Initializers 
			   				int *v_i, int *v, int *A,                			// Graph 
                            int *d_results, int *d_costs, float* d_random,   	// Output and random states
                            int *table1hop, int *tablemesh,               		// New cost parameters
                            int *localPositions
                         ){		     

    
    // Determinig thread ID and checking if it's within boudaries
    int tId = blockIdx.x * blockDim.x + threadIdx.x;

	if( tId >= MAX_GRIDS ) return; 

    // Which sector of d_grid belongs to the thread
    size_t t_istart = tId*gridSize;
    float cuda_random;

    int currentCost = d_costs[tId], nextCost, increment, distManhattanI, distManhattanJ;
	
    int* localGrid = 		(int*) malloc(sizeof(int) * gridSize);
    
    for(int i=0; i<gridSize; i++) {
        localGrid[i] = d_grid[t_istart + i];
        localPositions[ t_istart + localGrid[i] ] = i; //<-- OPT
    }

    /* Simulated Annealing */
    unsigned int rndIndex = tId; 	// Each thread starts in a different place in @d_random
    bool foundOptimal = false;
    double T = 100;
    while(T>=0.00001 && !foundOptimal){

        for(int i=0; i<gridSize; i++){

                if(foundOptimal) break;

                for(int j=i+1; j<gridSize; j++){

					if(currentCost==0) {
						foundOptimal = true;
						break;
					}

                    // If we're looking at 2 empty spaces, skip                   
                    if(localGrid[i]==255 && localGrid[j]==255) continue;

                    int node1 = localGrid[i];
                    int node2 = localGrid[j];
                    
					nextCost = currentCost;
                    
					// Remove cost from node to it's neighbors
                    if(node1!=255){
                        #pragma unroll 2                // << ---- OPT
                        for(int i=0; i<v[node1]; i++){
                            int ifrom = localPositions[t_istart +node1]/dim; 
                            int jfrom = localPositions[t_istart +node1]%dim; 
                            int ito = localPositions[t_istart +A[v_i[node1]+i]]/dim; 
                            int jto = localPositions[t_istart +A[v_i[node1]+i]]%dim;
                            distManhattanJ = abs(jto - jfrom);
                            distManhattanI = abs(ito - ifrom);
    
                            increment = tablemesh[distManhattanI*17 + distManhattanJ];
                            nextCost -= increment;
                            
                        }
                        
                    }
                    if(node2!=255){
                        #pragma unroll 2
                        for(int i=0; i<v[node2]; i++){
                            int ifrom = localPositions[t_istart +node2]/dim; 
                            int jfrom = localPositions[t_istart +node2]%dim; 
                            int ito = localPositions[t_istart +A[v_i[node2]+i]]/dim; 
                            int jto = localPositions[t_istart +A[v_i[node2]+i]]%dim; 
                            distManhattanJ = abs(jto - jfrom);
                            distManhattanI = abs(ito - ifrom);
    
                            increment = tablemesh[distManhattanI*17 + distManhattanJ];
                            nextCost -= increment;
                        }

                    }

                    // Swap nodes
                    int old1 = i;
                    int old2 = j;

                    if(node1!=255) localPositions[t_istart +node1] = old2;
                    if(node2!=255) localPositions[t_istart +node2] = old1;

                    localGrid[j] = node1;
                    localGrid[i] = node2;

                    // Recalculate cost to each neighbor based on the node's new position
                    if(node1!=255){
                        #pragma unroll 2
                        for(int i=0; i<v[node1]; i++){

                            int ifrom = localPositions[t_istart +node1]/dim; 
                            int jfrom = localPositions[t_istart +node1]%dim; 
                            int ito = localPositions[t_istart +A[v_i[node1]+i]]/dim; 
                            int jto = localPositions[t_istart +A[v_i[node1]+i]]%dim; 
                            distManhattanJ = abs(jto - jfrom);
                            distManhattanI = abs(ito - ifrom);
    
                            increment = tablemesh[distManhattanI*17 + distManhattanJ];
                            nextCost += increment;
                        }

                    }
                    if(node2!=255){
                        #pragma unroll 2
                        for(int i=0; i<v[node2]; i++){
                            int ifrom = localPositions[t_istart +node2]/dim; 
                            int jfrom = localPositions[t_istart +node2]%dim; 
                            int ito = localPositions[t_istart +A[v_i[node2]+i]]/dim; 
                            int jto = localPositions[t_istart +A[v_i[node2]+i]]%dim; 
                            distManhattanJ = abs(jto - jfrom);
                            distManhattanI = abs(ito - ifrom);
    
                            increment = tablemesh[distManhattanI*17 + distManhattanJ];
                            nextCost += increment;
                        }

                    }  

                    // Annealing acceptance criteria
                    double valor = exp(-1*(nextCost - currentCost)/T);
                    rndIndex = (rndIndex>=MAX_RANDS)?tId:rndIndex;
                    cuda_random = d_random[rndIndex];
                    rndIndex++;
  
                    if(nextCost <= currentCost || cuda_random <= valor)
                        currentCost = nextCost;
                    else {
                        if(node1!=255) localPositions[t_istart +node1] = old1;
                        if(node2!=255) localPositions[t_istart +node2] = old2;
                        localGrid[j] = node2;
                        localGrid[i] = node1;
                    }
                }

                T*=0.999;
            }
    }

    d_costs[tId] = currentCost;
	for(int i=0; i<gridSize; i++) d_results[t_istart+i] = localGrid[i];
}


void GPU_results(const char* fileName) {

    char placements[1000]; 
    strcpy(placements, fileName);
    strcat(placements, "_SAmesh_GPU_1k.out");

    std::ofstream output(placements, std::ofstream::out);

    int temp1Best[gridSize] = {0};
    int temp2[gridSize] = {0};
    
    int best = 100000;
    int tid = 0;
    int thread = 0;
    int pos = 0;

    for(int i=0; i<numGrids*gridSize; i++){ 

		if(i%gridSize==0 && i!=0) {

            pos = 0;
            if(h_costs[thread] < best) {
                best = h_costs[thread];
                for(int i=0; i<gridSize; i++) temp1Best[i] = temp2[i];
                tid = thread;
            }

            thread++;
            output << "\n";
        }

            temp2[pos++] = h_results[i];
            output << unsigned(h_results[i]) << " ";
    }

    output.close();

    // Best cost found
    std::cout << placements << "\t" << h_costs[tid] << "\t";
    for(int i=0; i<gridSize; i++){

        if(i%dim==0)
            std::cout << "\n" << temp1Best[i] << "\t";
        else
            std::cout << temp1Best[i] << "\t";

    }
    std::cout << "\n";

}

int main(int argc, const char **argv) {

    double timer = 0;
    cudaDeviceProp device;
    cudaGetDeviceProperties(&device, 0);

    // Pre-calculating random numbers (0,1]
	curandGenerator_t gen;
    cudaMalloc(&d_random, sizeof(float) * MAX_RANDS);
    curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
    curandSetPseudoRandomGeneratorSeed(gen, 1234ULL);
    curandGenerateUniform(gen, d_random, MAX_RANDS);
    
    h_costs = (int*) malloc(sizeof(int) * MAX_GRIDS);
    h_table1hop_1d = (int*) malloc(sizeof(int) * 17*17);
    h_tablemesh_1d = (int*) malloc(sizeof(int) * 17*17);

    int idx = 0;
    for(int i=0; i<17; i++)
        for(int j=0; j<17; j++){

            h_table1hop_1d[idx] = h_table1hop[i][j];
            h_tablemesh_1d[idx] = h_tablemesh[i][j];

            idx++;

        }

    // Read Graph
    readInput(argv[1]);
    std::cerr << "FILE: " << argv[3] << " running on " << device.name << std::endl;
    
	// Converting vector A to array
	h_A = (int*) malloc(sizeof(int) * A.size()); 
	memcpy(h_A, A.data(), sizeof(int) * A.size());

    // Read it's random placements (1k shuffles)
    readPlacements(argv[2]);

    // Allocating GPU memory
	cudaMalloc(&d_A,   		 sizeof(int)   * A.size());
	cudaMalloc(&d_v,   		 sizeof(int)   * nodes);
	cudaMalloc(&d_v_i, 		 sizeof(int)   * nodes);
    cudaMalloc(&d_seeds,     sizeof(float) * MAX_RANDS);
    cudaMalloc(&d_table1hop, sizeof(int)   * 17*17);
    cudaMalloc(&d_tablemesh, sizeof(int)   * 17*17);

	// Getting data from host to device
	cudaMemcpy(d_A, h_A,                    sizeof(int) * A.size(), cudaMemcpyHostToDevice); 
	cudaMemcpy(d_v, v, 	                    sizeof(int) * nodes   , cudaMemcpyHostToDevice);
    cudaMemcpy(d_v_i, v_i,                  sizeof(int) * nodes   , cudaMemcpyHostToDevice);
    cudaMemcpy(d_table1hop, h_table1hop_1d, sizeof(int) * 17*17   , cudaMemcpyHostToDevice);
    cudaMemcpy(d_tablemesh, h_tablemesh_1d, sizeof(int) * 17*17   , cudaMemcpyHostToDevice);

	// Device SETUP
	size_t placements_size = gridSize * MAX_GRIDS; //<--OPT
	
    // Host variables memory allocation
	h_results =   (int*) malloc(sizeof(int) * placements_size);
	
	// Allocating device memory
    cudaMalloc(&d_results,	sizeof(int) * placements_size); 
	cudaMalloc(&d_grid, 	sizeof(int) * placements_size);
    cudaMalloc(&d_costs, 	sizeof(int) * MAX_GRIDS);
   
   	// Getting data from host random_placements 
    cudaMemcpy(d_grid, h_randomPlacements,  sizeof(int) * placements_size,  cudaMemcpyHostToDevice);
    cudaMemcpy(d_costs, h_costs,            sizeof(int) * MAX_GRIDS,        cudaMemcpyHostToDevice);

   
    std::chrono::time_point<std::chrono::high_resolution_clock> start_clock, end_clock;
    start_clock = std::chrono::high_resolution_clock::now();

    int *d_localPositions; //<-- OPT
    cudaMalloc(&d_localPositions, placements_size * sizeof(int));
    annealing <<< MAX_BLOCKS, MAX_THREADS >>> ( nodes, dim, gridSize, 
                                                d_grid, d_v_i, d_v, d_A, 
                                                d_results, d_costs, d_random,
                                                d_table1hop, d_tablemesh,
                                                d_localPositions
                                            );

    cudaDeviceSynchronize();

    cudaMemcpy(h_results, d_results,    sizeof(int) * placements_size,  cudaMemcpyDeviceToHost);
    cudaMemcpy(h_costs, d_costs,        sizeof(int) * MAX_GRIDS,        cudaMemcpyDeviceToHost);

    end_clock = std::chrono::high_resolution_clock::now();
    timer = std::chrono::duration_cast<Seconds>(end_clock - start_clock).count();

	GPU_results(argv[3]);
    printf("%.3lf\n", timer);    
}


Writing GPU_SA.cu


## Running GPU placement
Results will be stored in GPU_results folder

In [6]:
!nvcc GPU_SA.cu -lcurand
!mkdir GPU_results 2>/dev/null || true

!echo "Running micro instances"
!for file in nvcc_colab_icpp2020/list_micro/*.in; do ./a.out $file random_placement/micro/$(basename $file ".in")_1k_rand.out $(basename $file "_zigzag.in") >> micro_mesh.txt; done
!mkdir GPU_results/mesh_micro 2>/dev/null || true
!mv *SAmesh_GPU_1k.out GPU_results/mesh_micro

!echo "Running kmeans_exp instances"
!for file in nvcc_colab_icpp2020/list_kmeans_expanded/*.in; do ./a.out $file random_placement/kmeans_exp/$(basename $file ".in")_1k_rand.out $(basename $file "_zigzag.in") >> kmeans_exp_mesh.txt; done
!mkdir GPU_results/mesh_kmeans_exp 2>/dev/null || true
!mv *SAmesh_GPU_1k.out GPU_results/mesh_kmeans_exp

!echo "Running cgrame instances"
!for file in nvcc_colab_icpp2020/list_cgrame/*.in; do ./a.out $file random_placement/cgrame/$(basename $file ".in")_1k_rand.out $(basename $file "_zigzag.in") >> cgrame_mesh.txt; done
!mkdir GPU_results/mesh_cgrame 2>/dev/null || true
!mv *SAmesh_GPU_1k.out GPU_results/mesh_cgrame

!echo "Running kmeans instances"
!for file in nvcc_colab_icpp2020/list_kmeans/*.in; do ./a.out $file random_placement/kmeans/$(basename $file ".in")_1k_rand.out $(basename $file "_zigzag.in") >> kmeans_mesh.txt; done
!mkdir GPU_results/mesh_kmeans 2>/dev/null || true
!mv *SAmesh_GPU_1k.out GPU_results/mesh_kmeans

!echo "Running tree instances"
!for file in nvcc_colab_icpp2020/list_tree/*.in; do ./a.out $file random_placement/tree/$(basename $file ".in")_1k_rand.out $(basename $file "_zigzag.in") >> tree_mesh.txt; done
!mkdir GPU_results/mesh_tree 2>/dev/null || true
!mv *SAmesh_GPU_1k.out GPU_results/mesh_tree

!echo "Running map instances"
!for file in nvcc_colab_icpp2020/list_map/*.in; do ./a.out $file random_placement/map/$(basename $file ".in")_1k_rand.out $(basename $file "_zigzag.in") >> map_mesh.txt; done
!mkdir GPU_results/mesh_map 2>/dev/null || true
!mv *SAmesh_GPU_1k.out GPU_results/mesh_map

Running micro instances
FILE: merge_stream_join running on Tesla T4
FILE: merge_tradicional running on Tesla T4
FILE: resparsify_stream_join running on Tesla T4
FILE: resparsify_tradicional running on Tesla T4
FILE: sparse_vec_mult_stream_join running on Tesla T4
FILE: sparse_vec_mult_tradicional running on Tesla T4
FILE: stream_db_join_stream_join running on Tesla T4
FILE: stream_db_join_tradicional running on Tesla T4
Running kmeans_exp instances
FILE: K4N4_exp running on Tesla T4
FILE: K4N5_exp running on Tesla T4
FILE: K4N6_exp running on Tesla T4
FILE: K4N7_exp running on Tesla T4
FILE: K4N8_exp running on Tesla T4
FILE: K5N4_exp running on Tesla T4
FILE: K5N5_exp running on Tesla T4
FILE: K5N6_exp running on Tesla T4
FILE: K5N7_exp running on Tesla T4
FILE: K5N8_exp running on Tesla T4
FILE: K6N4_exp running on Tesla T4
FILE: K6N5_exp running on Tesla T4
FILE: K6N6_exp running on Tesla T4
FILE: K6N7_exp running on Tesla T4
FILE: K6N8_exp running on Tesla T4
FILE: K7N4_exp running