<a href="https://colab.research.google.com/github/moustafa-7/parallel-project/blob/main/phase_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#setup

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/parallel_final_project

/content/drive/MyDrive/parallel_final_project


In [3]:
!g++ -fopenmp -o gen filegen.cpp 
!sleep 5
!./gen

File generated


## load kernel function

In [4]:
%%file load_kernel.h

long LoadOpenCLKernel(char const* path, char **buf)
{
    FILE  *fp;
    size_t fsz;
    long   off_end;
    int    rc;

    /* Open the file */
    fp = fopen(path, "r");
    if( NULL == fp ) {
        return -1L;
    }

    /* Seek to the end of the file */
    rc = fseek(fp, 0L, SEEK_END);
    if( 0 != rc ) {
        return -1L;
    }

    /* Byte offset to the end of the file (size) */
    if( 0 > (off_end = ftell(fp)) ) {
        return -1L;
    }
    fsz = (size_t)off_end;

    /* Allocate a buffer to hold the whole file */
    *buf = (char *) malloc( fsz+1);
    if( NULL == *buf ) {
        return -1L;
    }

    /* Rewind file pointer to start of file */
    rewind(fp);

    /* Slurp file into buffer */
    if( fsz != fread(*buf, 1, fsz, fp) ) {
        free(*buf);
        return -1L;
    }

    /* Close the file */
    if( EOF == fclose(fp) ) {
        free(*buf);
        return -1L;
    }


    /* Make sure the buffer is NUL-terminated, just in case */
    (*buf)[fsz] = '\0';

    /* Return the file size */
    return (long)fsz;
}


Overwriting load_kernel.h


## Setup

In [5]:
%%file setup.h

printf("Initializing OpenCL device...\n");

   cl_uint dev_cnt = 0;
   clGetPlatformIDs(0, 0, &dev_cnt);

   cl_platform_id platform_ids[dev_cnt];
   clGetPlatformIDs(dev_cnt, platform_ids, NULL);

   // Connect to a compute device
   int gpu = 1;
   err = clGetDeviceIDs(platform_ids[0], gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
   if (err != CL_SUCCESS)
   {
       printf("Error: Failed to create a device group!\n");
       return EXIT_FAILURE;
   }

   // Create a compute context
   context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
   if (!context)
   {
       printf("Error: Failed to create a compute context!\n");
       return EXIT_FAILURE;
   }

   // Create a command commands
   commands = clCreateCommandQueue(context, device_id, 0, &err);
   if (!commands)
   {
       printf("Error: Failed to create a command commands!\n");
       return EXIT_FAILURE;
   }

   // Create the compute program from the source file
   char *KernelSource;
   long lFileSize;



   // ############# change kernel name here ############################## replace "add_kernel.cl" with whatever 
   lFileSize = LoadOpenCLKernel("vec_add.cl", &KernelSource);
   
   if( lFileSize < 0L ) {
       printf("\nhere");
       perror("File read failed");
       return 1;
   }

   program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
   if (!program)
   {
       printf("Error: Failed to create compute program!\n");
       return EXIT_FAILURE;
   }

   // Build the program executable
   err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
   if (err != CL_SUCCESS)
   {
       size_t len;
       char buffer[2048];
       printf("Error: Failed to build program executable!\n");
       clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
       printf("%s\n", buffer);
       exit(1);
   }

Overwriting setup.h


## Kernel/s

In [6]:
%%file vec_add.cl


// This finds the 2 largest candidates and their count

int find_max2(__global int* array, int n, int *largest1, int *largest2, int* idx1, int *idx2)
{
    int temp;
    int tempIdx;
    *largest1 = array[0];
    *largest2 = array[1];
    *idx1 = 0;
    *idx2 = 1;

    if (*largest1 < *largest2)
    {
        temp = *largest1;
        *largest1 = *largest2;
        *largest2 = temp;
        tempIdx = *idx1;
        *idx1 = *idx2;
        *idx2 = tempIdx;
    }
 
    for (int i = 2; i < n; i++)
    {
        if (array[i] >= *largest1)
        {
            *largest2 = *largest1;
            *largest1 = array[i];
            *idx2 = *idx1;
            *idx1 = i;
        }
        else if (array[i] > *largest2 && array[i] != *largest1)
        {
            *largest2 = array[i];
            *idx2 = i;
        }
    }
}


// this kernel does all the work for 

__kernel void vec_add(                           
    __global int* data,                           
    __local int* local_sum,                           
    __global int* global_sum,
    __global int* round_2_variables,
    const unsigned int num_votes ,                     
    const unsigned int num_candidates,
    const unsigned int num_elements                   
    )                                             
{            
    int id = get_global_id(0); 
    int local_id = get_local_id(0);
                


    // compute the start and index of the data array for each work item based on the global_id and the num_elements (chunk) specified for each code
    int begin = id * num_elements;
    int end = begin + (num_elements - 1);
 
    begin = begin * num_candidates;
    end = end * (num_candidates) + num_candidates;


    // each work item will add the candidate count in its specified index in local sum
    for(int i=begin; i<end; i+=num_candidates){
        if(i < num_candidates*num_votes){
            atomic_add(&local_sum[data[i]-1] , 1);
        }
        else{
            break;
        }
    }            
    
    barrier(CLK_LOCAL_MEM_FENCE);

    // each local_id=0 will put the local sum into the global sum array
    if(local_id == 0){
        for(int j=0; j<num_candidates; j++){
            atomic_add(&global_sum[j] , local_sum[j]);
        }
    }

    barrier(CLK_GLOBAL_MEM_FENCE);

    int idx_1 = 4;
    int idx_2 = 6;
    int first_count = 83;
    int second = 82; 

    find_max2(global_sum, num_candidates, &first_count, &second, &idx_1, &idx_2);


    //////////////////////////////////////////////////////////////////////////
    /* barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); */

    for(int j=0;j<num_candidates; j++){
        local_sum[j] = 0;
    }

    /* barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); */
    barrier(CLK_LOCAL_MEM_FENCE);

    local_sum[idx_1] = 0;
    local_sum[idx_2] = 0;

    if((double)first_count/(double)num_votes < 0.5){
      for(int i=begin; i<end; i+=num_candidates){
        if(i < num_candidates*num_votes){
          for(int j=0; j< num_candidates; j++){
              if(data[i+j]-1 == idx_1){
                atomic_add(&local_sum[data[i+j]-1], 1);
                break;
              }
              if(data[i+j]-1 == idx_2){
                atomic_add(&local_sum[data[i+j]-1], 1);
                break;
              }
          }
        }
        else{
            break;
        }
      }    
    }

    barrier(CLK_LOCAL_MEM_FENCE);
    if(local_id == 0){
        atomic_add(&round_2_variables[0], local_sum[idx_1]);
        atomic_add(&round_2_variables[1], local_sum[idx_2]);
    }

    barrier(CLK_GLOBAL_MEM_FENCE);
    if(id == 0){
        atomic_add(&round_2_variables[2], idx_1);
        atomic_add(&round_2_variables[3], idx_2);
    }
}


Overwriting vec_add.cl


In [7]:
%%file TEST.c

#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <unistd.h>
#include <time.h>
#include <sys/types.h>
#include <sys/stat.h>
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
#include <CL/cl.h>
#include <stdbool.h>
#include "load_kernel.h"



int main(int argc, char *argv[])
{
    
 ////////////////// Host stuff ///////////////////////

  int err;                            // error code returned from api calls

  cl_device_id device_id;             // compute device id
  cl_context context;                 // compute context
  cl_command_queue commands;          // compute command queue
  cl_program program;                 // compute program
  cl_kernel kernel;                   // compute kernel
  clock_t          st, et;                  // Count time
 
//  
/////////////////////////////////////////
  int num_votes; // number of votes in the file
  int num_candidates;
  // reading file and put it in data
  static const char filename[] = "./text.txt";
  FILE *file = fopen(filename, "r");


  fscanf(file, "%d", &num_candidates);

  fscanf(file, "%d", &num_votes);


/////////////////////////////////////////
  
  size_t globalSize, localSize;
  // cl_int err;
  localSize = 64;   // workitems count
  globalSize = ceil(num_votes/(float)localSize)*localSize;  

  // printf("%zd\n", globalSize);
 
  int num_elements = ceil(num_votes/(float)globalSize);  // chuck for each thread
   



  int *data;
  /* int *local_sum; */
  int *global_sum;
  int *round_2_variables;
 
 // Allocate memory for data from file in the host
 data = (int*)malloc(sizeof(int) * num_votes * num_candidates);
 /* local_sum = (int*)malloc(sizeof(int) * num_candidates); */
 global_sum = (int*)malloc(sizeof(int) * num_candidates);
 round_2_variables = (int*)malloc(sizeof(int) * 4); // hightest_idx1, highest_idx2, count1, count2



  for(int i = 0; i < num_votes*num_candidates; i++) {       
              fscanf(file, "%d", &data[i ]); // one vector of data   
      }


    fclose(file);
 
  
   /////////////////// OpenCL stuff /////////////////////

  ///// Buffers /////

    // Device input buffers
      cl_mem d_data;
    // Device output buffer
      /* cl_mem d_local_sum; */
      cl_mem d_global_sum;
      cl_mem d_round_2_variables;




  ////////////////////////////////////////
   
    #include "setup.h"

  ////////////////////////////////////////

    kernel = clCreateKernel(program, "vec_add", &err);
      
    if (!kernel || err != CL_SUCCESS)
    {
        printf("Error: Failed to create compute kernel!\n");
        exit(1);
    }
   
     // Create the input and output arrays in device memory for our calculation
   
      d_data = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int) * num_votes * num_candidates , NULL, NULL);
      /* d_local_sum = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int) * num_candidates, NULL, NULL); */
      d_global_sum = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int) * num_candidates, NULL, NULL);
      d_round_2_variables = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int) * 4, NULL, NULL);
   

      err = clEnqueueWriteBuffer(commands, d_data, CL_TRUE, 0, sizeof(int) * num_votes * num_candidates, data, 0, NULL, NULL); // write data into buffer

   
   ///////  Execute  ////////"

       // Set the arguments to our compute kernel
      err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_data);
      err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), NULL);
      err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_global_sum);
      err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_round_2_variables);
      err |= clSetKernelArg(kernel, 4, sizeof(int), &num_votes);
      err |= clSetKernelArg(kernel, 5, sizeof(int), &num_candidates);
      err |= clSetKernelArg(kernel, 6, sizeof(int), &num_elements);

 
 
   
     // Execute the kernel over the entire range of the data set  
      printf("Executing\n");
      err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &globalSize, &localSize, 0, NULL, NULL);
                                                               
      // Wait for the command queue to get serviced before reading back results
      clFinish(commands);
   
      // Read the results from the device
      clEnqueueReadBuffer( commands, d_global_sum, CL_TRUE, 0, sizeof(int) * num_candidates , global_sum, 0, NULL, NULL );
      clEnqueueReadBuffer( commands, d_round_2_variables, CL_TRUE, 0, sizeof(int) * 4 , round_2_variables, 0, NULL, NULL );
      
      
      printf("\n\nBegin round 1\n================================================\n");
      int winner;
 
      for(int i=0; i<num_candidates; i++){
            printf("Candidate [%d] count: %d\n",i+1, global_sum[i]);
        }

      for(int k=0; k<num_candidates;k++)
          printf("Percentage of candidate [%d] in round 1: %f %%\n", k+1, (double)global_sum[k]*100/(double)num_votes);
      
 
      
      if(round_2_variables[0] != 0 && round_2_variables[1]!=0){
        printf("\n\nBegin round 2\n================================================\n");
        for(int i=0; i<2; i++){
              printf("candidate [%d] count: %d",round_2_variables[i+2]+1, round_2_variables[i]);
              printf("\n");
          }
        for(int k=0; k<2;k++)
            printf("Percentage of candidate [%d] in round 2: %f %%\n", round_2_variables[k+2]+1, (double)round_2_variables[k]*100/(double)num_votes);

        if(round_2_variables[0] != round_2_variables[1]){
          winner = (round_2_variables[0] > round_2_variables[1]) ?  round_2_variables[2]: round_2_variables[3];
          printf("\nCandidate [%d] wins\n", winner+1);
        }
        else
          printf("It's a draw!\n");

      }
 
      else{
          printf("\nCandidate [%d] won in first round\n", round_2_variables[2]+1);
      }



    // release OpenCL resources
 

    // Device input buffers

    clReleaseMemObject(d_data);
    clReleaseMemObject(d_global_sum);
    clReleaseMemObject(d_round_2_variables);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(commands);
    clReleaseContext(context);
 
    //release host memory
    free(data);
    free(global_sum);
    free(round_2_variables);
 
    return 0;
 
}


Overwriting TEST.c


In [8]:
!cc  -I /usr/local/cuda/include TEST.c  /usr/local/cuda/lib64/libOpenCL.so -lm

In [9]:
!./a.out

Initializing OpenCL device...
Executing


Begin round 1
Candidate [1] count: 25
Candidate [2] count: 27
Candidate [3] count: 27
Candidate [4] count: 21
Candidate [5] count: 26
Candidate [6] count: 34
Candidate [7] count: 40
Percentage of candidate [1] in round 1: 12.500000 %
Percentage of candidate [2] in round 1: 13.500000 %
Percentage of candidate [3] in round 1: 13.500000 %
Percentage of candidate [4] in round 1: 10.500000 %
Percentage of candidate [5] in round 1: 13.000000 %
Percentage of candidate [6] in round 1: 17.000000 %
Percentage of candidate [7] in round 1: 20.000000 %


Begin round 2
candidate [7] count: 102
candidate [6] count: 98
Percentage of candidate [7] in round 2: 51.000000 %
Percentage of candidate [6] in round 2: 49.000000 %

Candidate [7] wins
