<a href="https://colab.research.google.com/github/moustafa-7/parallel-project/blob/main/phase_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#setup

In [171]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [172]:
%cd /content/drive/MyDrive/parallel_final_project

/content/drive/MyDrive/parallel_final_project


## load kernel function

In [173]:
%%file load_kernel.h

long LoadOpenCLKernel(char const* path, char **buf)
{
    FILE  *fp;
    size_t fsz;
    long   off_end;
    int    rc;

    /* Open the file */
    fp = fopen(path, "r");
    if( NULL == fp ) {
        return -1L;
    }

    /* Seek to the end of the file */
    rc = fseek(fp, 0L, SEEK_END);
    if( 0 != rc ) {
        return -1L;
    }

    /* Byte offset to the end of the file (size) */
    if( 0 > (off_end = ftell(fp)) ) {
        return -1L;
    }
    fsz = (size_t)off_end;

    /* Allocate a buffer to hold the whole file */
    *buf = (char *) malloc( fsz+1);
    if( NULL == *buf ) {
        return -1L;
    }

    /* Rewind file pointer to start of file */
    rewind(fp);

    /* Slurp file into buffer */
    if( fsz != fread(*buf, 1, fsz, fp) ) {
        free(*buf);
        return -1L;
    }

    /* Close the file */
    if( EOF == fclose(fp) ) {
        free(*buf);
        return -1L;
    }


    /* Make sure the buffer is NUL-terminated, just in case */
    (*buf)[fsz] = '\0';

    /* Return the file size */
    return (long)fsz;
}


Overwriting load_kernel.h


## Setup

In [174]:
%%file setup.h

printf("Initializing OpenCL device...\n");

   cl_uint dev_cnt = 0;
   clGetPlatformIDs(0, 0, &dev_cnt);

   cl_platform_id platform_ids[dev_cnt];
   clGetPlatformIDs(dev_cnt, platform_ids, NULL);

   // Connect to a compute device
   int gpu = 1;
   err = clGetDeviceIDs(platform_ids[0], gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
   if (err != CL_SUCCESS)
   {
       printf("Error: Failed to create a device group!\n");
       return EXIT_FAILURE;
   }

   // Create a compute context
   context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
   if (!context)
   {
       printf("Error: Failed to create a compute context!\n");
       return EXIT_FAILURE;
   }

   // Create a command commands
   commands = clCreateCommandQueue(context, device_id, 0, &err);
   if (!commands)
   {
       printf("Error: Failed to create a command commands!\n");
       return EXIT_FAILURE;
   }

   // Create the compute program from the source file
   char *KernelSource;
   long lFileSize;



   // ############# change kernel name here ############################## replace "add_kernel.cl" with whatever 
   lFileSize = LoadOpenCLKernel("vec_add.cl", &KernelSource);
   
   if( lFileSize < 0L ) {
       printf("\nhere");
       perror("File read failed");
       return 1;
   }

   program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
   if (!program)
   {
       printf("Error: Failed to create compute program!\n");
       return EXIT_FAILURE;
   }

   // Build the program executable
   err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
   if (err != CL_SUCCESS)
   {
       size_t len;
       char buffer[2048];
       printf("Error: Failed to build program executable!\n");
       clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
       printf("%s\n", buffer);
       exit(1);
   }

Overwriting setup.h


## Kernel/s

In [216]:
%%file vec_add.cl



int find_max2(__global int* array, int n, int *largest1, int *largest2, int* idx1, int *idx2)
{
    int temp;
    int tempIdx;
    *largest1 = array[0];
    *largest2 = array[1];
    *idx1 = 0;
    *idx2 = 1;

    if (*largest1 < *largest2)
    {
        temp = *largest1;
        *largest1 = *largest2;
        *largest2 = temp;
        tempIdx = *idx1;
        *idx1 = *idx2;
        *idx2 = tempIdx;
    }
 
    for (int i = 2; i < n; i++)
    {
        if (array[i] >= *largest1)
        {
            *largest2 = *largest1;
            *largest1 = array[i];
            *idx2 = *idx1;
            *idx1 = i;
        }
        else if (array[i] > *largest2 && array[i] != *largest1)
        {
            *largest2 = array[i];
            *idx2 = i;
        }
    }
}

__kernel void vec_add(                           
    __global int* data,                           
    __local int* local_sum,                           
    __global int* global_sum,                           
    const unsigned int num_votes ,                     
    const unsigned int num_candidates,
    const unsigned int num_elements                   
    )                                             
{            
    int id = get_global_id(0); 
                
    int begin = id * num_elements;
    int end = begin + (num_elements - 1);
 
    begin = begin * num_candidates;
    end = end * (num_candidates) + num_candidates;
      
    for(int i=begin; i<end; i+=num_candidates){
        if(i < num_candidates*num_votes){
            /* global_sum[data[i]-1]++; */
            atomic_add(&local_sum[data[i]-1] , 1);
        }
        else{
            break;
        }
    }            
    
    barrier(CLK_LOCAL_MEM_FENCE);

    int local_id = get_local_id(0);

    if(local_id == 0){
        printf("%d\n", local_id);
        for(int j=0; j<num_candidates; j++){
            atomic_add(&global_sum[j] , local_sum[j]);
        }
    }
 

    int idx_1 = 0;
    int idx_2 = 0;
    int first = 1;
    int second =2 ; 
    int sum_1 =0;
    int sum_2 =0;

    if(id == 0){
    find_max2(global_sum, num_candidates, &first, &second, &idx_1, &idx_2);
    printf("%d\n", idx_1);
    printf("%d\n", idx_2);
    printf("%d\n", first);
    printf("%d\n", second);
    }

    /* for(int j=0;j<num_candidates; j++){ */
    /*     local_sum[j] = 0; */
    /*     global_sum[j] = 0; */
    /* } */

    /* barrier(CLK_GLOBAL_MEM_FENCE); */


    /* if((double)first/(double)num_votes < 0.5){ */
    /*   for(int i=begin; i<end; i+=num_candidates){ */
    /*     if(i < num_candidates*num_votes){ */
    /*       for(int j=0; j<5; j++){ */
    /*           if(data[i+j]-1 == idx_1 || data[i+j]-1 == idx_2){ */
    /*             atomic_add(&local_sum[data[i+j]-1], 1); */
    /*             break; */
    /*           } */
    /*       } */
    /*     } */
    /*     else{ */
    /*         break; */
    /*     } */
    /*   } */    
    /* } */
 

    /* barrier(CLK_LOCAL_MEM_FENCE); */

    /* if(local_id == 0){ */
    /*     printf("%d\n", local_id); */
    /*     for(int j=0; j<num_candidates; j++){ */
    /*         atomic_add(&global_sum[j] , local_sum[j]); */
    /*     } */
    /* } */
}                                              


Overwriting vec_add.cl


In [217]:
%%file TEST.c

// #include <stdio.h>
// #include <stdlib.h>
// #include <math.h>
// #include "load_kernel.h"
// #include <CL/cl.h>                                                        


#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <unistd.h>
#include <time.h>
#include <sys/types.h>
#include <sys/stat.h>
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
#include <CL/cl.h>
#include <stdbool.h>
#include "load_kernel.h"


// function that returns the largest 2 counts and their indecies
int find_max2(int* array, int n, int *largest1, int *largest2, int* idx1, int *idx2)
{
    int temp;
    int tempIdx;
    *largest1 = array[0];
    *largest2 = array[1];
    *idx1 = 0;
    *idx2 = 1;

    if (*largest1 < *largest2)
    {
        temp = *largest1;
        *largest1 = *largest2;
        *largest2 = temp;
        tempIdx = *idx1;
        *idx1 = *idx2;
        *idx2 = tempIdx;
    }
 
    for (int i = 2; i < n; i++)
    {
        if (array[i] >= *largest1)
        {
            *largest2 = *largest1;
            *largest1 = array[i];
            *idx2 = *idx1;
            *idx1 = i;
        }
        else if (array[i] > *largest2 && array[i] != *largest1)
        {
            *largest2 = array[i];
            *idx2 = i;
        }
    }
}
//

int main(int argc, char *argv[])
{
    
 ////////////////// Host stuff ///////////////////////

  int err;                            // error code returned from api calls

  cl_device_id device_id;             // compute device id
  cl_context context;                 // compute context
  cl_command_queue commands;          // compute command queue
  cl_program program;                 // compute program
  cl_kernel kernel;                   // compute kernel
  clock_t          st, et;                  // Count time
 
//  
/////////////////////////////////////////
  int num_votes; // number of votes in the file
  int num_candidates;
  // reading file and put it in data
  static const char filename[] = "./text.txt";
  FILE *file = fopen(filename, "r");


  fscanf(file, "%d", &num_candidates);

  fscanf(file, "%d", &num_votes);


/////////////////////////////////////////
  
  size_t globalSize, localSize;
  // cl_int err;
  localSize = 100;
  globalSize = ceil(num_votes/(float)localSize)*localSize;

 
  
  printf("%zd\n", globalSize);
 
  int num_elements = ceil(num_votes/(float)globalSize);
   



  int *data;
  int *local_sum;
  int *global_sum;
 
 // Allocate memory for data from file in the host
 data = (int *)malloc(sizeof(int) * num_votes * num_candidates);
 local_sum = (int *)malloc(sizeof(int) * num_candidates);
 global_sum = (int *)malloc(sizeof(int) * num_candidates);



  for(int i = 0; i < num_votes*num_candidates; i++) {       
              fscanf(file, "%d", &data[i ]); // one vector of data   
      }


    fclose(file);
 
  
   /////////////////// OpenCL stuff /////////////////////

  ///// Buffers /////

    // Device input buffers
      cl_mem d_data;
    // Device output buffer
      cl_mem d_local_sum;
      cl_mem d_global_sum;




  ////////////////////////////////////////
   
    #include "setup.h"



    kernel = clCreateKernel(program, "vec_add", &err);
      
    if (!kernel || err != CL_SUCCESS)
    {
        printf("Error: Failed to create compute kernel!\n");
        exit(1);
    }
   
     // Create the input and output arrays in device memory for our calculation
   
      d_data = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int) * num_votes * num_candidates , NULL, NULL);
      d_local_sum = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int) * num_candidates, NULL, NULL);
      d_global_sum = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int) * num_candidates, NULL, NULL);
   

      err = clEnqueueWriteBuffer(commands, d_data, CL_TRUE, 0, sizeof(int) * num_votes * num_candidates, data, 0, NULL, NULL); // write data into buffer

   
   ///////  Execute  ////////"

       // Set the arguments to our compute kernel
      err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_data);
      err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), NULL);
      err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_global_sum);
      err |= clSetKernelArg(kernel, 3, sizeof(int), &num_votes);
      err |= clSetKernelArg(kernel, 4, sizeof(int), &num_candidates);
      err |= clSetKernelArg(kernel, 5, sizeof(int), &num_elements);

 
 
   
     // Execute the kernel over the entire range of the data set  
      printf("Executing\n");
      err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &globalSize, &localSize, 0, NULL, NULL);
                                                               
      // Wait for the command queue to get serviced before reading back results
      clFinish(commands);
   
      // Read the results from the device
      clEnqueueReadBuffer( commands, d_global_sum, CL_TRUE, 0, sizeof(int) * num_candidates , global_sum, 0, NULL, NULL );
      
      for(int k = 0; k<num_candidates;k++){
          printf("global: %d\n", global_sum[k]);
      }

 
//    if ((double)first/(double)num_votes > 0.5)
//    {
//        
//  // release OpenCL resources
//    clReleaseMemObject(d_data);
//    clReleaseMemObject(d_local_sum);
//    clReleaseMemObject(d_global_sum);
//    clReleaseProgram(program);
//    clReleaseKernel(kernel);
//    clReleaseCommandQueue(commands);
//    clReleaseContext(context);
// 
//    //release host memory
//    free(data);
//    free(local_sum);
//    free(global_sum);
//     
//        return 0;
//    }
// 
////  /////////////////// OpenCL stuff // OH SHIT HERE WE GO AGAIN /////////////////////
//
////      cl_int err1;
// 
////     ///// Prepare the enviroemnt  /////////
//
////     err1 = clGetPlatformIDs(1, &cpPlatform, NULL);         // Bind to platform
// 
////     err1 = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);      // Get ID for the device
//
////     context = clCreateContext(0, 1, &device_id, NULL, NULL, &err1);    // Create a context  
//
////     queue = clCreateCommandQueue(context, device_id, 0, &err1);     // Create a command queue 
//
////     program = clCreateProgramWithSource(context, 1, (const char **) & Source, NULL, &err);     // Create the compute program from the source buffer
//
////     clBuildProgram(program, 0, NULL, NULL, NULL, NULL);     // Build the program executable 
//
//
////     err1 = clEnqueueWriteBuffer(queue, d_data, CL_TRUE, 0, sizeof(int) * num_votes * num_candidates, data, 0, NULL, NULL); // write data into buffer
//
////     kernel = clCreateKernel(program, "TEST", &err1);
// 
////  ///////  Execute  ////////
//
////      // Set the arguments to our compute kernel
////     err1  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_data);
////     err1 |= clSetKernelArg(kernel, 1, sizeof(int), &idx_1);
////     err1 |= clSetKernelArg(kernel, 2, sizeof(int), &idx_2);
////     err1 |= clSetKernelArg(kernel, 3, sizeof(int), &sum_1);
////     err1 |= clSetKernelArg(kernel, 3, sizeof(int), &sum_2);
// 
////    // Execute the kernel over the entire range of the data set  
////     err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize, 0, NULL, NULL);
//                                                             
////     // Wait for the command queue to get serviced before reading back results
////     clFinish(queue);
// 
////     //release OpenCL resources
////     clReleaseMemObject(d_data);
////     clReleaseMemObject(d_local_sum);
////     clReleaseMemObject(d_global_sum);
////     clReleaseProgram(program);
////     clReleaseKernel(kernel);
////     clReleaseCommandQueue(queue);
////     clReleaseContext(context);
// 
//    //release host memory
//    free(data);
//    free(local_sum);
//    free(global_sum);
//
// 

    return 0;
 
}


Overwriting TEST.c


In [218]:
!cc  -I /usr/local/cuda/include TEST.c  /usr/local/cuda/lib64/libOpenCL.so -lm

In [219]:
!./a.out

600
Initializing OpenCL device...
Executing
0
0
0
0
0
0
6
4
54
54
global: 79
global: 64
global: 77
global: 65
global: 83
global: 63
global: 82
global: 66
4
6
83
82


In [207]:
%%file print.c

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

int main(){
    printf("I am here");
}

Overwriting print.c


In [None]:
!gcc -o p print.c

In [None]:
!./p

I am here