# CUDA code definition

In [1]:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy
import time

class GloVeFastDistances():

    def __init__(self,gloveFile):
        mod1 = SourceModule("""
        __global__ void cosineSimilarity
        (const unsigned int limit, const float* A, float* distanceOut,float* C_model, const float normA) {
            __shared__ float fastA[300];
            const unsigned int id = blockIdx.x * blockDim.x + threadIdx.x;
            if (threadIdx.x < 300) {
                fastA[threadIdx.x] = A[threadIdx.x]; // only one embeding is on A
            }
            __syncthreads();
            if (id < limit) {
            
                float acum = 0;
                float c_norm = 0;
                const unsigned int row = id / 8; // Get row
                const unsigned int interiorId = threadIdx.x % 8;
                for (unsigned int i = interiorId; i < 300; i += 8) {
                    float cvalAux = C_model[row*300+i];
                    acum += fastA[i]*cvalAux; // Accumulate within the accumulator
                    c_norm += cvalAux*cvalAux;
                }
                acum += __shfl_down_sync(0xffffffff, acum, 4); // Reduction
                acum += __shfl_down_sync(0xffffffff, acum, 2); // Reduction
                acum += __shfl_down_sync(0xffffffff, acum, 1); // Reduction
                    
                c_norm += __shfl_down_sync(0xffffffff, c_norm, 4); // Reduction
                c_norm += __shfl_down_sync(0xffffffff, c_norm, 2); // Reduction
                c_norm += __shfl_down_sync(0xffffffff, c_norm, 1); // Reduction

                if (interiorId == 0) { // Final step and write results
                    float simVal=(acum / (normA * sqrtf(c_norm)));
                    distanceOut[row] = simVal;
                }
            }
        }
        """)
        self.cosine_similarity = mod1.get_function("cosineSimilarity")
        
        
        f = open(gloveFile, 'r')
        model = {}
        count=0
        for line in f:
            splitLine = line.split(' ')
            word = splitLine[0]
            embedding = numpy.array([float(val) for val in splitLine[1:]])
            model[word] = embedding

        self.embeddings=[]
        self.word_dictionary={}
        self.inverse_word_dictionary={}
        self.rows=len(model)
        self.final_result=numpy.empty(self.rows,dtype=numpy.float32)
        
        
        for idx,key in enumerate(sorted(model)):
            self.embeddings.append(model[key])
            self.word_dictionary[key]=idx
            self.inverse_word_dictionary[idx]=key
        self.embeddings=numpy.array(self.embeddings,dtype=numpy.float32)

        self.c_model_gpu = cuda.mem_alloc(self.embeddings.nbytes)
        cuda.memcpy_htod(self.c_model_gpu, self.embeddings)
        self.pos = numpy.zeros(self.rows,dtype=numpy.uint32)
        self.grid_dot = ((self.rows // 64) + 1, 1)
        self.block_dot = (512, 1, 1)
        self.cosine_similarity.prepare(("I", "P", "P", "P","F"))
        
        self.a_gpu = cuda.mem_alloc(300*4)
        self.distances_gpu = cuda.mem_alloc(self.rows*4)
    
        
    def get_similar_word(self,word):
        word=numpy.float32(word)
        norm=numpy.linalg.norm(word)
        cuda.memcpy_htod(self.a_gpu, word)
        self.cosine_similarity.prepared_call(self.grid_dot, self.block_dot, self.rows * 8, self.a_gpu, self.distances_gpu, self.c_model_gpu,norm)
        cuda.memcpy_dtoh(self.final_result,self.distances_gpu)
        return self.final_result



We instantiate the class with the glove file, this file is available at https://nlp.stanford.edu/projects/glove/, and it is the 840B tokens and 2.2M vocab file.

In [2]:
search_engine=GloVeFastDistances("glove.840B.300d.txt")

We run a 100 iterations of the defined algorithm, and obtain the mean time and its standard deviation

In [3]:
word=search_engine.word_dictionary["?"]

embeddings=search_engine.embeddings[word]

time_array=[]

for i in range(0,100):
    start_op=time.time()
    results=search_engine.get_similar_word(embeddings)
    end_op=time.time()
    time_array.append(end_op-start_op)
print("The seconds needed to compute the operation on average : "+str(numpy.mean(time_array)))
print("The st dev of the operation on average in seconds : "+str(numpy.std(time_array)))

time_in_gpu=numpy.mean(time_array)


The seconds needed to compute the operation on average : 0.02777631759643555
The st dev of the operation on average in seconds : 0.00026925900345701787


We run a 100 iterations of the sklearn implementation of the above algorithm, and obtain the mean time and its standard deviation

In [4]:
from sklearn.metrics.pairwise import cosine_similarity

word=search_engine.word_dictionary["?"]

time_array2=[]

embeddings=search_engine.embeddings
embed_a=embeddings[word]
for i in range(0,100):
    start_op=time.time()
    cosine_similarity(embeddings, [embed_a])
    end_op=time.time()
    time_array2.append(end_op-start_op)
print("The seconds needed to compute the operation on average : "+str(numpy.mean(time_array2)))
print("The st dev of the operation on average in seconds : "+str(numpy.std(time_array2)))

time_in_cpu=numpy.mean(time_array2)

print("The speedup from using GPU is of : " +str(time_in_cpu/time_in_gpu))

The seconds needed to compute the operation on average : 2.131491985321045
The st dev of the operation on average in seconds : 0.3029625818037941
The speedup from using GPU is of : 76.7377453084196
