In [394]:
import math

import pandas as pd
import numpy as np
from numba import cuda

df = pd.read_csv('data/CC GENERAL.csv')

print(df.head())
print("======================")
print(df.isnull().sum())
print("======================")

df['MINIMUM_PAYMENTS'] = df['MINIMUM_PAYMENTS'].fillna(df['MINIMUM_PAYMENTS'].median())
df['CREDIT_LIMIT'] = df['CREDIT_LIMIT'].fillna(df['CREDIT_LIMIT'].mean())
print(df.isnull().sum())
print("======================")

df = df.drop('CUST_ID', axis=1)
np_data = df.to_numpy()
print(np_data.shape)
print("======================")

  CUST_ID      BALANCE  BALANCE_FREQUENCY  PURCHASES  ONEOFF_PURCHASES  \
0  C10001    40.900749           0.818182      95.40              0.00   
1  C10002  3202.467416           0.909091       0.00              0.00   
2  C10003  2495.148862           1.000000     773.17            773.17   
3  C10004  1666.670542           0.636364    1499.00           1499.00   
4  C10005   817.714335           1.000000      16.00             16.00   

   INSTALLMENTS_PURCHASES  CASH_ADVANCE  PURCHASES_FREQUENCY  \
0                    95.4      0.000000             0.166667   
1                     0.0   6442.945483             0.000000   
2                     0.0      0.000000             1.000000   
3                     0.0    205.788017             0.083333   
4                     0.0      0.000000             0.083333   

   ONEOFF_PURCHASES_FREQUENCY  PURCHASES_INSTALLMENTS_FREQUENCY  \
0                    0.000000                          0.083333   
1                    0.000000       

In [395]:
k = 7
centroid = np_data[np.random.randint(np_data.shape[0], size=k), :]

In [396]:
def next_power_of_2(x):
    return 1 << (x - 1).bit_length()

In [397]:
def calc_dimension_for_distance(data, data_centroid):
    real_dim_x = data_centroid.shape[0]
    dim_x = next_power_of_2(real_dim_x)
    dim_y = dim_x
    thread_per_blocks = (dim_x, dim_y)
    blocks_per_grid_x = 1
    blocks_per_grid_y = math.ceil(data.shape[0] / thread_per_blocks[0])
    blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y)
    return thread_per_blocks, blocks_per_grid

In [398]:
dist_tpb, dist_bpg = calc_dimension_for_distance(np_data, centroid)

In [399]:
dist_tpb

(8, 8)

In [400]:
dist_bpg

(1, 1119)

In [401]:
@cuda.jit
def calc_distance_kernel(data, data_centroid, result):
    r = cuda.blockIdx.y * cuda.blockDim.y + cuda.threadIdx.y
    c = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
    if r < data.shape[0] and c < data_centroid.shape[0]:
        # TODO(silentcat) : complete euclidian dist calculation of shape (data.shape[0], k)
        result[r, c] =  1

In [402]:
def calc_distance(data, data_centroid):
    result = np.zeros((data.shape[0], data_centroid.shape[0]))
    data_device = cuda.to_device(data)
    centroid_device = cuda.to_device(centroid)
    result_device = cuda.to_device(result)

    # invoke kernel
    calc_distance_kernel[dist_bpg, dist_tpb](data_device, centroid_device, result_device)

    result = result_device.copy_to_host()
    return result


In [403]:
calculated_dist =  calc_distance(np_data, centroid)

In [404]:
calculated_dist

array([[17., 17., 17., ..., 17., 17., 17.],
       [17., 17., 17., ..., 17., 17., 17.],
       [17., 17., 17., ..., 17., 17., 17.],
       ...,
       [17., 17., 17., ..., 17., 17., 17.],
       [17., 17., 17., ..., 17., 17., 17.],
       [17., 17., 17., ..., 17., 17., 17.]])

In [405]:
calculated_dist.shape

(8950, 7)

In [406]:
max(calculated_dist[:,0])

17.0