In [None]:
import numpy as np
import pandas as pd

In [None]:
def target_mean_v1(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
        result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
    return result

In [None]:
def target_mean_v2(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    value_dict = dict()
    count_dict = dict()
    for i in range(data.shape[0]):
        if data.loc[i, x_name] not in value_dict.keys():
            value_dict[data.loc[i, x_name]] = data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] = 1
        else:
            value_dict[data.loc[i, x_name]] += data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] += 1
    for i in range(data.shape[0]):
        result[i] = (value_dict[data.loc[i, x_name]] - data.loc[i, y_name]) / (count_dict[data.loc[i, x_name]] - 1)
    return result

In [None]:
y = np.random.randint(2, size=(5000, 1))
x = np.random.randint(10, size=(5000, 1))
data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])

In [None]:
%%timeit
target_mean_v1(data, 'y', 'x')

In [None]:
%%timeit
target_mean_v2(data, 'y', 'x')

In [None]:
def target_mean_v3(data, y_name, x_name):
    batch = data.shape[0]
    result = np.zeros(batch)
    value_dict = dict()
    count_dict = dict()
    for i in range(batch):
        x_value = data.loc[i, x_name]
        y_value = data.loc[i, y_name]
        if x_value not in value_dict.keys():
            value_dict[x_value] = y_value
            count_dict[x_value] = 1
        else:
            value_dict[x_value] += y_value
            count_dict[x_value] += 1
    for i in range(batch):
        x_value_ = data.loc[i, x_name]
        result[i] = (value_dict[x_value_] - data.loc[i, y_name]) / (count_dict[x_value_] - 1)
    return result

In [None]:
%%timeit
target_mean_v3(data, 'y', 'x')

In [None]:
%load_ext Cython 

In [None]:
%%cython -a
import cython

import numpy as np
cimport numpy as cnp


cpdef cnp.ndarray[double] target_mean_v5(cnp.ndarray[long, ndim=2] data):
    cdef int batch = data.shape[0]
    cdef cnp.ndarray[double] result = np.zeros(batch)
    cdef dict value_dict = {}
    cdef dict count_dict = {}
    cdef long x_value
    cdef long y_valuez
    cdef unsigned int i
    for i in range(batch):
        x_value = data[i, 1]
        y_value = data[i, 0]
        if x_value not in value_dict.keys():
            value_dict[x_value] = y_value
            count_dict[x_value] = 1
        else:
            value_dict[x_value] += y_value

            count_dict[x_value] += 1
    for i in range(batch):
        x_value = data[i, 1]
        result[i] = (value_dict[x_value] - data[i, 0]) / (count_dict[x_value] - 1)
    return result

In [None]:
# y = np.random.randint(2, size=(5000, 1))
# x = np.random.randint(10, size=(5000, 1))
data_ = np.concatenate([y, x], axis=1)
# print(type(data))

In [None]:
result_1 = target_mean_v1(data, 'y', 'x')
result_2 = target_mean_v5(data_)

diff = np.linalg.norm(result_1 - result_2)
print(diff)

In [None]:
%%timeit

target_mean_v5(data_)

In [None]:
%%cython -a
import cython
import numpy as np
cimport numpy as cnp


cpdef cnp.ndarray[double] target_mean_v6(cnp.ndarray[long, ndim=2] data):
    cdef int[:] x = data[:,1].astype(np.intc)
    cdef int[:] y = data[:,0].astype(np.intc)
    cdef int batch = data.shape[0]
    cdef cnp.ndarray[double] result = np.zeros(batch)
    cdef int[:] value_dict = np.zeros(batch).astype(np.intc)
    cdef int[:] count_dict = np.zeros(batch).astype(np.intc)
    cdef unsigned int i
    for i in range(batch):
        value_dict[x[i]] += y[i]

        count_dict[x[i]] += 1
    for i in range(batch):
        result[i] = (value_dict[x[i]] - y[i]) / (count_dict[x[i]] - 1)
    return result

In [None]:
%%timeit

target_mean_v6(data_)

In [None]:
result_1 = target_mean_v1(data, 'y', 'x')
result_6 = target_mean_v6(data_)

diff = np.linalg.norm(result_1 - result_6)
print(diff)

In [None]:
%%cython -a
import cython
import numpy as np
cimport numpy as cnp
from cython.parallel import prange
import cython

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef cnp.ndarray[double] target_mean_v7(cnp.ndarray[long, ndim=2] data):
    cdef int[:] x = data[:,1].astype(np.intc)
    cdef int[:] y = data[:,0].astype(np.intc)
    cdef int batch = data.shape[0]
    cdef cnp.ndarray[double] result = np.zeros(batch)
    cdef int[:] value_dict = np.zeros(batch).astype(np.intc)
    cdef int[:] count_dict = np.zeros(batch).astype(np.intc)
    cdef int i
    for i in prange(batch, nogil=True):
        value_dict[x[i]] += y[i]

        count_dict[x[i]] += 1
    for i in prange(batch, nogil=True):
        result[i] = (value_dict[x[i]] - y[i]) / (count_dict[x[i]] - 1)
    return result

In [None]:
%%timeit
target_mean_v7(data_)

In [None]:
result_7 = target_mean_v7(data_)
diff = np.linalg.norm(result_1 - result_7)
print(diff)