In [None]:
import numpy as np
import pandas as pd

In [None]:
def target_mean_v1(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
        result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
    return result

In [None]:
def target_mean_v2(data, y_name, x_name):
    sample_num = data.shape[0]
    result = np.zeros(sample_num)
    value_dict = {}
    count_dict = {}
    for i in range(sample_num):
        if data.loc[i,x_name] not in value_dict.keys():
            value_dict[data.loc[i,x_name]] = data.loc[i,y_name]
            count_dict[data.loc[i,x_name]] = 1
        else:
            value_dict[data.loc[i,x_name]] += data.loc[i,y_name]
            count_dict[data.loc[i,x_name]] += 1
    for i in range(sample_num):
        result[i] = (value_dict[data.loc[i,x_name]]-data.loc[i,y_name])/(count_dict[data.loc[i,x_name]]-1)
    return result

In [None]:
def target_mean_v3(data, y_name, x_name):
    sample_num = data.shape[0]
    x = data.loc[:,x_name].values.flatten()
    y = data.loc[:,y_name].values.flatten()
    result = np.zeros(sample_num)
    value_dict = {}
    count_dict = {}
    for i in range(sample_num):
        if x[i] in value_dict.keys():
            value_dict[x[i]] += y[i]
            count_dict[x[i]] +=1
        else:
            value_dict[x[i]] = y[i]
            count_dict[x[i]] = 1
    for i in range(sample_num):
        result[i] = (value_dict[x[i]]-y[i])/(count_dict[x[i]]-1)
    return result            

In [None]:
def target_mean_v4(data, y_name, x_name):
    sample_num = data.shape[0]
    x = data.loc[:,x_name].values.flatten()
    y = data.loc[:,y_name].values.flatten()
    result = np.zeros(sample_num)
    value_dict = {}
    count_dict = {}
    for i in range(sample_num):
        if x[i] in value_dict.keys():
            value_dict[x[i]] += y[i]
            count_dict[x[i]] +=1
        else:
            value_dict[x[i]] = y[i]
            count_dict[x[i]] = 1
        
    value_li = np.array([value_dict[i] for i in x])
    count_li = np.array([count_dict[i] for i in x])
    result = (value_li-y)/(count_li-1)
    return result  

In [None]:
%load_ext Cython

In [None]:
%%cython -a
import numpy as np
cimport numpy as cnp

cpdef target_mean_v5(cnp.ndarray[double] x, cnp.ndarray[double] y,int sample_num):
    cdef cnp.ndarray result = np.zeros(sample_num) 
    cdef dict value_dict = {}
    cdef dict count_dict = {}
    
    for i in range(sample_num):
        if x[i] in value_dict.keys():
            value_dict[x[i]] += y[i]
            count_dict[x[i]] += 1
        else:
            value_dict[x[i]] = y[i]
            count_dict[x[i]] = 1
        
    for i in range(sample_num):
        result[i] = (value_dict[x[i]]-y[i])/(count_dict[x[i]]-1)
    
    return result

In [None]:
%%cython -a
cimport numpy as cnp

cpdef target_mean_v6(cnp.ndarray[long] x, cnp.ndarray[long] y,cnp.ndarray[double] result,int sample_num):
    #cdef cnp.ndarray result = np.zeros(sample_num) 
    cdef dict value_dict = {}
    cdef dict count_dict = {}
    
    for i in range(sample_num):
        if x[i] in value_dict.keys():
            value_dict[x[i]] += y[i]
            count_dict[x[i]] += 1
        else:
            value_dict[x[i]] = y[i]
            count_dict[x[i]] = 1
        
    for i in range(sample_num):
        result[i] = (value_dict[x[i]]-y[i])/(count_dict[x[i]]-1)
    
    return result

In [None]:
y = np.random.randint(2, size=(5000, 1))
x = np.random.randint(10, size=(5000, 1))
data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])
#result = target_mean_v1(data, 'y', 'x')

In [None]:
result1 = target_mean_v1(data, 'y', 'x')
result2 = target_mean_v2(data, 'y', 'x')
result3 = target_mean_v3(data, 'y', 'x')
result4 = target_mean_v4(data, 'y', 'x')


sample_num = data.shape[0]
x = data.loc[:,'x'].values.flatten().astype('double')
y = data.loc[:,'y'].values.flatten().astype('double')
result5 = target_mean_v5(x,y,sample_num)


result = np.zeros(sample_num)
x = data.loc[:,'x'].values.flatten()
y = data.loc[:,'y'].values.flatten()
result6 = target_mean_v6(x,y,result,sample_num)

diff_12 = np.linalg.norm(result1-result2)
diff_13 = np.linalg.norm(result1-result3)
diff_14 = np.linalg.norm(result1-result4)
diff_15 = np.linalg.norm(result1-result5)
diff_16 = np.linalg.norm(result1-result6)

print('diff_12',diff_12, 'diff_13',diff_13,'diff_14',diff_14,'diff_15',diff_15,'diff_16',diff_16)

diff_12 0.0 diff_13 0.0 diff_14 0.0 diff_15 0.0 diff_16 0.0


In [None]:
%%timeit
target_mean_v1(data, 'y', 'x')

1 loop, best of 3: 18.7 s per loop


In [None]:
%%timeit
target_mean_v2(data, 'y', 'x')

1 loop, best of 3: 219 ms per loop


In [None]:
%%timeit
target_mean_v3(data, 'y', 'x')

100 loops, best of 3: 8.61 ms per loop


In [None]:
%%timeit
target_mean_v4(data, 'y', 'x')

100 loops, best of 3: 6.56 ms per loop


In [None]:
%%timeit
sample_num = data.shape[0]
x = data.loc[:,'x'].values.flatten().astype('double')
y = data.loc[:,'y'].values.flatten().astype('double')
target_mean_v5(x,y,sample_num)

100 loops, best of 3: 1.96 ms per loop


In [None]:
%%timeit
sample_num = data.shape[0]
x = data.loc[:,'x'].values.flatten()
y = data.loc[:,'y'].values.flatten()
result = np.zeros(sample_num)
target_mean_v6(x,y,result,sample_num)

1000 loops, best of 3: 1 ms per loop
