In [1]:
%load_ext Cython

In [116]:
import numpy as np
import pandas as pd
y = np.random.randint(2, size=(5000, 1))
x = np.random.randint(10, size=(5000, 1))
data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])

## 超级慢的内个v1版本

In [3]:
def target_mean_v1(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
        result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
    return result

%timeit result_1 = target_mean_v1(data, 'y', 'x')

17.2 s ± 57.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## 大牛上课敲的那个


In [96]:
def target_mean_v2(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    value_dict = dict()
    count_dict = dict()
    for i in range(data.shape[0]):
        if data.loc[i, x_name] not in value_dict.keys():
            value_dict[data.loc[i, x_name]] = data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] = 1
        else:
            value_dict[data.loc[i, x_name]] += data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] += 1
    for i in range(data.shape[0]):
        result[i] = (value_dict[data.loc[i, x_name]] - data.loc[i, y_name]) / (count_dict[data.loc[i, x_name]] - 1)
    return result

%timeit result_2 = target_mean_v2(data, 'y', 'x')

303 ms ± 17.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## 砍掉if-else

In [6]:
def target_mean_v3(data, y_name, x_name):
    n = data.shape[0]
    result = np.zeros(n)
    value = np.zeros(10)
    count = np.zeros(10)
    for i in range(n):
        value[data.loc[i, x_name]] += data.loc[i, y_name]
        count[data.loc[i, x_name]] += 1
    for i in range(n):
        result[i] = (value[data.loc[i, x_name]] - data.loc[i, y_name]) / (count[data.loc[i, x_name]] - 1)
    return result

%timeit result_3 = target_mean_v3(data, 'y', 'x')

257 ms ± 12.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [19]:
%%cython

cimport numpy as cnp
import numpy as np

def target_mean_v4(data, y_name, x_name):
    cdef int n = data.shape[0]
    cdef cnp.ndarray result = np.zeros(n)
    value_dict = dict()
    count_dict = dict()
    for i in range(n):
        if data.loc[i, x_name] not in value_dict.keys():
            value_dict[data.loc[i, x_name]] = data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] = 1
        else:
            value_dict[data.loc[i, x_name]] += data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] += 1
    for i in range(n):
        result[i] = (value_dict[data.loc[i, x_name]] - data.loc[i, y_name]) / (count_dict[data.loc[i, x_name]] - 1)
    return result

In [12]:
%%timeit 
result_4 = target_mean_v4(data, 'y', 'x')

304 ms ± 22.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## 然并卵的一堆

In [74]:
%%cython

import numpy as np
cimport numpy as cnp

def target_mean_v5(data, y_name, x_name):
    cdef int n = data.shape[0]
    cdef cnp.ndarray result = np.zeros(n)
    cdef cnp.ndarray value = np.zeros(10)
    cdef cnp.ndarray count = np.zeros(10)
    
    for i in range(n):
        value[data.loc[i, x_name]] += data.loc[i, y_name]
        count[data.loc[i, x_name]] += 1
    for i in range(n):
        result[i] = (value[data.loc[i, x_name]] - data.loc[i, y_name]) / (count[data.loc[i, x_name]] - 1)
    return result

In [75]:
%%timeit 
result_5 = target_mean_v5(data, 'y', 'x')

266 ms ± 7.41 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## 花里胡哨但然并卵的一堆

In [93]:
%%cython

import numpy as np
cimport numpy as cnp
# from libcpp.vector cimport vector

# cdef extern from "ind_cols.h":
#     vector[long] get_ind_cols(double*, const long, const long)

cpdef target_mean_v6(data, y_name, x_name):
    
    cdef int n = data.shape[0]
    cdef cnp.ndarray result = np.zeros(n)
#     cdef cnp.ndarray[double, ndim=2, mode='fortran'] aa = np.asfortranarray(data, dtype=np.float64)
#     cdef int nn = np.unique(aa).shape[0]
#     cdef cnp.ndarray value = np.zeros(nn)
#     cdef cnp.ndarray count = np.zeros(nn)
#     cdef cnp.ndarray[double, ndim=1, mode='fortran'] value = np.asfortranarray(np.zeros(nn), dtype=np.float64)
#     cdef cnp.ndarray[double, ndim=1, mode='fortran'] count = np.asfortranarray(np.zeros(nn), dtype=np.float64)
#     ind_col_vec = list(get_ind_cols(&arg[0, 0], matrix.shape[0], matrix.shape[1]))

    groupby_result = data.groupby([x_name], as_index=False).agg(['sum', 'count'])

#     for i in range(n):
#         value[data.loc[i, x_name]] += data.loc[i, y_name]
#         count[data.loc[i, x_name]] += 1
    for i in range(n):
        result[i] = (groupby_result.loc[data.loc[i, y_name], (y_name, 'sum')] - data.loc[i, y_name]) / (groupby_result.loc[data.loc[i, y_name], (y_name, 'count')] - 1)
    return result

In [94]:
%%timeit 
result_6 = target_mean_v6(data, 'y', 'x')

2.4 s ± 18.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## 用数组直接累加

In [108]:
%%cython

import numpy as np
cimport numpy as cnp

def target_mean_v7(data, y_name, x_name):
    cdef int n = data.shape[0]
    cdef cnp.ndarray result = np.zeros(n)
    
    cdef cnp.ndarray xx = data[x_name].values
    cdef cnp.ndarray yy = data[y_name].values
    
    cdef int nn = np.unique(xx).shape[0]
    cdef cnp.ndarray value = np.zeros(nn)
    cdef cnp.ndarray count = np.zeros(nn)
    
    for i in range(n):
        ii = xx[i]
        value[ii] += yy[i]
        count[ii] += 1
        
    for i in range(n):
        ii = xx[i]
        result[i] = (value[ii] - yy[i]) / (count[ii] - 1)
    
    return result

In [109]:
%%timeit
result_7 = target_mean_v7(data, 'y', 'x')

11.2 ms ± 337 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## 用字典累加

In [118]:
%%cython

import numpy as np
cimport numpy as cnp

def target_mean_v8(data, y_name, x_name):
    cdef int n = data.shape[0]
    cdef cnp.ndarray result = np.zeros(n)
    
    cdef cnp.ndarray xx = data[x_name].values
    cdef cnp.ndarray yy = data[y_name].values
    
    cdef int nn = np.unique(xx).shape[0]
    
    value_dict = dict()
    count_dict = dict()
    
    for i in range(data.shape[0]):
        index = xx[i]
        if index not in value_dict.keys():
            value_dict[index] = yy[i]
            count_dict[index] = 0
        else:
            value_dict[index] += yy[i]
            count_dict[index] += 1
            
    for i in range(n):
        index = xx[i]
        result[i] = (value_dict[index] - yy[i]) / (count_dict[index] - 1)
    
    return result

In [119]:
%%timeit
result_8 = target_mean_v8(data, 'y', 'x')

8.28 ms ± 227 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## 胖大海的python代码

In [120]:
def target_mean_v9(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    xs = data[x_name].values
    ys = data[y_name].values
    value_dict = dict()
    count_dict = dict()
    for i in range(data.shape[0]):
        index = xs[i]
        if index not in value_dict.keys():
            value_dict[index] = ys[i]
            count_dict[index] = 0
        else:
            value_dict[index] += ys[i]
            count_dict[index] += 1

    for i in range(data.shape[0]):
        index = xs[i]
        result[i] = (value_dict[index] - ys[i]) / count_dict[index]
    return result

In [122]:
%%timeit
result_v9 = target_mean_v9(data,'y','x')

10.2 ms ± 222 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [102]:
# result_3 = target_mean_v3(data, 'y', 'x')
# result_4 = target_mean_v4(data, 'y', 'x')
# result_5 = target_mean_v5(data, 'y', 'x')
# result_6 = target_mean_v6(data, 'y', 'x')
result_7 = target_mean_v7(data, 'y', 'x')
diff = np.linalg.norm(result_3 - result_7)
diff

TypeError: Cannot convert Series to numpy.ndarray