In [21]:
%load_ext Cython

The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython


# Pure Python

In [22]:
import numpy as np
import pandas as pd

def target_mean_v1(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
        result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
    return result


def target_mean_v2(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    value_dict = dict()
    count_dict = dict()
    for i in range(data.shape[0]):
        if data.loc[i, x_name] not in value_dict.keys():
            value_dict[data.loc[i, x_name]] = data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] = 1
        else:
            value_dict[data.loc[i, x_name]] += data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] += 1
    for i in range(data.shape[0]):
        result[i] = (value_dict[data.loc[i, x_name]] - data.loc[i, y_name]) / (count_dict[data.loc[i, x_name]] - 1)
    return result


y = np.random.randint(2, size=(5000, 1))
x = np.random.randint(10, size=(5000, 1))
data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])



In [23]:
%%timeit
target_mean_v2(data, 'y','x')

1 loop, best of 3: 273 ms per loop


# Plain Cyphon

In [24]:
%%cython -a
import numpy as np
import pandas as pd

def target_mean_v3(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    value_dict = dict()
    count_dict = dict()
    for i in range(data.shape[0]):
        if data.loc[i, x_name] not in value_dict.keys():
            value_dict[data.loc[i, x_name]] = data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] = 1
        else:
            value_dict[data.loc[i, x_name]] += data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] += 1
    for i in range(data.shape[0]):
        result[i] = (value_dict[data.loc[i, x_name]] - data.loc[i, y_name]) / (count_dict[data.loc[i, x_name]] - 1)
    return result


In [25]:
%%timeit
target_mean_v3(data,'y','x')

1 loop, best of 3: 270 ms per loop


# Python speed-up



In [26]:
import numpy as np
import pandas as pd

def target_mean_v4(data, y_name, x_name):
  
  data_num = data.shape[0]
  result = np.empty(data_num)
  value_dict = dict()
  count_dict = dict()
  
  x_val_array = data[x_name].values
  y_val_array = data[y_name].values

  for i in range(data_num):
    data_x_key = x_val_array[i]
    data_y_key = y_val_array[i]

    if data_x_key not in value_dict:
      value_dict[data_x_key] = data_y_key
      count_dict[data_x_key] = 1
    else:
      value_dict[data_x_key] += data_y_key
      count_dict[data_x_key] += 1
    
  for i in range(data_num):
    data_x_key = x_val_array[i]
    data_y_key = y_val_array[i]

    result[i] = (value_dict[data_x_key] - data_y_key) / (count_dict[data_x_key] - 1)

  return result


In [27]:
%%timeit
target_mean_v4(data, 'y', 'x')

100 loops, best of 3: 8.35 ms per loop


# Cyphon Speed-up

In [28]:
%%cython -a
import numpy as np
import pandas as pd
cimport numpy as cnp

cpdef target_mean_v5(data, y_name, x_name):

  cdef:
   int data_num = data.shape[0]
   cnp.ndarray[cnp.float64_t] result = np.empty(data_num, dtype=np.float64)
   dict value_dict = dict()
   dict count_dict = dict()
   cnp.ndarray[cnp.int_t] x_val_array = data[x_name].values
   cnp.ndarray[cnp.int_t] y_val_array = data[y_name].values

  for i in range(data_num):
    data_x_key = x_val_array[i]
    data_y_key = y_val_array[i]

    if data_x_key not in value_dict:
      value_dict[data_x_key] = data_y_key
      count_dict[data_x_key] = 1
    else:
      value_dict[data_x_key] += data_y_key
      count_dict[data_x_key] += 1
    
  for i in range(data_num):
    data_x_key = x_val_array[i]
    data_y_key = y_val_array[i]

    result[i] = (value_dict[data_x_key] - data_y_key) / (count_dict[data_x_key] - 1)

  return result


In [29]:
%%timeit
target_mean_v5(data, 'y', 'x')

1000 loops, best of 3: 1.03 ms per loop


# Cyphon with OpenMP

In [30]:
%%cython -a
import numpy as np
import pandas as pd
import cython
cimport numpy as cnp
from cython.parallel import prange

cpdef target_mean_v6(data, cnp.str y_name, cnp.str x_name):

  cdef:
   int data_num = data.shape[0]
   double[:,] result = np.empty(data_num, dtype=np.double)
   double[:,] value = np.empty(10, dtype=np.double)
   double[:,] count = np.empty(10, dtype=np.double)
   long[:,] x_val_array = data[x_name].values
   long[:,] y_val_array = data[y_name].values
   int i = 0

  for i in prange(data_num, nogil=True):
    value[x_val_array[i]] += y_val_array[i]
    count[x_val_array[i]] += 1
    
  for i in prange(data_num, nogil=True):
    result[i] = (value[x_val_array[i]] - y_val_array[i]) / (count[x_val_array[i]] - 1)

  return result



In [31]:
%%timeit
target_mean_v6(data, 'y', 'x')

The slowest run took 4.66 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 60.4 µs per loop


Shut down boundscheck, wraparound

In [32]:
%%cython -a
import numpy as np
import pandas as pd
import cython
cimport numpy as cnp
from cython.parallel import prange

@cython.boundscheck(False)
@cython.wraparound(False)

cpdef target_mean_v7(data, cnp.str y_name, cnp.str x_name):

  cdef:
   int data_num = data.shape[0]
   double[:,] result = np.empty(data_num, dtype=np.double)
   double[:,] value = np.empty(10, dtype=np.double)
   double[:,] count = np.empty(10, dtype=np.double)
   long[:,] x_val_array = data[x_name].values
   long[:,] y_val_array = data[y_name].values
   int i = 0

  for i in prange(data_num, nogil=True):
    value[x_val_array[i]] += y_val_array[i]
    count[x_val_array[i]] += 1
    
  for i in prange(data_num, nogil=True):
    result[i] = (value[x_val_array[i]] - y_val_array[i]) / (count[x_val_array[i]] - 1)

  return result



In [33]:
%%timeit
target_mean_v7(data, 'y', 'x')

The slowest run took 30.26 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 37.3 µs per loop


Reference:

https://mp.weixin.qq.com/s/-Vr0u5dH4cCAOMRf_Dss_Q

https://cython.readthedocs.io/en/latest/src/userguide/language_basics.html

https://pandas.pydata.org/pandas-docs/stable/user_guide/enhancingperf.html

https://www.infoworld.com/article/3252209/cython-tutorial-how-to-speed-up-python.html?nsdr=true

https://www.infoworld.com/article/3329750/how-to-profile-python-code.html