# 作业
把提供的 target encoding 代码改为 cython 代码并比较速度区别（如可以实现并行可加分）

In [72]:
import time
import functools

def log_execution_time(func):
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        start = time.perf_counter()
        res = func(*args, **kwargs)
        end = time.perf_counter()
        print('{} took {} ms'.format(func.__name__, (end - start) * 1000))
        return res
    return wrapper

# Baseline

In [73]:
# coding = 'utf-8'
import numpy as np
import pandas as pd

y = np.random.randint(2, size=(5000, 1))
x = np.random.randint(10, size=(5000, 1))
data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])

data

Unnamed: 0,y,x
0,1,1
1,1,0
2,1,1
3,0,0
4,1,9
...,...,...
4995,1,1
4996,0,9
4997,1,9
4998,0,8


In [74]:
# @log_execution_time
def target_mean_v1(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
        result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
    return result

In [75]:
result_1 = target_mean_v1(data, 'y', 'x')

In [76]:
%%timeit
target_mean_v1(data, 'y', 'x')

1 loop, best of 3: 24.3 s per loop


# 王然老师优化思路

In [77]:
# @log_execution_time
def target_mean_v2(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    value_dict = dict()
    count_dict = dict()

    for i in range(data.shape[0]):
        if data.loc[i, x_name] not in value_dict.keys():
            value_dict[data.loc[i, x_name]] = data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] = 1
        else:
            value_dict[data.loc[i, x_name]] += data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] += 1
    
    for i in range(data.shape[0]):
        result[i] = (value_dict[data.loc[i, x_name]] - data.loc[i, y_name]) / (count_dict[data.loc[i, x_name]] - 1)
    
    return result

In [78]:
result_2 = target_mean_v2(data, 'y', 'x')

In [79]:
print(np.linalg.norm(result_1 - result_2))

0.0


In [80]:
%%timeit
target_mean_v2(data, 'y', 'x')

1 loop, best of 3: 274 ms per loop


# 优化1-python

In [81]:
import numpy as np
import pandas as pd

x1 = x.reshape(data.shape[0])
y1 = y.reshape(data.shape[0])

# @log_execution_time
def target_mean_v3(x,y):
    result = np.zeros(x.size)
    value_dict = {}
    count_dict = {}

    for i in range(x.size):
        key = x[i]
        val = y[i]
        if key not in value_dict.keys():
            value_dict[key] = val
            count_dict[key] = 1
        else:
            value_dict[key] += val
            count_dict[key] += 1
    
    for i in range(data.shape[0]):
        key = x[i]
        val = y[i]
        result[i] = (value_dict[key] - val) / (count_dict[key] - 1)
    
    return result

In [82]:
result_3 = target_mean_v3(x1,y1)

In [83]:
print(np.linalg.norm(result_2 - result_3))

0.0


In [84]:
%%timeit
target_mean_v3(x1,y1)

100 loops, best of 3: 8.64 ms per loop


# 优化2-Cython

cython语法不熟啊啊啊，c也不熟

In [85]:
%load_ext Cython

The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython


In [86]:
%%cython -a

import numpy as np
cimport numpy as cnp

y3 = np.random.randint(2, size=(5000))
x3 = np.random.randint(10, size=(5000))

In [87]:
%%cython -a

import numpy as np
cimport numpy as cnp

# @cython.boundscheck(False)
# @cython.wraparound(False)
cpdef target_mean_v4(x,y):
    result = np.zeros(x.size)
    cdef dict value_dict = {}
    cdef dict count_dict = {}

    for i in range(x.size):
        key = x[i]
        val = y[i]
        if key not in value_dict.keys():
            value_dict[key] = val
            count_dict[key] = 1
        else:
            value_dict[key] += val
            count_dict[key] += 1
    
    for i in range(x.size):
        key = x[i]
        val = y[i]
        result[i] = (value_dict[key] - val) / (count_dict[key] - 1)
    
    return result

In [88]:
%%timeit
target_mean_v4(x3,y3)

100 loops, best of 3: 7.14 ms per loop
