<a href="https://colab.research.google.com/github/panh1992/ML-000/blob/main/Week02/chap02_school_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# chap02 作业

使用 cython 优化 target_encoding 代码，体验速度差距

## 本次优化过程主要分为几步

1. python 代码逻辑简化
2. 使用 cython 提取多余计算变量
3. pandas loc 查找，替换为 numpy array 操作 替换后效率提升极大
4. 使用 cdef 定义精确类型 进一步提速

<span id = "1"></span>
## 1、import 必要的模块，准备测试数据

In [1]:
import numpy as np
import pandas as pd

y = np.random.randint(2, size=(200, 1))
x = np.random.randint(10, size=(200, 1))
data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])

# 展示测试数据
data.head()

Unnamed: 0,y,x
0,0,6
1,1,8
2,0,7
3,0,5
4,1,5


## 2、 比较两个版本 python 代码差距  

### 版本一

In [2]:
def target_mean_v1(data, y_name, x_name):
  result = np.zeros(data.shape[0])
  for i in range(data.shape[0]):
    groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
    result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
  return result

In [3]:
v1_result = target_mean_v1(data, 'y', 'x')

v1_result

array([0.55555556, 0.38888889, 0.47058824, 0.5       , 0.44444444,
       0.42424242, 0.44444444, 0.44444444, 0.46875   , 0.625     ,
       0.33333333, 0.45454545, 0.44444444, 0.55555556, 0.33333333,
       0.6       , 0.41666667, 0.45454545, 0.33333333, 0.38888889,
       0.55      , 0.46875   , 0.45454545, 0.44444444, 0.46875   ,
       0.5       , 0.5       , 0.45454545, 0.41666667, 0.5       ,
       0.46875   , 0.33333333, 0.6875    , 0.47058824, 0.42424242,
       0.45454545, 0.45454545, 0.45454545, 0.55      , 0.42424242,
       0.55      , 0.33333333, 0.44444444, 0.45454545, 0.46875   ,
       0.33333333, 0.4375    , 0.44444444, 0.26666667, 0.55      ,
       0.625     , 0.44444444, 0.46875   , 0.4375    , 0.5       ,
       0.42424242, 0.4375    , 0.42424242, 0.41176471, 0.44444444,
       0.6875    , 0.38888889, 0.5       , 0.5       , 0.41176471,
       0.44444444, 0.6       , 0.6       , 0.625     , 0.46875   ,
       0.625     , 0.42424242, 0.45454545, 0.41176471, 0.5    

In [4]:
%%timeit
target_mean_v1(data, 'y', 'x')

1 loop, best of 3: 730 ms per loop


### 版本二

In [5]:
def target_mean_v2(data, y_name, x_name):
  result = np.zeros(data.shape[0])
  value_dict = dict()
  count_dict = dict()
  for i in range(data.shape[0]):
    if data.loc[i, x_name] not in value_dict.keys():
      value_dict[data.loc[i, x_name]] = data.loc[i, y_name]
      count_dict[data.loc[i, x_name]] = 1
    else:
      value_dict[data.loc[i, x_name]] += data.loc[i, y_name]
      count_dict[data.loc[i, x_name]] += 1
  for i in range(data.shape[0]):
    result[i] = (value_dict[data.loc[i, x_name]] - data.loc[i, y_name]) / (count_dict[data.loc[i, x_name]] - 1)
  return result

In [6]:
v2_result = target_mean_v1(data, 'y', 'x')

v2_result

array([0.55555556, 0.38888889, 0.47058824, 0.5       , 0.44444444,
       0.42424242, 0.44444444, 0.44444444, 0.46875   , 0.625     ,
       0.33333333, 0.45454545, 0.44444444, 0.55555556, 0.33333333,
       0.6       , 0.41666667, 0.45454545, 0.33333333, 0.38888889,
       0.55      , 0.46875   , 0.45454545, 0.44444444, 0.46875   ,
       0.5       , 0.5       , 0.45454545, 0.41666667, 0.5       ,
       0.46875   , 0.33333333, 0.6875    , 0.47058824, 0.42424242,
       0.45454545, 0.45454545, 0.45454545, 0.55      , 0.42424242,
       0.55      , 0.33333333, 0.44444444, 0.45454545, 0.46875   ,
       0.33333333, 0.4375    , 0.44444444, 0.26666667, 0.55      ,
       0.625     , 0.44444444, 0.46875   , 0.4375    , 0.5       ,
       0.42424242, 0.4375    , 0.42424242, 0.41176471, 0.44444444,
       0.6875    , 0.38888889, 0.5       , 0.5       , 0.41176471,
       0.44444444, 0.6       , 0.6       , 0.625     , 0.46875   ,
       0.625     , 0.42424242, 0.45454545, 0.41176471, 0.5    

In [7]:
%%timeit
target_mean_v2(data, 'y', 'x')

100 loops, best of 3: 9.33 ms per loop


## 3、使用 cython 优化 版本二 代码

In [8]:
%load_ext Cython

#### 1）单纯提取多余计算变量

In [9]:
%%cython -a --cplus

import numpy as np
import pandas as pd

cpdef target_mean_v3(data, y_name, x_name):
  length = data.shape[0]
  result = np.zeros(data.shape[0])
  value_dict = dict()
  count_dict = dict()

  for i in range(length):
    x = data.loc[i, x_name]
    if x not in value_dict.keys():
      value_dict[x] = data.loc[i, y_name]
      count_dict[x] = 1
    else:
      value_dict[x] += data.loc[i, y_name]
      count_dict[x] += 1
  
  for i in range(length):
    x = data.loc[i, x_name]
    result[i] = (value_dict[x] - data.loc[i, y_name]) / (count_dict[x] - 1)
  return result

In [10]:
v3_result = target_mean_v3(data, 'y', 'x')

v3_result

array([0.55555556, 0.38888889, 0.47058824, 0.5       , 0.44444444,
       0.42424242, 0.44444444, 0.44444444, 0.46875   , 0.625     ,
       0.33333333, 0.45454545, 0.44444444, 0.55555556, 0.33333333,
       0.6       , 0.41666667, 0.45454545, 0.33333333, 0.38888889,
       0.55      , 0.46875   , 0.45454545, 0.44444444, 0.46875   ,
       0.5       , 0.5       , 0.45454545, 0.41666667, 0.5       ,
       0.46875   , 0.33333333, 0.6875    , 0.47058824, 0.42424242,
       0.45454545, 0.45454545, 0.45454545, 0.55      , 0.42424242,
       0.55      , 0.33333333, 0.44444444, 0.45454545, 0.46875   ,
       0.33333333, 0.4375    , 0.44444444, 0.26666667, 0.55      ,
       0.625     , 0.44444444, 0.46875   , 0.4375    , 0.5       ,
       0.42424242, 0.4375    , 0.42424242, 0.41176471, 0.44444444,
       0.6875    , 0.38888889, 0.5       , 0.5       , 0.41176471,
       0.44444444, 0.6       , 0.6       , 0.625     , 0.46875   ,
       0.625     , 0.42424242, 0.45454545, 0.41176471, 0.5    

In [11]:
%%timeit
target_mean_v3(data, 'y', 'x')

100 loops, best of 3: 5.39 ms per loop


### 2）使用 numpy array 代替 pandas loc 查找

In [12]:
%%cython -a --cplus

import numpy as np
import pandas as pd

cpdef target_mean_v4(data, y_name, x_name):

  length = data.shape[0]
  result = np.zeros(data.shape[0])
  value_dict = dict()
  count_dict = dict()

  x_index = data.columns.get_loc(x_name)
  y_index = data.columns.get_loc(y_name)
  matrix = data.values

  for i in range(length):
    x = matrix[i][x_index]
    if x not in value_dict.keys():
      value_dict[x] = matrix[i][y_index]
      count_dict[x] = 1
    else:
      value_dict[x] += matrix[i][y_index]
      count_dict[x] += 1
  
  for i in range(length):
    x = matrix[i][x_index]
    result[i] = (value_dict[x] - matrix[i][y_index]) / (count_dict[x] - 1)
  return result


In [13]:
v4_result = target_mean_v4(data, 'y', 'x')

v4_result

array([0.55555556, 0.38888889, 0.47058824, 0.5       , 0.44444444,
       0.42424242, 0.44444444, 0.44444444, 0.46875   , 0.625     ,
       0.33333333, 0.45454545, 0.44444444, 0.55555556, 0.33333333,
       0.6       , 0.41666667, 0.45454545, 0.33333333, 0.38888889,
       0.55      , 0.46875   , 0.45454545, 0.44444444, 0.46875   ,
       0.5       , 0.5       , 0.45454545, 0.41666667, 0.5       ,
       0.46875   , 0.33333333, 0.6875    , 0.47058824, 0.42424242,
       0.45454545, 0.45454545, 0.45454545, 0.55      , 0.42424242,
       0.55      , 0.33333333, 0.44444444, 0.45454545, 0.46875   ,
       0.33333333, 0.4375    , 0.44444444, 0.26666667, 0.55      ,
       0.625     , 0.44444444, 0.46875   , 0.4375    , 0.5       ,
       0.42424242, 0.4375    , 0.42424242, 0.41176471, 0.44444444,
       0.6875    , 0.38888889, 0.5       , 0.5       , 0.41176471,
       0.44444444, 0.6       , 0.6       , 0.625     , 0.46875   ,
       0.625     , 0.42424242, 0.45454545, 0.41176471, 0.5    

In [14]:
%%timeit
target_mean_v4(data, 'y', 'x')

1000 loops, best of 3: 389 µs per loop


### 3）使用 numpy cimport

In [15]:
%%cython -a --cplus

import numpy as np
cimport numpy as cnp
import pandas as pd

cpdef target_mean_v5(data, y_name, x_name):
  cdef long long length = data.shape[0]
  cdef cnp.ndarray result = np.zeros(data.shape[0])
  value_dict = dict()
  count_dict = dict()

  cdef long x_index = data.columns.get_loc(x_name)
  cdef long y_index = data.columns.get_loc(y_name)
  matrix = data.values

  for i in range(length):
    x = matrix[i][x_index]
    if x not in value_dict.keys():
      value_dict[x] = matrix[i][y_index]
      count_dict[x] = 1
    else:
      value_dict[x] += matrix[i][y_index]
      count_dict[x] += 1
  
  for i in range(length):
    x = matrix[i][x_index]
    result[i] = (value_dict[x] - matrix[i][y_index]) / (count_dict[x] - 1)
  return result

In [16]:
v5_result = target_mean_v5(data, 'y', 'x')

v5_result

array([0.55555556, 0.38888889, 0.47058824, 0.5       , 0.44444444,
       0.42424242, 0.44444444, 0.44444444, 0.46875   , 0.625     ,
       0.33333333, 0.45454545, 0.44444444, 0.55555556, 0.33333333,
       0.6       , 0.41666667, 0.45454545, 0.33333333, 0.38888889,
       0.55      , 0.46875   , 0.45454545, 0.44444444, 0.46875   ,
       0.5       , 0.5       , 0.45454545, 0.41666667, 0.5       ,
       0.46875   , 0.33333333, 0.6875    , 0.47058824, 0.42424242,
       0.45454545, 0.45454545, 0.45454545, 0.55      , 0.42424242,
       0.55      , 0.33333333, 0.44444444, 0.45454545, 0.46875   ,
       0.33333333, 0.4375    , 0.44444444, 0.26666667, 0.55      ,
       0.625     , 0.44444444, 0.46875   , 0.4375    , 0.5       ,
       0.42424242, 0.4375    , 0.42424242, 0.41176471, 0.44444444,
       0.6875    , 0.38888889, 0.5       , 0.5       , 0.41176471,
       0.44444444, 0.6       , 0.6       , 0.625     , 0.46875   ,
       0.625     , 0.42424242, 0.45454545, 0.41176471, 0.5    

In [17]:
%%timeit
target_mean_v5(data, 'y', 'x')

1000 loops, best of 3: 339 µs per loop
