<a href="https://colab.research.google.com/github/panh1992/ML-000/blob/main/Week02/chap02_school_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# chap02 作业

使用 cython 优化 target_encoding 代码，体验速度差距

## 1、import 必要的模块，准备测试数据

In [97]:
import numpy as np
import pandas as pd

y = np.random.randint(2, size=(100, 1))
x = np.random.randint(10, size=(100, 1))
data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])

In [98]:
# 展示测试数据
data

Unnamed: 0,y,x
0,0,6
1,0,4
2,1,2
3,0,6
4,1,3
...,...,...
95,1,9
96,0,7
97,1,9
98,1,9


## 2、 比较两个版本 python 代码差距  

### 版本一

In [99]:
def target_mean_v1(data, y_name, x_name):
  result = np.zeros(data.shape[0])
  for i in range(data.shape[0]):
    groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
    result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
  return result

In [100]:
v1_result = target_mean_v1(data, 'y', 'x')

v1_result

array([0.41666667, 0.45454545, 0.57142857, 0.41666667, 0.53846154,
       0.4       , 0.57142857, 0.45454545, 0.53846154, 0.61538462,
       0.5       , 0.71428571, 0.53846154, 0.85714286, 0.6       ,
       0.        , 0.36363636, 0.85714286, 0.6       , 0.75      ,
       0.5       , 0.41666667, 0.5       , 0.625     , 0.        ,
       0.45454545, 0.85714286, 0.45454545, 0.85714286, 0.33333333,
       0.71428571, 0.625     , 0.75      , 0.41666667, 0.41666667,
       0.        , 0.71428571, 0.57142857, 0.71428571, 0.57142857,
       0.71428571, 0.625     , 0.        , 0.4       , 0.61538462,
       0.33333333, 0.85714286, 0.85714286, 0.41666667, 0.85714286,
       0.5       , 0.4       , 0.57142857, 0.41666667, 0.53846154,
       0.85714286, 0.53846154, 0.625     , 0.5       , 0.33333333,
       0.45454545, 0.61538462, 0.5       , 0.6       , 0.5       ,
       1.        , 0.36363636, 0.85714286, 0.33333333, 0.33333333,
       0.75      , 0.        , 0.36363636, 0.625     , 0.4    

In [101]:
%%timeit
target_mean_v1(data, 'y', 'x')

1 loop, best of 3: 474 ms per loop


### 版本二

In [102]:
def target_mean_v2(data, y_name, x_name):
  result = np.zeros(data.shape[0])
  value_dict = dict()
  count_dict = dict()
  for i in range(data.shape[0]):
    if data.loc[i, x_name] not in value_dict.keys():
      value_dict[data.loc[i, x_name]] = data.loc[i, y_name]
      count_dict[data.loc[i, x_name]] = 1
    else:
      value_dict[data.loc[i, x_name]] += data.loc[i, y_name]
      count_dict[data.loc[i, x_name]] += 1
  for i in range(data.shape[0]):
    result[i] = (value_dict[data.loc[i, x_name]] - data.loc[i, y_name]) / (count_dict[data.loc[i, x_name]] - 1)
  return result

In [103]:
v2_result = target_mean_v1(data, 'y', 'x')

v2_result

array([0.41666667, 0.45454545, 0.57142857, 0.41666667, 0.53846154,
       0.4       , 0.57142857, 0.45454545, 0.53846154, 0.61538462,
       0.5       , 0.71428571, 0.53846154, 0.85714286, 0.6       ,
       0.        , 0.36363636, 0.85714286, 0.6       , 0.75      ,
       0.5       , 0.41666667, 0.5       , 0.625     , 0.        ,
       0.45454545, 0.85714286, 0.45454545, 0.85714286, 0.33333333,
       0.71428571, 0.625     , 0.75      , 0.41666667, 0.41666667,
       0.        , 0.71428571, 0.57142857, 0.71428571, 0.57142857,
       0.71428571, 0.625     , 0.        , 0.4       , 0.61538462,
       0.33333333, 0.85714286, 0.85714286, 0.41666667, 0.85714286,
       0.5       , 0.4       , 0.57142857, 0.41666667, 0.53846154,
       0.85714286, 0.53846154, 0.625     , 0.5       , 0.33333333,
       0.45454545, 0.61538462, 0.5       , 0.6       , 0.5       ,
       1.        , 0.36363636, 0.85714286, 0.33333333, 0.33333333,
       0.75      , 0.        , 0.36363636, 0.625     , 0.4    

In [104]:
%%timeit
target_mean_v2(data, 'y', 'x')

100 loops, best of 3: 5.3 ms per loop


## 3、使用 cython 优化 版本二 代码

In [105]:
%load_ext Cython

The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython


#### 1）单纯提取多余计算变量 比较速度差异

In [112]:
%%cython -a --cplus

import numpy as np
import pandas as pd

cpdef target_mean_v3(data, y_name, x_name):
  length = data.shape[0]
  result = np.zeros(data.shape[0])
  value_dict = dict()
  count_dict = dict()

  for i in range(length):
    x = data.loc[i, x_name]
    if x not in value_dict.keys():
      value_dict[x] = data.loc[i, y_name]
      count_dict[x] = 1
    else:
      value_dict[x] += data.loc[i, y_name]
      count_dict[x] += 1
  
  for i in range(length):
    x = data.loc[i, x_name]
    result[i] = (value_dict[x] - data.loc[i, y_name]) / (count_dict[x] - 1)
  return result

In [113]:
v3_result = target_mean_v3(data, 'y', 'x')

v3_result

array([0.41666667, 0.45454545, 0.57142857, 0.41666667, 0.53846154,
       0.4       , 0.57142857, 0.45454545, 0.53846154, 0.61538462,
       0.5       , 0.71428571, 0.53846154, 0.85714286, 0.6       ,
       0.        , 0.36363636, 0.85714286, 0.6       , 0.75      ,
       0.5       , 0.41666667, 0.5       , 0.625     , 0.        ,
       0.45454545, 0.85714286, 0.45454545, 0.85714286, 0.33333333,
       0.71428571, 0.625     , 0.75      , 0.41666667, 0.41666667,
       0.        , 0.71428571, 0.57142857, 0.71428571, 0.57142857,
       0.71428571, 0.625     , 0.        , 0.4       , 0.61538462,
       0.33333333, 0.85714286, 0.85714286, 0.41666667, 0.85714286,
       0.5       , 0.4       , 0.57142857, 0.41666667, 0.53846154,
       0.85714286, 0.53846154, 0.625     , 0.5       , 0.33333333,
       0.45454545, 0.61538462, 0.5       , 0.6       , 0.5       ,
       1.        , 0.36363636, 0.85714286, 0.33333333, 0.33333333,
       0.75      , 0.        , 0.36363636, 0.625     , 0.4    

In [114]:
%%timeit
target_mean_v3(data, 'y', 'x')

100 loops, best of 3: 3.02 ms per loop


### 2）使用 numpy array 代替 pandas loc 查找 比较速度差异

In [119]:
%%cython -a --cplus

import numpy as np
import pandas as pd

cpdef target_mean_v4(data, y_name, x_name):

  length = data.shape[0]
  result = np.zeros(data.shape[0])
  value_dict = dict()
  count_dict = dict()

  x_index = data.columns.get_loc(x_name)
  y_index = data.columns.get_loc(y_name)
  matrix = data.values

  for i in range(length):
    x = matrix[i][x_index]
    if x not in value_dict.keys():
      value_dict[x] = matrix[i][y_index]
      count_dict[x] = 1
    else:
      value_dict[x] += matrix[i][y_index]
      count_dict[x] += 1
  
  for i in range(length):
    x = matrix[i][x_index]
    result[i] = (value_dict[x] - matrix[i][y_index]) / (count_dict[x] - 1)
  return result


In [120]:
v4_result = target_mean_v4(data, 'y', 'x')

v4_result

array([0.41666667, 0.45454545, 0.57142857, 0.41666667, 0.53846154,
       0.4       , 0.57142857, 0.45454545, 0.53846154, 0.61538462,
       0.5       , 0.71428571, 0.53846154, 0.85714286, 0.6       ,
       0.        , 0.36363636, 0.85714286, 0.6       , 0.75      ,
       0.5       , 0.41666667, 0.5       , 0.625     , 0.        ,
       0.45454545, 0.85714286, 0.45454545, 0.85714286, 0.33333333,
       0.71428571, 0.625     , 0.75      , 0.41666667, 0.41666667,
       0.        , 0.71428571, 0.57142857, 0.71428571, 0.57142857,
       0.71428571, 0.625     , 0.        , 0.4       , 0.61538462,
       0.33333333, 0.85714286, 0.85714286, 0.41666667, 0.85714286,
       0.5       , 0.4       , 0.57142857, 0.41666667, 0.53846154,
       0.85714286, 0.53846154, 0.625     , 0.5       , 0.33333333,
       0.45454545, 0.61538462, 0.5       , 0.6       , 0.5       ,
       1.        , 0.36363636, 0.85714286, 0.33333333, 0.33333333,
       0.75      , 0.        , 0.36363636, 0.625     , 0.4    

In [121]:
%%timeit
target_mean_v4(data, 'y', 'x')

1000 loops, best of 3: 238 µs per loop


### 3）使用 numpy cimport 比较速度差异

In [161]:
%%cython -a --cplus

import numpy as np
cimport numpy as cnp
import pandas as pd

cpdef target_mean_v5(data, y_name, x_name):
  cdef long long length = data.shape[0]
  cdef cnp.ndarray result = np.zeros(data.shape[0])
  value_dict = dict()
  count_dict = dict()

  x_index = data.columns.get_loc(x_name)
  y_index = data.columns.get_loc(y_name)
  matrix = data.values

  for i in range(length):
    x = matrix[i][x_index]
    if x not in value_dict.keys():
      value_dict[x] = matrix[i][y_index]
      count_dict[x] = 1
    else:
      value_dict[x] += matrix[i][y_index]
      count_dict[x] += 1
  
  for i in range(length):
    x = matrix[i][x_index]
    result[i] = (value_dict[x] - matrix[i][y_index]) / (count_dict[x] - 1)
  return result

In [162]:
v5_result = target_mean_v5(data, 'y', 'x')

v5_result

array([0.41666667, 0.45454545, 0.57142857, 0.41666667, 0.53846154,
       0.4       , 0.57142857, 0.45454545, 0.53846154, 0.61538462,
       0.5       , 0.71428571, 0.53846154, 0.85714286, 0.6       ,
       0.        , 0.36363636, 0.85714286, 0.6       , 0.75      ,
       0.5       , 0.41666667, 0.5       , 0.625     , 0.        ,
       0.45454545, 0.85714286, 0.45454545, 0.85714286, 0.33333333,
       0.71428571, 0.625     , 0.75      , 0.41666667, 0.41666667,
       0.        , 0.71428571, 0.57142857, 0.71428571, 0.57142857,
       0.71428571, 0.625     , 0.        , 0.4       , 0.61538462,
       0.33333333, 0.85714286, 0.85714286, 0.41666667, 0.85714286,
       0.5       , 0.4       , 0.57142857, 0.41666667, 0.53846154,
       0.85714286, 0.53846154, 0.625     , 0.5       , 0.33333333,
       0.45454545, 0.61538462, 0.5       , 0.6       , 0.5       ,
       1.        , 0.36363636, 0.85714286, 0.33333333, 0.33333333,
       0.75      , 0.        , 0.36363636, 0.625     , 0.4    

In [163]:
%%timeit
target_mean_v5(data, 'y', 'x')

1000 loops, best of 3: 211 µs per loop


## 总结