<a href="https://colab.research.google.com/github/panh1992/ML-000/blob/main/Week02/chap02_school_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# chap02 作业

使用 cython 优化 target_encoding 代码，体验速度差距

## 本次优化过程主要分为几步

1. python 代码逻辑简化
2. 使用 cython 定义精确类型，提取多余计算变量，进一步提速
3. pandas loc 查找，替换为 numpy array 索引，替换后效率提升极大
4. cytho链接c++实现 实现过程完全放到 c++ 中实现，使用 openmp 进行并行处理 

## 一、import 必要的模块，准备测试数据

In [1]:
import numpy as np
import pandas as pd

y = np.random.randint(2, size=(5000, 1))
x = np.random.randint(10, size=(5000, 1))
data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])

# 展示测试数据
data.head()

Unnamed: 0,y,x
0,0,4
1,1,7
2,1,7
3,0,3
4,1,3


## 二、 比较两个版本 python 代码差距， 提取版本二重复计算操作，优化版本二代码

### 版本一

In [2]:
def target_mean_v1(data, y_name, x_name):
  result = np.zeros(data.shape[0])
  for i in range(data.shape[0]):
    groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
    result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
  return result

In [3]:
v1_result = target_mean_v1(data, 'y', 'x')

v1_result

array([0.49420849, 0.4838013 , 0.4838013 , ..., 0.51335878, 0.47261663,
       0.5204918 ])

In [4]:
%%timeit
target_mean_v1(data, 'y', 'x')

1 loop, best of 3: 23.5 s per loop


### 版本二
 
提取多余计算变量

In [5]:
def target_mean_v2(data, y_name, x_name):
  length = data.shape[0]
  result = np.zeros(length)
  value_dict = dict()
  count_dict = dict()
  for i in range(length):
    x = data.loc[i, x_name]
    if x not in value_dict.keys():
      value_dict[x] = data.loc[i, y_name]
      count_dict[x] = 1
    else:
      value_dict[x] += data.loc[i, y_name]
      count_dict[x] += 1

  for i in range(length):
    x = data.loc[i, x_name]
    result[i] = (value_dict[x] - data.loc[i, y_name]) / (count_dict[x] - 1)

  return result

In [6]:
v2_result = target_mean_v2(data, 'y', 'x')

In [7]:
%%timeit
target_mean_v2(data, 'y', 'x')

10 loops, best of 3: 149 ms per loop


In [8]:
np.linalg.norm(v2_result - v1_result)

0.0

## 3、使用 cython 优化 版本二 代码

In [9]:
%load_ext Cython

#### 1）使用 cython 定义精确类型 测试时间

In [10]:
%%cython -a --cplus

import numpy as np
cimport numpy as np
import pandas as pd

cpdef target_mean_v3(data, y_name, x_name):
  cdef long length = data.shape[0]
  cpdef np.ndarray[np.float64_t, ndim=1] result = np.zeros(length, dtype=np.float64)
  cdef dict value_dict = dict()
  cdef dict count_dict = dict()
  for i in range(length):
    x = data.loc[i, x_name]
    if x not in value_dict.keys():
      value_dict[x] = data.loc[i, y_name]
      count_dict[x] = 1
    else:
      value_dict[x] += data.loc[i, y_name]
      count_dict[x] += 1

  for i in range(length):
    x = data.loc[i, x_name]
    result[i] = (value_dict[x] - data.loc[i, y_name]) / (count_dict[x] - 1)

  return result

In [11]:
v3_result = target_mean_v3(data, 'y', 'x')

In [12]:
%%timeit
target_mean_v3(data, 'y', 'x')

10 loops, best of 3: 148 ms per loop


In [13]:
np.linalg.norm(v3_result - v1_result)

0.0

### 2）pandas loc 查找，替换为 numpy array 索引

In [14]:
%%cython -a --cplus

import numpy as np
cimport numpy as np
import pandas as pd

cpdef target_mean_v4(data, y_name, x_name):

  cdef long length = data.shape[0]
  cpdef np.ndarray[np.float64_t, ndim=1] result = np.zeros(data.shape[0], dtype = np.float64)
  cdef dict value_dict = dict()
  cdef dict count_dict = dict()

  cdef long x_index = data.columns.get_loc(x_name)
  cdef long y_index = data.columns.get_loc(y_name)
  cpdef np.ndarray matrix = data.values

  for i in range(length):
    x = matrix[i][x_index]
    if x not in value_dict.keys():
      value_dict[x] = matrix[i][y_index]
      count_dict[x] = 1
    else:
      value_dict[x] += matrix[i][y_index]
      count_dict[x] += 1
  
  for i in range(length):
    x = matrix[i][x_index]
    result[i] = (value_dict[x] - matrix[i][y_index]) / (count_dict[x] - 1)

  return result


In [15]:
v4_result = target_mean_v4(data, 'y', 'x')

In [16]:
%%timeit
target_mean_v4(data, 'y', 'x')

100 loops, best of 3: 9.29 ms per loop


In [17]:
np.linalg.norm(v4_result - v1_result)

0.0

### 3）cytho链接c++实现 实现过程完全放到 c++ 中实现，使用 openmp 进行并行处理

In [18]:
# 挂载谷歌云盘
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [19]:
# 查看c++实现代码
!cat /content/drive/MyDrive/target_encoding/target_encoding.h

#pragma once
#include <map>
#include <iostream>
#include <omp.h>

using namespace std;

void target_mean(double *matrix, double *result, const long row, const long col, const long x_index, const long y_index) {

    map<double, double> value_dict;
    map<double, double> count_dict;

    #pragma omp parallel for
    for (int i = 0; i < row; i++) {
        double x = matrix[i * col + x_index];
        auto value_iter = value_dict.find(x);
        auto count_iter = count_dict.find(x);
        if(value_iter != value_dict.end()) {
            value_dict[x] = value_iter->second + matrix[i * col + y_index];
            count_dict[x] = count_iter->second + 1;
        } else {
            value_dict[x] = matrix[i * col + y_index];
            count_dict[x] = 1;
        }
    }

    #pragma omp parallel for
    for (int i = 0; i < row; i++) {
        double x = matrix[i * col + x_index];
        result[i] = (value_dict[x] - matrix[i * col + y_index]) / (count_dict[x

In [20]:
# 查看 setup.py
!cat /content/drive/MyDrive/target_encoding/setup.py

from distutils.core import setup, Extension
from Cython.Build import cythonize
import numpy

compile_flags = ['-std=c++11', '-fopenmp']
linker_flags = ['-fopenmp']

module = Extension('target_encoding',
                   ['target_encoding.pyx'],
                   language='c++',
                   include_dirs=[numpy.get_include()],
                   extra_compile_args=compile_flags,
                   extra_link_args=linker_flags)

setup(
    name='cython_test',
    ext_modules=cythonize(module)
)


In [21]:
%%cython -a --cplus
# distutils: language = c++

import numpy as np
cimport numpy as np
import pandas as pd
cimport cython


cdef extern from "/content/drive/MyDrive/target_encoding/target_encoding.h":
  void target_mean(double *matrix, double *result, const long row, const long col, const long x_index, const long y_index) nogil


cpdef target_mean_v5(data, y_name, x_name):

  cdef long length = data.shape[0]
  cdef long x_index = data.columns.get_loc(x_name)
  cdef long y_index = data.columns.get_loc(y_name)

  cdef np.ndarray[np.float64_t, ndim=1, mode = 'c'] result = np.ascontiguousarray(np.zeros(length), dtype=np.float64)
  cdef np.ndarray[np.float64_t, ndim=2, mode = 'c'] matrix = np.ascontiguousarray(data[[y_name, x_name]].values, dtype=np.float64)

  cdef double* result_buff = <double*> result.data
  cdef double* matrix_buff = <double*> matrix.data

  cdef long row = matrix.shape[0]
  cdef long col = matrix.shape[1]

  with nogil:
    target_mean(matrix_buff, result_buff, row, col, x_index, y_index)

  return result


In [22]:
v5_result = target_mean_v5(data, 'y', 'x')

In [23]:
%%timeit -n 1 -p 1
target_mean_v5(data, 'y', 'x')

1 loop, best of 3: 2 ms per loop


In [24]:
np.linalg.norm(v5_result - v1_result)

0.0

In [25]:
v5_result

array([0.49420849, 0.4838013 , 0.4838013 , ..., 0.51335878, 0.47261663,
       0.5204918 ])