## START MY CHAP02 HOMEWORK

In [47]:
import numpy as np
import pandas as pd

y = np.random.randint(2, size=(5000, 1))
x = np.random.randint(10, size=(5000, 1))
data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])

def is_eq(r1, r2):
  if np.linalg.norm(r1 - r2) != 0.0:
    raise AssertionError("两者不匹配")
  else:
    print("It's right.")

#### 未优化的demo


In [4]:
def target_mean_v1(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        # 排除索引为自己的数字，然后根据x值groupby,统计平均值、总数
        groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
        # 找到x值，对应的groupby的index,看看y列的mean值是多少
        result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
    return result

#### 老师提供的优化思路的demo

In [41]:
def target_mean_v2(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    value_dict = dict()
    count_dict = dict()
    for i in range(data.shape[0]):
        if data.loc[i, x_name] not in value_dict.keys():
            value_dict[data.loc[i, x_name]] = data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] = 1
        else:
            value_dict[data.loc[i, x_name]] += data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] += 1
    for i in range(data.shape[0]):
        result[i] = (value_dict[data.loc[i, x_name]] - data.loc[i, y_name]) / (count_dict[data.loc[i, x_name]] - 1)
    return result

#### python优化方案
优化思路：
1. 考虑pandas的遍历比较慢，换成原始的数组
2. 考虑循环内有多次遍历，换成先提取
3. 尝试减少一次循环 -> 没成功

In [42]:
def target_mean_v3(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    xs = data[x_name].values
    ys = data[y_name].values
    value_dict = dict()
    count_dict = dict()
    for i in range(data.shape[0]):
        index = xs[i]
        if index not in value_dict.keys():
            value_dict[index] = ys[i]
            count_dict[index] = 0
        else:
            value_dict[index] += ys[i]
            count_dict[index] += 1

    for i in range(data.shape[0]):
        index = xs[i]
        result[i] = (value_dict[index] - ys[i]) / count_dict[index]
    return result

#### cython优化方案1
优化思路：
1. 外部调用的方法和参数不能变
2. 增加一个python的方法，将pandas转数组的工作交给python代码完成
3. Cython代码只实现两次循环方案

In [2]:
%load_ext cython

In [25]:
%%cython -a

import numpy as np 
cimport numpy as cnp

cpdef target_mean_v4_cython(cnp.ndarray[long] xs, cnp.ndarray[long] ys, int shape, cnp.ndarray result, str y_name, str x_name):
    value_dict = dict()
    count_dict = dict()
    for i in range(shape):
        index = xs[i]
        if index not in value_dict.keys():
            value_dict[index] = ys[i]
            count_dict[index] = 0
        else:
            value_dict[index] += ys[i]
            count_dict[index] += 1
    for i in range(shape):
        index = xs[i]
        result[i] = (value_dict[index] - ys[i]) / count_dict[index]
    return result
  
def target_mean_v4(data, y_name, x_name):
    xs = data[x_name].values
    ys = data[y_name].values
    shape = data.shape[0]
    result = np.zeros(shape)
    return target_mean_v4_cython(xs, ys, shape, result, y_name, x_name)

#### cython优化方案2
优化思路：
1. 去除所有python部分代码，用cython实现
2. 明确cython申明类型
3. 去除类型检查、包装检查
4. 修改for循环用range改为更cython的写法

然而.....然并软用

In [27]:
%%cython -a

import numpy as np
cimport numpy as cnp
cimport cython

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef target_mean_v5_cython(data, str y_name, str x_name):
    cdef cnp.ndarray[long] xs = data[x_name].values
    cdef cnp.ndarray[long] ys = data[y_name].values
    cdef int n = data.shape[0]
    cdef cnp.ndarray[double] result = np.zeros(n)

    cdef value_dict = dict()
    cdef count_dict = dict()
    cdef int i = 0

    for i from 0<=i<n:
        index = xs[i]
        if index not in value_dict.keys():
            value_dict[index] = ys[i]
            count_dict[index] = 0
        else:
            value_dict[index] += ys[i]
            count_dict[index] += 1

    for i from 0<=i<n:
        index = xs[i]
        result[i] = (value_dict[index] - ys[i]) / count_dict[index]

    
    return result

#### cython优化方案3
优化思路：更底层的优化
1. 考虑用数组代替dict
2. 但是这里有个问题，数组申明的时候必须指定大小。如何可以动态改变数组大小呢？？这个还需要研究，这里这个写法很不雅观，后面再再想想怎么解决。
```python
cdef long[:] value_sum = np.zeros(10).astype(long)
cdef long[:] count = np.zeros(10).astype(long)
```

In [33]:
%%cython -a

import numpy as np
cimport numpy as cnp
cimport cython

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef target_mean_v6_cython(data, str y_name, str x_name):
    cdef long[:] xs = data[x_name].values
    cdef long[:] ys = data[y_name].values
    cdef int n = data.shape[0]
    cdef double[:] result = np.zeros(n)
    cdef long[:] value_sum = np.zeros(10).astype(long)
    cdef long[:] count = np.zeros(10).astype(long)
    for i in range(n):
        x_v, y_v = xs[i], ys[i]
        value_sum[x_v] += y_v
        count[x_v] += 1
    for i in range(n):
        x_v, y_v = xs[i], ys[i]
        result[i] = (value_sum[x_v] - y_v) / (count[x_v] - 1)
    return result

#### cython优化方案4
优化思路：再原有基础上再增加多线程

1. 找到cython中有prange可以实现多线程，先测试一下。

```python
for i in prange(n, nogil=True):
```

2. 这里注意关闭了GIL后，再循环类不能再赋值，否则会报错，具体原因还不知道，需要再研究。

```python
for i in prange(n, nogil=True):
    x_v, y_v = xs[i], ys[i]
    result[i] = (value_sum[x_v] - y_v) / (count[x_v] - 1)
```
Error
```md
Error compiling Cython file:
------------------------------------------------------------
    cdef int n = data.shape[0]
    cdef double[:] result = np.zeros(n)
    cdef long[:] value_sum = np.zeros(10).astype(long)
    cdef long[:] count = np.zeros(10).astype(long)
    for i in prange(n, nogil=True):
        x_v, y_v = xs[i], ys[i]
       ^
------------------------------------------------------------
Assignment of Python object not allowed without gil
```

In [38]:
%%cython -a

import numpy as np
cimport numpy as cnp
from cython.parallel import prange
cimport cython

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef target_mean_v7_cython(data, str y_name, str x_name):
    cdef long[:] xs = data[x_name].values
    cdef long[:] ys = data[y_name].values
    cdef int n = data.shape[0]
    cdef double[:] result = np.zeros(n)
    cdef long[:] value_sum = np.zeros(10).astype(long)
    cdef long[:] count = np.zeros(10).astype(long)
    cdef int i = 0
    for i in prange(n, nogil=True):
        value_sum[xs[i]] += ys[i]
        count[xs[i]] += 1
    for i in prange(n, nogil=True):
        result[i] = (value_sum[xs[i]] - ys[i]) / (count[xs[i]] - 1)
    return result

In [49]:
r1 = target_mean_v1(data, 'y', 'x')
is_eq(r1, target_mean_v2(data, 'y', 'x'))
is_eq(r1, target_mean_v3(data, 'y', 'x'))
is_eq(r1, target_mean_v4(data, 'y', 'x'))
is_eq(r1, target_mean_v5_cython(data, 'y', 'x'))
is_eq(r1, target_mean_v6_cython(data, 'y', 'x'))
is_eq(r1, target_mean_v7_cython(data, 'y', 'x'))

It's right.
It's right.
It's right.
It's right.
It's right.
It's right.


In [51]:
# %%timeit -r 100
%timeit -n 1 target_mean_v1(data, 'y', 'x')
%timeit -n 1 target_mean_v2(data, 'y', 'x')
%timeit -n 100 target_mean_v3(data, 'y', 'x')
%timeit -n 100 target_mean_v4(data, 'y', 'x')
%timeit -n 100 target_mean_v5_cython(data, 'y', 'x')
%timeit -n 100 target_mean_v6_cython(data, 'y', 'x')
%timeit -n 100 target_mean_v7_cython(data, 'y', 'x')

1 loop, best of 3: 24.2 s per loop
1 loop, best of 3: 267 ms per loop
100 loops, best of 3: 8.66 ms per loop
100 loops, best of 3: 1.19 ms per loop
100 loops, best of 3: 1.23 ms per loop
100 loops, best of 3: 550 µs per loop
100 loops, best of 3: 37.1 µs per loop
