# Appendix A: Advance Numpy

In [5]:
import numpy as np
import pandas as pd

## A.1 ndarray Object Internals

In [6]:
array = np.arange(1, 100)

In [7]:
array[::-1]

array([99, 98, 97, 96, 95, 94, 93, 92, 91, 90, 89, 88, 87, 86, 85, 84, 83,
       82, 81, 80, 79, 78, 77, 76, 75, 74, 73, 72, 71, 70, 69, 68, 67, 66,
       65, 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,
       48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32,
       31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15,
       14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1])

In [8]:
arr = np.arange(12).reshape((3, 4))
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [9]:
np.ones((3, 4, 5), dtype=np.float64).strides

(160, 40, 8)

In [10]:
ints = np.ones(10, dtype=np.uint16)
floats = np.ones(10, dtype=np.float32)

In [11]:
np.issubdtype(ints.dtype, np.integer)

True

In [12]:
np.issubdtype(floats.dtype, np.floating)

True

In [13]:
np.float64.mro()

[numpy.float64,
 numpy.floating,
 numpy.inexact,
 numpy.number,
 numpy.generic,
 float,
 object]

## A.2 Advanced Array Manipulation

In [14]:
arr = np.arange(8)
arr

array([0, 1, 2, 3, 4, 5, 6, 7])

In [15]:
arr.reshape(2, 4)

array([[0, 1, 2, 3],
       [4, 5, 6, 7]])

One of the passed shape dimensions can be –1, in which case the value used for that
dimension will be inferred from the data:

In [16]:
arr = np.arange(15)
arr.reshape(5, -1)

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

ravel does not produce a copy of the underlying values if the values in the result were contiguous in the original array. The flatten method behaves like ravel except it always returns a copy of the data:

In [17]:
arr.ravel()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [18]:
arr.flatten()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

### C Versus Fortran Order
Functions like reshape and ravel accept an order argument indicating the order to use the data in the array. This is usually set to 'C' or 'F' in most cases (there are also less commonly used options 'A' and 'K'; see the NumPy documentation


In [19]:
arr = np.arange(12).reshape((3, 4))
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [20]:
arr.ravel()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [21]:
arr.ravel('F')

array([ 0,  4,  8,  1,  5,  9,  2,  6, 10,  3,  7, 11])

There are some convenience functions, like vstack and hstack, for common kinds of
concatenation. The preceding operations could have been expressed as:

In [22]:
arr1 = np.array([[1, 2, 3], [4, 5, 6]])
arr2 = np.array([[7, 8, 9], [10, 11, 12]])

np.vstack((arr1, arr2))

array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12]])

In [23]:
np.hstack((arr1, arr2))

array([[ 1,  2,  3,  7,  8,  9],
       [ 4,  5,  6, 10, 11, 12]])

In [24]:
first, second, third = np.split(arr, [1, 3])

In [25]:
print(first)
print(second)
print(third)

[[0 1 2 3]]
[[ 4  5  6  7]
 [ 8  9 10 11]]
[]


### Stacking helpers: r_ and c_
There are two special objects in the NumPy namespace, r_ and c_, that make stacking arrays more concise:

In [26]:
arr = np.arange(6)
arr1 = arr.reshape((3, 2))
arr2 = np.random.randn(3, 2)

In [27]:
np.r_[arr1, arr2]

array([[ 0.        ,  1.        ],
       [ 2.        ,  3.        ],
       [ 4.        ,  5.        ],
       [ 1.91780533,  1.08668383],
       [ 0.74433499, -0.53483292],
       [ 0.37413022, -0.62047975]])

In [28]:
np.c_[np.r_[arr1, arr2], arr]

array([[ 0.        ,  1.        ,  0.        ],
       [ 2.        ,  3.        ,  1.        ],
       [ 4.        ,  5.        ,  2.        ],
       [ 1.91780533,  1.08668383,  3.        ],
       [ 0.74433499, -0.53483292,  4.        ],
       [ 0.37413022, -0.62047975,  5.        ]])

In [29]:
np.c_[1:6, -10:-5]

array([[  1, -10],
       [  2,  -9],
       [  3,  -8],
       [  4,  -7],
       [  5,  -6]])

### Repeating arrays

In [30]:
arr = np.arange(3)

In [31]:
arr.repeat(3)

array([0, 0, 0, 1, 1, 1, 2, 2, 2])

In [32]:
arr.repeat([2, 3, 4])

array([0, 0, 1, 1, 1, 2, 2, 2, 2])

tile, on the other hand, is a shortcut for stacking copies of an array along an axis.
Visually you can think of it as being akin to “laying down tiles”:

In [33]:
np.tile(arr, 2)

array([0, 1, 2, 0, 1, 2])

In [34]:
np.tile(arr, (3, 2))

array([[0, 1, 2, 0, 1, 2],
       [0, 1, 2, 0, 1, 2],
       [0, 1, 2, 0, 1, 2]])

### Fancy Indexing Equivalents: take and put

In [35]:
arr = np.arange(10) * 100
inds = [7, 1, 3, 9]
arr[inds]

array([700, 100, 300, 900])

There are alternative ndarray methods that are useful in the special case of only making a selection on a single axis:

In [36]:
arr.take(inds)

array([700, 100, 300, 900])

In [37]:
arr.put(inds, 42)

In [38]:
arr

array([  0,  42, 200,  42, 400, 500, 600,  42, 800,  42])

### A.3 Broadcasting

#### The Broadcasting Rule
Two arrays are compatible for broadcasting if for each trailing dimension (i.e., starting from the end) the axis lengths match or if either of the lengths is 1. Broadcasting is then performed over the missing or length 1 dimensions.
<div style="margin-top:20px;">
<img src="examples/broadcasting.png"/>
</div>

In [39]:
arr = np.random.randn(4, 3)
arr.mean(0)

array([-0.3981219 ,  0.82306612, -0.24570856])

In [40]:
demeaned = arr - arr.mean(0)
demeaned

array([[-0.47507337, -0.38178626, -0.56188449],
       [ 0.16142964,  0.96277993,  1.1240475 ],
       [-0.47310422, -1.01651092, -0.88515836],
       [ 0.78674796,  0.43551725,  0.32299535]])

<hr>

In [41]:
arr

array([[-0.87319527,  0.44127985, -0.80759305],
       [-0.23669226,  1.78584605,  0.87833894],
       [-0.87122613, -0.1934448 , -1.13086693],
       [ 0.38862606,  1.25858336,  0.07728678]])

In [44]:
row_means = arr.mean(1)
row_means.shape

(4,)

In [45]:
row_means.reshape((4,1))

array([[-0.41316949],
       [ 0.80916424],
       [-0.73184595],
       [ 0.57483207]])

In [46]:
demeaned = arr - row_means.reshape((4, 1))
demeaned.mean(1)

array([-3.70074342e-17,  1.11022302e-16,  7.40148683e-17, -3.70074342e-17])

<div style="margin-top:20px;">
<img src="examples/broadcasting_over_axis_1_of_a_2D_array.png"/>
</div>

<div style="margin-top:20px;">
<img src="examples/broadcasting_over_axis_0_of_a_3D_array.png"/>
</div>

<div style="margin-top:20px;">
In the three-dimensional case, broadcasting over any of the three dimensions is only a matter of reshaping the data to be shape-compatible. Figure A-7 nicely visualizes the shapes required to broadcast over each axis of a three-dimensional array.
A common problem, therefore, is needing to add a new axis with length 1 specifically for broadcasting purposes. Using reshape is one option, but inserting an axis requires constructing a tuple indicating the new shape. This can often be a tedious exercise. Thus, NumPy arrays offer a special syntax for inserting new axes by index‐ ing. We use the special np.newaxis attribute along with “full” slices to insert the new axis:
<img src="examples/compatible_2d_array_shapes_for_broadcasting_over_a_3D_array.png"/>
</div>

In [53]:
arr = np.zeros((4, 4))

# np.newaxis 为 numpy.ndarray（多维数组）增加一个轴
arr_3d = arr[:, np.newaxis, :]
arr_3d.shape

(4, 1, 4)

In [48]:
arr_1d = np.random.normal(size=3)
arr_1d[:, np.newaxis]

array([[1.08612151],
       [1.14170547],
       [0.38338622]])

In [49]:
arr_1d[np.newaxis, :]

array([[1.08612151, 1.14170547, 0.38338622]])

Thus, if we had a three-dimensional array and wanted to demean axis 2, say, we would need to write:

In [50]:
arr = np.random.randn(3, 4, 5)
depth_means = arr.mean(2)
depth_means

array([[-0.14209901,  0.35895765,  0.01912123, -0.09925586],
       [-0.14481255, -0.54011509, -0.31125746, -0.51140903],
       [-0.53022946,  0.45617579, -0.04534145, -0.03560022]])

In [51]:
depth_means.shape

(3, 4)

In [52]:
demeaned = arr - depth_means[: ,: ,np.newaxis]
demeaned.mean(2)

array([[ 4.44089210e-17,  0.00000000e+00, -2.22044605e-17,
         0.00000000e+00],
       [ 2.49800181e-17,  4.44089210e-17,  0.00000000e+00,
        -2.22044605e-17],
       [-1.11022302e-17,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00]])

You might be wondering if there’s a way to generalize demeaning over an axis without sacrificing performance. There is, but it requires some indexing gymnastics:

In [56]:
def demean_axis(arr, axis=0): 
    means = arr.mean(axis)
    # This generalizes things like [:, :, np.newaxis] to N dimensions
    indexer = [slice(None)] * arr.ndim 
    indexer[axis] = np.newaxis
    return arr - means[indexer]

### Setting Array Values by Broadcasting
The same broadcasting rule governing arithmetic operations also applies to setting values via indexing. 

In [57]:
arr = np.zeros((4, 3))
arr[:] = 5
arr

array([[5., 5., 5.],
       [5., 5., 5.],
       [5., 5., 5.],
       [5., 5., 5.]])

However, if we had a one-dimensional array of values we wanted to set into the col‐ umns of the array, we can do that as long as the shape is compatible:

In [59]:
col = np.array([1.28, -0.42, 0.44, 1.6])
arr[:] = col[:, np.newaxis]
arr

array([[ 1.28,  1.28,  1.28],
       [-0.42, -0.42, -0.42],
       [ 0.44,  0.44,  0.44],
       [ 1.6 ,  1.6 ,  1.6 ]])

In [62]:
arr[:2] = [[-1.37], [0.509]]
arr

array([[-1.37 , -1.37 , -1.37 ],
       [ 0.509,  0.509,  0.509],
       [ 0.44 ,  0.44 ,  0.44 ],
       [ 1.6  ,  1.6  ,  1.6  ]])

## A.4 Advanced ufunc Usage

#### ufunc Instance Methods
Each of NumPy’s binary ufuncs has special methods for performing certain kinds of special vectorized operations

***reduce*** takes a single array and aggregates its values, optionally along an axis, by performing a sequence of binary operations. For example, an alternative way to sum elements in an array is to use np.add.reduce:

In [63]:
arr = np.arange(10)
np.add.reduce(arr)

45

In [64]:
arr.sum()

45

The starting value (0 for add) depends on the ufunc. If an axis is passed, the reduction is performed along that axis. This allows you to answer certain kinds of questions in a concise way. As a less trivial example, we can use np.logical_and to check whether the values in each row of an array are sorted:

In [65]:
np.random.seed(123456) # for reproducibility
arr = np.random.randn(5, 5)
arr[::2].sort(1) # sort a few rows
arr[:, :-1] < arr[:, 1:]

array([[ True,  True,  True,  True],
       [ True, False,  True, False],
       [ True,  True,  True,  True],
       [False,  True, False, False],
       [ True,  True,  True,  True]])

In [68]:
np.logical_and.reduce(arr[:, :-1] < arr[:, 1:], axis=1)

array([ True, False,  True, False,  True])

**accumulate** is related to reduce like cumsum is related to sum. It produces an array of
the same size with the intermediate “accumulated” values:

In [69]:
arr = np.arange(15).reshape((3, 5))

np.add.accumulate(arr, axis=1)

array([[ 0,  1,  3,  6, 10],
       [ 5, 11, 18, 26, 35],
       [10, 21, 33, 46, 60]])

**outer** performs a pairwise cross-product between two arrays:

In [70]:
arr = np.arange(3).repeat([1, 2, 3])
arr

array([0, 1, 1, 2, 2, 2])

In [71]:
np.multiply.outer(arr, np.arange(5))

array([[0, 0, 0, 0, 0],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 2, 4, 6, 8],
       [0, 2, 4, 6, 8],
       [0, 2, 4, 6, 8]])

The output of **outer** will have a dimension that is the sum of the dimensions of the inputs:

In [73]:
x, y = np.random.randn(3, 4), np.random.randn(5)
result = np.subtract.outer(x, y)
result.shape

(3, 4, 5)

In [74]:
result

array([[[ 1.44696051,  0.502831  , -0.1341728 , -0.53718857,
          0.14465854],
        [ 0.69826429, -0.24586523, -0.88286902, -1.2858848 ,
         -0.60403768],
        [ 1.15633904,  0.21220953, -0.42479426, -0.82781004,
         -0.14596292],
        [ 0.8077558 , -0.13637371, -0.7733775 , -1.17639328,
         -0.49454616]],

       [[ 1.16414523,  0.22001572, -0.41698808, -0.82000386,
         -0.13815674],
        [ 0.24723814, -0.69689137, -1.33389517, -1.73691094,
         -1.05506382],
        [ 2.0660161 ,  1.12188659,  0.48488279,  0.08186702,
          0.76371413],
        [ 1.97554282,  1.03141331,  0.39440951, -0.00860626,
          0.67324086]],

       [[-0.03611298, -0.9802425 , -1.61724629, -2.02026207,
         -1.33841495],
        [ 3.73594474,  2.79181523,  2.15481144,  1.75179566,
          2.43364278],
        [ 2.60155478,  1.65742527,  1.02042148,  0.6174057 ,
          1.29925282],
        [ 2.51060765,  1.56647813,  0.92947434,  0.52645856,
          1