# Appendix A: Advance Numpy

In [5]:
import numpy as np
import pandas as pd

## A.1 ndarray Object Internals

In [6]:
array = np.arange(1, 100)

In [7]:
array[::-1]

array([99, 98, 97, 96, 95, 94, 93, 92, 91, 90, 89, 88, 87, 86, 85, 84, 83,
       82, 81, 80, 79, 78, 77, 76, 75, 74, 73, 72, 71, 70, 69, 68, 67, 66,
       65, 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,
       48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32,
       31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15,
       14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1])

In [8]:
arr = np.arange(12).reshape((3, 4))
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [9]:
np.ones((3, 4, 5), dtype=np.float64).strides

(160, 40, 8)

In [10]:
ints = np.ones(10, dtype=np.uint16)
floats = np.ones(10, dtype=np.float32)

In [11]:
np.issubdtype(ints.dtype, np.integer)

True

In [12]:
np.issubdtype(floats.dtype, np.floating)

True

In [13]:
np.float64.mro()

[numpy.float64,
 numpy.floating,
 numpy.inexact,
 numpy.number,
 numpy.generic,
 float,
 object]

## A.2 Advanced Array Manipulation

In [14]:
arr = np.arange(8)
arr

array([0, 1, 2, 3, 4, 5, 6, 7])

In [15]:
arr.reshape(2, 4)

array([[0, 1, 2, 3],
       [4, 5, 6, 7]])

One of the passed shape dimensions can be –1, in which case the value used for that
dimension will be inferred from the data:

In [16]:
arr = np.arange(15)
arr.reshape(5, -1)

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

ravel does not produce a copy of the underlying values if the values in the result were contiguous in the original array. The flatten method behaves like ravel except it always returns a copy of the data:

In [17]:
arr.ravel()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [18]:
arr.flatten()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

### C Versus Fortran Order
Functions like reshape and ravel accept an order argument indicating the order to use the data in the array. This is usually set to 'C' or 'F' in most cases (there are also less commonly used options 'A' and 'K'; see the NumPy documentation


In [19]:
arr = np.arange(12).reshape((3, 4))
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [20]:
arr.ravel()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [21]:
arr.ravel('F')

array([ 0,  4,  8,  1,  5,  9,  2,  6, 10,  3,  7, 11])

There are some convenience functions, like vstack and hstack, for common kinds of
concatenation. The preceding operations could have been expressed as:

In [22]:
arr1 = np.array([[1, 2, 3], [4, 5, 6]])
arr2 = np.array([[7, 8, 9], [10, 11, 12]])

np.vstack((arr1, arr2))

array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12]])

In [23]:
np.hstack((arr1, arr2))

array([[ 1,  2,  3,  7,  8,  9],
       [ 4,  5,  6, 10, 11, 12]])

In [24]:
first, second, third = np.split(arr, [1, 3])

In [25]:
print(first)
print(second)
print(third)

[[0 1 2 3]]
[[ 4  5  6  7]
 [ 8  9 10 11]]
[]


### Stacking helpers: r_ and c_
There are two special objects in the NumPy namespace, r_ and c_, that make stacking arrays more concise:

In [26]:
arr = np.arange(6)
arr1 = arr.reshape((3, 2))
arr2 = np.random.randn(3, 2)

In [27]:
np.r_[arr1, arr2]

array([[ 0.        ,  1.        ],
       [ 2.        ,  3.        ],
       [ 4.        ,  5.        ],
       [ 1.91780533,  1.08668383],
       [ 0.74433499, -0.53483292],
       [ 0.37413022, -0.62047975]])

In [28]:
np.c_[np.r_[arr1, arr2], arr]

array([[ 0.        ,  1.        ,  0.        ],
       [ 2.        ,  3.        ,  1.        ],
       [ 4.        ,  5.        ,  2.        ],
       [ 1.91780533,  1.08668383,  3.        ],
       [ 0.74433499, -0.53483292,  4.        ],
       [ 0.37413022, -0.62047975,  5.        ]])

In [29]:
np.c_[1:6, -10:-5]

array([[  1, -10],
       [  2,  -9],
       [  3,  -8],
       [  4,  -7],
       [  5,  -6]])

### Repeating arrays

In [30]:
arr = np.arange(3)

In [31]:
arr.repeat(3)

array([0, 0, 0, 1, 1, 1, 2, 2, 2])

In [32]:
arr.repeat([2, 3, 4])

array([0, 0, 1, 1, 1, 2, 2, 2, 2])

tile, on the other hand, is a shortcut for stacking copies of an array along an axis.
Visually you can think of it as being akin to “laying down tiles”:

In [33]:
np.tile(arr, 2)

array([0, 1, 2, 0, 1, 2])

In [34]:
np.tile(arr, (3, 2))

array([[0, 1, 2, 0, 1, 2],
       [0, 1, 2, 0, 1, 2],
       [0, 1, 2, 0, 1, 2]])

### Fancy Indexing Equivalents: take and put

In [35]:
arr = np.arange(10) * 100
inds = [7, 1, 3, 9]
arr[inds]

array([700, 100, 300, 900])

There are alternative ndarray methods that are useful in the special case of only making a selection on a single axis:

In [36]:
arr.take(inds)

array([700, 100, 300, 900])

In [37]:
arr.put(inds, 42)

In [38]:
arr

array([  0,  42, 200,  42, 400, 500, 600,  42, 800,  42])

### A.3 Broadcasting

#### The Broadcasting Rule
Two arrays are compatible for broadcasting if for each trailing dimension (i.e., starting from the end) the axis lengths match or if either of the lengths is 1. Broadcasting is then performed over the missing or length 1 dimensions.
<div style="margin-top:20px;">
<img src="examples/broadcasting.png"/>
</div>

In [39]:
arr = np.random.randn(4, 3)
arr.mean(0)

array([-0.3981219 ,  0.82306612, -0.24570856])

In [40]:
demeaned = arr - arr.mean(0)
demeaned

array([[-0.47507337, -0.38178626, -0.56188449],
       [ 0.16142964,  0.96277993,  1.1240475 ],
       [-0.47310422, -1.01651092, -0.88515836],
       [ 0.78674796,  0.43551725,  0.32299535]])

<hr>

In [41]:
arr

array([[-0.87319527,  0.44127985, -0.80759305],
       [-0.23669226,  1.78584605,  0.87833894],
       [-0.87122613, -0.1934448 , -1.13086693],
       [ 0.38862606,  1.25858336,  0.07728678]])

In [44]:
row_means = arr.mean(1)
row_means.shape

(4,)

In [45]:
row_means.reshape((4,1))

array([[-0.41316949],
       [ 0.80916424],
       [-0.73184595],
       [ 0.57483207]])

In [46]:
demeaned = arr - row_means.reshape((4, 1))
demeaned.mean(1)

array([-3.70074342e-17,  1.11022302e-16,  7.40148683e-17, -3.70074342e-17])

<div style="margin-top:20px;">
<img src="examples/broadcasting_over_axis_1_of_a_2D_array.png"/>
</div>

<div style="margin-top:20px;">
<img src="examples/broadcasting_over_axis_0_of_a_3D_array.png"/>
</div>

<div style="margin-top:20px;">
In the three-dimensional case, broadcasting over any of the three dimensions is only a matter of reshaping the data to be shape-compatible. Figure A-7 nicely visualizes the shapes required to broadcast over each axis of a three-dimensional array.
A common problem, therefore, is needing to add a new axis with length 1 specifically for broadcasting purposes. Using reshape is one option, but inserting an axis requires constructing a tuple indicating the new shape. This can often be a tedious exercise. Thus, NumPy arrays offer a special syntax for inserting new axes by index‐ ing. We use the special np.newaxis attribute along with “full” slices to insert the new axis:
<img src="examples/compatible_2d_array_shapes_for_broadcasting_over_a_3D_array.png"/>
</div>

In [53]:
arr = np.zeros((4, 4))

# np.newaxis 为 numpy.ndarray（多维数组）增加一个轴
arr_3d = arr[:, np.newaxis, :]
arr_3d.shape

(4, 1, 4)

In [48]:
arr_1d = np.random.normal(size=3)
arr_1d[:, np.newaxis]

array([[1.08612151],
       [1.14170547],
       [0.38338622]])

In [49]:
arr_1d[np.newaxis, :]

array([[1.08612151, 1.14170547, 0.38338622]])

Thus, if we had a three-dimensional array and wanted to demean axis 2, say, we would need to write:

In [50]:
arr = np.random.randn(3, 4, 5)
depth_means = arr.mean(2)
depth_means

array([[-0.14209901,  0.35895765,  0.01912123, -0.09925586],
       [-0.14481255, -0.54011509, -0.31125746, -0.51140903],
       [-0.53022946,  0.45617579, -0.04534145, -0.03560022]])

In [51]:
depth_means.shape

(3, 4)

In [52]:
demeaned = arr - depth_means[: ,: ,np.newaxis]
demeaned.mean(2)

array([[ 4.44089210e-17,  0.00000000e+00, -2.22044605e-17,
         0.00000000e+00],
       [ 2.49800181e-17,  4.44089210e-17,  0.00000000e+00,
        -2.22044605e-17],
       [-1.11022302e-17,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00]])

You might be wondering if there’s a way to generalize demeaning over an axis without sacrificing performance. There is, but it requires some indexing gymnastics:

In [56]:
def demean_axis(arr, axis=0): 
    means = arr.mean(axis)
    # This generalizes things like [:, :, np.newaxis] to N dimensions
    indexer = [slice(None)] * arr.ndim 
    indexer[axis] = np.newaxis
    return arr - means[indexer]

### Setting Array Values by Broadcasting
The same broadcasting rule governing arithmetic operations also applies to setting values via indexing. 

In [57]:
arr = np.zeros((4, 3))
arr[:] = 5
arr

array([[5., 5., 5.],
       [5., 5., 5.],
       [5., 5., 5.],
       [5., 5., 5.]])

However, if we had a one-dimensional array of values we wanted to set into the col‐ umns of the array, we can do that as long as the shape is compatible:

In [59]:
col = np.array([1.28, -0.42, 0.44, 1.6])
arr[:] = col[:, np.newaxis]
arr

array([[ 1.28,  1.28,  1.28],
       [-0.42, -0.42, -0.42],
       [ 0.44,  0.44,  0.44],
       [ 1.6 ,  1.6 ,  1.6 ]])

In [62]:
arr[:2] = [[-1.37], [0.509]]
arr

array([[-1.37 , -1.37 , -1.37 ],
       [ 0.509,  0.509,  0.509],
       [ 0.44 ,  0.44 ,  0.44 ],
       [ 1.6  ,  1.6  ,  1.6  ]])

## A.4 Advanced ufunc Usage

#### ufunc Instance Methods
Each of NumPy’s binary ufuncs has special methods for performing certain kinds of special vectorized operations

***reduce*** takes a single array and aggregates its values, optionally along an axis, by performing a sequence of binary operations. For example, an alternative way to sum elements in an array is to use np.add.reduce:

In [63]:
arr = np.arange(10)
np.add.reduce(arr)

45

In [64]:
arr.sum()

45

The starting value (0 for add) depends on the ufunc. If an axis is passed, the reduction is performed along that axis. This allows you to answer certain kinds of questions in a concise way. As a less trivial example, we can use np.logical_and to check whether the values in each row of an array are sorted:

In [65]:
np.random.seed(123456) # for reproducibility
arr = np.random.randn(5, 5)
arr[::2].sort(1) # sort a few rows
arr[:, :-1] < arr[:, 1:]

array([[ True,  True,  True,  True],
       [ True, False,  True, False],
       [ True,  True,  True,  True],
       [False,  True, False, False],
       [ True,  True,  True,  True]])

In [68]:
np.logical_and.reduce(arr[:, :-1] < arr[:, 1:], axis=1)

array([ True, False,  True, False,  True])

**accumulate** is related to reduce like cumsum is related to sum. It produces an array of
the same size with the intermediate “accumulated” values:

In [69]:
arr = np.arange(15).reshape((3, 5))

np.add.accumulate(arr, axis=1)

array([[ 0,  1,  3,  6, 10],
       [ 5, 11, 18, 26, 35],
       [10, 21, 33, 46, 60]])

**outer** performs a pairwise cross-product between two arrays:

In [70]:
arr = np.arange(3).repeat([1, 2, 3])
arr

array([0, 1, 1, 2, 2, 2])

In [71]:
np.multiply.outer(arr, np.arange(5))

array([[0, 0, 0, 0, 0],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 2, 4, 6, 8],
       [0, 2, 4, 6, 8],
       [0, 2, 4, 6, 8]])

The output of **outer** will have a dimension that is the sum of the dimensions of the inputs:

In [73]:
x, y = np.random.randn(3, 4), np.random.randn(5)
result = np.subtract.outer(x, y)
result.shape

(3, 4, 5)

In [74]:
result

array([[[ 1.44696051,  0.502831  , -0.1341728 , -0.53718857,
          0.14465854],
        [ 0.69826429, -0.24586523, -0.88286902, -1.2858848 ,
         -0.60403768],
        [ 1.15633904,  0.21220953, -0.42479426, -0.82781004,
         -0.14596292],
        [ 0.8077558 , -0.13637371, -0.7733775 , -1.17639328,
         -0.49454616]],

       [[ 1.16414523,  0.22001572, -0.41698808, -0.82000386,
         -0.13815674],
        [ 0.24723814, -0.69689137, -1.33389517, -1.73691094,
         -1.05506382],
        [ 2.0660161 ,  1.12188659,  0.48488279,  0.08186702,
          0.76371413],
        [ 1.97554282,  1.03141331,  0.39440951, -0.00860626,
          0.67324086]],

       [[-0.03611298, -0.9802425 , -1.61724629, -2.02026207,
         -1.33841495],
        [ 3.73594474,  2.79181523,  2.15481144,  1.75179566,
          2.43364278],
        [ 2.60155478,  1.65742527,  1.02042148,  0.6174057 ,
          1.29925282],
        [ 2.51060765,  1.56647813,  0.92947434,  0.52645856,
          1

The last method, **reduceat**, performs a “local reduce,” in essence an array groupby operation in which slices of the array are aggregated together. It accepts a sequence of “bin edges” that indicate how to split and aggregate the values:

In [77]:
arr = np.arange(10)
np.add.reduceat(arr, [0, 5, 8])

array([10, 18, 17])

#### Writing New ufuncs in Python
**numpy.frompyfunc** accepts a Python function along with a specification for the number of inputs and outputs. For example, a simple function that adds element-wise would be specified as:

In [87]:
def add_elements(x, y):
    return x + y

In [88]:
add_them = np.frompyfunc(add_elements, 2, 1)
add_them(np.arange(8), np.arange(8))

array([0, 2, 4, 6, 8, 10, 12, 14], dtype=object)

Functions created using frompyfunc always return arrays of Python objects, which can be inconvenient. Fortunately, there is an alternative (but slightly less featureful) function, numpy.vectorize, that allows you to specify the output type:

In [89]:
add_them = np.vectorize(add_elements, otypes=[np.float64])
add_them(np.arange(8), np.arange(8))

array([ 0.,  2.,  4.,  6.,  8., 10., 12., 14.])

These functions provide a way to create ufunc-like functions, but they are very slow because they require a Python function call to compute each element, which is a lot slower than NumPy’s C-based ufunc loops:

In [90]:
arr = np.random.randn(1000)

In [91]:
%timeit add_them(arr, arr)

340 µs ± 13.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [92]:
%timeit np.add(arr, arr)

1.39 µs ± 97.9 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


## A.5 Structured and Record Arrays
You may have noticed up until now that ndarray is a homogeneous data container; that is, it represents a block of memory in which each element takes up the same number of bytes, determined by the dtype. On the surface, this would appear to not allow you to represent heterogeneous or tabular-like data. A structured array is an ndarray in which each element can be thought of as representing a struct in C (hence the “structured” name) or a row in a SQL table with multiple named fields:

In [93]:
dtype = [('x', np.float64), ('y', np.int32)]
sarr = np.array([(1.5, 6), (np.pi, -2)], dtype=dtype)
sarr

array([(1.5       ,  6), (3.14159265, -2)],
      dtype=[('x', '<f8'), ('y', '<i4')])

In [94]:
sarr[0]

(1.5, 6)

In [95]:
sarr[0]['y']

6

In [96]:
sarr['x']

array([1.5       , 3.14159265])

#### Nested dtypes and Multidimensional Fields
When specifying a structured dtype, you can additionally pass a **shape** (as an int or tuple):

In [98]:
dtype = [('x', np.int64, 3), ('y', np.int32)]
arr = np.zeros(4, dtype=dtype)

# In this case, the x field now refers to an array of length 3 for each record:
arr

array([([0, 0, 0], 0), ([0, 0, 0], 0), ([0, 0, 0], 0), ([0, 0, 0], 0)],
      dtype=[('x', '<i8', (3,)), ('y', '<i4')])

This enables you to express more complicated, nested structures as a single block of memory in an array. You can also nest dtypes to make more complex structures. Here is an example:

In [99]:
dtype = [('x', [('a', 'f8'), ('b', 'f4')]), ('y', np.int32)]
data = np.array([((1, 2), 5), ((3, 4), 6)], dtype=dtype)

In [100]:
data['x']

array([(1., 2.), (3., 4.)], dtype=[('a', '<f8'), ('b', '<f4')])

In [101]:
data['y']

array([5, 6], dtype=int32)

In [102]:
data['x']['a']

array([1., 3.])

## A.6 More About Sorting
Like Python’s built-in list, the ndarray sort instance method is an in-place sort, meaning that the array contents are rearranged without producing a new array:

In [103]:
arr = np.random.randn(6)
arr.sort()
arr

array([-0.45376078, -0.44029844, -0.16796669, -0.01522018,  0.79142578,
        1.42015508])

When sorting arrays in-place, remember that if the array is a view on a different ndarray, the original array will be modified:

In [104]:
arr = np.random.randn(3, 5)
arr

array([[ 0.8957934 , -0.19616553,  0.38073321, -0.27587428,  1.40080981],
       [-0.35647001,  0.79726805, -1.64304137,  1.04591122,  0.92451513],
       [ 1.55369306,  0.39502294, -0.00709019, -1.6708305 , -0.48200159]])

In [105]:
arr[:, 0].sort()  # Sort first column values in-place
arr

array([[-0.35647001, -0.19616553,  0.38073321, -0.27587428,  1.40080981],
       [ 0.8957934 ,  0.79726805, -1.64304137,  1.04591122,  0.92451513],
       [ 1.55369306,  0.39502294, -0.00709019, -1.6708305 , -0.48200159]])

On the other hand, numpy.sort creates a new, sorted copy of an array. Otherwise, it accepts the same arguments (such as kind) as ndarray.sort:

In [106]:
arr = np.random.randn(5)
arr

array([-0.33561284,  0.94898468,  0.59999679, -0.33702297, -0.59535931])

In [107]:
np.sort(arr)

array([-0.59535931, -0.33702297, -0.33561284,  0.59999679,  0.94898468])

In [108]:
arr

array([-0.33561284,  0.94898468,  0.59999679, -0.33702297, -0.59535931])

All of these sort methods take an axis argument for sorting the sections of data along the passed axis independently:

In [109]:
arr = np.random.randn(3, 5)
arr

array([[-0.87533803,  1.22250308,  0.8731084 , -0.58981007,  2.2619356 ],
       [-0.38831248, -0.1201323 ,  0.45218062, -0.30334351,  0.64837786],
       [-1.00423862,  1.19794563,  0.76554348, -0.42380615,  1.04157099]])

In [110]:
arr.sort(axis=1)
arr

array([[-0.87533803, -0.58981007,  0.8731084 ,  1.22250308,  2.2619356 ],
       [-0.38831248, -0.30334351, -0.1201323 ,  0.45218062,  0.64837786],
       [-1.00423862, -0.42380615,  0.76554348,  1.04157099,  1.19794563]])

You may notice that none of the sort methods have an option to sort in descending order. This is a problem in practice because array slicing produces views, thus not producing a copy or requiring any computational work. Many Python users are familiar with the “trick” that for a list values, values[::-1] returns a list in reverse order. The same is true for ndarrays:

In [111]:
arr[:, ::-1]

array([[ 2.2619356 ,  1.22250308,  0.8731084 , -0.58981007, -0.87533803],
       [ 0.64837786,  0.45218062, -0.1201323 , -0.30334351, -0.38831248],
       [ 1.19794563,  1.04157099,  0.76554348, -0.42380615, -1.00423862]])

#### Indirect Sorts: argsort and lexsort
In data analysis you may need to reorder datasets by one or more keys. For example, a table of data about some students might need to be sorted by last name, then by first name. This is an example of an indirect sort, and if you’ve read the pandas-related chapters you have already seen many higher-level examples. Given a key or keys (an array of values or multiple arrays of values), you wish to obtain an array of integer indices (I refer to them colloquially as indexers) that tells you how to reorder the data to be in sorted order. Two methods for this are argsort and numpy.lexsort. As an example:

In [112]:
values = np.array([5, 0, 1, 3, 2])
indexer = values.argsort()

In [113]:
indexer

array([1, 2, 4, 3, 0])

In [114]:
values[indexer]

array([0, 1, 2, 3, 5])

As a more complicated example, this code reorders a two-dimensional array by its first row:

In [115]:
arr = np.random.randn(3, 5)
arr[0] = values
arr

array([[ 5.        ,  0.        ,  1.        ,  3.        ,  2.        ],
       [ 0.82074955, -1.25653022, -0.642246  ,  0.23689196, -0.46686749],
       [-1.38265331, -0.36942223,  0.85864385, -0.85123567,  1.0580063 ]])

In [116]:
arr[:, arr[0].argsort()]

array([[ 0.        ,  1.        ,  2.        ,  3.        ,  5.        ],
       [-1.25653022, -0.642246  , -0.46686749,  0.23689196,  0.82074955],
       [-0.36942223,  0.85864385,  1.0580063 , -0.85123567, -1.38265331]])

lexsort is similar to argsort, but it performs an indirect lexicographical sort on multiple key arrays. Suppose we wanted to sort some data identified by first and last names:

In [117]:
first_name = np.array(['Bob', 'Jane', 'Steve', 'Bill', 'Barbara'])
last_name = np.array(['Jones', 'Arnold', 'Arnold', 'Jones', 'Walters'])

In [120]:
sorter = np.lexsort((first_name, last_name))
sorter

array([1, 2, 3, 0, 4])

In [126]:
zip_object = zip(last_name[sorter], first_name[sorter])
zip_object

<zip at 0x10e5bf5c8>

In [127]:
list(zip_object)

[('Arnold', 'Jane'),
 ('Arnold', 'Steve'),
 ('Jones', 'Bill'),
 ('Jones', 'Bob'),
 ('Walters', 'Barbara')]

lexsort can be a bit confusing the first time you use it because the order in which the keys are used to order the data starts with the last array passed. Here, last_name was used before first_name.

#### Alternative Sort Algorithms
A stable sorting algorithm preserves the relative position of equal elements. This can be especially important in indirect sorts where the relative ordering is meaningful:

In [128]:
values = np.array(['2:first', '2:second', '1:first', '1:second', '1:third'])
key = np.array([2, 2, 1, 1, 1])
indexer = key.argsort(kind='mergesort')
indexer

array([2, 3, 4, 0, 1])

In [130]:
values.take(indexer)

array(['1:first', '1:second', '1:third', '2:first', '2:second'],
      dtype='<U8')

The only stable sort available is mergesort, which has guaranteed O(n log n) perfor‐ mance (for complexity buffs), but its performance is on average worse than the default quicksort method.

#### Partially Sorting Arrays
One of the goals of sorting can be to determine the largest or smallest elements in an array. NumPy has optimized methods, numpy.partition and np.argpartition, for partitioning an array around the k-th smallest element:

In [131]:
np.random.seed(12345)
arr = np.random.randn(20)
arr

array([-0.20470766,  0.47894334, -0.51943872, -0.5557303 ,  1.96578057,
        1.39340583,  0.09290788,  0.28174615,  0.76902257,  1.24643474,
        1.00718936, -1.29622111,  0.27499163,  0.22891288,  1.35291684,
        0.88642934, -2.00163731, -0.37184254,  1.66902531, -0.43856974])

In [132]:
np.partition(arr, 3)

array([-2.00163731, -1.29622111, -0.5557303 , -0.51943872, -0.37184254,
       -0.43856974, -0.20470766,  0.28174615,  0.76902257,  0.47894334,
        1.00718936,  0.09290788,  0.27499163,  0.22891288,  1.35291684,
        0.88642934,  1.39340583,  1.96578057,  1.66902531,  1.24643474])

After you call **partition(arr, 3)**, the first three elements in the result are the smallest three values in no particular order. **numpy.argpartition**, similar to **numpy.arg sort**, returns the indices that rearrange the data into the equivalent order:

In [133]:
indices = np.argpartition(arr, 3)
indices

array([16, 11,  3,  2, 17, 19,  0,  7,  8,  1, 10,  6, 12, 13, 14, 15,  5,
        4, 18,  9])

In [134]:
arr.take(indices)

array([-2.00163731, -1.29622111, -0.5557303 , -0.51943872, -0.37184254,
       -0.43856974, -0.20470766,  0.28174615,  0.76902257,  0.47894334,
        1.00718936,  0.09290788,  0.27499163,  0.22891288,  1.35291684,
        0.88642934,  1.39340583,  1.96578057,  1.66902531,  1.24643474])

#### numpy.searchsorted: Finding Elements in a Sorted Array
searchsorted is an array method that performs a binary search on a sorted array, returning the location in the array where the value would need to be inserted to maintain sortedness:

In [135]:
arr = np.array([0, 1, 7, 12, 15])
arr.searchsorted(9)

3

You can also pass an array of values to get an array of indices back:

In [136]:
arr.searchsorted([0, 8, 11, 16])

array([0, 3, 3, 5])

You might have noticed that searchsorted returned 0 for the 0 element. This is because the default behavior is to return the index at the left side of a group of equal values:

In [137]:
arr = np.array([0, 0, 0, 1, 1, 1, 1])
arr.searchsorted([0, 1])

array([0, 3])

In [138]:
arr.searchsorted([0, 1], side='right')

array([3, 7])

As another application of searchsorted, suppose we had an array of values between 0 and 10,000, and a separate array of “bucket edges” that we wanted to use to bin the data:

In [139]:
data = np.floor(np.random.uniform(0, 10000, size=50))
bins = np.array([0, 100, 1000, 5000, 10000])
data

array([9940., 6768., 7908., 1709.,  268., 8003., 9037.,  246., 4917.,
       5262., 5963.,  519., 8950., 7282., 8183., 5002., 8101.,  959.,
       2189., 2587., 4681., 4593., 7095., 1780., 5314., 1677., 7688.,
       9281., 6094., 1501., 4896., 3773., 8486., 9110., 3838., 3154.,
       5683., 1878., 1258., 6875., 7996., 5735., 9732., 6340., 8884.,
       4954., 3516., 7142., 5039., 2256.])

To then get a labeling of which interval each data point belongs to (where 1 would mean the bucket [0, 100)), we can simply use searchsorted:

In [140]:
labels = bins.searchsorted(data)
labels

array([4, 4, 4, 3, 2, 4, 4, 2, 3, 4, 4, 2, 4, 4, 4, 4, 4, 2, 3, 3, 3, 3,
       4, 3, 4, 3, 4, 4, 4, 3, 3, 3, 4, 4, 3, 3, 4, 3, 3, 4, 4, 4, 4, 4,
       4, 3, 3, 4, 4, 3])

This, combined with pandas’s groupby, can be used to bin data:

In [142]:
pd.Series(data).groupby(labels).mean()

2     498.000000
3    3064.277778
4    7389.035714
dtype: float64

## A.7 Writing Fast Numpy Function with Numba
Numba is an open source project that creates fast functions for NumPy-like data using CPUs, GPUs, or other hardware. It uses the LLVM Project to translate Python code into compiled machine code.

To introduce Numba, let’s consider a pure Python function that computes the expres‐ sion (x - y).mean() using a for loop:

In [143]:
def mean_distance(x, y):
    nx = len(x)
    result = 0.0
    count = 0
    for i in range(nx):
        result += x[i] - y[i]
        count += 1
    return result / count

In [144]:
x = np.random.randn(10000000)
y = np.random.randn(10000000)

In [145]:
%timeit mean_distance(x, y)

5.43 s ± 119 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [146]:
%timeit (x - y).mean()

48.8 ms ± 1.5 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


The NumPy version is over 100 times faster. We can turn this function into a compiled Numba function using the **numba.jit** function:

In [147]:
import numba as nb
numba_mean_distance = nb.jit(mean_distance)

In [148]:
@nb.jit
def mean_distance(x, y):
    nx = len(x)
    result = 0.0
    count = 0
    for i in range(nx):
        result += x[i] - y[i]
        count += 1
    return result /count

In [149]:
%timeit numba_mean_distance(x, y)

16.2 ms ± 1.83 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


Numba cannot compile arbitrary Python code, but it supports a significant subset of pure Python that is most useful for writing numerical algorithms.

Numba is a deep library, supporting different kinds of hardware, modes of compilation, and user extensions. It is also able to compile a substantial subset of the NumPy Python API without explicit for loops. Numba is able to recognize constructs that can be compiled to machine code, while substituting calls to the CPython API for functions that it does not know how to compile. Numba’s jit function has an option, nopython=True, which restricts allowed code to Python code that can be compiled to **LLVM** without any *Python C API calls*. jit(nopython=True) has a shorter alias **numba.njit**.

In [150]:
# From previous example
from numba import float64, njit

@njit(float64(float64[:], float64[:])) 
def mean_distance(x, y):
    return (x - y).mean()

#### Creating Custom numpy.ufunc Objects with Numba
The numba.vectorize function creates compiled NumPy ufuncs, which behave like built-in ufuncs. Let’s consider a Python implementation of numpy.add:

In [151]:
from numba import vectorize

@vectorize
def nb_add(x, y):
    return x+y

In [152]:
x = np.arange(10)

In [155]:
nb_add(x, x)

array([ 0,  2,  4,  6,  8, 10, 12, 14, 16, 18])

In [156]:
nb_add.accumulate(x, 0)

array([ 0,  1,  3,  6, 10, 15, 21, 28, 36, 45])

## A.8 Advanced Array Input and Output
In Chapter 4, we became acquainted with np.save and np.load for storing arrays in binary format on disk. There are a number of additional options to consider for more sophisticated use. In particular, memory maps have the additional benefit of enabling you to work with datasets that do not fit into RAM.

#### Memory-Mapped Files
A **memory-mapped** file is a method for interacting with binary data on disk as though it is stored in an in-memory array. NumPy implements a **memmap** object that is ndarray-like, enabling small segments of a large file to be read and written without reading the whole array into memory. Additionally, a memmap has the same methods as an in-memory array and thus can be substituted into many algorithms where an ndarray would be expected.

In [161]:
mmap = np.memmap('mymmap', dtype='float64', mode='w+', shape=(10000, 10000))
mmap.shape, mmap

((10000, 10000), memmap([[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]]))

In [162]:
# Slicing a memmap returns views on the data on disk:
section = mmap[:5]

If you assign data to these, it will be buffered in memory (like a Python file object), but you can write it to disk by calling **flush**:

In [163]:
section[:] = np.random.randn(5, 10000)
mmap.flush()
mmap

memmap([[ 1.37140985,  0.93127837,  0.60573747, ..., -0.62115557,
         -0.46780136,  0.47874865],
        [ 0.42296545,  0.83060431,  0.69976547, ...,  1.28831447,
          0.58858679, -1.42755372],
        [ 2.16005954, -1.24616489,  2.44470054, ...,  0.86866129,
          0.28019716,  2.13008671],
        ...,
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ]])

In [164]:
# Delete mmap allocated memory
del mmap

Whenever a memory map falls out of scope and is garbage-collected, any changes will be flushed to disk also. When opening an existing memory map, you still have to specify the dtype and shape, as the file is only a block of binary data with no metadata on disk:

In [165]:
mmap = np.memmap('mymmap', dtype='float64', shape=(10000, 10000))
mmap

memmap([[ 1.37140985,  0.93127837,  0.60573747, ..., -0.62115557,
         -0.46780136,  0.47874865],
        [ 0.42296545,  0.83060431,  0.69976547, ...,  1.28831447,
          0.58858679, -1.42755372],
        [ 2.16005954, -1.24616489,  2.44470054, ...,  0.86866129,
          0.28019716,  2.13008671],
        ...,
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ]])

## A.9 Performance Tips
* Convert Python loops and conditional logic to array operations and boolean array operations
* Use broadcasting whenever possible
* Use arrays views (slicing) to avoid copying data
* Utilize ufuncs and ufunc methods

#### The Importance of Contiguous Memory
While the full extent of this topic is a bit outside the scope of this book, in some applications the memory layout of an array can significantly affect the speed of com‐ putations. This is based partly on performance differences having to do with the cache hierarchy of the CPU; operations accessing contiguous blocks of memory (e.g., summing the rows of a C order array) will generally be the fastest because the mem‐ ory subsystem will buffer the appropriate blocks of memory into the ultrafast L1 or L2 CPU cache. Also, certain code paths inside NumPy’s C codebase have been opti‐ mized for the contiguous case in which generic strided memory access can be avoided.

<hr>