## ndarray内部机里

In [1]:
np.ones((10,5)).shape

(10, 5)

In [2]:
np.ones((3,4,5),dtype=np.float64).strides

(160, 40, 8)

### Numpy数据类型体系

In [3]:
ints = np.ones(10,dtype=np.uint16)

floats = np.ones(10,dtype=np.float32)

np.issubdtype(ints.dtype,np.integer)

True

In [6]:
np.issubdtype(floats.dtype,np.floating)

True

In [8]:
np.float64.mro()  #查看所有的父类

[numpy.float64,
 numpy.floating,
 numpy.inexact,
 numpy.number,
 numpy.generic,
 float,
 object]

In [9]:
np.issubdtype(ints.dtype,np.number)

True

## 高级数组操作

In [10]:
arr = np.arange(8)

In [11]:
arr

array([0, 1, 2, 3, 4, 5, 6, 7])

In [12]:
arr.reshape((4,2))

array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7]])

In [2]:
arr = np.arange(15)
arr.reshape((5,-1))

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [3]:
arr = np.arange(15).reshape((5,3))
arr

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [4]:
arr.ravel()  #散开 raveling

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [5]:
arr

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [10]:
arr.flatten()  

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [7]:
arr

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

### C和Fortran顺序

In [11]:
arr = np.arange(12).reshape((3,4))

In [12]:
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [13]:
arr.ravel()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [14]:
arr.ravel('F')

array([ 0,  4,  8,  1,  5,  9,  2,  6, 10,  3,  7, 11])

In [15]:
#C/行优先顺序：先经过更高的维度 轴1 会先于轴0被处理
#Fortran/列优先顺序：后经过更高的维度（轴0会先于轴1被处理）

### 数组的合并和拆分

In [17]:
arr1 = np.array([[1,2,3],[4,5,6]])
arr2 = np.array([[7,8,9],[10,11,12]])

np.concatenate([arr1,arr2],axis=0)

array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12]])

In [18]:
np.concatenate([arr1,arr2],axis=1)

array([[ 1,  2,  3,  7,  8,  9],
       [ 4,  5,  6, 10, 11, 12]])

In [19]:
np.vstack((arr1,arr2))

array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12]])

In [20]:
np.hstack((arr1,arr2))

array([[ 1,  2,  3,  7,  8,  9],
       [ 4,  5,  6, 10, 11, 12]])

In [21]:
arr = np.random.randn(5,2)
arr

array([[ 2.22570979,  2.17152384],
       [-0.74264074, -0.85237851],
       [ 1.33701901,  0.66299729],
       [ 1.17827829, -1.08058035],
       [ 0.12047471, -1.63129297]])

In [22]:
first,second,third = np.split(arr,[1,3])   #[1,3]指示在哪个索引处分割数组

In [23]:
first

array([[2.22570979, 2.17152384]])

In [24]:
second

array([[-0.74264074, -0.85237851],
       [ 1.33701901,  0.66299729]])

In [25]:
third

array([[ 1.17827829, -1.08058035],
       [ 0.12047471, -1.63129297]])

### 堆叠辅助类：r_和c_

In [26]:
arr = np.arange(6)

In [27]:
arr1 = arr.reshape((3,2))

In [28]:
arr2= np.random.randn(3,2)

In [29]:
arr1

array([[0, 1],
       [2, 3],
       [4, 5]])

In [30]:
arr2

array([[-1.79803088,  1.17294027],
       [ 0.30164528,  0.87094523],
       [-1.0783696 , -0.6636792 ]])

In [31]:
np.r_[arr1,arr2]

array([[ 0.        ,  1.        ],
       [ 2.        ,  3.        ],
       [ 4.        ,  5.        ],
       [-1.79803088,  1.17294027],
       [ 0.30164528,  0.87094523],
       [-1.0783696 , -0.6636792 ]])

In [32]:
np.c_[np.r_[arr1,arr2],arr]

array([[ 0.        ,  1.        ,  0.        ],
       [ 2.        ,  3.        ,  1.        ],
       [ 4.        ,  5.        ,  2.        ],
       [-1.79803088,  1.17294027,  3.        ],
       [ 0.30164528,  0.87094523,  4.        ],
       [-1.0783696 , -0.6636792 ,  5.        ]])

In [33]:
np.c_[1:6,-10:-5]  #将切片转换成成数组

array([[  1, -10],
       [  2,  -9],
       [  3,  -8],
       [  4,  -7],
       [  5,  -6]])

### 元素的重复操作：tile和repeat

In [34]:
#repeat 会将数组中的各个元素重复一定次数，从而产生一个更大的数据
arr = np.arange(3)

In [35]:
arr

array([0, 1, 2])

In [36]:
arr.repeat(3)

array([0, 0, 0, 1, 1, 1, 2, 2, 2])

In [37]:
#如果传入的是一组整数，则各元素就可以重复不同的次数
arr.repeat([2,3,4])

array([0, 0, 1, 1, 1, 2, 2, 2, 2])

In [39]:
#对于多维数组，还可以让他们的元素沿着指定轴重复
arr = np.random.randn(2,2)
arr

array([[-0.11189287,  1.29509116],
       [-0.61685098,  1.09498909]])

In [40]:
arr.repeat(2,axis=0)   #注意没有设置轴向，则数组会被扁平化

array([[-0.11189287,  1.29509116],
       [-0.11189287,  1.29509116],
       [-0.61685098,  1.09498909],
       [-0.61685098,  1.09498909]])

In [41]:
arr.repeat([2,3],axis=0) 

array([[-0.11189287,  1.29509116],
       [-0.11189287,  1.29509116],
       [-0.61685098,  1.09498909],
       [-0.61685098,  1.09498909],
       [-0.61685098,  1.09498909]])

In [42]:
arr.repeat([2,3],axis=1) 

array([[-0.11189287, -0.11189287,  1.29509116,  1.29509116,  1.29509116],
       [-0.61685098, -0.61685098,  1.09498909,  1.09498909,  1.09498909]])

In [43]:
#tile的功能是沿指定轴向堆叠数组的副本，可以将其想象成“铺瓷砖”
arr

array([[-0.11189287,  1.29509116],
       [-0.61685098,  1.09498909]])

In [45]:
np.tile(arr,2)   #第二个参数是瓷砖的数量，对于标量，瓷砖是水平铺设的，而不是垂直铺设，表示“铺设”布局的元组

array([[-0.11189287,  1.29509116, -0.11189287,  1.29509116],
       [-0.61685098,  1.09498909, -0.61685098,  1.09498909]])

In [46]:
arr

array([[-0.11189287,  1.29509116],
       [-0.61685098,  1.09498909]])

In [48]:
np.tile(arr,(2,1))

array([[-0.11189287,  1.29509116],
       [-0.61685098,  1.09498909],
       [-0.11189287,  1.29509116],
       [-0.61685098,  1.09498909]])

In [49]:
np.tile(arr,(3,2))

array([[-0.11189287,  1.29509116, -0.11189287,  1.29509116],
       [-0.61685098,  1.09498909, -0.61685098,  1.09498909],
       [-0.11189287,  1.29509116, -0.11189287,  1.29509116],
       [-0.61685098,  1.09498909, -0.61685098,  1.09498909],
       [-0.11189287,  1.29509116, -0.11189287,  1.29509116],
       [-0.61685098,  1.09498909, -0.61685098,  1.09498909]])

### 花式索引的等级函数：take和put

In [50]:
arr = np.arange(10)*100

In [51]:
arr

array([  0, 100, 200, 300, 400, 500, 600, 700, 800, 900])

In [52]:
inds = [7,1,2,6]

In [53]:
arr[inds]

array([700, 100, 200, 600])

In [55]:
arr.take(inds)

array([700, 100, 200, 600])

In [56]:
arr.put(inds, 42)

In [57]:
arr

array([  0,  42,  42, 300, 400, 500,  42,  42, 800, 900])

In [58]:
arr.put(inds, [40, 41, 42, 43])

In [59]:
arr

array([  0,  41,  42, 300, 400, 500,  43,  40, 800, 900])

In [60]:
#其他轴上使用take，只需传入axis关键字即可
inds = [2, 0, 2, 1]
arr = np.random.randn(2, 4)
arr


array([[ 3.92017409e-02, -2.18542445e+00,  5.05865074e-01,
         3.87978394e-01],
       [-1.49066314e-01, -1.53625166e-04, -1.51778780e+00,
         6.69956159e-01]])

In [61]:
arr.take(inds, axis=1)

array([[ 5.05865074e-01,  3.92017409e-02,  5.05865074e-01,
        -2.18542445e+00],
       [-1.51778780e+00, -1.49066314e-01, -1.51778780e+00,
        -1.53625166e-04]])

In [62]:
#put 不接受axis参数，他只会在数组的扁平化版本（一唯，C顺序）上进行索引，因此，在需要使用其他轴向的索引设置元素时，最好还是使用花式索引

## 广播

标量值和数组合并时就会发生最简单的广播

广播的原则：如果两个数组的后缘维度（即从末尾开始算起的维度）的轴长度相符或者其中一方的长度为1，则认为他们时广播兼容的
广播会在缺失和（或）长度为1的维度上进行


In [64]:
arr = np.arange(5)
arr

array([0, 1, 2, 3, 4])

In [65]:
arr * 4

array([ 0,  4,  8, 12, 16])

In [66]:
#减去列平均值的方式，对数组的每一列进行距平化处理
arr = np.random.randn(4, 3)
arr.mean(0)

array([ 0.44877674, -0.01278826,  0.20120386])

In [67]:
arr

array([[ 0.82131943,  1.15118262,  0.66244413],
       [ 2.11278274, -1.53330169, -1.35127979],
       [-0.46286126,  0.8492125 ,  1.1259237 ],
       [-0.67613395, -0.51824647,  0.36772743]])

In [68]:
demeaned = arr - arr.mean(0)
demeaned

array([[ 0.37254269,  1.16397088,  0.46124026],
       [ 1.664006  , -1.52051343, -1.55248366],
       [-0.911638  ,  0.86200076,  0.92471983],
       [-1.12491069, -0.50545821,  0.16652356]])

In [70]:
#在1轴向上做减法（即各行家去平均值）,较小的那个数组的形状必须时（4，1）

In [71]:
arr

array([[ 0.82131943,  1.15118262,  0.66244413],
       [ 2.11278274, -1.53330169, -1.35127979],
       [-0.46286126,  0.8492125 ,  1.1259237 ],
       [-0.67613395, -0.51824647,  0.36772743]])

In [72]:
row_means = arr.mean(1)

In [73]:
row_means.shape

(4,)

In [74]:
row_means

array([ 0.87831539, -0.25726625,  0.50409164, -0.275551  ])

In [75]:
row_means.reshape((4,1))

array([[ 0.87831539],
       [-0.25726625],
       [ 0.50409164],
       [-0.275551  ]])

In [76]:
row_means

array([ 0.87831539, -0.25726625,  0.50409164, -0.275551  ])

In [77]:
demeaned = arr - row_means.reshape((4, 1))
demeaned

array([[-0.05699596,  0.27286723, -0.21587127],
       [ 2.37004899, -1.27603544, -1.09401355],
       [-0.96695291,  0.34512086,  0.62183205],
       [-0.40058295, -0.24269547,  0.64327842]])

In [78]:
demeaned.mean(1)

array([3.70074342e-17, 7.40148683e-17, 0.00000000e+00, 0.00000000e+00])

根据广播的原则，较小数组的"广播维"必须为1

In [79]:
arr - arr.mean(1).reshape((4,1))

array([[-0.05699596,  0.27286723, -0.21587127],
       [ 2.37004899, -1.27603544, -1.09401355],
       [-0.96695291,  0.34512086,  0.62183205],
       [-0.40058295, -0.24269547,  0.64327842]])

Numpy数组提供了一种通过索引机智插入轴的特殊语法

In [80]:
#通过特殊的np.newaxis属性以及“全”切片来插入新轴
arr = np.zeros((4, 4))
arr

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [81]:
arr_3d = arr[:, np.newaxis, :]
arr_3d

array([[[0., 0., 0., 0.]],

       [[0., 0., 0., 0.]],

       [[0., 0., 0., 0.]],

       [[0., 0., 0., 0.]]])

In [82]:
arr_3d.shape

(4, 1, 4)

In [86]:
arr_1d = np.random.normal(size=3)
arr_1d

array([ 0.15312531,  0.92034292, -0.35838709])

In [87]:
arr_1d.shape

(3,)

In [88]:
arr_1d[:, np.newaxis]

array([[ 0.15312531],
       [ 0.92034292],
       [-0.35838709]])

In [89]:
arr_1d[np.newaxis, :]

array([[ 0.15312531,  0.92034292, -0.35838709]])

In [90]:
arr_1d[np.newaxis, :].shape

(1, 3)

三维数组，并希望对轴2进行距平化

In [93]:
arr = np.random.randn(3, 4, 5)
arr

array([[[ 1.44735677,  1.66971827,  1.1451873 ,  1.74555421,
         -1.04305564],
        [-0.99046117,  1.43539894,  0.10329701,  0.4826856 ,
          0.01515171],
        [-1.30508287,  0.17539415,  0.40686923, -2.35564012,
          1.3534118 ],
        [ 0.41505276, -0.93064307,  1.08552809, -0.71653113,
          0.03245309]],

       [[-0.57881409, -0.70857865, -0.42590904, -0.59948278,
         -1.11968584],
        [ 1.1159756 , -0.45285479, -1.51766187,  0.15710985,
          0.92554756],
        [-0.8197847 ,  0.97031944, -0.90346702,  0.81532543,
         -0.54766201],
        [-1.16413852,  1.06331041, -1.73810712, -0.45497291,
          0.57647246]],

       [[ 0.65070079,  0.4263271 ,  0.24510734, -1.16847704,
          1.17612933],
        [ 1.03459687,  0.2697482 ,  0.31472519, -0.96222323,
         -0.44972002],
        [-0.01197708, -1.00086724, -0.59666432, -3.01356819,
         -2.57860475],
        [ 0.06474185,  1.08915969, -0.45844465, -1.21041561,
         -0

In [92]:
depth_means = arr.mean(2) 
depth_means

array([[ 0.32728615, -0.49976644,  0.0443139 , -0.32627028],
       [-0.46060588,  0.00256801,  0.18623865, -0.33243366],
       [ 0.27739838,  0.058195  ,  0.22587214,  0.68211279]])

In [94]:
depth_means.shape

(3, 4)

In [95]:
demeaned = arr - depth_means[:, :, np.newaxis]

In [96]:
demeaned

array([[[ 1.12007061,  1.34243212,  0.81790115,  1.41826806,
         -1.37034179],
        [-0.49069473,  1.93516538,  0.60306346,  0.98245204,
          0.51491815],
        [-1.34939677,  0.13108025,  0.36255533, -2.39995402,
          1.3090979 ],
        [ 0.74132304, -0.60437279,  1.41179837, -0.39026085,
          0.35872338]],

       [[-0.11820821, -0.24797277,  0.03469684, -0.13887689,
         -0.65907996],
        [ 1.11340759, -0.4554228 , -1.52022988,  0.15454184,
          0.92297955],
        [-1.00602335,  0.7840808 , -1.08970567,  0.62908678,
         -0.73390066],
        [-0.83170486,  1.39574407, -1.40567345, -0.12253924,
          0.90890612]],

       [[ 0.37330241,  0.14892873, -0.03229103, -1.44587542,
          0.89873095],
        [ 0.97640187,  0.2115532 ,  0.25653019, -1.02041823,
         -0.50791502],
        [-0.23784922, -1.22673938, -0.82253646, -3.23944033,
         -2.80447689],
        [-0.61737094,  0.40704689, -1.14055745, -1.8925284 ,
         -0

In [97]:
demeaned.mean(2)

array([[ 0.66566603,  0.70898086, -0.38932346,  0.30344223],
       [-0.2258882 ,  0.04305526, -0.28329242, -0.01105347],
       [-0.01144087, -0.0167696 , -1.66620846, -0.81719044]])

既通用又不牺牲性能的方法，单格式需要索引方面的技巧

In [98]:
def demean_axis(arr, axis=0):
    means = arr.mean(axis)
    
    #This generalizes thing like [:, :, np.nexaxis]
    indexer = [slice(None)] * arr.ndim
    indexer[axis] = np.newaxis
    return arr - means[indexer]

### 通过广播设置数组的值

In [104]:
arr = np.zeros((4,3))
arr

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

In [105]:
arr[:]

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

In [101]:
arr[:] = 5

In [102]:
arr

array([[5., 5., 5.],
       [5., 5., 5.],
       [5., 5., 5.],
       [5., 5., 5.]])

想用一个一维数组来设置目标数组的各列

In [106]:
col = np.array([1.28, -0.42, 0.44, 1.6])
arr[:] = col[:, np.newaxis]

In [107]:
arr

array([[ 1.28,  1.28,  1.28],
       [-0.42, -0.42, -0.42],
       [ 0.44,  0.44,  0.44],
       [ 1.6 ,  1.6 ,  1.6 ]])

In [108]:
arr[:2] = [[-1.37], [0.509]]
arr

array([[-1.37 , -1.37 , -1.37 ],
       [ 0.509,  0.509,  0.509],
       [ 0.44 ,  0.44 ,  0.44 ],
       [ 1.6  ,  1.6  ,  1.6  ]])

## ufunc高级应用

### ufunc实例

用np.add.reduce对数组中各个元素进行求和   
reduce 接收一个数组参数，并通过一系列的二元运算对其值进行聚合（可指明轴向）

In [109]:
arr = np.arange(10)
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [110]:
np.add.reduce(arr)

45

In [111]:
arr.sum()

45

用np.logical_and 检查数组各行中的值是否有序的

In [112]:
np.random.seed(123456)

In [113]:
arr = np.random.randn(5,5)
arr

array([[ 0.4691123 , -0.28286334, -1.5090585 , -1.13563237,  1.21211203],
       [-0.17321465,  0.11920871, -1.04423597, -0.86184896, -2.10456922],
       [-0.49492927,  1.07180381,  0.72155516, -0.70677113, -1.03957499],
       [ 0.27185989, -0.42497233,  0.56702035,  0.27623202, -1.08740069],
       [-0.67368971,  0.11364841, -1.47842655,  0.52498767,  0.40470522]])

In [114]:
arr[::2]

array([[ 0.4691123 , -0.28286334, -1.5090585 , -1.13563237,  1.21211203],
       [-0.49492927,  1.07180381,  0.72155516, -0.70677113, -1.03957499],
       [-0.67368971,  0.11364841, -1.47842655,  0.52498767,  0.40470522]])

In [115]:
arr[::2].sort(1)

In [117]:
arr[:, :-1] < arr[:, 1:]

array([[ True,  True,  True,  True],
       [ True, False,  True, False],
       [ True,  True,  True,  True],
       [False,  True, False, False],
       [ True,  True,  True,  True]])

In [120]:
np.logical_and.reduce(arr[:, :-1] < arr[:, 1:], axis=1)  #

array([ True, False,  True, False,  True])

ccumulate跟reduce的关系，就像cumsum跟sum的关系那样。它产生一个跟原数组大小相同的中间“累计”值数组

In [121]:
arr = np.arange(15).reshape((3, 5))
print(arr)
np.add.accumulate(arr, axis=1)

[[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]]


array([[ 0,  1,  3,  6, 10],
       [ 5, 11, 18, 26, 35],
       [10, 21, 33, 46, 60]], dtype=int32)

outer用于计算两个数组的叉积

In [122]:
arr = np.arange(3).repeat([1,2,2])
arr

array([0, 1, 1, 2, 2])

In [123]:
np.multiply.outer(arr, np.arange(5))

array([[0, 0, 0, 0, 0],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 2, 4, 6, 8],
       [0, 2, 4, 6, 8]])

outer输出结果的维度是两个输入数据的维度之和

In [124]:
x, y = np.random.randn(3, 4), np.random.randn(5)
print(x)
print(y)

[[ 0.57704599 -1.71500202 -1.03926848 -0.37064686]
 [-1.15789225 -1.34431181  0.84488514  1.07576978]
 [-0.10904998  1.64356307 -1.46938796  0.35702056]]
[-0.6746001  -1.77690372 -0.96891381 -1.29452359  0.41373811]


In [125]:
result = np.subtract.outer(x, y)

In [127]:
result.shape

(3, 4, 5)

In [128]:
arr = np.arange(10)
np.add.reduceat(arr, [0, 5, 8])

array([10, 18, 17], dtype=int32)

In [129]:
arr = np.multiply.outer(np.arange(4), np.arange(5))
arr

array([[ 0,  0,  0,  0,  0],
       [ 0,  1,  2,  3,  4],
       [ 0,  2,  4,  6,  8],
       [ 0,  3,  6,  9, 12]])

reduceat用于"局部约简"，其实就是一个对数据各切片进行聚合的groupby运算。它接受一组用于指示如何对值进行拆分和聚合的“面元边界”

In [130]:
np.add.reduceat(arr, [0, 2, 4],axis=1)

array([[ 0,  0,  0],
       [ 1,  5,  4],
       [ 2, 10,  8],
       [ 3, 15, 12]], dtype=int32)

![image.png](attachment:image.png)

### 编写新的ufunc

In [132]:
def add_elements(x, y):
    return x + y

In [133]:
add_them = np.frompyfunc(add_elements, 2, 1)

In [134]:
add_them(np.arange(8), np.arange(8))

array([0, 2, 4, 6, 8, 10, 12, 14], dtype=object)

In [135]:
add_them = np.vectorize(add_elements, otypes=[np.float])
add_them(np.arange(8), np.arange(8))

array([ 0.,  2.,  4.,  6.,  8., 10., 12., 14.])

In [136]:
arr = np.random.randn(10000)
%timeit add_them(arr, arr)
%timeit np.add(arr, arr)

1.37 ms ± 1.09 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
3.53 µs ± 29.6 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


## 排序

In [138]:
arr = np.random.randn(6)
arr

array([-0.2625154 ,  0.02812367,  0.87458306,  0.01972901,  1.10790981,
        0.3825507 ])

In [139]:
arr.sort()  #原数组会被修改
arr

array([-0.2625154 ,  0.01972901,  0.02812367,  0.3825507 ,  0.87458306,
        1.10790981])

In [140]:
arr = np.random.randn(3, 5)
arr

array([[-0.5565042 ,  0.78636179, -0.38964038, -0.56135084, -0.1724924 ],
       [ 0.24018256, -1.77194708,  0.82789246, -0.77915441, -1.00592983],
       [ 1.24347796,  0.70024873,  0.64729809,  0.31218427, -3.05614302]])

In [145]:
arr[:, 0].sort()   #按照第0列进行排序

In [146]:
arr

array([[-0.5565042 , -1.77194708, -0.38964038, -0.56135084, -0.1724924 ],
       [ 0.24018256,  0.70024873,  0.82789246, -0.77915441, -1.00592983],
       [ 1.24347796,  0.78636179,  0.64729809,  0.31218427, -3.05614302]])

相反，numpy.sort会为原数组创建一个已排序副本，且接受的参数（如kind）跟ndarray.sort一样

In [147]:
arr = np.random.randn(5)
arr

array([ 0.1113004 , -0.01858619,  2.01359289,  0.81407983, -1.23442883])

In [148]:
np.sort(arr)

array([-1.23442883, -0.01858619,  0.1113004 ,  0.81407983,  2.01359289])

In [149]:
arr

array([ 0.1113004 , -0.01858619,  2.01359289,  0.81407983, -1.23442883])

这两个排序方法都可以接受一个axis参数，以便沿指定轴向对各块数据进行单独排序

In [150]:
arr = np.random.randn(3, 5)
arr

array([[ 0.91624336,  0.3461365 , -0.92729173,  0.38947689, -0.99011385],
       [-1.6393558 ,  0.38023146,  0.51697934, -1.13637738, -0.08944073],
       [ 0.68565292, -0.88063003,  1.52901185, -0.43166578,  0.27258011]])

In [151]:
arr.sort(axis=1)
arr

array([[-0.99011385, -0.92729173,  0.3461365 ,  0.38947689,  0.91624336],
       [-1.6393558 , -1.13637738, -0.08944073,  0.38023146,  0.51697934],
       [-0.88063003, -0.43166578,  0.27258011,  0.68565292,  1.52901185]])

In [153]:
#倒序
arr[:, ::-1]

array([[ 0.91624336,  0.38947689,  0.3461365 , -0.92729173, -0.99011385],
       [ 0.51697934,  0.38023146, -0.08944073, -1.13637738, -1.6393558 ],
       [ 1.52901185,  0.68565292,  0.27258011, -0.43166578, -0.88063003]])

### 间接排序： argsort和lesort

In [154]:
values = np.array([5, 0, 1, 3, 2])
indexer = values.argsort()

In [155]:
indexer  #返回的是从小到大的索引值

array([1, 2, 4, 3, 0], dtype=int64)

In [156]:
values[indexer]

array([0, 1, 2, 3, 5])

根据数组的第一行对其进行排序

In [157]:
arr = np.random.randn(3, 5)
arr[0] = values

In [158]:
arr

array([[ 5.        ,  0.        ,  1.        ,  3.        ,  2.        ],
       [ 1.02838486, -0.94579266, -1.74274053,  0.03354246,  0.80127785],
       [ 0.86776374,  1.39014946, -0.12561686, -0.17029985, -0.06390751]])

In [159]:
arr[:, arr[0].argsort()]

array([[ 0.        ,  1.        ,  2.        ,  3.        ,  5.        ],
       [-0.94579266, -1.74274053,  0.80127785,  0.03354246,  1.02838486],
       [ 1.39014946, -0.12561686, -0.06390751, -0.17029985,  0.86776374]])

lexsort跟argsort差不多，只不过它可以一次性对多个键数组进行间接排序（字典序）

In [160]:
first_name = np.array(['Bob', 'Jane', 'Steve', 'Bill', 'Barbara'])
last_name = np.array(['Jones', 'Arnold', 'Arnold', 'Jones', 'Walters'])
sorter = np.lexsort([first_name, last_name])    

In [161]:
sorter

array([1, 2, 3, 0, 4], dtype=int64)

In [162]:
zip(last_name[sorter], first_name[sorter])

<zip at 0x1e7af394408>

### 其他排序算法

稳定的排序算法会保持等价元素的相对位置。对于相对为你之具有实际意义的那些间接排序而言，这一点非常重要

In [163]:
values = np.array(['2:first', '2:second', '1:first', '1:second',
                   '1:third'])
key = np.array([2, 2, 1, 1, 1])
indexer = key.argsort(kind='mergesort')

In [164]:
indexer

array([2, 3, 4, 0, 1], dtype=int64)

In [165]:
values.take(indexer)

array(['1:first', '1:second', '1:third', '2:first', '2:second'],
      dtype='<U8')

In [166]:
values[indexer]

array(['1:first', '1:second', '1:third', '2:first', '2:second'],
      dtype='<U8')

![image.png](attachment:image.png)

### 部分排序数组

numpy.partition和np.argpartition可以在第K个最小元素划分的数组

In [169]:
np.random.seed(12345)

In [170]:
arr = np.random.randn(20)
arr

array([-0.20470766,  0.47894334, -0.51943872, -0.5557303 ,  1.96578057,
        1.39340583,  0.09290788,  0.28174615,  0.76902257,  1.24643474,
        1.00718936, -1.29622111,  0.27499163,  0.22891288,  1.35291684,
        0.88642934, -2.00163731, -0.37184254,  1.66902531, -0.43856974])

In [171]:
np.partition(arr, 3)  #结果中的头3个元素是最小的3个，没有特定的顺序

array([-2.00163731, -1.29622111, -0.5557303 , -0.51943872, -0.37184254,
       -0.43856974, -0.20470766,  0.28174615,  0.76902257,  0.47894334,
        1.00718936,  0.09290788,  0.27499163,  0.22891288,  1.35291684,
        0.88642934,  1.39340583,  1.96578057,  1.66902531,  1.24643474])

numpy.argpartition与numpy.argsort相似，会返回索引，重拍数据为等价的顺序

In [173]:
indices = np.argpartition(arr, 3)
indices

array([16, 11,  3,  2, 17, 19,  0,  7,  8,  1, 10,  6, 12, 13, 14, 15,  5,
        4, 18,  9], dtype=int64)

In [174]:
arr.take(indices)

array([-2.00163731, -1.29622111, -0.5557303 , -0.51943872, -0.37184254,
       -0.43856974, -0.20470766,  0.28174615,  0.76902257,  0.47894334,
        1.00718936,  0.09290788,  0.27499163,  0.22891288,  1.35291684,
        0.88642934,  1.39340583,  1.96578057,  1.66902531,  1.24643474])

numpy.searchsorted: 在有序数组中查找元素

In [175]:
arr = np.array([0, 1, 7, 12, 15])
arr

array([ 0,  1,  7, 12, 15])

In [176]:
arr.searchsorted(8)

3

In [177]:
arr.searchsorted([0, 3, 3, 5])  #默认返回相等值的左侧索引

array([0, 2, 2, 2], dtype=int64)

In [178]:
arr = np.array([0, 0, 0, 1, 1, 1, 1])
arr.searchsorted([0, 1])


array([0, 3], dtype=int64)

In [179]:
arr.searchsorted([0, 1], side='right')

array([3, 7], dtype=int64)

In [182]:
#有个表示“面元边界”的数组，希望将数据组拆分开
data = np.floor(np.random.uniform(0, 1000, size=50))  #np.floor 向下取整,uniform均匀分布
bins = np.array([0, 100, 1000, 5000, 10000])
data

array([994., 676., 790., 170.,  26., 800., 903.,  24., 491., 526., 596.,
        51., 895., 728., 818., 500., 810.,  95., 218., 258., 468., 459.,
       709., 178., 531., 167., 768., 928., 609., 150., 489., 377., 848.,
       911., 383., 315., 568., 187., 125., 687., 799., 573., 973., 634.,
       888., 495., 351., 714., 503., 225.])

In [184]:
labels = bins.searchsorted(data)
labels

array([2, 2, 2, 2, 1, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2], dtype=int64)

In [185]:
labels.shape

(50,)

In [187]:
import pandas as pd
pd.Series(data).groupby(labels).mean()

1     49.00000
2    569.23913
dtype: float64

## 高级数组输入输出

### 内存映像

内存映像⽂件是⼀种将磁盘上的⾮常⼤的⼆进制数据⽂件当做内 存中的数组进⾏处理的⽅式。NumPy实现了⼀个类似于ndarray 的memmap对象，它允许将⼤⽂件分成⼩段进⾏读写，⽽不是⼀ 次性将整个数组读⼊内存。另外，memmap也拥有跟普通数组⼀ 样的⽅法，因此，基本上只要是能⽤于ndarray的算法就也能⽤ 于memmap。

要创建⼀个内存映像，可以使⽤函数np.memmap并传⼊⼀个⽂ 件路径、数据类型、形状以及⽂件模式：


In [1]:
#要创建一个内存映像，可以使用函数np.memmap并传入一个文件路径、数据类型、形状以及文件模式
mmap = np.memmap('mymmap', dtype='float64', mode='w+',
                 shape=(10000, 10000))
mmap

memmap([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [2]:
#视图
section = mmap[:5]
section

memmap([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [4]:
#如果将数据赋值给这些视图：数据会先被缓存到内存中（就像是python的文件对象），调用flush即可将其写入磁盘
section[:] = np.random.randn(5, 10000)
mmap.flush()
mmap

memmap([[ 0.44277119,  0.95555952,  0.06595023, ..., -1.31859418,
         -1.04480488,  1.39142419],
        [ 2.03149551,  0.19072266,  0.28732816, ..., -0.22721905,
          2.70826669,  0.49936937],
        [-1.40799138,  0.31344319, -0.62464644, ...,  0.21426496,
         -0.56149702, -1.10826565],
        ...,
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ]])

In [5]:
del mmap

只要某个内存映像超出了作⽤域，它就会被垃圾回收器回收，之 前对其所做的任何修改都会被写⼊磁盘。当打开⼀个已经存在的 内存映像时，仍然需要指明数据类型和形状，因为磁盘上的那个 ⽂件只是⼀块⼆进制数据⽽已，没有任何元数据：


In [6]:
mmap = np.memmap('mymmap', dtype='float64', shape=(10000, 10000))
mmap

memmap([[ 0.44277119,  0.95555952,  0.06595023, ..., -1.31859418,
         -1.04480488,  1.39142419],
        [ 2.03149551,  0.19072266,  0.28732816, ..., -0.22721905,
          2.70826669,  0.49936937],
        [-1.40799138,  0.31344319, -0.62464644, ...,  0.21426496,
         -0.56149702, -1.10826565],
        ...,
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ]])

In [8]:
#%xdel mmap
#!rm mymmap

NameError: name 'mmap' is not defined


### HDF5及其他数组存储方式

PyTables和h5py这两个Python项⽬可以将NumPy的数组数据存 储为⾼效且可压缩的HDF5格式（HDF意思是“层次化数据格 式”）。你可以安全地将好⼏百GB甚⾄TB的数据存储为HDF5格 式。要学习Python使⽤HDF5，请参考pandas线上⽂档。

## 性能建议

- 将Python循环和条件逻辑转换为数组运算和布尔数组运算。 
- 尽量使⽤⼴播。 
- 避免复制数据，尽量使⽤数组视图（即切⽚）。 
- 利⽤ufunc及其各种⽅法

### 连续内存的重要性

数组的内存布局可以对计算速度造成极⼤的影响。这是因为性能差别在⼀定程度上跟CPU的⾼速缓存（cache）体系有关。运算过程中访问连续内存块（例如，对以 C顺序存储的数组的⾏求和）⼀般是最快的，因为内存⼦系统会 将适当的内存块缓存到超⾼速的L1或L2CPU Cache中。此外， NumPy的C语⾔基础代码（某些）对连续存储的情况进⾏了优化 处理，这样就能避免⼀些跨越式的内存访问。 


⼀个数组的内存布局是连续的，就是说元素是以它们在数组中出 现的顺序（即Fortran型（列优先）或C型（⾏优先））存储在内 存中的。默认情况下，NumPy数组是以C型连续的⽅式创建的。 列优先的数组（⽐如C型连续数组的转置）也被称为Fortran型连 续。通过ndarray的flags属性即可查看这些信息： 

In [9]:
arr_c = np.ones((1000, 1000), order='C')
arr_f = np.ones((1000, 1000), order='F')
arr_c.flags

  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False

In [10]:
arr_f.flags

  C_CONTIGUOUS : False
  F_CONTIGUOUS : True
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False

In [11]:
arr_f.flags.f_contiguous

True

In [12]:
#对两个数组的⾏进⾏求和计算，理论上说， arr_c会⽐arr_f快，因为arr_c的⾏在内存中是连续的。
%timeit arr_c.sum(1)

479 µs ± 5.96 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [13]:
%timeit arr_f.sum(1)

576 µs ± 18.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [14]:
arr_f.copy('C').flags

  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False

In [15]:
#注意在构建视图时，其结果不一定是连续的
arr_c[:50].flags.contiguous

True

In [16]:
arr_c[:, :50].flags

  C_CONTIGUOUS : False
  F_CONTIGUOUS : False
  OWNDATA : False
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False