https://github.com/wesm/pydata-book

In [2]:
import numpy as np
import pandas as pd
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 20
np.set_printoptions(precision=4, suppress=True)

# A1. ndarray Object Internals

In [2]:
np.ones((10, 5)).shape

(10, 5)

In [3]:
np.ones((3, 4, 5), dtype=np.float64).strides

(160, 40, 8)

### NumPy dtype Hierarchy

### Setting Array Values by Broadcasting

The same broadcasting rule governing arithmetic operations also applies to setting values via array indexing

In [4]:
arr = np.zeros((4, 3))
print(arr)
arr[:] = 5 #arr[:,:] = 5
print(arr) 
print(arr.shape)

[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
[[5. 5. 5.]
 [5. 5. 5.]
 [5. 5. 5.]
 [5. 5. 5.]]
(4, 3)


if *we had a one-dimensional array of values we wanted to set into the columns of the array*, 
we can do that as long as the shape is compatible

In [5]:
col = np.array([1.28, -0.42, 0.44, 1.6])
print(col)
print(col.shape)

[ 1.28 -0.42  0.44  1.6 ]
(4,)


In [6]:
col = col[:, np.newaxis] #col[para todas las filas,añade un eje]
print(col.shape)
arr[:,:]=col
arr

(4, 1)


array([[ 1.28,  1.28,  1.28],
       [-0.42, -0.42, -0.42],
       [ 0.44,  0.44,  0.44],
       [ 1.6 ,  1.6 ,  1.6 ]])

queremos asignar valor pero solo a una parte del array

In [7]:
arr[:2] = [[-1.37], [0.509]] #hasta la fila 2
arr

array([[-1.37 , -1.37 , -1.37 ],
       [ 0.509,  0.509,  0.509],
       [ 0.44 ,  0.44 ,  0.44 ],
       [ 1.6  ,  1.6  ,  1.6  ]])

# Advanced ufunc Usage
Apart from fast element-wise operations provided
by the universal functions, 
there are a number of additional features that occasionally
can help you write more concise code without loops

### ufunc Instance Methods

*reduce*: 
takes a single array and aggregates its values, optionally along an axis, by performing
a sequence of binary operations

In [8]:
arr = np.arange(1,10)
print(arr)
#np.reduce(arr)-->ERROR
print(np.multiply.reduce(arr))
print(1*2*3*4*5*6*7*8*9)
print(np.floor_divide.reduce(arr))
print(1/2/3/4/5/6/7/8/9)
print(np.power.reduce(arr))
#print(1**2**3**4**5**6**7**8**9)
print(np.subtract.reduce(arr))
print(1-2-3-4-5-6-7-8-9)
print(np.add.reduce(arr))
print(1+2+3+4+5+6+7+8+9)
print(arr.sum())

[1 2 3 4 5 6 7 8 9]
362880
362880
0
2.7557319223985893e-06
1
-43
-43
45
45
45


If an axis is passed, the reduction is performed along that axis.

example, we can use np.logical_and to check whether
the values in each row of an array are sorted:

In [9]:
np.random.seed(12346)  # for reproducibility
arr = np.random.randn(5, 5)
print(arr)
print("----------------------")
arr[::2].sort(1) # sort a few rows
#arr[todas filas, todas columnas, step2]
print(arr)
print("----------------------")
print(arr[:, :-1]) #hasta la penúltima columna
print("----------------------")
print(arr[:, 1:]) #desde la segunda columna
print("----------------------")
#solo estará, ordenadas las que tengan todo 'Trues'
cond = arr[:, :-1] < arr[:, 1:]
print(cond) 
cond.all(axis=1)

[[-0.09    0.7594  0.7483 -0.9815  0.3658]
 [-0.3154 -0.8661  0.0279 -0.4556 -1.6019]
 [ 0.2483 -0.3215 -0.8487  0.0005 -0.5465]
 [ 0.2539  1.9368 -0.7995 -0.5692  0.0489]
 [-0.6491 -0.4795 -0.9535  1.4225  0.1754]]
----------------------
[[-0.9815 -0.09    0.3658  0.7483  0.7594]
 [-0.3154 -0.8661  0.0279 -0.4556 -1.6019]
 [-0.8487 -0.5465 -0.3215  0.0005  0.2483]
 [ 0.2539  1.9368 -0.7995 -0.5692  0.0489]
 [-0.9535 -0.6491 -0.4795  0.1754  1.4225]]
----------------------
[[-0.9815 -0.09    0.3658  0.7483]
 [-0.3154 -0.8661  0.0279 -0.4556]
 [-0.8487 -0.5465 -0.3215  0.0005]
 [ 0.2539  1.9368 -0.7995 -0.5692]
 [-0.9535 -0.6491 -0.4795  0.1754]]
----------------------
[[-0.09    0.3658  0.7483  0.7594]
 [-0.8661  0.0279 -0.4556 -1.6019]
 [-0.5465 -0.3215  0.0005  0.2483]
 [ 1.9368 -0.7995 -0.5692  0.0489]
 [-0.6491 -0.4795  0.1754  1.4225]]
----------------------
[[ True  True  True  True]
 [False  True False False]
 [ True  True  True  True]
 [ True False  True  True]
 [ True  True  T

array([ True, False,  True, False,  True])

In [10]:
#esto mismo lo podemos hacer con:
print(np.logical_and.reduce(arr[:, :-1] < arr[:, 1:], axis=1)) 
#LO QUE ESTÁ HACIENDO
print(True&True&True&True)
print(False&True&False&False)
print(True&True&True&True)
print(True&False&True&True)
print(True&True&True&True)

[ True False  True False  True]
True
False
True
False
True


Note that *logical_and.reduce* = *all method*

*accumulate*:
is related to reduce like cumsum is related to sum. 
It produces an array ofthe same size with the intermediate “accumulated” values:

In [11]:
arr = np.arange(100)
print(arr.sum())
print(arr.cumsum())

4950
[   0    1    3    6   10   15   21   28   36   45   55   66   78   91
  105  120  136  153  171  190  210  231  253  276  300  325  351  378
  406  435  465  496  528  561  595  630  666  703  741  780  820  861
  903  946  990 1035 1081 1128 1176 1225 1275 1326 1378 1431 1485 1540
 1596 1653 1711 1770 1830 1891 1953 2016 2080 2145 2211 2278 2346 2415
 2485 2556 2628 2701 2775 2850 2926 3003 3081 3160 3240 3321 3403 3486
 3570 3655 3741 3828 3916 4005 4095 4186 4278 4371 4465 4560 4656 4753
 4851 4950]


In [12]:
arr = np.arange(15).reshape((3, 5))
print(arr)
print("--------------")
print(np.add.reduce(arr,axis=0)) #by default
print("--------------")
print(np.add.reduce(arr,axis=1))
print("--------------")
print(np.add.accumulate(arr, axis=1))
#add / substract / multiply / divide / power / mod / fmax,fmin

[[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]]
--------------
[15 18 21 24 27]
--------------
[10 35 60]
--------------
[[ 0  1  3  6 10]
 [ 5 11 18 26 35]
 [10 21 33 46 60]]


*outer*:
performs a pairwise cross-product between two arrays:

In [13]:
#Apply the ufunc `op` to all pairs (a, b)
print(np.multiply.outer([1, 2, 3], [4, 5, 6]))
print("--------------")
A = np.array([[1, 2, 3], [4, 5, 6]])
print(A)
print(A.shape)
print("--------------")
B = np.array([[1, 2, 3, 4]])
print(B)
print(B.shape)
print("--------------")
C = np.multiply.outer(A, B)
print(C)
print(C.shape)

[[ 4  5  6]
 [ 8 10 12]
 [12 15 18]]
--------------
[[1 2 3]
 [4 5 6]]
(2, 3)
--------------
[[1 2 3 4]]
(1, 4)
--------------
[[[[ 1  2  3  4]]

  [[ 2  4  6  8]]

  [[ 3  6  9 12]]]


 [[[ 4  8 12 16]]

  [[ 5 10 15 20]]

  [[ 6 12 18 24]]]]
(2, 3, 1, 4)


In [14]:
arr = np.arange(3).repeat([1, 2, 2])
print(arr)
print("--------------")
arr2 = np.arange(5)
print(arr2)
print("--------------")
print(np.multiply.outer(arr,arr2))
#Apply the ufunc `op` to all pairs (a, b) 
print("--------------")
#LO QUE ESTÁ HACIENDO
print(arr[0]*arr2)
print(arr[1]*arr2)
print(arr[2]*arr2)
print(arr[3]*arr2)
print(arr[4]*arr2)
print("--------------")
print(arr*arr2)
print((arr*arr2).sum())
print(arr.dot(arr2))

[0 1 1 2 2]
--------------
[0 1 2 3 4]
--------------
[[0 0 0 0 0]
 [0 1 2 3 4]
 [0 1 2 3 4]
 [0 2 4 6 8]
 [0 2 4 6 8]]
--------------
[0 0 0 0 0]
[0 1 2 3 4]
[0 1 2 3 4]
[0 2 4 6 8]
[0 2 4 6 8]
--------------
[0 1 2 6 8]
17
17


The output of outer will have a dimension that is the sum of the dimensions of the inputs:

In [15]:
x, y = np.random.randn(3, 4), np.random.randn(5)
print(x)
print(y)
print(x.shape)
print(y.shape)
result = np.subtract.outer(x, y)
print(result)
print(result.shape)

[[ 0.5224  0.1064  0.1027 -0.1082]
 [ 0.0549  0.1964 -0.1939 -1.4566]
 [ 0.8574 -0.7416 -0.7804 -0.1064]]
[ 0.5937 -1.2835  0.478   1.2924  0.1516]
(3, 4)
(5,)
[[[-0.0713  1.8059  0.0445 -0.77    0.3708]
  [-0.4873  1.3899 -0.3715 -1.186  -0.0452]
  [-0.491   1.3862 -0.3752 -1.1897 -0.0489]
  [-0.7019  1.1752 -0.5862 -1.4007 -0.2599]]

 [[-0.5389  1.3383 -0.4231 -1.2376 -0.0968]
  [-0.3973  1.4798 -0.2816 -1.0961  0.0447]
  [-0.7876  1.0896 -0.6718 -1.4863 -0.3455]
  [-2.0503 -0.1731 -1.9345 -2.749  -1.6082]]

 [[ 0.2637  2.1409  0.3795 -0.435   0.7058]
  [-1.3353  0.5419 -1.2195 -2.034  -0.8932]
  [-1.3741  0.5031 -1.2583 -2.0728 -0.932 ]
  [-0.7001  1.177  -0.5844 -1.3989 -0.2581]]]
(3, 4, 5)


*reduceat*:

reduceat(a, indices, axis=0, dtype=None, out=None)

Performs a (local) reduce with specified slices over a single axis.
indices: Paired indices, comma separated (not colon), specifying slices to reduce

In [16]:
arr = np.arange(10)
print(arr)
print("--------------")
print(np.add.reduceat(arr, [0, 5, 8]))
print("--------------")
#LO QUE ESTÁ HACIENDO
print(arr[0:5].sum())
print(arr[5:8].sum())
print(arr[8:].sum())

[0 1 2 3 4 5 6 7 8 9]
--------------
[10 18 17]
--------------
10
18
17


In [17]:
print(np.arange(8))
print(np.add.reduceat(np.arange(8),[0,4, 1,5, 2,6, 3,7])[::2]) #step2
print(arr[0:4].sum())
print(arr[1:5].sum())
print(arr[2:6].sum())
print(arr[3:7].sum())

[0 1 2 3 4 5 6 7]
[ 6 10 14 18]
6
10
14
18


In [18]:
x = np.linspace(0, 15, 16).reshape(4,4)
print(x)
print("--------------")
 # reduce such that the result has the following five rows:
 # [row1 + row2 + row3]
 # [row4]
 # [row2]
 # [row3]
 # [row1 + row2 + row3 + row4]
print(np.add.reduceat(x,[0,3,1,2,0],axis = 0))
print("--------------")
#LO QUE ESTÁ HACIENDO
x = np.linspace(0, 15, 16).reshape(4,4)
print(x[0:3].sum(0))
#deja la fila 4, la fila 2 y la fila 3
print(x[3])
print(x[1])
print(x[2])
print(x[0:].sum(0))
print("--------------")
 # reduce such that result has the following two columns:
 # [col1 * col2 * col3, col4]
print(np.multiply.reduceat(x, [0, 3], 1)) #axis = 1 -->
#LO QUE ESTÁ HACIENDO
print(x[0,0]*x[0,1]*x[0,2])
print(x[1,0]*x[1,1]*x[1,2])
print(x[2,0]*x[2,1]*x[2,2])
print(x[3,0]*x[3,1]*x[3,2])
#y deja la última columna

[[ 0.  1.  2.  3.]
 [ 4.  5.  6.  7.]
 [ 8.  9. 10. 11.]
 [12. 13. 14. 15.]]
--------------
[[12. 15. 18. 21.]
 [12. 13. 14. 15.]
 [ 4.  5.  6.  7.]
 [ 8.  9. 10. 11.]
 [24. 28. 32. 36.]]
--------------
[12. 15. 18. 21.]
[12. 13. 14. 15.]
[4. 5. 6. 7.]
[ 8.  9. 10. 11.]
[24. 28. 32. 36.]
--------------
[[   0.    3.]
 [ 120.    7.]
 [ 720.   11.]
 [2184.   15.]]
0.0
120.0
720.0
2184.0


A descriptive example:

If `a` is 1-D, the function `ufunc.accumulate(a)` is the same as
``ufunc.reduceat(a, indices)[::2]`` where `indices` is
``range(len(array) - 1)`` with a zero placed
in every other element:
``indices = zeros(2 * len(a) - 1)``, ``indices[1::2] = range(1, len(a))``.

Don't be fooled by this attribute's name: `reduceat(a)` is not
necessarily smaller than `a`.

In [19]:
a = np.arange(1,10)
print(a)
print(np.add.accumulate(a))
indices = np.zeros((2 * len(a) - 1))
indices[1::2] = range(1, len(a))
print(np.add.reduceat(a,[indices])[::2])

[1 2 3 4 5 6 7 8 9]
[ 1  3  6 10 15 21 28 36 45]


ValueError: object too deep for desired array

### Writing New ufuncs in Python
The most general is to use the NumPy C API, but that is beyond the scope of this book. 
In this section, we will look at *pure Python ufuncs*

*numpy.frompyfunc*
accepts a Python function along with a specification for the number of inputs and outputs.
Takes an arbitrary Python function and returns a NumPy ufunc.
frompyfunc(func, nin, nout, *[, identity])
nin : int
    The number of input arguments.
nout : int
    The number of objects returned by `func`.

EJEMPLO
Use frompyfunc to add broadcasting to the Python function ``oct``:

The oct() function converts an integer into an octal string. Octal strings in Python are prefixed with 0o .

In [20]:
oct(3)

'0o3'

In [21]:
oct_array = np.frompyfunc(oct, 1, 1)
print(oct_array)
arr = np.array((10, 30, 100))
print(arr)
print(oct_array(arr))
#comparamos
print(np.array((oct(10), oct(30), oct(100))))

<ufunc '? (vectorized)'>
[ 10  30 100]
['0o12' '0o36' '0o144']
['0o12' '0o36' '0o144']


In [22]:
def add_elements(x, y):
    return x + y

In [23]:
add_elements(4,5)

9

In [30]:
add_them = np.frompyfunc(add_elements, 2, 1)
print(add_them)

<ufunc '? (vectorized)'>


In [31]:
add_them(np.arange(8), np.arange(8))

array([0, 2, 4, 6, 8, 10, 12, 14], dtype=object)

In [34]:
arr = add_them(np.arange(8), np.arange(8))
print(arr)
print(arr.dtype)
#arr.dtype = float -->ERROR: Cannot change data-type for object array.

[0 2 4 6 8 10 12 14]
object


Functions created using frompyfunc always return arrays of Python objects --> THIS CAN BE INCONVENIENT

there is an alternative (but slightly less featureful)
function, 
*numpy.vectorize*, 
that allows you to specify the output type:

In [35]:
add_them = np.vectorize(add_elements, otypes=[np.float64])
arr = add_them(np.arange(8), np.arange(8))
print(arr)
print(arr.dtype)

[ 0.  2.  4.  6.  8. 10. 12. 14.]
float64


These functions provide a way to create ufunc-like functions, but they are very slow compare with ufuncs

Later in this chapter we’ll show how to create fast ufuncs in Python using the 
*Numba projec*

In [36]:
arr = np.random.randn(10000)
%timeit add_them(arr, arr)
%timeit np.add(arr, arr)

1.52 ms ± 42.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
3.06 µs ± 49.3 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


# A5. Structured and Record Arrays

Un Ndarray es un 'homogeneous data container'

Pero podemos formar *structured array* similar a las filas en una tabla de SQL, de forma que podamos 'mezclar tipos'

ESTRUCTURA TIPICA PARA FORMAR UN *structured array*:

dtype = [('x', npfloat64),('y', np.int32)]

tupla(field_name, field_data_type)

In [3]:
dtype = [('x', np.float64), ('y', np.int32)]
sarr = np.array([(1.5, 6), (np.pi, -2)], dtype=dtype)
sarr

array([(1.5   ,  6), (3.1416, -2)], dtype=[('x', '<f8'), ('y', '<i4')])

ahora los elementos del array son objetos con forma de tupla (tuple-like objects)

In [4]:
sarr[0]

(1.5, 6)

In [5]:
sarr[0]['y']

6

In [6]:
sarr['x']

array([1.5   , 3.1416])

In [None]:
#sarr-->array([(sarr[0]),(sarr[1])])
#sarr[0]-->(x dtype,y dtype)

### Nested dtypes and Multidimensional Fields
Cuando especificas un dtype de un *structured array*, adicionalmente le puedes pasar un shape(como entero o tupla)

In [7]:
dtype = [('x', np.int64, 3), ('y', np.int32)]
arr = np.zeros(4, dtype=dtype)
arr


array([([0, 0, 0], 0), ([0, 0, 0], 0), ([0, 0, 0], 0), ([0, 0, 0], 0)],
      dtype=[('x', '<i8', (3,)), ('y', '<i4')])

In [13]:
print(arr[0])
print("---------")
print(arr[1])
print("---------")
print(arr[2])
print("---------")
print(arr[0]['x'])
arr['x']

([0, 0, 0], 0)
---------
([0, 0, 0], 0)
---------
([0, 0, 0], 0)
---------
[0 0 0]


array([[0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0]], dtype=int64)

In [12]:
dtype = [('x', np.int64, 3), ('y', np.int32)]
arr = np.arange(4, dtype = dtype)
arr

ValueError: no fill-function for data-type.

In [14]:
#lo que yo quería hacer
dtype = [('x', [('a', 'f8'), ('b', 'f4')]), ('y', np.int32)]
data = np.array([((1, 2), 5), ((3, 4), 6)], dtype=dtype)
data

array([((1., 2.), 5), ((3., 4.), 6)],
      dtype=[('x', [('a', '<f8'), ('b', '<f4')]), ('y', '<i4')])

In [17]:
print(data['x'])
print(data['x'].dtype)
print(data['y'])
print(data['y'].dtype)
print(data['x']['a'])

[(1., 2.) (3., 4.)]
[('a', '<f8'), ('b', '<f4')]
[5 6]
int32
[1. 3.]
