In [55]:
import numpy as np
from numpy.random import randn

### Arrays

In [56]:
arr = np.array([[0.1,1,3],[1,2,3]])
print arr
print arr.shape
print arr.dtype

[[ 0.1  1.   3. ]
 [ 1.   2.   3. ]]
(2, 3)
float64


In [57]:
arr2 = np.zeros([4])
print arr2
print arr2.shape
print arr2.dtype

[ 0.  0.  0.  0.]
(4,)
float64


#### Array operations

In [58]:
arr * arr

array([[ 0.01,  1.  ,  9.  ],
       [ 1.  ,  4.  ,  9.  ]])

In [59]:
arr - arr

array([[ 0.,  0.,  0.],
       [ 0.,  0.,  0.]])

In [60]:
print 1 / arr

[[ 10.           1.           0.33333333]
 [  1.           0.5          0.33333333]]


In [61]:
arr ** 0.5

array([[ 0.31622777,  1.        ,  1.73205081],
       [ 1.        ,  1.41421356,  1.73205081]])

#### Array indexes and slicing

Array slices work the same way as with python lists, but the result is a view from the original source rather than a new copy. It means that any modification to the result, will affect the original array.

In [62]:
arr_slice = arr[1:]
arr_slice[:] = 7,8,9
print arr

[[ 0.1  1.   3. ]
 [ 7.   8.   9. ]]


In a 2-dim array, elements can also be accessed with the following syntax.

In [63]:
arr[0, 1]  # the index is actually a tuple

1.0

In [64]:
arr[0:, :-1] #the index is actually a Slice type

array([[ 0.1,  1. ],
       [ 7. ,  8. ]])

##### Boolean indexing

Having a 2-dim array (_data_), we'll suppose each row corresponds to a name (_names_), and some of them can be repeated.
We can use Boolean indexing to filter the rows we want.

In [65]:
data = randn(6, 4)
names = np.array(['a', 'b', 'c', 'a', 'b', 'a'])
print data

[[ 0.7821515  -0.40747181  1.467273   -0.19841888]
 [-0.73327469  1.76989469 -1.22568423 -0.3933416 ]
 [-0.07304037 -0.84896324 -1.22532353  0.93252173]
 [-0.92663304 -0.5271856   0.1444864  -0.37137158]
 [-0.03544821 -1.03232282 -0.79097154 -0.5432489 ]
 [ 0.64473951  0.62685921 -0.62634332  0.81091144]]


In [66]:
names == 'a'

array([ True, False, False,  True, False,  True], dtype=bool)

In [67]:
data[names == 'a']

array([[ 0.7821515 , -0.40747181,  1.467273  , -0.19841888],
       [-0.92663304, -0.5271856 ,  0.1444864 , -0.37137158],
       [ 0.64473951,  0.62685921, -0.62634332,  0.81091144]])

In [68]:
mask = (names == 'c') | (names == 'b')
data[mask, :3]

array([[-0.73327469,  1.76989469, -1.22568423],
       [-0.07304037, -0.84896324, -1.22532353],
       [-0.03544821, -1.03232282, -0.79097154]])

**and** and **or** will not work with boolean arrays.

We can use it on 2-dim arrays too.

In [69]:
data < 0

array([[False,  True, False,  True],
       [ True, False,  True,  True],
       [ True,  True,  True, False],
       [ True,  True, False,  True],
       [ True,  True,  True,  True],
       [False, False,  True, False]], dtype=bool)

In [70]:
data[data<0] = 0
print data

[[ 0.7821515   0.          1.467273    0.        ]
 [ 0.          1.76989469  0.          0.        ]
 [ 0.          0.          0.          0.93252173]
 [ 0.          0.          0.1444864   0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.64473951  0.62685921  0.          0.81091144]]


##### Fancy indexing

It's a fancy way of selecting a subset of the array knowing the rows we want and the columns we want. It uses lists as indexes of the array.

In [71]:
arr = np.empty((8,4))
for i in range(8): arr[i] = i
    
print arr

[[ 0.  0.  0.  0.]
 [ 1.  1.  1.  1.]
 [ 2.  2.  2.  2.]
 [ 3.  3.  3.  3.]
 [ 4.  4.  4.  4.]
 [ 5.  5.  5.  5.]
 [ 6.  6.  6.  6.]
 [ 7.  7.  7.  7.]]


In [72]:
arr[[1,3,0]]

array([[ 1.,  1.,  1.,  1.],
       [ 3.,  3.,  3.,  3.],
       [ 0.,  0.,  0.,  0.]])

In [73]:
arr[[4,5,6], [1,2,3]] # will return elements (4,1), (5,2) and (1,3) which is not trivial

array([ 4.,  5.,  6.])

The previous example returned elements (4,1), (5,2) and (1,3), The len of both lists must be equal. The response is not trivial.

It would make more sense if it returned the rectangular region formed by selecting the subset of the matrix's rows and columns.
We can get this by using **np.ix_**

In [74]:
arr[np.ix_([4,5,6], [1,2])] # now the len of the lists don't have to be equal.

array([[ 4.,  4.],
       [ 5.,  5.],
       [ 6.,  6.]])

##### Transpose of a matrix

In [75]:
arr = np.arange(15).reshape(3,5)
arr

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

Transpose can be done with **arr.T** or **arr.transpose()**

In [76]:
arr.T

array([[ 0,  5, 10],
       [ 1,  6, 11],
       [ 2,  7, 12],
       [ 3,  8, 13],
       [ 4,  9, 14]])

Can be used to calculate matrix products X^T . X

In [77]:
np.dot(arr.T, arr)

array([[125, 140, 155, 170, 185],
       [140, 158, 176, 194, 212],
       [155, 176, 197, 218, 239],
       [170, 194, 218, 242, 266],
       [185, 212, 239, 266, 293]])

We can get the transpose with **swapaxes** method, which is a more generic method:

In [78]:
arr.swapaxes(0,1)

array([[ 0,  5, 10],
       [ 1,  6, 11],
       [ 2,  7, 12],
       [ 3,  8, 13],
       [ 4,  9, 14]])

### Universal Functions

An ufunc is a “vectorized” wrapper for a function that takes a fixed number of scalar inputs and produces a fixed number of scalar outputs.
We often write loops for this operations, with ufuncs you avoid writing them.. This practice of replacing explicit loops with array expressions is call **vectorization**.

They can be **unary** (takes only one argument) like *sqrt*, *floor*, *rint*; or **binary** (takes 2 arguments) like *add*, *multiply*, *maximum*.
For a list of all of them [here](http://docs.scipy.org/doc/numpy/reference/ufuncs.html#available-ufuncs)

In [79]:
np.sqrt([81 , 64, 4])

array([ 9.,  8.,  2.])

In [80]:
np.maximum([-1, 8, 0.3], [1, 7, 10.1])

array([  1. ,   8. ,  10.1])

#### Conditional logic

.where() equivalent to *x if condition else y*

In [81]:
xarr = np.array([1,2,3])
yarr = np.array([4,5,6])
print np.where(xarr < 2 , xarr, yarr)

[1 5 6]


#### Mathematical and Statistical Methods

In [82]:
arr = np.random.rand(2, 2)
arr

array([[ 0.58648172,  0.11376387],
       [ 0.77845208,  0.42236568]])

In [83]:
print arr.mean()
print arr.mean(axis=0)
print arr.sum()
print arr.sum(axis=1) # axis.. 0 for column, 1 for row

0.475265838052
[ 0.6824669   0.26806477]
1.90106335221
[ 0.70024559  1.20081776]


#### Boolean arrays

In [84]:
arr = randn(100)
bools = arr>0

In [85]:
bools.all()

False

In [86]:
bools.any()

True

In [87]:
bools.sum()

46

#### Other important functions

In [88]:
arr = randn(10)

In [89]:
arr.sort()
print arr

[-1.33969565 -1.30267031 -0.88430822 -0.84811827 -0.56013842 -0.29791448
  0.27589468  0.61020155  1.07601581  1.36789246]


In [90]:
x = np.array(['x', 'y', 'z', 'x'])
np.unique(x)

array(['x', 'y', 'z'], 
      dtype='|S1')