#  this tutorial refers 'python data science handbook'

In [2]:
import numpy as np

# to check which version we are using
np.version.version

In [2]:
 np.version.version

'1.14.3'

# to create a array in numpy

In [3]:
np.array([1,2,3,4,5])

array([1, 2, 3, 4, 5])

In [4]:
# type np. and then press tab to see all components
np

<module 'numpy' from 'C:\\Users\\Mayank\\Anaconda3\\lib\\site-packages\\numpy\\__init__.py'>

In [5]:
# to view numpy documentation
np?

A Python Integer Is More Than Just an Integer
The standard Python implementation is written in C. This means that every Python
object is simply a cleverly disguised C structure, which contains not only its value, but
other information as well. For example, when we define an integer in Python, such as
x = 10000, x is not just a “raw” integer. It’s actually a pointer to a compound C structure,
which contains several values. Looking through the Python 3.4 source code, we
find that the integer (long) type definition effectively looks like this (once the C macros
are expanded):

struct _longobject {
long ob_refcnt;
PyTypeObject *ob_type;
size_t ob_size;
long ob_digit[1];
};
A single integer in Python 3.4 actually contains four pieces:
• ob_refcnt, a reference count that helps Python silently handle memory allocation
and deallocation
• ob_type, which encodes the type of the variable
• ob_size, which specifies the size of the following data members
• ob_digit, which contains the actual integer value that we expect the Python variable
to represent


# difference between a python list and numpy array

In [6]:
# a list in python
l=list(range(10))
l

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [7]:
type(l)

list

In [8]:
type(l[0])

int

In [9]:
l1=[str(c) for c in l]
l1

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

In [10]:
type(l1)

list

In [11]:
type(l1[0])

str

In [12]:
l2=[2,1,'1',bool,True]
p2=[type(c) for c in l2]

In [13]:
p2

[int, int, str, type, bool]

But this flexibility comes at a cost: to allow these flexible types, each item in the list
must contain its own type info, reference count, and other information—that is, each
item is a complete Python object. In the special case that all variables are of the same
type, much of this information is redundant: it can be much more efficient to store
data in a fixed-type array. The difference between a dynamic-type list and a fixed-type
(NumPy-style) array is illustrated in Figure 2-2.
At the implementation level, the array essentially contains a single pointer to one contiguous
block of data. The Python list, on the other hand, contains a pointer to a
block of pointers, each of which in turn points to a full Python object like the Python
integer we saw earlier. Again, the advantage of the list is flexibility: because each list
element is a full structure containing both data and type information, the list can be
filled with data of any desired type. Fixed-type NumPy-style arrays lack this flexibility,
but are much more efficient for storing and manipulating data.

# in built in array module for fixed type ,efficient data

In [14]:
import array

In [15]:
l=list(range(10))

In [16]:
Array=array.array('i',l)
Array
# here 'i' represents it is a array of type integer

array('i', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

# numpy arrays

In [17]:
import numpy as np

In [18]:
# using a list to create an array
np.array([1,2,3,4,5])

array([1, 2, 3, 4, 5])

In [19]:
# all the types in numpy array should be same if not values are upcasted if possible
np.array([1.0,2,3,4,5])
# converts everyone integer to floating point

array([1., 2., 3., 4., 5.])

In [20]:
np.array([1,2,3,4,'5'])

array(['1', '2', '3', '4', '5'], dtype='<U11')

In [21]:
# if we want to set data type of a array explicitly
np.array([1,2,3,4,5],dtype='float32')

array([1., 2., 3., 4., 5.], dtype=float32)

In [22]:
# creating a multidimensional array in numpy
x=np.array([range(3)])
print(x)
np.array([range(i,i+3) for i in [2,4,6]])

[[0 1 2]]


array([[2, 3, 4],
       [4, 5, 6],
       [6, 7, 8]])

#  create a numpy array from scratch

In [23]:
# create a size specified array of only 0, np.zeros()
np.zeros(10)

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [24]:
# mention data type of array explicitly
np.zeros(10,dtype=int)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [25]:
# create a 3x5 array filled with one,np.ones()
np.ones((3,5),dtype=float)

array([[1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.]])

In [26]:
# a 3x5 matrix filled with zeros
np.zeros((3,5),dtype=int)

array([[0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0]])

In [27]:
# create a np array filled with 3.14,np.full()
np.full(10,3.14)

array([3.14, 3.14, 3.14, 3.14, 3.14, 3.14, 3.14, 3.14, 3.14, 3.14])

In [28]:
np.full((3,5),3.14)

array([[3.14, 3.14, 3.14, 3.14, 3.14],
       [3.14, 3.14, 3.14, 3.14, 3.14],
       [3.14, 3.14, 3.14, 3.14, 3.14]])

In [29]:
# Create an array filled with a linear sequence
# Starting at 0, ending at 20, stepping by 2
np.arange(0,20,2)

array([ 0,  2,  4,  6,  8, 10, 12, 14, 16, 18])

In [30]:
# create a equi spaced array from 0 to 1 with 5 elements
np.linspace(0,1,5)

array([0.  , 0.25, 0.5 , 0.75, 1.  ])

In [31]:
# create a 3x3 array filled with random value between 0 and 1
np.random.random((3,5))

array([[0.582304  , 0.05535802, 0.85467676, 0.77138021, 0.39423393],
       [0.62210639, 0.91547998, 0.41522294, 0.78414029, 0.88678171],
       [0.00606036, 0.24952487, 0.9159058 , 0.25597481, 0.1492066 ]])

In [32]:
# create a 3x3e array with mean 0 and standard deviation 1
np.random.normal(0,1,(3,3))

array([[ 1.02562985,  1.44212472,  0.26869509],
       [ 1.43089977, -0.28481875, -0.57813688],
       [ 0.73498751,  0.31881325, -0.78513676]])

In [33]:
# mean 2 and standard deviation 2
np.random.normal(2,2,(3,3))

array([[ 4.59974333,  1.8975902 , -0.48358065],
       [ 6.40560116,  1.73393077,  2.06022491],
       [-0.7446708 ,  3.0775509 ,  1.33461257]])

In [34]:
# create a 3x3 array fileed with random values from 0 to 10
np.random.randint(0,10,(3,3))

array([[1, 0, 9],
       [5, 4, 3],
       [2, 6, 2]])

In [35]:
# create a 3x3 identity matrix
np.eye(3,dtype=int)

array([[1, 0, 0],
       [0, 1, 0],
       [0, 0, 1]])

In [36]:
# create a empty array of size 3 ,gets filled with whatever values are already exist
np.empty((3,3))

array([[4.59974333, 1.8975902 , 0.48358065],
       [6.40560116, 1.73393077, 2.06022491],
       [0.7446708 , 3.0775509 , 1.33461257]])

In [37]:
np.empty(3)

array([1.00221861e-310, 2.79165906e+207, 4.26547482e+202])

#  numpy standard data types

In [38]:
np.zeros(10,dtype=int)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [39]:
np.zeros(10,dtype=float)

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [40]:
np.zeros(10,dtype='int16')

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int16)

In [41]:
np.zeros(10,dtype=np.int16)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int16)

In [42]:
# refer page 41 of the book to view more supported dtypes

#  basics of numpy array

This section
will present several examples using NumPy array manipulation to access data
and subarrays, and to split, reshape, and join the arrays.

## array attributes

In [43]:
np.random.seed(0)
x1=np.random.randint(10,size=6)

In [44]:
# every np array has attributes ndim,shape,size for its dimension,shape, and total size
x1.ndim

1

In [45]:
x1.shape

(6,)

In [46]:
x1.size

6

In [47]:
x2=np.random.randint(10,size=(2,3))

In [48]:
x2.ndim

2

In [49]:
x2.shape

(2, 3)

In [50]:
x2.size

6

In [51]:
# we also have dtype to see data type of array
x2.dtype

dtype('int32')

In [52]:
x2.itemsize

4

In [53]:
x2.nbytes

24

In [54]:
# accessing array elememts
x1

array([5, 0, 3, 3, 7, 9])

In [55]:
x1[0]

5

In [56]:
# to index from the end use -1 and so on
x1[-1]

9

In [57]:
x1[-2]

7

In [58]:
x2

array([[3, 5, 2],
       [4, 7, 6]])

In [59]:
# to access elements in a multidim array
print(x2[0,0])
print(x2[(0,0)])

3
3


In [60]:
print(x2[0,1])

5


In [61]:
## modify values using abov notation
x2[0,1]=10

In [62]:
x2

array([[ 3, 10,  2],
       [ 4,  7,  6]])

In [63]:
# if we assign a float value it will be truncated because its a int array
x2[0,0]=99.15
x2

array([[99, 10,  2],
       [ 4,  7,  6]])

## array slicing,sub arrays

Just as we can use square brackets to access individual array elements, we can also use
them to access subarrays with the slice notation, marked by the colon (:) character.
The NumPy slicing syntax follows that of the standard Python list; to access a slice of
an array x, use this:
x[start:stop:step]
If any of these are unspecified, they default to the values start=0, stop=size of
dimension, step=1.

In [64]:
x=np.arange(10)
x

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [65]:
# get first 5 element subarray
x[:5]

array([0, 1, 2, 3, 4])

In [66]:
x[2:5]

array([2, 3, 4])

In [67]:
x[5:]

array([5, 6, 7, 8, 9])

In [68]:
# steps of 2
x[::2]

array([0, 2, 4, 6, 8])

In [69]:
x[1::2]

array([1, 3, 5, 7, 9])

In [70]:
# if step value is negative , start and stop are swapped, easy way to reverse an array
x[::-1]

array([9, 8, 7, 6, 5, 4, 3, 2, 1, 0])

In [71]:
x[5::-2]

array([5, 3, 1])

### multidimensional array slicing

In [72]:
x2=np.random.randint(20,size=(3,4))

In [73]:
x2

array([[12,  1,  6,  7],
       [14, 17,  5, 13],
       [ 8,  9, 19, 16]])

In [74]:
x2[:,:]

array([[12,  1,  6,  7],
       [14, 17,  5, 13],
       [ 8,  9, 19, 16]])

In [75]:
# upto second row and second column
x2[:2,:2]

array([[12,  1],
       [14, 17]])

In [76]:
# all rows alternate columns
x2[:,::2]

array([[12,  6],
       [14,  5],
       [ 8, 19]])

In [77]:
# reversin array altogether
x2[::-1,::-1]

array([[16, 19,  9,  8],
       [13,  5, 17, 14],
       [ 7,  6,  1, 12]])

In [78]:
# accessing 1st column of an array
print(x2)
print(x2[:,0])

[[12  1  6  7]
 [14 17  5 13]
 [ 8  9 19 16]]
[12 14  8]


In [79]:
# print first row of x2
print(x2[0,:])

[12  1  6  7]


In [80]:
# also can be used
print(x2[0])

[12  1  6  7]


One important—and extremely useful—thing to know about array slices is that they
return views rather than copies of the array data. This is one area in which NumPy
array slicing differs from Python list slicing: in lists, slices will be copies.

In [81]:
print(x2)

[[12  1  6  7]
 [14 17  5 13]
 [ 8  9 19 16]]


In [82]:
x2_sub=x2[:2,:2]

In [83]:
x2_sub

array([[12,  1],
       [14, 17]])

In [84]:
x2_sub[0,0]=45

In [85]:
# we see that x2 is changed as well despite making changes only in x2_sub, 
# this is because np array slicing return views instead of copy of array data
print(x2)

[[45  1  6  7]
 [14 17  5 13]
 [ 8  9 19 16]]


### create copies

In [86]:
# to create copy we use copy()
x2_sub_copy=x2[:2,:2].copy()

In [87]:
x2_sub_copy

array([[45,  1],
       [14, 17]])

In [88]:
x2_sub_copy[0,0]=100

In [89]:
# x2 has no change in this case
x2

array([[45,  1,  6,  7],
       [14, 17,  5, 13],
       [ 8,  9, 19, 16]])

## reshaping arrays

In [90]:
np.arange(9)

array([0, 1, 2, 3, 4, 5, 6, 7, 8])

In [91]:
# to reshape easiest way is to use reshape()
np.arange(9).reshape((3,3))

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [92]:
# note that for this to work size of original array should match the size of the new array
np.arange(9).reshape((3,4))

ValueError: cannot reshape array of size 9 into shape (3,4)

In [93]:
#Another common reshaping pattern is the conversion of a one-dimensional array
#into a two-dimensional row or column matrix. You can do this with the reshape
#method, or more easily by making use of the newaxis keyword within a slice operation:
x=np.array([1,2,3])
x.reshape((1,3))

array([[1, 2, 3]])

In [None]:
#can also use np.newaxis in slicing mode
x[np.newaxis,:]

In [None]:
x.reshape((3,1))

In [None]:
x[:,np.newaxis]

## array concatination and splitting

All of the preceding routines worked on single arrays. It’s also possible to combine
multiple arrays into one, and to conversely split a single array into multiple arrays.
We’ll take a look at those operations here.

### concatenation of arrays

Concatenation, or joining of two arrays in NumPy, is primarily accomplished
through the routines np.concatenate, np.vstack, and np.hstack. np.concatenate
takes a tuple or list of arrays as its first argument, as we can see here:

In [None]:
x=np.array([1,2,3])
y=np.array([4,5,6])
np.concatenate([x,y])

In [None]:
z=np.array([7,8,9])
np.concatenate([x,y,z])

In [None]:
p=np.array([[1,2,3],[4,5,6]])
q=np.array([[7,8,9],[10,11,12]])
np.concatenate([p,q])

In [None]:
np.concatenate([p,q],axis=1)

In [None]:
# vertical stack
x=np.array([1,2,3])
y=np.array([[4,5,6],[7,8,9]])
np.vstack([x,y])

In [None]:
x=np.array([[99],[99]])
np.hstack([x,y])

### splitting array

The opposite of concatenation is splitting, which is implemented by the functions
np.split, np.hsplit, and np.vsplit. For each of these, we can pass a list of indices
giving the split points:

In [None]:
x=np.array([1,2,3,4,5,6,7,8,9])

In [None]:
#Notice that N split points lead to N + 1 subarrays.
y1,y2,y3=np.split(x,[3,5])

In [None]:
y1

In [None]:
y2

In [None]:
y3

In [None]:
y1,y2,y3,y4=np.split(x,[3,5,8])
print(y1)
print(y2)
print(y3)
print(y4)

In [None]:
grid=np.arange(16).reshape([4,4])

In [None]:
grid

In [None]:
upper,lower=np.vsplit(grid,[2])
# try changing value of 2 to see result change

In [None]:
print(upper)
print(lower)

In [None]:
left,right=np.hsplit(grid,[2])
grid

In [None]:
print(left)
print(right)

## computation on numpy arrays

Up until now, we have been discussing some of the basic nuts and bolts of NumPy; in
the next few sections, we will dive into the reasons that NumPy is so important in the
Python data science world. Namely, it provides an easy and flexible interface to optimized
computation with arrays of data.
Computation on NumPy arrays can be very fast, or it can be very slow. The key to
making it fast is to use vectorized operations, generally implemented through Num‐
Py’s universal functions (ufuncs). This section motivates the need for NumPy’s ufuncs,
which can be used to make repeated calculations on array elements much more efficient.
It then introduces many of the most common and useful arithmetic ufuncs
available in the NumPy package.

In [None]:
# lets calculate reciprocal of an array elements
l=np.random.randint(1,10,size=5)
l

In [None]:
def reciprocal(l):
    output=np.empty(len(l))
    for i in range(len(l)):
        output[i]=1/l[i]
    (output)

In [None]:
# lets calculate time taken for this loop for 5 elements by using timeit
%timeit reciprocal(l)

In [None]:
# lets calculate for 1000000 elements
l=np.arange(1,1000001)
print(len(l))

In [None]:
# this takes lot of time to compute
%timeit reciprocal(l)

It takes several seconds to compute these million operations and to store the result!
When even cell phones have processing speeds measured in Giga-FLOPS (i.e., billions
of numerical operations per second), this seems almost absurdly slow. It turns
out that the bottleneck here is not the operations themselves, but the type-checking
and function dispatches that CPython must do at each cycle of the loop. Each time
the reciprocal is computed, Python first examines the object’s type and does a
dynamic lookup of the correct function to use for that type. If we were working in
compiled code instead, this type specification would be known before the code executes
and the result could be computed much more efficiently.

In [None]:
%timeit reciprocal(l)

In [None]:
# directly performing operation on array
%timeit (1.0/l)

Looking at the execution time for our big array, we see that it completes orders of
magnitude faster than the Python loop:
For many types of operations, NumPy provides a convenient interface into just this
kind of statically typed, compiled routine. This is known as a vectorized operation.
You can accomplish this by simply performing an operation on the array, which will
then be applied to each element. This vectorized approach is designed to push the
loop into the compiled layer that underlies NumPy, leading to much faster execution.

Vectorized operations in NumPy are implemented via ufuncs, whose main purpose is
to quickly execute repeated operations on values in NumPy arrays. Ufuncs are
extremely flexible—before we saw an operation between a scalar and an array, but we
can also operate between two arrays:

In [None]:
np.arange(5)/np.arange(1,6)

And ufunc operations are not limited to one-dimensional arrays—they can act on
multidimensional arrays as well:

In [None]:
x=np.arange(9).reshape((3,3))
x

In [None]:
#each element replaced by 2^that element
2**x

In [None]:
# each element^2
x**2

## exploring numpy's ufuncs 

Ufuncs exist in two flavors: unary ufuncs, which operate on a single input, and binary
ufuncs, which operate on two inputs. We’ll see examples of both these types of functions
here.

### array arithmetic

In [None]:
x=np.arange(1,5)
x

In [None]:
print("x+5=",x+5)

In [None]:
print("x-5=",x-5)

In [None]:
print("x/2=",x/2)

In [None]:
print("x*2=",x*2)

In [None]:
print("x//2=",x//2)

In [None]:
# ** for exponent
# % for modulus
print(x**2)
print(x%2)

In [None]:
# In addition, these can be strung together however you wish, and the standard order
# of operations is respected:
-(.5*x+1)**2

In [None]:
# arithmetic operations implemented in numpy
print(x)
print(np.add(x,2)) # x+2
print(np.subtract(x,2)) #x-2
print(np.negative(x)) #-x
print(np.multiply(x,2)) #x*2
print(np.divide(x,2)) #x/2
print(np.floor_divide(x,2)) #x//2
print(np.power(x,2)) #x^2
print(np.mod(x,2)) #x%2
# we will also see boolean and bitwise operations later

In [None]:
# inbuilt absolute function
y=np.array([-1,-2,-3,-4])
print(abs(y))
print(np.absolute(y))
print(np.abs(y))

### trigonometric functions

In [None]:
theta=np.linspace(0,np.pi,3)
theta

In [None]:
x=np.sin(theta)
x

In [None]:
np.cos(theta)

In [None]:
np.tan(theta)

In [None]:
#The values are computed to within machine precision, which is why values that
#should be zero do not always hit exactly zero. Inverse trigonometric functions are also
#available:

np.arcsin(x)

In [None]:
# exponential  
np.exp(x) #e^x

In [None]:
x=np.array([1,0,2]) #2^x
np.exp2(x)

In [None]:
np.power(3,x)
#3^x

In [None]:
# log functions
x=np.array([1,2,3,4])
np.log(x) #ln(x)

In [None]:
np.log2(x) #log2(x)

In [None]:
np.log10(x) #log10(x)

In [None]:
# when x is very small ,use this function istead of above discussed..these are expm1 and log1p as they give more accurate result for smaller value
x=np.array([0,0.1,0.01,0.001])

In [None]:
np.expm1(x) #exp(x)-1

In [None]:
np.log1p(x) # log(1+x)

# advanced ufuncs features

### specifying output

For large calculations, it is sometimes useful to be able to specify the array where the
result of the calculation will be stored. Rather than creating a temporary array, you
can use this to write computation results directly to the memory location where you’d
like them to be. For all ufuncs, you can do this using the out argument of the
function:

In [None]:
x=np.arange(5)
print("value in x =",x)
y=np.empty(5,dtype=int)
np.multiply(x,2,out=y)
print("value of y =",y)

In [None]:
#This can even be used with array views. For example, we can write the results of a
#computation to every other element of a specified array:

y=np.zeros(10)
np.power(2,x,out=y[::2])
print(y)

### aggregates

For binary ufuncs, there are some interesting aggregates that can be computed
directly from the object. For example, if we’d like to reduce an array with a particular
operation, we can use the reduce method of any ufunc. A reduce repeatedly applies a
given operation to the elements of an array until only a single result remains.


For example, calling reduce on the add ufunc returns the sum of all elements in the
array:

In [None]:
x=np.arange(1,6)
print(x)
np.add.reduce(x)

In [None]:
np.multiply.reduce(x)

In [None]:
# if we like to store all the intermediate values , we can use accumulate
np.add.accumulate(x)

In [None]:
np.multiply.accumulate(x)

### outer product

Finally, any ufunc can compute the output of all pairs of two different inputs using
the outer method. This allows you, in one line, to do things like create a multiplication
table:

In [None]:
x=np.arange(1,6)
np.multiply.outer(x,x)

In [None]:
np.add.outer(x,x)

## aggregations: min, max and everything in between

Often when you are faced with a large amount of data, a first step is to compute summary
statistics for the data in question. Perhaps the most common summary statistics
are the mean and standard deviation, which allow you to summarize the “typical” values
in a dataset, but other aggregates are useful as well (the sum, product, median,
minimum and maximum, quantiles, etc.).

NumPy has fast built-in aggregation functions for working on arrays; we’ll discuss
and demonstrate some of them here.

### summing values in array

In [None]:
#As a quick example, consider computing the sum of all values in an array. Python
#itself can do this using the built-in sum function:
x=np.random.randint(1,100,size=10000)
sum(x)

In [None]:
# numpy sum function
np.sum(x)

In [None]:
# However, because it executes the operation in compiled code, NumPy’s version of the
#operation is computed much more quickly:
%timeit sum(x)

In [None]:
%timeit np.sum(x)

Be careful, though: the sum function and the np.sum function are not identical, which
can sometimes lead to confusion! In particular, their optional arguments have different
meanings, and np.sum is aware of multiple array dimensions, as we will see in the
following section.

### minimum and maximum

Similarly, Python has built-in min and max functions, used to find the minimum value
and maximum value of any given array:

In [None]:
x=np.arange(50,100000)

In [None]:
# python inbuilt min and numpy min ,again where numpy implementation is much fatser
print(min(x))
np.min(x)

In [None]:
print(max(x))
np.max(x)

In [None]:
%timeit min(x)

In [None]:
%timeit np.min(x)

In [None]:
# we can also use methods of array objects
x.min(),x.max()

Whenever possible, make sure that you are using the NumPy version of these aggregates
when operating on NumPy arrays!

### multidimensional aggregates

One common type of aggregation operation is an aggregate along a row or column.
Say you have some data stored in a two-dimensional array:

In [None]:
x=np.random.random((3,4))
x

In [None]:
# by default over entire table
x.sum()

Aggregation functions take an additional argument specifying the axis along which
the aggregate is computed. For example, we can find the minimum value within each
column by specifying axis=0:

In [None]:
# The function returns four values, corresponding to the four columns of numbers.
x.sum(axis=0)

In [None]:
x.sum(axis=1)

In [None]:
# to find max value within each row
x.max(axis=1)

The way the axis is specified here can be confusing to users coming from other languages.
The axis keyword specifies the dimension of the array that will be collapsed,
rather than the dimension that will be returned. So specifying axis=0 means that the
first axis will be collapsed: for two-dimensional arrays, this means that values within
each column will be aggregated.

### other aggregate functions

NumPy provides many other aggregation functions, but we won’t discuss them in
detail here. Additionally, most aggregates have a NaN-safe counterpart that computes
the result while ignoring missing values, which are marked by the special IEEE
floating-point NaN value
Some of these NaN-safe functions were not added until
NumPy 1.8, so they will not be available in older NumPy versions.

In [None]:
np.sum(x)

In [None]:
# nan support sum aggregate
np.nansum(x)

np.sum np.nansum Compute sum of elements

np.prod np.nanprod Compute product of elements

np.mean np.nanmean Compute median of elements

np.std np.nanstd Compute standard deviation

np.var np.nanvar Compute variance

np.min np.nanmin Find minimum value

np.max np.nanmax Find maximum value

np.argmin np.nanargmin Find index of minimum value

np.argmax np.nanargmax Find index of maximum value

np.median np.nanmedian Compute median of elements

np.percentile np.nanpercentile Compute rank-based statistics of elements

np.any N/A Evaluate whether any elements are true

np.all N/A Evaluate whether all elements are true

## computation on arrays : broadcasting

We saw in the previous section how NumPy’s universal functions can be used to vectorize
operations and thereby remove slow Python loops. Another means of vectorizing
operations is to use NumPy’s broadcasting functionality. Broadcasting is simply a
set of rules for applying binary ufuncs (addition, subtraction, multiplication, etc.) on
arrays of different sizes.

### introducing broadcasting

In [None]:
x=np.array([1,2,3,4])
y=np.array([5,6,7,8])

In [None]:
x+y

Broadcasting allows these types of binary operations to be performed on arrays of different
sizes—for example, we can just as easily add a scalar (think of it as a zerodimensional
array) to an array:

In [None]:
x+5

We can think of this as an operation that stretches or duplicates the value 5 into the
array [5, 5, 5, 5], and adds the results. The advantage of NumPy’s broadcasting is that
this duplication of values does not actually take place, but it is a useful mental model
as we think about broadcasting.

We can similarly extend this to arrays of higher dimension. Observe the result when
we add a one-dimensional array to a two-dimensional array:

In [None]:
arr=np.arange(9).reshape((3,3))

In [None]:
arr

In [None]:
z=[1,2,3]
arr+z

Here the one-dimensional array a is stretched, or broadcast, across the second
dimension in order to match the shape of M.

While these examples are relatively easy to understand, more complicated cases can
involve broadcasting of both arrays. Consider the following example:

In [None]:
a=np.arange(3)
a

In [None]:
b=np.arange(3)[:,np.newaxis]
b

In [None]:
a+b


Rules of Broadcasting
Broadcasting in NumPy follows a strict set of rules to determine the interaction
between the two arrays:

• Rule 1: If the two arrays differ in their number of dimensions, the shape of the
one with fewer dimensions is padded with ones on its leading (left) side.

• Rule 2: If the shape of the two arrays does not match in any dimension, the array
with shape equal to 1 in that dimension is stretched to match the other shape.

• Rule 3: If in any dimension the sizes disagree and neither is equal to 1, an error is
raised.

In [None]:
# example where one array is broadcasted
m=np.ones((3,3))

In [None]:
a=np.arange(3)
# a is broadcasted

In [None]:
m+a

In [None]:
# example where both arrays are broadcasted

In [None]:
m=np.array([0,1,2])

In [None]:
n=np.array([0,1,2]).reshape((3,1))

In [None]:
m+n

In [None]:
# example where 2 arrays are not compatible

In [None]:
m=np.ones((3,2))
n=np.arange(3)

In [None]:
m+n

We saw that using +, -, *, /,
and others on arrays leads to element-wise operations. NumPy also implements comparison
operators such as < (less than) and > (greater than) as element-wise ufuncs.
The result of these comparison operators is always an array with a Boolean data type.
All six of the standard comparison operations are available:

In [None]:
x=np.array([1,2,3,4])
x<3

In [None]:
x>3

In [None]:
x<=3

In [None]:
x>=3

In [None]:
x!=3

In [None]:
x==3

It is also possible to do an element-by-element comparison of two arrays, and to
include compound expressions:

In [None]:
x*2==x**2

As in the case of arithmetic operators, the comparison operators are implemented as
ufuncs in NumPy; for example, when you write x < 3, internally NumPy uses
np.less(x, 3). A summary of the comparison operators and their equivalent ufunc
is shown here:
Operator    Equivalent ufunc

== np.equal

!= np.not_equal

< np.less

<= np.less_equal

, > np.greater

, >= np.greater_equal

In [None]:
# works on 2d array
x=np.random.randint(0,10,(3,3))

In [None]:
x

In [None]:
x<4

In [None]:
# To count the number of True entries in a Boolean array, np.count_nonzero is useful
np.count_nonzero(x<4)

Another way to get at this
information is to use np.sum; in this case, False is interpreted as 0, and True is interpreted
as 1:

In [None]:
np.sum(x<4)

In [None]:
#The benefit of sum() is that like with other NumPy aggregation functions, this summation
# can be done along rows or columns as well:

## how many values less than 4 in each row?
np.sum(x<4,axis=1)
# This counts the number of values less than 4 in each row of the matrix.

In [None]:
# If we’re interested in quickly checking whether any or all the values are true, we can
# use (you guessed it) np.any() or np.all():
np.any(x<6)

In [None]:
np.all(x<6)

In [None]:
np.any(x<6,axis=1)

## boolean operation

In [None]:
x=np.arange(0,10,2)

In [None]:
np.sum((x>3) & (x<6))

& np.bitwise_and

| np.bitwise_or

^ np.bitwise_xor

~ np.bitwise_not

In [None]:
np.bitwise_and(x,1)

## boolean array as masks

In the preceding section, we looked at aggregates computed directly on Boolean
arrays. A more powerful pattern is to use Boolean arrays as masks, to select particular
subsets of the data themselves. Returning to our x array from before, suppose we
want an array of all values in the array that are less than, say, 5:

In [None]:
x=np.arange(9).reshape((3,3))

In [None]:
x<5

In [None]:
x[x<5]

## Using the Keywords and/or Versus the Operators &/|

One common point of confusion is the difference between the keywords and and or
on one hand, and the operators & and | on the other hand. When would you use one
versus the other?
The difference is this: and and or gauge the truth or falsehood of entire object, while &
and | refer to bits within each object.
When you use and or or, it’s equivalent to asking Python to treat the object as a single
Boolean entity. In Python, all nonzero integers will evaluate as True. Thus:

In [None]:
bool(42),bool(0)

In [None]:
bool(42 and 0)

In [None]:
42 and 0

In [None]:
42 and 11

In [None]:
1 and 2

In [None]:
2 and 1

When you use & and | on integers, the expression operates on the bits of the element,
applying the and or the or to the individual bits making up the number:

In [None]:
42 & 20

In [None]:
3 & 1

In [None]:
1 & 3

In [None]:
3 | 2

In [None]:
4|2

In [None]:
bin(2)

In [None]:
bin(4)

In [None]:
bin(6)

In [None]:
bin(4|2)

In [None]:
 #When you have an array of Boolean values in NumPy, this can be thought of as a
#string of bits where 1 = True and 0 = False, and the result of & and | operates in a
#similar manner as before:

a=np.array([1,0,1,0,1])
b=np.array([0,1,1,1,0])
a|b

In [None]:
a=np.array([1,0,1,0,1],dtype=bool)
b=np.array([0,1,1,1,0],dtype=bool)
a|b

In [None]:
a=np.array([1,0,1,0,1],dtype=bool)
b=np.array([0,1,1,1,0],dtype=bool)
a&b

In [None]:
a |b

So remember this: and and or perform a single Boolean evaluation on an entire
object, while & and | perform multiple Boolean evaluations on the content (the individual
bits or bytes) of an object. For Boolean NumPy arrays, the latter is nearly
always the desired operation.

## fancy indexing

In the previous sections, we saw how to access and modify portions of arrays using
simple indices (e.g., arr[0]), slices (e.g., arr[:5]), and Boolean masks (e.g., arr[arr> 0] )

In this section, we’ll look at another style of array indexing, known as fancy
indexing. Fancy indexing is like the simple indexing we’ve already seen, but we pass
arrays of indices in place of single scalars. This allows us to very quickly access and
modify complicated subsets of an array’s values.

Fancy indexing is conceptually simple: it means passing an array of indices to access
multiple array elements at once. For example, consider the following array:

In [None]:
x=np.random.randint(100,size=20)

In [None]:
x

In [None]:
# suppose we want to access 3 different element 
[x[2],x[9],x[17]]

In [None]:
ind=[2,9,17]
x[ind]

In [None]:
ind=np.array([[1,2],[3,4]])
x[ind]

In [None]:
# fancy indexing also works on multiple dimension
x=np.arange(12).reshape((3,4))
x

In [None]:
row=np.array([0,1,2])
col=np.array([1,2,3])
x[row,col]
# first value is x[0,1] then x[1,2] then x[2,3]

In [None]:
x[row[:,np.newaxis],col]

### combined indexing

In [None]:
x

In [None]:
x[2,[0,1,2]]

In [None]:
x[1:,[2,0,1]]

In [None]:
# we can combine fancy indexing with masking
mask=np.array([1,0,1,0],dtype=bool)
x[row[:,np.newaxis],mask]

### modifying values with fancy indexing

Just as fancy indexing can be used to access parts of an array, it can also be used to
modify parts of an array. For example, imagine we have an array of indices and we’d
like to set the corresponding items in an array to some value:

In [3]:
x=np.arange(10)
y=np.array([1,2,3,4])
x[y]=97
x

array([ 0, 97, 97, 97, 97,  5,  6,  7,  8,  9])

Notice, though, that repeated indices with these operations can cause some potentially
unexpected results. Consider the following:

In [6]:
x[[0,0]]=[4,6]
x
# x[0] first changes to 4 then to 6

array([ 6, 97, 97, 97, 97,  5,  6,  7,  8,  9])

In [8]:
# but....
x=np.zeros(10)
x[[0,0]]-=10
x
# x[0] should be changed to -10 and then -20 but as we see it is still -10

array([-10.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.])

Conceptually, this is because x[i] += 1 is meant as a shorthand of x[i] = x[i] + 1.
x[i] + 1 is evaluated, and then the result is assigned to the indices in x. With this in
mind, it is not the augmentation that happens multiple times, but the assignment,
which leads to the rather nonintuitive results.

So what if you want the other behavior where the operation is repeated? For this, you
can use the at() method of ufuncs (available since NumPy 1.8), and do the following:

In [9]:
i=[2,3,3,4,4,4]
x=np.zeros(10)
np.add.at(x,i,1)
x

array([0., 0., 1., 2., 3., 0., 0., 0., 0., 0.])

The at() method does an in-place application of the given operator at the specified
indices (here, i) with the specified value (here, 1). Another method that is similar in
spirit is the reduceat() method of ufuncs, which you can read about in the NumPy
documentation.

## sorting arrays

Up to this point we have been concerned mainly with tools to access and operate on
array data with NumPy. This section covers algorithms related to sorting values in
NumPy arrays. These algorithms are a favorite topic in introductory computer science
courses: if you’ve ever taken one, you probably have had dreams (or, depending
on your temperament, nightmares) about insertion sorts, selection sorts, merge sorts,
quick sorts, bubble sorts, and many, many more. All are means of accomplishing a
similar task: sorting the values in a list or array.

For example, a simple selection sort repeatedly finds the minimum value from a list,
and makes swaps until the list is sorted. We can code this in just a few lines of Python

In [13]:
def selectionsort(x):
    for i in range(len(x)):
        swap=i+np.argmin(x[i:])
        (x[swap],x[i])=(x[i],x[swap])
    return x
x=np.array([2,3,1,6,4,0])
selectionsort(x)
# o(n^2) complexity

array([0, 1, 2, 3, 4, 6])

In [16]:
def bogosort(x):
    while np.any(x[:-1]>x[1:]):
        np.random.shuffle(x)
    print(x)
bogosort([9,8,6,5,7,3])
# algo that sorts on chance basis

[5, 8, 6, 9, 3, 7]


Fortunately, Python contains built-in sorting algorithms that are much more efficient
than either of the simplistic algorithms just shown. We’ll start by looking at the
Python built-ins, and then take a look at the routines included in NumPy and optimized
for NumPy arrays.

## fast sorting in numpy: np.sort() and np.argsort()

Although Python has built-in sort and sorted functions to work with lists, we won’t
discuss them here because NumPy’s np.sort function turns out to be much more
efficient and useful for our purposes. By default np.sort uses an 0 (N log N ), quicksort
algorithm, though mergesort and heapsort are also available. For most applications,
the default quicksort is more than sufficient.

In [22]:
# does not modify original input array
x=np.array([1,5,3,8,5,2])
print(np.sort(x))
print(x)

[1 2 3 5 5 8]
[1 5 3 8 5 2]


In [23]:
# sort in- place
x.sort()
x

array([1, 2, 3, 5, 5, 8])

In [26]:
# A related function is argsort, which instead returns the indices of the sorted elements:
x=np.array([1,5,2,3,9,0,4])
y=np.argsort(x)
y
# 5th index element places first
# 0th index element places next and so on

array([5, 0, 2, 3, 6, 1, 4], dtype=int64)

In [27]:
# These indices can then be used (via fancy indexing) to construct the sorted array if desired:
x[y]

array([0, 1, 2, 3, 4, 5, 9])

### sorting along rows or columns

In [32]:
np.random.seed(0)
x=np.random.randint(0,10,(4,6))
x

array([[5, 0, 3, 3, 7, 9],
       [3, 5, 2, 4, 7, 6],
       [8, 8, 1, 6, 7, 7],
       [8, 1, 5, 9, 8, 9]])

In [33]:
# sort each column of x
np.sort(x,axis=0)

array([[3, 0, 1, 3, 7, 6],
       [5, 1, 2, 4, 7, 7],
       [8, 5, 3, 6, 7, 9],
       [8, 8, 5, 9, 8, 9]])

In [34]:
# sort each row of x
np.sort(x,axis=1)

array([[0, 3, 3, 5, 7, 9],
       [2, 3, 4, 5, 6, 7],
       [1, 6, 7, 7, 8, 8],
       [1, 5, 8, 8, 9, 9]])

## partial sorting : partitioning

Sometimes we’re not interested in sorting the entire array, but simply want to find the
K smallest values in the array. NumPy provides this in the np.partition function.
np.partition takes an array and a number K; the result is a new array with the smallest
K values to the left of the partition, and the remaining values to the right, in arbitrary
order:

In [36]:
x=np.array([4,7,2,7,5,9,0,5,3])
np.partition(x,3)
#Note that the first three values in the resulting array are the three smallest in the
# array, and the remaining array positions contain the remaining values. Within the
# two partitions, the elements have arbitrary order.


array([0, 3, 2, 4, 7, 9, 7, 5, 5])

In [40]:
# we can alos apply this on multidim array
np.random.seed(0)
x=np.random.randint(0,10,(4,6))
x

array([[5, 0, 3, 3, 7, 9],
       [3, 5, 2, 4, 7, 6],
       [8, 8, 1, 6, 7, 7],
       [8, 1, 5, 9, 8, 9]])

In [41]:
np.partition(x,2,axis=1)
# The result is an array where the first two slots in each row contain the smallest values
# from that row, with the remaining values filling the remaining slots.

array([[0, 3, 3, 5, 7, 9],
       [2, 3, 4, 5, 7, 6],
       [1, 6, 7, 8, 8, 7],
       [1, 5, 8, 9, 8, 9]])

Finally, just as there is a np.argsort that computes indices of the sort, there is a
np.argpartition that computes indices of the partition.

## numpy structured data , structured array

While often our data can be well represented by a homogeneous array of values,
sometimes this is not the case. This section demonstrates the use of NumPy’s structured
arrays and record arrays, which provide efficient storage for compound, hetero‐
geneous data. While the patterns shown here are useful for simple operations,
scenarios like this often lend themselves to the use of Pandas DataFrames, which we’ll
explore

Imagine that we have several categories of data on a number of people (say, name,
age, and weight), and we’d like to store these values for use in a Python program. It
would be possible to store these in three separate arrays:

In [42]:
name=['alice','bob','cathy','doug']
age=[25,47,37,19]
weight=[55.0,85.5,68.0,61.5]

But this is a bit clumsy. There’s nothing here that tells us that the three arrays are
related; it would be more natural if we could use a single structure to store all of this
data. NumPy can handle this through structured arrays, which are arrays with compound
data types.

In [45]:
# We can similarly create a structured array using a compound data type specification:
data=np.zeros(4,dtype={'names':('name','age','weight'),'formats':('U10','i4','f8')})
print(data)

[('', 0, 0.) ('', 0, 0.) ('', 0, 0.) ('', 0, 0.)]


In [47]:
print(data.dtype)

[('name', '<U10'), ('age', '<i4'), ('weight', '<f8')]


Here 'U10' translates to “Unicode string of maximum length 10,” 'i4' translates to
“4-byte (i.e., 32 bit) integer,” and 'f8' translates to “8-byte (i.e., 64 bit) float.” We’ll
discuss other options for these type codes in the following section.
Now that we’ve created an empty container array, we can fill the array with our lists of
values:

In [48]:
data['name']=name
data['age']=age
data['weight']=weight

In [49]:
print(data)

[('alice', 25, 55. ) ('bob', 47, 85.5) ('cathy', 37, 68. )
 ('doug', 19, 61.5)]


In [50]:
data['name']

array(['alice', 'bob', 'cathy', 'doug'], dtype='<U10')

As we had hoped, the data is now arranged together in one convenient block of
memory.
The handy thing with structured arrays is that you can now refer to values either by
index or by name:

In [52]:
# get all names
data['name']

array(['alice', 'bob', 'cathy', 'doug'], dtype='<U10')

In [54]:
# get firts row of data
data[0]

('alice', 25, 55.)

In [55]:
data[0]['name']

'alice'

In [56]:
data[0][0]

'alice'

In [58]:
# get name where age is less than 30
data[data['age']<30]['name']

array(['alice', 'doug'], dtype='<U10')

In [59]:
data['name'][data['age']<30]

array(['alice', 'doug'], dtype='<U10')

Note that if you’d like to do any operations that are any more complicated than these,
you should probably consider the Pandas package, covered in the next chapter. As
we’ll see, Pandas provides a DataFrame object, which is a structure built on NumPy
arrays that offers a variety of useful data manipulation functionality similar to what
we’ve shown here, as well as much, much more.

# This ends numpy tutorial. Refer numpy documentation for more.