#  this tutorial refers 'python data science handbook'

In [1]:
import numpy as np

# to check which version we are using
np.version.version

In [2]:
 np.version.version

'1.14.3'

# to create a array in numpy

In [3]:
np.array([1,2,3,4,5])

array([1, 2, 3, 4, 5])

In [4]:
# type np. and then press tab to see all components
np

<module 'numpy' from 'C:\\Users\\Mayank\\Anaconda3\\lib\\site-packages\\numpy\\__init__.py'>

In [5]:
# to view numpy documentation
np?

A Python Integer Is More Than Just an Integer
The standard Python implementation is written in C. This means that every Python
object is simply a cleverly disguised C structure, which contains not only its value, but
other information as well. For example, when we define an integer in Python, such as
x = 10000, x is not just a “raw” integer. It’s actually a pointer to a compound C structure,
which contains several values. Looking through the Python 3.4 source code, we
find that the integer (long) type definition effectively looks like this (once the C macros
are expanded):

struct _longobject {
long ob_refcnt;
PyTypeObject *ob_type;
size_t ob_size;
long ob_digit[1];
};
A single integer in Python 3.4 actually contains four pieces:
• ob_refcnt, a reference count that helps Python silently handle memory allocation
and deallocation
• ob_type, which encodes the type of the variable
• ob_size, which specifies the size of the following data members
• ob_digit, which contains the actual integer value that we expect the Python variable
to represent


# difference between a python list and numpy array

In [6]:
# a list in python
l=list(range(10))
l

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [7]:
type(l)

list

In [8]:
type(l[0])

int

In [9]:
l1=[str(c) for c in l]
l1

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

In [10]:
type(l1)

list

In [11]:
type(l1[0])

str

In [12]:
l2=[2,1,'1',bool,True]
p2=[type(c) for c in l2]

In [13]:
p2

[int, int, str, type, bool]

But this flexibility comes at a cost: to allow these flexible types, each item in the list
must contain its own type info, reference count, and other information—that is, each
item is a complete Python object. In the special case that all variables are of the same
type, much of this information is redundant: it can be much more efficient to store
data in a fixed-type array. The difference between a dynamic-type list and a fixed-type
(NumPy-style) array is illustrated in Figure 2-2.
At the implementation level, the array essentially contains a single pointer to one contiguous
block of data. The Python list, on the other hand, contains a pointer to a
block of pointers, each of which in turn points to a full Python object like the Python
integer we saw earlier. Again, the advantage of the list is flexibility: because each list
element is a full structure containing both data and type information, the list can be
filled with data of any desired type. Fixed-type NumPy-style arrays lack this flexibility,
but are much more efficient for storing and manipulating data.

# in built in array module for fixed type ,efficient data

In [14]:
import array

In [15]:
l=list(range(10))

In [16]:
Array=array.array('i',l)
Array
# here 'i' represents it is a array of type integer

array('i', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

# numpy arrays

In [17]:
import numpy as np

In [18]:
# using a list to create an array
np.array([1,2,3,4,5])

array([1, 2, 3, 4, 5])

In [19]:
# all the types in numpy array should be same if not values are upcasted if possible
np.array([1.0,2,3,4,5])
# converts everyone integer to floating point

array([1., 2., 3., 4., 5.])

In [20]:
np.array([1,2,3,4,'5'])

array(['1', '2', '3', '4', '5'], dtype='<U11')

In [21]:
# if we want to set data type of a array explicitly
np.array([1,2,3,4,5],dtype='float32')

array([1., 2., 3., 4., 5.], dtype=float32)

In [22]:
# creating a multidimensional array in numpy
x=np.array([range(3)])
print(x)
np.array([range(i,i+3) for i in [2,4,6]])

[[0 1 2]]


array([[2, 3, 4],
       [4, 5, 6],
       [6, 7, 8]])

#  create a numpy array from scratch

In [23]:
# create a size specified array of only 0, np.zeros()
np.zeros(10)

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [24]:
# mention data type of array explicitly
np.zeros(10,dtype=int)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [25]:
# create a 3x5 array filled with one,np.ones()
np.ones((3,5),dtype=float)

array([[1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.]])

In [26]:
# a 3x5 matrix filled with zeros
np.zeros((3,5),dtype=int)

array([[0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0]])

In [27]:
# create a np array filled with 3.14,np.full()
np.full(10,3.14)

array([3.14, 3.14, 3.14, 3.14, 3.14, 3.14, 3.14, 3.14, 3.14, 3.14])

In [28]:
np.full((3,5),3.14)

array([[3.14, 3.14, 3.14, 3.14, 3.14],
       [3.14, 3.14, 3.14, 3.14, 3.14],
       [3.14, 3.14, 3.14, 3.14, 3.14]])

In [29]:
# Create an array filled with a linear sequence
# Starting at 0, ending at 20, stepping by 2
np.arange(0,20,2)

array([ 0,  2,  4,  6,  8, 10, 12, 14, 16, 18])

In [30]:
# create a equi spaced array from 0 to 1 with 5 elements
np.linspace(0,1,5)

array([0.  , 0.25, 0.5 , 0.75, 1.  ])

In [31]:
# create a 3x3 array filled with random value between 0 and 1
np.random.random((3,5))

array([[0.13604418, 0.19495359, 0.97097911, 0.81342699, 0.36382979],
       [0.26037483, 0.20992484, 0.64325431, 0.60785433, 0.3937888 ],
       [0.71626687, 0.78200178, 0.92975041, 0.52324586, 0.50512568]])

In [32]:
# create a 3x3e array with mean 0 and standard deviation 1
np.random.normal(0,1,(3,3))

array([[ 0.50401848, -0.91141391,  2.19619312],
       [-1.46229609, -0.17700882, -0.78642062],
       [-0.9390557 , -1.11165723, -1.30415669]])

In [33]:
# mean 2 and standard deviation 2
np.random.normal(2,2,(3,3))

array([[-0.22551496,  1.38874871,  2.17338875],
       [-1.2935523 ,  0.45328552,  1.75591627],
       [ 1.45329033,  0.99574794,  3.10991091]])

In [34]:
# create a 3x3 array fileed with random values from 0 to 10
np.random.randint(0,10,(3,3))

array([[9, 4, 7],
       [7, 2, 1],
       [5, 6, 9]])

In [35]:
# create a 3x3 identity matrix
np.eye(3,dtype=int)

array([[1, 0, 0],
       [0, 1, 0],
       [0, 0, 1]])

In [36]:
# create a empty array of size 3 ,gets filled with whatever values are already exist
np.empty((3,3))

array([[0.22551496, 1.38874871, 2.17338875],
       [1.2935523 , 0.45328552, 1.75591627],
       [1.45329033, 0.99574794, 3.10991091]])

In [37]:
np.empty(3)

array([0., 0., 0.])

#  numpy standard data types

In [38]:
np.zeros(10,dtype=int)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [39]:
np.zeros(10,dtype=float)

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [40]:
np.zeros(10,dtype='int16')

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int16)

In [41]:
np.zeros(10,dtype=np.int16)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int16)

In [42]:
# refer page 41 of the book to view more supported dtypes

#  basics of numpy array

This section
will present several examples using NumPy array manipulation to access data
and subarrays, and to split, reshape, and join the arrays.

## array attributes

In [43]:
np.random.seed(0)
x1=np.random.randint(10,size=6)

In [44]:
# every np array has attributes ndim,shape,size for its dimension,shape, and total size
x1.ndim

1

In [45]:
x1.shape

(6,)

In [46]:
x1.size

6

In [47]:
x2=np.random.randint(10,size=(2,3))

In [48]:
x2.ndim

2

In [49]:
x2.shape

(2, 3)

In [50]:
x2.size

6

In [51]:
# we also have dtype to see data type of array
x2.dtype

dtype('int32')

In [52]:
x2.itemsize

4

In [53]:
x2.nbytes

24

In [54]:
# accessing array elememts
x1

array([5, 0, 3, 3, 7, 9])

In [55]:
x1[0]

5

In [56]:
# to index from the end use -1 and so on
x1[-1]

9

In [57]:
x1[-2]

7

In [58]:
x2

array([[3, 5, 2],
       [4, 7, 6]])

In [59]:
# to access elements in a multidim array
print(x2[0,0])
print(x2[(0,0)])

3
3


In [60]:
print(x2[0,1])

5


In [61]:
## modify values using abov notation
x2[0,1]=10

In [62]:
x2

array([[ 3, 10,  2],
       [ 4,  7,  6]])

In [63]:
# if we assign a float value it will be truncated because its a int array
x2[0,0]=99.15
x2

array([[99, 10,  2],
       [ 4,  7,  6]])

## array slicing,sub arrays

Just as we can use square brackets to access individual array elements, we can also use
them to access subarrays with the slice notation, marked by the colon (:) character.
The NumPy slicing syntax follows that of the standard Python list; to access a slice of
an array x, use this:
x[start:stop:step]
If any of these are unspecified, they default to the values start=0, stop=size of
dimension, step=1.

In [64]:
x=np.arange(10)
x

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [65]:
# get first 5 element subarray
x[:5]

array([0, 1, 2, 3, 4])

In [66]:
x[2:5]

array([2, 3, 4])

In [67]:
x[5:]

array([5, 6, 7, 8, 9])

In [68]:
# steps of 2
x[::2]

array([0, 2, 4, 6, 8])

In [69]:
x[1::2]

array([1, 3, 5, 7, 9])

In [70]:
# if step value is negative , start and stop are swapped, easy way to reverse an array
x[::-1]

array([9, 8, 7, 6, 5, 4, 3, 2, 1, 0])

In [71]:
x[5::-2]

array([5, 3, 1])

### multidimensional array slicing

In [72]:
x2=np.random.randint(20,size=(3,4))

In [73]:
x2

array([[12,  1,  6,  7],
       [14, 17,  5, 13],
       [ 8,  9, 19, 16]])

In [74]:
x2[:,:]

array([[12,  1,  6,  7],
       [14, 17,  5, 13],
       [ 8,  9, 19, 16]])

In [75]:
# upto second row and second column
x2[:2,:2]

array([[12,  1],
       [14, 17]])

In [76]:
# all rows alternate columns
x2[:,::2]

array([[12,  6],
       [14,  5],
       [ 8, 19]])

In [77]:
# reversin array altogether
x2[::-1,::-1]

array([[16, 19,  9,  8],
       [13,  5, 17, 14],
       [ 7,  6,  1, 12]])

In [78]:
# accessing 1st column of an array
print(x2)
print(x2[:,0])

[[12  1  6  7]
 [14 17  5 13]
 [ 8  9 19 16]]
[12 14  8]


In [79]:
# print first row of x2
print(x2[0,:])

[12  1  6  7]


In [80]:
# also can be used
print(x2[0])

[12  1  6  7]


One important—and extremely useful—thing to know about array slices is that they
return views rather than copies of the array data. This is one area in which NumPy
array slicing differs from Python list slicing: in lists, slices will be copies.

In [81]:
print(x2)

[[12  1  6  7]
 [14 17  5 13]
 [ 8  9 19 16]]


In [82]:
x2_sub=x2[:2,:2]

In [83]:
x2_sub

array([[12,  1],
       [14, 17]])

In [84]:
x2_sub[0,0]=45

In [85]:
# we see that x2 is changed as well despite making changes only in x2_sub, 
# this is because np array slicing return views instead of copy of array data
print(x2)

[[45  1  6  7]
 [14 17  5 13]
 [ 8  9 19 16]]


### create copies

In [86]:
# to create copy we use copy()
x2_sub_copy=x2[:2,:2].copy()

In [87]:
x2_sub_copy

array([[45,  1],
       [14, 17]])

In [88]:
x2_sub_copy[0,0]=100

In [89]:
# x2 has no change in this case
x2

array([[45,  1,  6,  7],
       [14, 17,  5, 13],
       [ 8,  9, 19, 16]])

## reshaping arrays

In [90]:
np.arange(9)

array([0, 1, 2, 3, 4, 5, 6, 7, 8])

In [91]:
# to reshape easiest way is to use reshape()
np.arange(9).reshape((3,3))

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [92]:
# note that for this to work size of original array should match the size of the new array
np.arange(9).reshape((3,4))

ValueError: cannot reshape array of size 9 into shape (3,4)

In [None]:
#Another common reshaping pattern is the conversion of a one-dimensional array
#into a two-dimensional row or column matrix. You can do this with the reshape
#method, or more easily by making use of the newaxis keyword within a slice operation:
x=np.array([1,2,3])
x.reshape((1,3))

In [None]:
#can also use np.newaxis in slicing mode
x[np.newaxis,:]

In [None]:
x.reshape((3,1))

In [None]:
x[:,np.newaxis]

## array concatination and splitting

All of the preceding routines worked on single arrays. It’s also possible to combine
multiple arrays into one, and to conversely split a single array into multiple arrays.
We’ll take a look at those operations here.

### concatenation of arrays

Concatenation, or joining of two arrays in NumPy, is primarily accomplished
through the routines np.concatenate, np.vstack, and np.hstack. np.concatenate
takes a tuple or list of arrays as its first argument, as we can see here:

In [None]:
x=np.array([1,2,3])
y=np.array([4,5,6])
np.concatenate([x,y])

In [None]:
z=np.array([7,8,9])
np.concatenate([x,y,z])

In [None]:
p=np.array([[1,2,3],[4,5,6]])
q=np.array([[7,8,9],[10,11,12]])
np.concatenate([p,q])

In [None]:
np.concatenate([p,q],axis=1)

In [None]:
# vertical stack
x=np.array([1,2,3])
y=np.array([[4,5,6],[7,8,9]])
np.vstack([x,y])

In [None]:
x=np.array([[99],[99]])
np.hstack([x,y])

### splitting array

The opposite of concatenation is splitting, which is implemented by the functions
np.split, np.hsplit, and np.vsplit. For each of these, we can pass a list of indices
giving the split points:

In [None]:
x=np.array([1,2,3,4,5,6,7,8,9])

In [None]:
#Notice that N split points lead to N + 1 subarrays.
y1,y2,y3=np.split(x,[3,5])

In [None]:
y1

In [None]:
y2

In [None]:
y3

In [None]:
y1,y2,y3,y4=np.split(x,[3,5,8])
print(y1)
print(y2)
print(y3)
print(y4)

In [None]:
grid=np.arange(16).reshape([4,4])

In [None]:
grid

In [None]:
upper,lower=np.vsplit(grid,[2])
# try changing value of 2 to see result change

In [None]:
print(upper)
print(lower)

In [None]:
left,right=np.hsplit(grid,[2])
grid

In [None]:
print(left)
print(right)

## computation on numpy arrays

Up until now, we have been discussing some of the basic nuts and bolts of NumPy; in
the next few sections, we will dive into the reasons that NumPy is so important in the
Python data science world. Namely, it provides an easy and flexible interface to optimized
computation with arrays of data.
Computation on NumPy arrays can be very fast, or it can be very slow. The key to
making it fast is to use vectorized operations, generally implemented through Num‐
Py’s universal functions (ufuncs). This section motivates the need for NumPy’s ufuncs,
which can be used to make repeated calculations on array elements much more efficient.
It then introduces many of the most common and useful arithmetic ufuncs
available in the NumPy package.

In [94]:
# lets calculate reciprocal of an array elements
l=np.random.randint(1,10,size=5)
l

array([9, 2, 4, 4, 4])

In [130]:
def reciprocal(l):
    output=np.empty(len(l))
    for i in range(len(l)):
        output[i]=1/l[i]
    (output)

In [133]:
# lets calculate time taken for this loop for 5 elements by using timeit
%timeit reciprocal(l)

496 ms ± 94.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [120]:
# lets calculate for 1000000 elements
l=np.arange(1,1000001)
print(len(l))

1000000


In [114]:
# this takes lot of time to compute
%timeit reciprocal(l)

398 ms ± 36 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


It takes several seconds to compute these million operations and to store the result!
When even cell phones have processing speeds measured in Giga-FLOPS (i.e., billions
of numerical operations per second), this seems almost absurdly slow. It turns
out that the bottleneck here is not the operations themselves, but the type-checking
and function dispatches that CPython must do at each cycle of the loop. Each time
the reciprocal is computed, Python first examines the object’s type and does a
dynamic lookup of the correct function to use for that type. If we were working in
compiled code instead, this type specification would be known before the code executes
and the result could be computed much more efficiently.

In [131]:
%timeit reciprocal(l)

387 ms ± 34 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [132]:
# directly performing operation on array
%timeit (1.0/l)

4.85 ms ± 188 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


Looking at the execution time for our big array, we see that it completes orders of
magnitude faster than the Python loop:
For many types of operations, NumPy provides a convenient interface into just this
kind of statically typed, compiled routine. This is known as a vectorized operation.
You can accomplish this by simply performing an operation on the array, which will
then be applied to each element. This vectorized approach is designed to push the
loop into the compiled layer that underlies NumPy, leading to much faster execution.

Vectorized operations in NumPy are implemented via ufuncs, whose main purpose is
to quickly execute repeated operations on values in NumPy arrays. Ufuncs are
extremely flexible—before we saw an operation between a scalar and an array, but we
can also operate between two arrays:

In [134]:
np.arange(5)/np.arange(1,6)

array([0.        , 0.5       , 0.66666667, 0.75      , 0.8       ])

And ufunc operations are not limited to one-dimensional arrays—they can act on
multidimensional arrays as well:

In [140]:
x=np.arange(9).reshape((3,3))
x

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [138]:
#each element replaced by 2^that element
2**x

array([[  1,   2,   4],
       [  8,  16,  32],
       [ 64, 128, 256]], dtype=int32)

In [142]:
# each element^2
x**2

array([[ 0,  1,  4],
       [ 9, 16, 25],
       [36, 49, 64]], dtype=int32)

## exploring numpy's ufuncs 

Ufuncs exist in two flavors: unary ufuncs, which operate on a single input, and binary
ufuncs, which operate on two inputs. We’ll see examples of both these types of functions
here.

### array arithmetic

In [144]:
x=np.arange(1,5)
x

array([1, 2, 3, 4])

In [145]:
print("x+5=",x+5)

x+5= [6 7 8 9]


In [146]:
print("x-5=",x-5)

x-5= [-4 -3 -2 -1]


In [147]:
print("x/2=",x/2)

x/2= [0.5 1.  1.5 2. ]


In [150]:
print("x*2=",x*2)

x*2= [2 4 6 8]


In [151]:
print("x//2=",x//2)

x//2= [0 1 1 2]


In [153]:
# ** for exponent
# % for modulus
print(x**2)
print(x%2)

[ 1  4  9 16]
[1 0 1 0]


In [156]:
# In addition, these can be strung together however you wish, and the standard order
# of operations is respected:
-(.5*x+1)**2

array([-2.25, -4.  , -6.25, -9.  ])

In [165]:
# arithmetic operations implemented in numpy
print(x)
print(np.add(x,2)) # x+2
print(np.subtract(x,2)) #x-2
print(np.negative(x)) #-x
print(np.multiply(x,2)) #x*2
print(np.divide(x,2)) #x/2
print(np.floor_divide(x,2)) #x//2
print(np.power(x,2)) #x^2
print(np.mod(x,2)) #x%2
# we will also see boolean and bitwise operations later

[1 2 3 4]
[3 4 5 6]
[-1  0  1  2]
[-1 -2 -3 -4]
[2 4 6 8]
[0.5 1.  1.5 2. ]
[0 1 1 2]
[ 1  4  9 16]
[1 0 1 0]


In [170]:
# inbuilt absolute function
y=np.array([-1,-2,-3,-4])
print(abs(y))
print(np.absolute(y))
print(np.abs(y))

[1 2 3 4]
[1 2 3 4]
[1 2 3 4]


### trigonometric functions

In [175]:
theta=np.linspace(0,np.pi,3)
theta

array([0.        , 0.00872665, 0.01745329])

In [173]:
x=np.sin(theta)
x

array([0.0000000e+00, 1.0000000e+00, 1.2246468e-16])

In [176]:
np.cos(theta)

array([1.        , 0.99996192, 0.9998477 ])

In [177]:
np.tan(theta)

array([0.        , 0.00872687, 0.01745506])

In [178]:
#The values are computed to within machine precision, which is why values that
#should be zero do not always hit exactly zero. Inverse trigonometric functions are also
#available:

np.arcsin(x)

array([0.00000000e+00, 1.57079633e+00, 1.22464680e-16])

In [179]:
# exponential  
np.exp(x) #e^x

array([1.        , 2.71828183, 1.        ])

In [180]:
x=np.array([1,0,2]) #2^x
np.exp2(x)

array([2., 1., 4.])

In [181]:
np.power(3,x)
#3^x

array([3, 1, 9], dtype=int32)

In [184]:
# log functions
x=np.array([1,2,3,4])
np.log(x) #ln(x)

array([0.        , 0.69314718, 1.09861229, 1.38629436])

In [185]:
np.log2(x) #log2(x)

array([0.       , 1.       , 1.5849625, 2.       ])

In [186]:
np.log10(x) #log10(x)

array([0.        , 0.30103   , 0.47712125, 0.60205999])

In [191]:
# when x is very small ,use this function istead of above discussed..these are expm1 and log1p as they give more accurate result for smaller value
x=np.array([0,0.1,0.01,0.001])

In [192]:
np.expm1(x) #exp(x)-1

array([0.        , 0.10517092, 0.01005017, 0.0010005 ])

In [193]:
np.log1p(x) # log(1+x)

array([0.        , 0.09531018, 0.00995033, 0.0009995 ])

# advanced ufuncs features

### specifying output

For large calculations, it is sometimes useful to be able to specify the array where the
result of the calculation will be stored. Rather than creating a temporary array, you
can use this to write computation results directly to the memory location where you’d
like them to be. For all ufuncs, you can do this using the out argument of the
function:

In [197]:
x=np.arange(5)
print("value in x =",x)
y=np.empty(5,dtype=int)
np.multiply(x,2,out=y)
print("value of y =",y)

value in x = [0 1 2 3 4]
value of y = [0 2 4 6 8]


In [199]:
#This can even be used with array views. For example, we can write the results of a
#computation to every other element of a specified array:

y=np.zeros(10)
np.power(2,x,out=y[::2])
print(y)

[ 1.  0.  2.  0.  4.  0.  8.  0. 16.  0.]


### aggregates

For binary ufuncs, there are some interesting aggregates that can be computed
directly from the object. For example, if we’d like to reduce an array with a particular
operation, we can use the reduce method of any ufunc. A reduce repeatedly applies a
given operation to the elements of an array until only a single result remains.


For example, calling reduce on the add ufunc returns the sum of all elements in the
array:

In [202]:
x=np.arange(1,6)
print(x)
np.add.reduce(x)

[1 2 3 4 5]


15

In [201]:
np.multiply.reduce(x)

120

In [203]:
# if we like to store all the intermediate values , we can use accumulate
np.add.accumulate(x)

array([ 1,  3,  6, 10, 15], dtype=int32)

In [204]:
np.multiply.accumulate(x)

array([  1,   2,   6,  24, 120], dtype=int32)

### outer product

Finally, any ufunc can compute the output of all pairs of two different inputs using
the outer method. This allows you, in one line, to do things like create a multiplication
table:

In [205]:
x=np.arange(1,6)
np.multiply.outer(x,x)

array([[ 1,  2,  3,  4,  5],
       [ 2,  4,  6,  8, 10],
       [ 3,  6,  9, 12, 15],
       [ 4,  8, 12, 16, 20],
       [ 5, 10, 15, 20, 25]])

In [207]:
np.add.outer(x,x)

array([[ 2,  3,  4,  5,  6],
       [ 3,  4,  5,  6,  7],
       [ 4,  5,  6,  7,  8],
       [ 5,  6,  7,  8,  9],
       [ 6,  7,  8,  9, 10]])

## aggregations: min, max and everything in between

Often when you are faced with a large amount of data, a first step is to compute summary
statistics for the data in question. Perhaps the most common summary statistics
are the mean and standard deviation, which allow you to summarize the “typical” values
in a dataset, but other aggregates are useful as well (the sum, product, median,
minimum and maximum, quantiles, etc.).

NumPy has fast built-in aggregation functions for working on arrays; we’ll discuss
and demonstrate some of them here.

### summing values in array

In [214]:
#As a quick example, consider computing the sum of all values in an array. Python
#itself can do this using the built-in sum function:
x=np.random.randint(1,100,size=10000)
sum(x)

495843

In [215]:
# numpy sum function
np.sum(x)

495843

In [216]:
# However, because it executes the operation in compiled code, NumPy’s version of the
#operation is computed much more quickly:
%timeit sum(x)

1.97 ms ± 134 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [217]:
%timeit np.sum(x)

16.5 µs ± 1.11 µs per loop (mean ± std. dev. of 7 runs, 100000 loops each)


Be careful, though: the sum function and the np.sum function are not identical, which
can sometimes lead to confusion! In particular, their optional arguments have different
meanings, and np.sum is aware of multiple array dimensions, as we will see in the
following section.

### minimum and maximum

Similarly, Python has built-in min and max functions, used to find the minimum value
and maximum value of any given array:

In [226]:
x=np.arange(50,100000)

In [231]:
# python inbuilt min and numpy min ,again where numpy implementation is much fatser
print(min(x))
np.min(x)

50


50

In [228]:
print(max(x))
np.max(x)

99999


99999

In [229]:
%timeit min(x)

13.9 ms ± 1.08 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [230]:
%timeit np.min(x)

174 µs ± 14.1 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [233]:
# we can also use methods of array objects
x.min(),x.max()

(50, 99999)

Whenever possible, make sure that you are using the NumPy version of these aggregates
when operating on NumPy arrays!

### multidimensional aggregates

One common type of aggregation operation is an aggregate along a row or column.
Say you have some data stored in a two-dimensional array:

In [235]:
x=np.random.random((3,4))
x

array([[0.74203788, 0.52694237, 0.49998955, 0.69659731],
       [0.58736027, 0.18849041, 0.1257693 , 0.40584001],
       [0.49408119, 0.08220491, 0.53566323, 0.96771264]])

In [237]:
# by default over entire table
x.sum()

5.852689072139742

Aggregation functions take an additional argument specifying the axis along which
the aggregate is computed. For example, we can find the minimum value within each
column by specifying axis=0:

In [240]:
# The function returns four values, corresponding to the four columns of numbers.
x.sum(axis=0)

array([1.82347934, 0.7976377 , 1.16142208, 2.07014996])

In [241]:
x.sum(axis=1)

array([2.46556711, 1.30745999, 2.07966197])

In [242]:
# to find max value within each row
x.max(axis=1)

array([0.74203788, 0.58736027, 0.96771264])

The way the axis is specified here can be confusing to users coming from other languages.
The axis keyword specifies the dimension of the array that will be collapsed,
rather than the dimension that will be returned. So specifying axis=0 means that the
first axis will be collapsed: for two-dimensional arrays, this means that values within
each column will be aggregated.

### other aggregate functions

NumPy provides many other aggregation functions, but we won’t discuss them in
detail here. Additionally, most aggregates have a NaN-safe counterpart that computes
the result while ignoring missing values, which are marked by the special IEEE
floating-point NaN value
Some of these NaN-safe functions were not added until
NumPy 1.8, so they will not be available in older NumPy versions.

In [243]:
np.sum(x)

5.852689072139742

In [245]:
# nan support sum aggregate
np.nansum(x)

5.852689072139742

np.sum np.nansum Compute sum of elements

np.prod np.nanprod Compute product of elements

np.mean np.nanmean Compute median of elements

np.std np.nanstd Compute standard deviation

np.var np.nanvar Compute variance

np.min np.nanmin Find minimum value

np.max np.nanmax Find maximum value

np.argmin np.nanargmin Find index of minimum value

np.argmax np.nanargmax Find index of maximum value

np.median np.nanmedian Compute median of elements

np.percentile np.nanpercentile Compute rank-based statistics of elements

np.any N/A Evaluate whether any elements are true

np.all N/A Evaluate whether all elements are true