### NumPy Essentials by Tanmay Dutta; Leo Chin

In [1]:
import numpy as np

In [2]:
x = np.array([[1,2,3], [4,5,6]])

In [15]:
x.flags

  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False

In [16]:
c_array = np.random.rand(10000, 10000)

In [17]:
f_array = np.asfortranarray(c_array)

In [18]:
c_array.flags

  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False

In [19]:
f_array.flags

  C_CONTIGUOUS : False
  F_CONTIGUOUS : True
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False

In [20]:
%timeit np.sum(c_array[0, :]) #summing 0th row for c_array

35 µs ± 3.98 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [21]:
%timeit np.sum(f_array[0, :]) #summing 0th row for f_array

373 µs ± 27.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [22]:
%timeit np.sum(c_array[:, 0]) #summing 0th column for c_array

347 µs ± 21.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [23]:
%timeit np.sum(f_array[:, 0]) #summing 0th column for f_array

29.9 µs ± 1.65 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [24]:
'''
The above 4 operations show that C-style (C language) arrays are faster for row-wise operations.
And F-style (Fortran) arrays are faster for column-wise operations.
'''

'\nThe above 4 operations show that C-style (C language) arrays are faster for row-wise operations.\nAnd F-style (Fortran) arrays are faster for column-wise operations.\n'

In [30]:
'''This is because, in a C array, elements in a row are laid out in successive memory locations.
The opposite is true for a Fortran array, where the elements of a column are laid out
in consecutive memory locations.'''

'This is because, in a C array, elements in a row are laid out in successive memory locations.\nThe opposite is true for a Fortran array, where the elements of a column are laid out\nin consecutive memory locations.'

In [31]:
x = np.random.rand(10,10)

y = x[:5, :] #this creates a view. Modifying elements in this view will also modify elements of the original array x.

np.may_share_memory(x, y) 

True

In [32]:
'''The may_share_memory function in NumPy miscellaneous routines can be used to determine whether two arrays are copies 
or views of each other. While this method does the job in most cases, it is not always reliable, since it uses heuristics.
It may return incorrect results too. For introductory purposes, however, we shall take it for granted.'''

'The may_share_memory function in NumPy miscellaneous routines can be used to determine whether two arrays are copies \nor views of each other. While this method does the job in most cases, it is not always reliable, since it uses heuristics.\nIt may return incorrect results too. For introductory purposes, however, we shall take it for granted.'

In [34]:
y[:] = 0

# Now check elements of x

x

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.01369553, 0.74449898, 0.50824603, 0.83819715, 0.07781696,
        0.20485961, 0.60296608, 0.60276955, 0.82298139, 0.34480932],
       [0.83643777, 0.9607265 , 0.91440298, 0.70411156, 0.67829876,
        0.25572621, 0.33493284, 0.9550057 , 0.93820347, 0.01740083],
       [0.08227568, 0.69059438, 0.6201081

In [35]:
# Let's try again with a copy and not a view

x = np.random.rand(10,10)

y[:] = x[:5, :] # this is a copy

In [37]:
y[:] = 0

# Now check elements of x

x

array([[0.11777719, 0.17904195, 0.94989867, 0.38917789, 0.93359187,
        0.884827  , 0.58365713, 0.47753229, 0.04214265, 0.85287812],
       [0.2698905 , 0.20760531, 0.93294485, 0.22842301, 0.11672957,
        0.60568595, 0.93875626, 0.13736395, 0.43786093, 0.37835104],
       [0.05782976, 0.81489142, 0.46995933, 0.66980721, 0.92645448,
        0.55479517, 0.16014735, 0.4405296 , 0.3044386 , 0.30566531],
       [0.47109452, 0.5253076 , 0.3159031 , 0.11847112, 0.68778373,
        0.58726745, 0.31804931, 0.13462629, 0.51856434, 0.82144245],
       [0.70665438, 0.48028079, 0.44165838, 0.48088404, 0.38455961,
        0.56821907, 0.66594792, 0.32613477, 0.61608297, 0.12193172],
       [0.36069685, 0.2484328 , 0.78737387, 0.8623491 , 0.84097917,
        0.68858218, 0.21870698, 0.73834343, 0.63905077, 0.49066101],
       [0.0756877 , 0.80345764, 0.6562534 , 0.20698442, 0.93212547,
        0.84896931, 0.68377068, 0.61483339, 0.85870505, 0.965301  ],
       [0.92436568, 0.76060896, 0.7220456

In [38]:
int(0.73848)

0

In [39]:
int(1.233)

1

In [40]:
'''NumPy's int function truncates a float to the nearest integer toward zero (this is an equivalent of the floor function)'''

"NumPy's int function truncates a float to the nearest integer toward zero (this is an equivalent of the floor function)"

In [45]:
np.eye(4, 3) # returns an identity matrix with diagonal=1

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 0.]])

In [46]:
z = np.array(['nikhil', 1])

In [47]:
z.dtype

dtype('<U6')

In [49]:
z # 1 is converted to string as you can see

array(['nikhil', '1'], dtype='<U6')

In [50]:
x = np.array([1,2,3])

y = np.array([[1,2,3], [4,5,6]])

x == y

array([[ True,  True,  True],
       [False, False, False]])

In [51]:
a = np.arange(12)

In [52]:
a

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [56]:
a.reshape(2, 3, -1) # - 1 means calculate automatically the remaining dimension's value

array([[[ 0,  1],
        [ 2,  3],
        [ 4,  5]],

       [[ 6,  7],
        [ 8,  9],
        [10, 11]]])

In [58]:
a.reshape(2, 3, -1).shape # you can see that in output '2' for columns was calculated automatically

(2, 3, 2)

In [60]:
a = np.arange(1000).reshape(2, 5, -1)

In [62]:
%timeit a.flatten()

3.58 µs ± 157 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [63]:
%timeit a.ravel()

777 ns ± 39 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [65]:
''' Both the FLATTEN and RAVEL function flatten out the dimensions. 
np.flatten() creates a copy from the original array, while np.ravel() just changes the view.
Hence, flatten takes longer.
'''

' Both the FLATTEN and RAVEL function flatten out the dimensions. \nnp.flatten() creates a copy from the original array, while np.ravel() just changes the view.\nHence, flatten takes longer.\n'

#### Strides

In [66]:
x = np.arange(8)

x

array([0, 1, 2, 3, 4, 5, 6, 7])

In [69]:
x.dtype

dtype('int32')

In [68]:
x.strides

(4,)

In [74]:
str(x.data)

'<memory at 0x000002638EC81C48>'

In [76]:
x.resize(2,4)

x

array([[0, 1, 2, 3],
       [4, 5, 6, 7]])

In [77]:
x.strides

(16, 4)

In [88]:
x = np.ones(1000)

y = np.ones(1000 * 10)[::10] # take only every 10th element

x.size == y.size

True

In [89]:
x.flags

  C_CONTIGUOUS : True
  F_CONTIGUOUS : True
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False

In [90]:
y.flags

  C_CONTIGUOUS : False
  F_CONTIGUOUS : False
  OWNDATA : False
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False

In [95]:
'''The above indicates that for x, the memory utilization is continuous, while for y it is not.
Hence any operation involving y will take longer time.
The reason for the difference in performance is that the CPU is pulling data from the main memory to its cache in blocks,
and the smaller stride means fewer transfers are needed. 
'''

'The above indicates that for x, the memory utilization is continuous, while for y it is not.\nHence any operation involving y will take longer time.\nThe reason for the difference in performance is that the CPU is pulling data from the main memory to its cache in blocks,\nand the smaller stride means fewer transfers are needed. \n'

In [91]:
x.strides

(8,)

In [92]:
y.strides # shows how strides for y are larger

(80,)

In [93]:
%timeit np.sum(x)

8.36 µs ± 2.71 µs per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [94]:
%timeit np.sum(y)

19.9 µs ± 3.37 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


#### Matrix

In [100]:
x = np.arange(9).reshape(3,3)

print(x)

[[0 1 2]
 [3 4 5]
 [6 7 8]]


In [101]:
x_m = np.matrix(x)

x_m

matrix([[0, 1, 2],
        [3, 4, 5],
        [6, 7, 8]])

In [102]:
type(x_m)

numpy.matrix

In [103]:
y_m = np.mat(x)

y_m

matrix([[0, 1, 2],
        [3, 4, 5],
        [6, 7, 8]])

In [104]:
type(y_m)

numpy.matrix

In [105]:
'''numpy.matrix() creates a copy while numpy.mat() changes the view only'''

'numpy.matrix() creates a copy while numpy.mat() changes the view only'

In [109]:
x_m.H # Hermitian Transpose. x_m.I will give inverse.

matrix([[0, 3, 6],
        [1, 4, 7],
        [2, 5, 8]])

In [110]:
x = np.arange(25000000).reshape(5000,5000)

In [111]:
%timeit x.T 

768 ns ± 88.4 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [112]:
y = np.mat(x)

%timeit y.T

3.11 µs ± 202 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [114]:
# The above shows operations on matrices are slower than arrays

In [115]:
'''Therefore, ndarray is preferred when doing linear algebra especially for large sets of data considering its performance.
Use matrix only when necessary.'''

'Therefore, ndarray is preferred when doing linear algebra especially for large sets of data considering its performance.\nUse matrix only when necessary.'