Watch on youtube : https://youtu.be/1NfIZD1xBxQ

# Numpy Tricks while dealing with large dataset

In [None]:
#Import the libraries
import numpy as np
import time

## Tip 1 : Dont iterate over each entry , instead create an array of requried shape

In [None]:
start_time = time.time()

epochs = range(1000000)
result_array = np.empty((0)) 

for e in epochs:
  value = e * 5 
  np.append(result_array, [value])
    
elapsed_time = time.time() - start_time
elapsed_time

5.848477125167847

In [None]:
start_time = time.time()

epochs = range(1000000) 
result_array = np.zeros((len(epochs),)) 

for index, e in enumerate(epochs):
  value = e * 5 # do something
  result_array[index] = value
    
elapsed_time = time.time() - start_time
elapsed_time

0.29676342010498047

## Tip 2: Avoid using RAM for Large Arrays - instead use Disk for storage

In [None]:
results = np.ones((600,600,600,6))
results[2,4,5,1] = 100

In [None]:
import h5py

hdf5_store = h5py.File("./cache.hdf5", "a")
results = hdf5_store.create_dataset("results", (600,600,600,6), compression="gzip")
# do something...
results[2,4,5,1] = 100

## Tip 3 : Dont Access Array more than necessary

In [None]:
import numpy as np
import time

In [None]:
start = time.time()

ex_array = np.ones((200, 200, 200))

for _ in range(100000000):
    ex_array[50, 50, 100]

runtime = time.time() - start
runtime

20.55499792098999

In [None]:
start = time.time()

ex_array = np.ones((200, 200, 200))

x = ex_array[50, 50, 100] 

for _ in range(100000000):
    x
    
runtime = time.time() - start
runtime

5.534504652023315

## Tip 4 : Returning Index Location of an element - `np.where()`

In [None]:
arr_x = np.array([6, 6, 3, 5, 5, 0, 3, 2, 5, 1])
new_index = np.where(arr_x > 3)
print("The index of arrays where value > 3 is ", new_index)

The index of arrays where value > 3 is  (array([0, 1, 3, 4, 8]),)


In [None]:
arr_x.take(new_index)

array([[6, 6, 5, 5, 5]])

In [None]:
np.where(arr_x > 3, 'greater than 3', 'less than 3')

array(['greater than 3', 'greater than 3', 'less than 3',
       'greater than 3', 'greater than 3', 'less than 3', 'less than 3',
       'less than 3', 'greater than 3', 'less than 3'], dtype='<U14')

In [None]:
print('Index of max value: ', np.argmax(arr_x))  

#in the same way, we can return the index position of minimumm value of an array by writing as:
print('index of min value: ', np.argmin(arr_x))  

Index of max value:  0
index of min value:  5


## Tip 5 : Adding new Axis to a numpy array

In [None]:
x = np.arange(10)
print(x)
print(x.shape)

[0 1 2 3 4 5 6 7 8 9]
(10,)


In [None]:
x_col = x[:,np.newaxis,np.newaxis]
print("shape of new array is:",x_col.shape)
print(x_col)

shape of new array is: (10, 1, 1)
[[[0]]

 [[1]]

 [[2]]

 [[3]]

 [[4]]

 [[5]]

 [[6]]

 [[7]]

 [[8]]

 [[9]]]


In [None]:
x_row = x[np.newaxis,np.newaxis,:]
print("shape of new array is:",x_row.shape)
print(x_row)

shape of new array is: (1, 1, 10)
[[[0 1 2 3 4 5 6 7 8 9]]]
