In [1]:
import numpy as np

#### Comparing numpy with python data structures like list

In [2]:
arr1 = list(range(1000000))
arr2 = list(range(1000000, 2000000))

In [3]:
def dot_product(arr1, arr2):
    sum = 0
    for a, b in zip(arr1, arr2):
        sum += a * b
    return sum

In [4]:
%%time
dot_product(arr1, arr2)

Wall time: 353 ms


833332333333500000

In [5]:
# below we create numpy array by specifying dtype as int64 because  by default on a 64 bit machine, numpy create a integer of
# 32 bit and it doesn't take care of integer overflow for large integers nd you would get incorrect answer

# SO link - https://stackoverflow.com/questions/50671172/numpy-dot-giving-incorrect-answer-for-large-integers

# To confirm execute below statement - 
# np.array([1, 2, 3]).dtype
# If it returns dtype('int32') then specifying dtype is required else not needed

np_arr1 = np.array(arr1, dtype=np.int64)
np_arr2 = np.array(arr2, dtype=np.int64)

### `np.dot` performs element wise multiplication

In [6]:
%%time
np.dot(np_arr1, np_arr2)

Wall time: 3.95 ms


833332333333500000

#### We can see that the time taken by list is quite more as compared to numpy arrays

In [7]:
%%time
(np_arr1 * np_arr2).sum()

Wall time: 96.2 ms


833332333333500000

### The numpy array can store only a single type of data to make computation faster

Suppose we have to calculate the final temperature of different cities and let us assume that it depends on 3 factors f1, f2, f3. We have 3 cities Pune, Mumbai and Bangalore, also these 3 factors have pre-defined weights w1, w2 and w3 which have fixed values.

In [8]:
w1, w2, w3 = 0.1, 0.2, 0.3

In [9]:
pune = [10, 20, 30]
mumbai = [11, 12, 13]
bangalore = [12, 13, 14]

In [10]:
pune_temp = pune[0] * w1 + pune[1] * w2 + pune[2] * w3 # similarly we calculate the rest
pune_temp

14.0

We can simplify this by using numpy arrays

In [11]:
w = [0.1, 0.2, 0.3]
city_data = [pune, mumbai, bangalore]

# Creating numpy arrays
np_w = np.array(w)
np_city_data = np.array(city_data)

#### We will use matrix multiplication to simplufy the process, using `np.matmul` reduces it to one line of code. We can also use `@` to perform matrix multiplication

In [12]:
np.matmul(np_city_data, np_w)

array([14. ,  7.4,  8. ])

In [13]:
np_city_data @ np_w

array([14. ,  7.4,  8. ])

## Reading data from files

In [14]:
from urllib.request import urlretrieve

# The file would to downloaded to pwd (present working directory)
urlretrieve('https://hub.jovian.ml/wp-content/uploads/2020/08/climate.csv', 'climate.txt')

('climate.txt', <http.client.HTTPMessage at 0x214eac3a248>)

In [15]:
np_climate_data = np.genfromtxt('climate.txt', delimiter=',', skip_header=1)

In [16]:
np_climate_data.shape

(10000, 3)

In [17]:
np_climate_data[:5]

array([[25., 76., 99.],
       [39., 65., 70.],
       [59., 45., 77.],
       [84., 63., 38.],
       [66., 50., 52.]])

In [18]:
np_temp_data = np.matmul(np_climate_data, np_w)
np_temp_data.shape

(10000,)

Now we want to add temperature to the climate array and then finally save it to a file

### `np.concatenate` would concatenate multiple arrays but they need to have same number of dimensions and same length.
In our below example we reshape the array so that it also has 2 dimensions + we ensure that it has same length as the other array.

In [19]:
# We do reshape(-1, 1) as we don't care about how many rows we have but we want to have just 1 column (a column array).
# Here we need not specify the number of rows, it picks it up on its own.

climate_val = np.concatenate((np_climate_data, np_temp_data.reshape(-1, 1)), axis=1)
# np.concatenate((np_climate_data, np_temp_data.reshape(10000, 1)), axis=1)

In [20]:
climate_val.shape

(10000, 4)

In [21]:
np.savetxt('climate_value.txt', climate_val, fmt='%.2f', header='factor1, factor2, factor3, temperature')

## Arithmetic operations

In [22]:
arith_arr1 = np.array([[1, 2, 3],
                      [4, 5, 6],
                      [6, 7, 8]])

In [23]:
arith_arr1 + 3

array([[ 4,  5,  6],
       [ 7,  8,  9],
       [ 9, 10, 11]])

In [24]:
arith_arr2 = np.array([[10, 11, 12],
                       [13, 14, 15],
                       [16, 17, 18]])

In [25]:
arith_arr1 + arith_arr2

array([[11, 13, 15],
       [17, 19, 21],
       [22, 24, 26]])

Similarly we can do subtraction, multiplication, division, modulo etc

In [26]:
x = np.arange(3)

In [27]:
x

array([0, 1, 2])

In [28]:
x.shape

(3,)

### `np.newaxis` is a pusedo-index allows temporary addition of a new axis into an array, 
### a convinient alias for `None`



In [29]:
x[:, np.newaxis] # new column axis is added and hence shape of array is changed

array([[0],
       [1],
       [2]])

In [30]:
x[:, np.newaxis, np.newaxis].shape

x[:, np.newaxis, np.newaxis]

array([[[0]],

       [[1]],

       [[2]]])

In [31]:
x[:, None].shape # None can also be used in place of newaxis

(3, 1)

### `np.newaxis` vs `np.reshape`
`np.newaxis` can also be used when you have more than 1 unknown dimension to operate with
`np.arange(5)[:, np.newaxis, np.newaxis]` will work but `np.arange(5).reshape(-1, 1, -1)` will throw an error - `ValueError: can only specify one unknown dimension`

In [32]:
np.arange(5)[:, np.newaxis, np.newaxis]

array([[[0]],

       [[1]],

       [[2]],

       [[3]],

       [[4]]])

In [33]:
np.arange(5).reshape(-1, 1, -1)

ValueError: can only specify one unknown dimension

### Broadcasting

Numpy arrays support broadcasting which allows arithmetic operations to be performed on arrays with different but compatible dimensions.

In broadcasting, the arrays are stretced conceptually when performing arithmetic operations but in actual they still hold the same amount of space.

In [35]:
arr1 = np.arange(3)
arr1

array([0, 1, 2])

In [36]:
arr1 + 5 # here the scalar is broadcasted so that it matches the shape of arr1, then the arithmetic operation is performed

array([5, 6, 7])

### Rules of Broadcasting
Broadcasting in NumPy follows a strict set of rules to determine the interaction between the two arrays:

**Rule 1**: If the two arrays differ in their number of dimensions, the shape of the one with fewer dimensions is padded with ones on its leading (left) side.

**Rule 2**: If the shape of the two arrays does not match in any dimension, the array with shape equal to 1 in that dimension is stretched to match the other shape.

**Rule 3**: If in any dimension the sizes disagree and neither is equal to 1, an error is raised.

Ref. - https://jakevdp.github.io/PythonDataScienceHandbook/02.05-computation-on-arrays-broadcasting.html

In [37]:
arr1.shape

(3,)

In [42]:
arr2 = np.ones((1, 3), dtype=np.int64)
arr2.shape

(1, 3)

In [43]:
arr2

array([[1, 1, 1]], dtype=int64)

#### As per rule 1, the dimensions donot match hence a new dimension is added (1 is padded) to the left of the array with fewer dimension. So its shape will become (1, 3).

#### Now the shapes match and hence they can be added

In [44]:
arr1 + arr2

array([[1, 2, 3]], dtype=int64)

In [47]:
arr3 = np.ones((4, 3), dtype=np.int8)

`arr1` has shape (3,) so as per Rule 1, an extra 1 is padded to the left of `arr1` and hence its shape becomes (3, 1).
As per Rule 2 the shapes do not match and so we stretch the `arr1` as it has 1 in its shape and check if the shapes of the two arays match after stretching. On stretching `arr1` becomes (4, 3) and the shapes match, hence they can be added.

In [48]:
arr3 + arr1

array([[1, 2, 3],
       [1, 2, 3],
       [1, 2, 3],
       [1, 2, 3]])

In [49]:
arr4 = np.ones((3, 4), dtype=np.int8)

Here the shape of `arr1` after streching would become (3, 3) and the shape won't match to shape of `arr4`, throwing an error when arithmetic operation is performed.

In [50]:
arr4 + arr1

ValueError: operands could not be broadcast together with shapes (3,4) (3,) 

In [51]:
arr1 = np.array([1, 2, 3])
arr2 = np.array([1, 5, 3])

In [52]:
arr1 != arr2

array([False,  True, False])

In [53]:
arr1 <= arr2

array([ True,  True,  True])

In [54]:
arr1 == arr2

array([ True, False,  True])

**Finding the count of elements that match in `arr1` and `arr2` (located at same position in both the arrays).**

In [56]:
(arr1 == arr2).sum()

2

## Indexing Numpy arrays

In [57]:
arr = np.array([
    [[11, 12, 13, 14], 
     [13, 14, 15, 19]], 
    
    [[15, 16, 17, 21], 
     [63, 92, 36, 18]], 
    
    [[98, 32, 81, 23],      
     [17, 18, 19.5, 43]]
])

In [58]:
arr.shape

(3, 2, 4)

In [60]:
arr[0, 0, 1] # should return 12

12.0

In [61]:
arr[1:, 0:1, :2]

array([[[15., 16.]],

       [[98., 32.]]])

In [62]:
arr[:2, 1]

array([[13., 14., 15., 19.],
       [63., 92., 36., 18.]])