In [1]:
import numpy as np

#### Comparing numpy with python data structures like list

In [2]:
arr1 = list(range(1000000))
arr2 = list(range(1000000, 2000000))

In [3]:
def dot_product(arr1, arr2):
    sum = 0
    for a, b in zip(arr1, arr2):
        sum += a * b
    return sum

In [4]:
%%time
dot_product(arr1, arr2)

Wall time: 305 ms


833332333333500000

In [5]:
# below we create numpy array by specifying dtype as int64 because  by default on a 64 bit machine, numpy create a integer of
# 32 bit and it doesn't take care of integer overflow for large integers nd you would get incorrect answer

# SO link - https://stackoverflow.com/questions/50671172/numpy-dot-giving-incorrect-answer-for-large-integers

# To confirm execute below statement - 
# np.array([1, 2, 3]).dtype
# If it returns dtype('int32') then specifying dtype is required else not needed

np_arr1 = np.array(arr1, dtype=np.int64)
np_arr2 = np.array(arr2, dtype=np.int64)

### `np.dot` performs element wise multiplication

In [6]:
%%time
np.dot(np_arr1, np_arr2)

Wall time: 1.96 ms


833332333333500000

#### We can see that the time taken by list is quite more as compared to numpy arrays

In [7]:
%%time
(np_arr1 * np_arr2).sum()

Wall time: 8.99 ms


833332333333500000

### The numpy array can store only a single type of data to make computation faster

Suppose we have to calculate the final temperature of different cities and let us assume that it depends on 3 factors f1, f2, f3. We have 3 cities Pune, Mumbai and Bangalore, also these 3 factors have pre-defined weights w1, w2 and w3 which have fixed values.

In [8]:
w1, w2, w3 = 0.1, 0.2, 0.3

In [9]:
pune = [10, 20, 30]
mumbai = [11, 12, 13]
bangalore = [12, 13, 14]

In [10]:
pune_temp = pune[0] * w1 + pune[1] * w2 + pune[2] * w3 # similarly we calculate the rest
pune_temp

14.0

We can simplify this by using numpy arrays

In [11]:
w = [0.1, 0.2, 0.3]
city_data = [pune, mumbai, bangalore]

# Creating numpy arrays
np_w = np.array(w)
np_city_data = np.array(city_data)

#### We will use matrix multiplication to simplufy the process, using `np.matmul` reduces it to one line of code. We can also use `@` to perform matrix multiplication

In [12]:
np.matmul(np_city_data, np_w)

array([14. ,  7.4,  8. ])

In [13]:
np_city_data @ np_w

array([14. ,  7.4,  8. ])

## Reading data from files

In [14]:
from urllib.request import urlretrieve

# The file would to downloaded to pwd (present working directory)
urlretrieve('https://hub.jovian.ml/wp-content/uploads/2020/08/climate.csv', 'climate.txt')

('climate.txt', <http.client.HTTPMessage at 0x2429b164a08>)

In [15]:
np_climate_data = np.genfromtxt('climate.txt', delimiter=',', skip_header=1)

In [16]:
np_climate_data.shape

(10000, 3)

In [17]:
np_climate_data[:5]

array([[25., 76., 99.],
       [39., 65., 70.],
       [59., 45., 77.],
       [84., 63., 38.],
       [66., 50., 52.]])

In [18]:
np_temp_data = np.matmul(np_climate_data, np_w)
np_temp_data.shape

(10000,)

Now we want to add temperature to the climate array and then finally save it to a file

### `np.concatenate` would concatenate multiple arrays but they need to have same number of dimensions and same length.
In our below example we reshape the array so that it also has 2 dimensions + we ensure that it has same length as the other array.

In [19]:
# We do reshape(-1, 1) as we don't care about how many rows we have but we want to have just 1 column (a column array).
# Here we need not specify the number of rows, it picks it up on its own.

climate_val = np.concatenate((np_climate_data, np_temp_data.reshape(-1, 1)), axis=1)
# np.concatenate((np_climate_data, np_temp_data.reshape(10000, 1)), axis=1)

In [20]:
climate_val.shape

(10000, 4)

In [21]:
np.savetxt('climate_value.txt', climate_val, fmt='%.2f', header='factor1, factor2, factor3, temperature')

## Arithmetic operations

In [25]:
arith_arr1 = np.array([[1, 2, 3],
                      [4, 5, 6],
                      [6, 7, 8]])

In [26]:
arith_arr1 + 3

array([[ 4,  5,  6],
       [ 7,  8,  9],
       [ 9, 10, 11]])

In [27]:
arith_arr2 = np.array([[10, 11, 12],
                       [13, 14, 15],
                       [16, 17, 18]])

In [28]:
arith_arr1 + arith_arr2

array([[11, 13, 15],
       [17, 19, 21],
       [22, 24, 26]])

Similarly we can do subtraction, multiplication, division, modulo etc

### Broadcasting

Numpy arrays support broadcasting which allows arithmetic operations to be performed on arrays with different but compatible dimensions.

In [30]:
x = np.arange(3)

In [31]:
x

array([0, 1, 2])

In [32]:
x.shape

(3,)

### `np.newaxis` is a pusedo-index allows temporary addition of a new axis into an array, 
### a convinient alias for `None`



In [38]:
x[:, np.newaxis] # new column axis is added and hence shape of array is changed

array([[0],
       [1],
       [2]])

In [37]:
x[:, np.newaxis, np.newaxis].shape

x[:, np.newaxis, np.newaxis]

array([[[0]],

       [[1]],

       [[2]]])

In [46]:
x[:, None].shape # None can also be used in place of newaxis

(3, 1)

### `np.newaxis` vs `np.reshape`
`np.newaxis` can also be used when you have more than 1 unknown dimension to operate with
`np.arange(5)[:, np.newaxis, np.newaxis]` will work but `np.arange(5).reshape(-1, 1, -1)` will throw an error - `ValueError: can only specify one unknown dimension`

In [48]:
np.arange(5)[:, np.newaxis, np.newaxis]

array([[[0]],

       [[1]],

       [[2]],

       [[3]],

       [[4]]])

In [53]:
np.arange(5).reshape(-1, 1, -1)

ValueError: can only specify one unknown dimension