In [1]:
import numpy as np

In [2]:
np.linspace(1, 20, 4)
# Generates 4 numbers between 1 and 20

array([ 1.        ,  7.33333333, 13.66666667, 20.        ])

In [3]:
np.zeros((3, 4))
# np.zeros and np.ones get a TUPLE.

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [4]:
np.diag([1, 3, 10])
# The `np.diag()` function is used to create a 2-dimensional diagonal matrix. A diagonal matrix is a matrix with non-zero elements only on the main diagonal. 
# It takes a 1-dimensional array or a list as input, representing the diagonal elements.

array([[ 1,  0,  0],
       [ 0,  3,  0],
       [ 0,  0, 10]])

In [5]:
arr = np.array([
    [[1, 2], [3, 4]],
    [[5, 6], [7, 8]],
    [[9, 10], [11, 12]]
])
arr

array([[[ 1,  2],
        [ 3,  4]],

       [[ 5,  6],
        [ 7,  8]],

       [[ 9, 10],
        [11, 12]]])

## np.random:

In [6]:
np.random.rand(2, 3)
# Generates an array with random numbers.
# Args: axis

array([[0.62097604, 0.85695333, 0.04583894],
       [0.70499283, 0.16378797, 0.0772364 ]])

In [7]:
arr_1 = np.random.randint(3, 10, 3)
arr_2 = np.random.randint(3, 10, size=(4, 3))
# Frist and second args: range
# Third arg: axis

In [8]:
arr_1

array([5, 6, 7])

In [9]:
arr_2

array([[6, 8, 7],
       [3, 3, 7],
       [5, 7, 5],
       [4, 6, 8]])

In [10]:
# Boolean Indexing
# arr[Boolean Condition -> a condition that return a boolean array]
arr = np.array([1, 2, 3, 4, 5])
mask = np.array([True, False, False, False, True])
result = arr[mask]
result

array([1, 5])

In [11]:
arr_1 = np.random.randint(1, 100000, 100)
arr_1[(arr_1 % 2 == 0) & (arr_1 < 10000)]

array([4140, 9038, 1932,  420])

In [12]:
# Fancy Indexing
# arr[[Indices]]
arr = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90])
arr[[2, 0, 4, 6]]

array([30, 10, 50, 70])

In [13]:
arr_2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
row_indices = np.array([0, 1, 2])
col_indices = np.array([1, 2, 0])
result = arr_2d[row_indices, col_indices]
result

array([2, 6, 7])

In [14]:
# View vs. Copy
list_1 = [1, 2, 3, 4, 5]
arr_1 = np.array([1, 2, 3, 4, 5])

temp_list = list_1[1:4] # copy
temp_arr = arr_1[1:4] # view

temp_list[1] = 0 # Modifies the copy version of the list.
temp_arr[1] = 0 # Modifies the main array because the slicing and temp_arr is just a view.

print(list_1, arr_1)

# Note: If you really need to avoid modifying the original array, use the .copy() method before making any changes.

[1, 2, 3, 4, 5] [1 2 0 4 5]


In [15]:
print(np.shares_memory(temp_list, list_1), np.shares_memory(temp_arr, arr_1))

False True


# NumPy Data Types

<img src="./images/np-dtypes.webp" width="800">

Below is a list of all data types in NumPy and the characters used to represent them.

- `i` - `integer`
- `b` - `boolean`
- `u` - `unsigned integer`
- `f` - `float`
- `c` - `complex float`
- `m` - `timedelta`
- `M` - `datetime`
- `O` - `object`
- `S` - `string`
- `U` - `unicode string`
- `V` - fixed chunk of memory for other type (`void`)

Here, `'i4'` represents a 32-bit integer data type.

## Integer Types

NumPy offers several integer data types with different sizes and ranges. The most commonly used integer types are:

- `int8`: 8-bit signed integer (-128 to 127)
- `int16`: 16-bit signed integer (-32,768 to 32,767)
- `int32`: 32-bit signed integer (-2,147,483,648 to 2,147,483,647)
- `int64`: 64-bit signed integer (-9,223,372,036,854,775,808 to 9,223,372,036,854,775,807)


NumPy also provides unsigned integer types, which have the same sizes as their signed counterparts but can only represent non-negative values:

- `uint8`: 8-bit unsigned integer (0 to 255)
- `uint16`: 16-bit unsigned integer (0 to 65,535)
- `uint32`: 32-bit unsigned integer (0 to 4,294,967,295)
- `uint64`: 64-bit unsigned integer (0 to 18,446,744,073,709,551,615)

# Floating-Point Types

Floating-point types are used to represent real numbers with decimal points. NumPy provides two main floating-point types:

- `float32`: 32-bit single-precision floating-point number
- `float64`: 64-bit double-precision floating-point number (default)


The `float32` type has a precision of about 7 decimal digits, while `float64` has a precision of about 15 decimal digits. The choice between `float32` and `float64` depends on the required precision and the memory constraints of your application.


In [16]:
# Note: Boolean and String data types in numpy are represented as bool_ and str_

## For string data type in NumPy

The main differences between 'S' and 'U' data types are:
- 'S' represents byte strings, while 'U' represents Unicode strings.
- 'S' is specified in terms of bytes, while 'U' is specified in terms of characters.
- 'S' can contain arbitrary binary data, while 'U' is designed to store text data.
- 'S' is more memory-efficient, while 'U' can handle a wider range of characters.

When choosing between 'S' and 'U', consider the following:
- If you are working with ASCII text or raw binary data, use the 'S' data type.
- If you need to handle non-ASCII characters or multilingual text, use the 'U' data type.
- Be aware of the memory implications, as Unicode strings require more memory than byte strings.


In [17]:
arr = np.array(['apple', 'banana', 'cherry'], dtype='S10')
arr

array([b'apple', b'banana', b'cherry'], dtype='|S10')

In [18]:
# Object data type, is a NumPy data type that allow arrays to hold objects of arbitrary Python types.
# This data type is very flexible.
obj_array = np.array([1, "apple", [3, 4, 5], {"key": "value"}], dtype=np.object_)
obj_array

array([1, 'apple', list([3, 4, 5]), {'key': 'value'}], dtype=object)

In [19]:
# To cast an array to a different data type using the `astype()` method, you can pass the desired data type as an argument:
arr_float = np.array([1.5, 2.5, 3.5])
arr_float.astype(int)

array([1, 2, 3])

In [20]:
arr_float.astype(str)

array(['1.5', '2.5', '3.5'], dtype='<U32')

In [21]:
np.array([1.5, 2.5, 3.5], dtype=int)

array([1, 2, 3])

# Memory Usage of Different Data Types

Here are the sizes of some common NumPy data types:

- `bool`: 1 byte
- `int8`, `uint8`: 1 byte
- `int16`, `uint16`: 2 bytes
- `int32`, `uint32`, `float32`: 4 bytes
- `int64`, `uint64`, `float64` (default): 8 bytes
- `complex64`: 8 bytes (4 bytes for real part, 4 bytes for imaginary part)
- `complex128`: 16 bytes (8 bytes for real part, 8 bytes for imaginary part)

In [22]:
arr = np.zeros((1000, 1000), dtype=np.float64)  # 8000000 (8 MB)
arr.nbytes

8000000

# Broadcasting rules

In NumPy, broadcasting between two arrays is possible if their shapes are compatible. The rule is:

- When comparing dimensions, we start from the trailing (rightmost) dimensions and move left.
- For each pair of dimensions, they are compatible if either:

    1- The dimensions are equal,

    or

    2- At least one of them is 1.

If all compared dimensions follow this rule, broadcasting can occur.

In [23]:
# For example:
arr1 = np.array([[1, 2, 3],
                 [4, 5, 6]])

arr2 = np.array([[10],
                 [20]])

arr3 = np.array([100, 200, 300])

# (2, 3) (1, 1) (1, 3) -> Shapes are compatible.

arr1 + arr2 + arr3 # Expecting output

array([[111, 212, 313],
       [124, 225, 326]])

In [24]:
# Example:
arr1 = np.array([[1, 2, 3],
                 [4, 5, 6]])

arr2 = np.array([[10, 20],
                 [30, 40]])

# (2, 3) (2, 2) -> Shapes are NOT compatible.

arr1 * arr_2 # Expecting Error

ValueError: operands could not be broadcast together with shapes (2,3) (4,3) 

Example:
(2, 3, 3) (1, 3) -> 3 and 3 matches, 1 and 3 matches, 1 and 2 matches -> Shapes are compatible.

- Note: If one of the arrays has fewer dimensions, we treat the missing dimensions as 1 for the purpose of broadcasting.

For exmaple here, 2 matches with 1 (originally no dimension.)

# Vectorization

### What is Vectorization?

Vectorization in Python (especially with NumPy) means performing operations on arrays or datasets all at once, instead of using for loops.
Instead of calculating element by element, the entire array is processed in one go.

- Note: Using Python, many mathematical operations on numbers require a for loop to be applied to each element of a list.
However, when using NumPy functions on NumPy arrays, a for loop is not needed. You can call the function on the entire array, and NumPy will automatically apply it to each element.

In [None]:
# Example:

import time



a = [1, 2, 3, 4, 5]
b = [10, 20, 30, 40, 50]
c = []

start_time = time.time()
for i in range(len(a)):
    c.append(a[i] + b[i])
end_time = time.time()

loop_time = end_time - start_time

a = np.array([1, 2, 3, 4, 5])
b = np.array([10, 20, 30, 40, 50])

start_time = time.time()
c = a + b
end_time = time.time()

vectorized_time = end_time - start_time

print(f'loop_time = {loop_time:.7f} seconds\nvectorized_time = {vectorized_time:.7f} seconds')


loop_time = 0.0001035 seconds
vectorized_time = 0.0000691 seconds


In [30]:
# Use .reshape(tuple of the new shape or the dimention of the new array) to change the dimentions of the array.
# Use .ravel() or .flatten() to change an array to a 1D array.
# Note: .ravel() returns a view of the array, while .flatten() returns a copy of the array.
# Example:
arr_1 = np.array([[1, 2], [3, 4], [5, 6]])
new_arr_1 = arr_1.ravel() # Returns a view of arr_1
new_arr_1[0] *= 10 # Changes will also affect arr_1

arr_2 = np.array([[1, 2], [3, 4], [5, 6]])
new_arr_2 = arr_2.flatten() # Returns a copy of arr_2
new_arr_2[0] *= 10 # Changes will NOT affect arr_2

print(f'{arr_1}\n\n{arr_2}')

[[10  2]
 [ 3  4]
 [ 5  6]]

[[1 2]
 [3 4]
 [5 6]]


In [31]:
# .ravel() and .flatten() can be called both with "C" order or "F" order.
arr = np.array([[1, 2], [3, 4], [5, 6]])
c_arr = arr.ravel('C')
f_arr = arr.ravel('F')
print(f'{c_arr}\n\n{f_arr}')

[1 2 3 4 5 6]

[1 3 5 2 4 6]


In [41]:
# Note: arr.ravel() does the job inplace, while np.ravel(arr) should be assigned to a varb and does NOT change the main arr.
# So if you want to flatten the array, use arr.ravel() or arr.flatten().
# And if you want to keep the main array and assign the flattened array to another varb, use flattened_arr = np.ravel(arr) or flattened_arr =  np.flatten(arr)

In [32]:
# The `np.vstack()` function stacks arrays vertically, meaning it combines them along the first axis (rows). The arrays to be stacked must have the same number of columns.
arr1 = np.array([[1, 2], [3, 4]])
arr2 = np.array([[5, 6], [7, 8]])
stacked_arr = np.vstack((arr1, arr2))
stacked_arr

array([[1, 2],
       [3, 4],
       [5, 6],
       [7, 8]])

In [33]:
# The `np.hstack()` function stacks arrays horizontally, meaning it combines them along the second axis (columns). The arrays to be stacked must have the same number of rows.
arr1 = np.array([[1, 2], [3, 4]])
arr2 = np.array([[5, 6], [7, 8]])
stacked_arr = np.hstack((arr1, arr2))
stacked_arr

array([[1, 2, 5, 6],
       [3, 4, 7, 8]])

In [34]:
#The `np.concatenate()` function takes a sequence of arrays as input and concatenates them along a specified axis. By default, the concatenation is performed along the first axis (axis 0).
arr1 = np.array([[1, 2], [3, 4]])
arr2 = np.array([[5, 6], [7, 8]])
np.concatenate((arr1, arr2))

array([[1, 2],
       [3, 4],
       [5, 6],
       [7, 8]])

In [37]:
arr1 = np.array([[1, 2], [3, 4]])
arr2 = np.array([[5, 6], [7, 8]])
np.concatenate((arr1, arr2), axis=1)

array([[1, 2, 5, 6],
       [3, 4, 7, 8]])

In [39]:
# .vsplit(arr, rows)
# .hsplit(arr, rows)
arr = np.array([[1, 2, 3, 4],
                [5, 6, 7, 8],
                [9, 10, 11, 12]])

# Splitting into 3 equal-sized sub-arrays
split_arr = np.vsplit(arr, 3)
split_arr

[array([[1, 2, 3, 4]]), array([[5, 6, 7, 8]]), array([[ 9, 10, 11, 12]])]

In [42]:
# The `np.tile()` function creates a new array by repeating an input array a specified number of times along each axis. It takes two arguments: the input array and a tuple or array specifying the number of repetitions along each axis.
arr = np.array([[1, 2], [3, 4]])
tiled_arr = np.tile(arr, (2, 3))
tiled_arr

array([[1, 2, 1, 2, 1, 2],
       [3, 4, 3, 4, 3, 4],
       [1, 2, 1, 2, 1, 2],
       [3, 4, 3, 4, 3, 4]])