# Dependencies

In [1]:
import sys

import numpy as np

In [2]:
rng = np.random.default_rng(seed=42)

# NumPy - Input and output
Doc:
   - [numpy.org/doc/stable/reference/routines.io.html](https://numpy.org/doc/stable/reference/routines.io.html)

## NumPy binary files (npy, npz)
- npy
   - Stores a single NumPy array in binary format
   - Efficient for loading and saving large arrays
   - Use `np.save` to save, `np.load` to load
- npz
   - Stores multiple NumPy arrays in a compressed archive
   - Use `np.savez` or `np.savez_compressed` to save, `np.load` to load.
   - Offers compression for smaller file size

In [3]:
arr_1d_1 = np.array([1, 2, 3, 4, 5])

# save
np.save("../resources/binaries/binary_1.npy", arr_1d_1)

In [4]:
arr_1d_2 = np.array([1, 2, 3, 4, 5])
arr_1d_3 = np.array([5, 4, 3, 2, 1])

# savez
np.savez("../resources/binaries/binary_2.npz", arr_1d_2, arr_1d_3)

In [5]:
arr_1d_4 = np.array([4, 5, 6, 7, 8])
arr_1d_5 = np.array([8, 7, 6, 5, 4])

# savez_compressed
np.savez_compressed("../resources/binaries/binary_3.npz", arr_1d_4, arr_1d_5)

In [6]:
# load .npy
arr_1d_6 = np.load("../resources/binaries/binary_1.npy")

# load .npz
load_1 = np.load("../resources/binaries/binary_2.npz")
arr_1d_7, arr_1d_8 = load_1['arr_0'], load_1['arr_1']

# load .npz [compressed file]
load_2 = np.load("../resources/binaries/binary_3.npz")
arr_1d_9, arr_1d_10 = load_2['arr_0'], load_2['arr_1']

# log
print(f"arr_1d_6 : {arr_1d_6}")
print(f"arr_1d_7 : {arr_1d_7}")
print(f"arr_1d_8 : {arr_1d_8}")
print(f"arr_1d_9 : {arr_1d_9}")
print(f"arr_1d_10: {arr_1d_10}")

arr_1d_6 : [1 2 3 4 5]
arr_1d_7 : [1 2 3 4 5]
arr_1d_8 : [5 4 3 2 1]
arr_1d_9 : [4 5 6 7 8]
arr_1d_10: [8 7 6 5 4]


## Text files

In [7]:
arr_2d_1 = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0], [10.0, 11.0, 12.0]])

# savetxt
np.savetxt("../resources/txtfiles/file_1.csv", X=arr_2d_1, fmt='%i', delimiter=',', header="A, B, C", comments="")

# load txt file
arr_2d_2 = np.loadtxt(
    fname="../resources/txtfiles/file_1.csv",
    dtype=np.int64,
    delimiter=',',
    skiprows=1,
)

# log
print(f"arr_2d_2:\n{arr_2d_2}")

arr_2d_2:
[[ 1  2  3]
 [ 4  5  6]
 [ 7  8  9]
 [10 11 12]]


In [8]:
arr_2d_2 = np.array([[1.0, 2.0, 3.0], [4.0, None, 6.0], [7.0, 8.0, 9.0], [10.0, 11.0, 12.0]])
arr_2d_2 = np.where(arr_2d_2 == None, np.nan, arr_2d_2)

# savetxt [contains null values]
np.savetxt("../resources/txtfiles/file_2.csv", X=arr_2d_2, fmt='%.1f', delimiter=',', header="A, B, C", comments="")

# load txt file [advancced]
arr_2d_3 = np.genfromtxt(
    fname="../resources/txtfiles/file_2.csv",
    dtype=np.float64,
    delimiter=',',
    names=True,
)

# log
print(f"arr_2d_3       : {arr_2d_3}")
print(f"arr_2d_3.dtype : {arr_2d_3.dtype}")

arr_2d_3       : [( 1.,  2.,  3.) ( 4., nan,  6.) ( 7.,  8.,  9.) (10., 11., 12.)]
arr_2d_3.dtype : [('A', '<f8'), ('B', '<f8'), ('C', '<f8')]


In [9]:
str_1 = "1.0,2.0,3.0,4.0,5.0"

# fromstring
fromstring_1 = np.fromstring(str_1, sep=',')

# log
print(f"fromstring_1: {fromstring_1}")

fromstring_1: [1. 2. 3. 4. 5.]


## String formatting

In [10]:
arr_2d_4 = np.array([[1, 2], [3, 4], [5, 6]])

# array2string
array2string_1 = np.array2string(arr_2d_4)

# log
print(f"type(array2string_1) : {type(array2string_1)}")
print(f"array2string_1:\n{array2string_1}")

type(array2string_1) : <class 'str'>
array2string_1:
[[1 2]
 [3 4]
 [5 6]]


## Memory mapping files
   - It's used when datasets are too large to fit into RAM
   - `np.memmap` allows you to work with them as if they were in memory, but without loading the entire dataset into RAM
   - You can efficiently read and process only the parts of the data you need

In [11]:
# create an empty memmap array
memmap_arr_1 = np.memmap("../resources/memmaps/memmap_1.dat", dtype=np.float32, mode='w+', shape=(10, 10))

# fill the array with data
random_data_1 = rng.random((10, 10))
memmap_arr_1[:] = random_data_1

# flush changes to disk
memmap_arr_1.flush()

# open the memory-mapped file in read mode
memmap_arr_2 = np.memmap("../resources/memmaps/memmap_1.dat", dtype=np.float32, mode='r', shape=(10, 10))

# log
print(f"memmap_arr_1  size : {sys.getsizeof(memmap_arr_1)} bytes")
print(f"memmap_arr_2  size : {sys.getsizeof(memmap_arr_2)} bytes")
print(f"random_data_1 size : {sys.getsizeof(random_data_1)} bytes")

# when done, delete the reference to the memmap array
del memmap_arr_1
del memmap_arr_2

memmap_arr_1  size : 160 bytes
memmap_arr_2  size : 160 bytes
random_data_1 size : 928 bytes


## Text formatting options

In [12]:
arr_2d_5 = np.array([[1.123456789, 2.987654321, 3.2353278765], [3.141592653, 4.567890123, 1.2461287468175]])

# log
print(f"arr_2d_5:\n{arr_2d_5}")

arr_2d_5:
[[1.12345679 2.98765432 3.23532788]
 [3.14159265 4.56789012 1.24612875]]


In [13]:
# retrieve the current print options
print_options_1 = np.get_printoptions()

# log
print(f"print_options_1:\n{print_options_1}")

print_options_1:
{'edgeitems': 3, 'threshold': 1000, 'floatmode': 'maxprec', 'precision': 8, 'suppress': False, 'linewidth': 75, 'nanstr': 'nan', 'infstr': 'inf', 'sign': '-', 'formatter': None, 'legacy': False}


In [14]:
# use as a context manager to temporarily set print options within a specific block of code
with np.printoptions(precision=2, suppress=True, linewidth=20):
    print(f"arr_2d_5:\n{arr_2d_5}")

# log
print(f"arr_2d_5:\n{arr_2d_5}")

arr_2d_5:
[[1.12 2.99 3.24]
 [3.14 4.57 1.25]]
arr_2d_5:
[[1.12345679 2.98765432 3.23532788]
 [3.14159265 4.56789012 1.24612875]]


In [15]:
# set global print options for NumPy arrays
np.set_printoptions(precision=2, suppress=True, linewidth=20)

# log
print(f"arr_2d_5:\n{arr_2d_5}")

# reset to default settings according to the docs [but it's not!!] [tested on v2.0]
np.set_printoptions()

arr_2d_5:
[[1.12 2.99 3.24]
 [3.14 4.57 1.25]]
