We will look at the following file formats

- csv

- pickle

- npy

Note: Not convering in this but wanted to describe:
- protobuffers (Google) and arrow (Apache): Binary Record oriented formats. Very portable.
- Parquet : Column bssed used in Big data. Fast for searching.


In [29]:
import numpy as np
import inspect

def print_name_value(variable):
    frame = inspect.currentframe()
    frame = inspect.getouterframes(frame)[1]
    ctx = inspect.getframeinfo(frame[0]).code_context[0].strip()
    single_arg = ctx[ctx.find('(') + 1:-1].split(',')[0]
    mem_variable = id(variable)
    print(f'{single_arg}:\n{variable}')

In [30]:
a = 10
# this is a formatted print
print("a as a float:%f" %a) 
print("a as a float:%0.2f" %a) 
# You can also use %.2f to produce the same results.

a as a float:10.000000
a as a float:10.00


In [31]:
d = np.array([[10.77489, 15.78901, 12.001], [7, 16, 30], [19, 45, 70]])
print_name_value(d)

d:
[[10.77489 15.78901 12.001  ]
 [ 7.      16.      30.     ]
 [19.      45.      70.     ]]


In [32]:
# write the vector d to a csv file with 3 precision
d.tofile(file="data.csv", sep=",", format="%0.3f")

In [33]:
# The function genfromtxt(filename, delimiter) 
# can be used to read several file formats.
rd = np.genfromtxt("data.csv", delimiter=",")
print_name_value(rd)

rd:
[10.775 15.789 12.001  7.    16.    30.    19.    45.    70.   ]


#### Pickle file

pickle file is a file format that follows first in first out and 
the objects are stored in the binary format

In [73]:
print_name_value(d)
PICKLE_F_NAME = 'data.pickle'
d.dump(PICKLE_F_NAME) # write d to a pickle file

d:
[[10.77489 15.78901 12.001  ]
 [ 7.      16.      30.     ]
 [19.      45.      70.     ]]


In [74]:
# Reading back from pickle.
import pickle
with (open(PICKLE_F_NAME, "rb")) as in_file:
    data = pickle.load(in_file)
    print(data)

[[10.77489 15.78901 12.001  ]
 [ 7.      16.      30.     ]
 [19.      45.      70.     ]]


#### npy file

Is a binary file format to store numpy objects

In [35]:
a = np.array([[2, 3, 11], [-5, 9, 19]])
print_name_value(a)

a:
[[ 2  3 11]
 [-5  9 19]]


In [36]:
# using save(filename, object) we can save a particular object in the specified file
np.save('d1.npy', a)

In [15]:
# using load(filename) we can read all the objects sequentially from the file
a = np.load('d1.npy')
print_name_value(a)


a:
[[ 2  3 11]
 [-5  9 19]]


In [20]:
# saving multiple objects in a npy file
b = np.array(["JJ", "AX", "BY"], dtype='object' )

In [21]:
np.save('d2.npy', [np.asanyarray(a, dtype=object), b]) 

In [22]:
x = np.load("d2.npy", allow_pickle=True)

In [23]:
x

array([array([[2, 3, 11],
              [-5, 9, 19]], dtype=object),
       array(['JJ', 'AX', 'BY'], dtype=object)], dtype=object)

#### More operations using genfromtxt()

#### Python io module 

io module is used to deal with various I/O types. 

There are 3 main i/o types:

    1) text
    
    2) binary
    
    3) raw
    
Modes are:

    1) read-only
    
    2) write-only
    
    3) read-write
    
We will look at StringIO - is very efficient to create large strings

In [26]:
from io import StringIO
out = StringIO()

In [27]:
print("Watermelon", file=out, end="")

In [28]:
print(out.getvalue())

Watermelon


In [31]:
# Writing to memory files

mem_file = StringIO()
for i in range(0, 5):
    mem_file.write(str(i))
    mem_file.write(" ")

data = mem_file.getvalue()
print_name_value(data)
mem_file.close()

data:
0 1 2 3 4 


#### Text files

In [60]:
with open('file1.txt', 'w') as out_file:
    for i in range(10):
        j = 2 * i
        rec = f'{i},"{j}, text"\n'
        out_file.write(rec)

In [53]:
with open('file1.txt', 'r') as in_file:
    for line in in_file.readlines():
        print(line, end='')


0,"0, text"
1,"2, text"
2,"4, text"
3,"6, text"
4,"8, text"
5,"10, text"
6,"12, text"
7,"14, text"
8,"16, text"
9,"18, text"


#### Binary files

In [64]:
# Reading a binary file
with open('sample_image.png', 'rb') as in_file:
    bytes = in_file.read(2)
    count = 20
    while bytes != b"" and count > 0:
        count -= 1
        print(str(bytes))
        bytes = in_file.read(2)


b'\x89P'
b'NG'
b'\r\n'
b'\x1a\n'
b'\x00\x00'
b'\x00\r'
b'IH'
b'DR'
b'\x00\x00'
b'\x06\x0e'
b'\x00\x00'
b'\x01\xf6'
b'\x08\x06'
b'\x00\x00'
b'\x00\xa8'
b'Kw'
b'R\x00'
b'\x00\x04'
b'\x0ei'
b'CC'


In [72]:
# Reading and Writing a binary file
with open('bin_file.bin', 'wb') as out_file:
    for i in range(9):
        out_file.write(str(i).encode())
        
with open('bin_file.bin', 'rb') as in_file:
    byte = None
    while byte != b"":
        byte = in_file.read(1)
        print(byte)


b'0'
b'1'
b'2'
b'3'
b'4'
b'5'
b'6'
b'7'
b'8'
b''
