In [1]:
import numpy as np
import time
from multiprocessing import Pool

## with open

In [2]:
with open('./e-coli.txt') as f:
    start_time = time.time()
    reference = f.readline().strip()
    num_A = [int(v) for v in f.readline().strip().split()]
    num_C = [int(v) for v in f.readline().strip().split()]
    num_G = [int(v) for v in f.readline().strip().split()]
    num_T = [int(v) for v in f.readline().strip().split()]
    num_D = [int(v) for v in f.readline().strip().split()]
    end_time = time.time()
    
    print('elapsed-time:', end_time - start_time)

elapsed-time: 13.536487817764282


In [14]:
print(num_A[:10])

[37, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [24]:
print(len(num_A))
print(len(num_C))
print(len(num_G))
print(len(num_T))
print(len(num_D))

8429809
8429809
8429809
8429809
8429809


In [16]:
print(len(reference))

8429809


In [28]:
np_num_A = np.array([num_A])
np_num_C = np.array([num_C])
np_num_G = np.array([num_G])
np_num_T = np.array([num_T])
np_num_D = np.array([num_D])

In [29]:
nums = np.concatenate((np_num_A, np_num_C, np_num_G, np_num_T, np_num_D), axis=0)

In [30]:
print(nums.shape)

(5, 8429809)


## investigate speed of processing

In [38]:
with open('./e-coli.txt') as f:
    for line in f.readlines():
        print(len(line))

8429810
19241092
19186609
19184375
19236501
24158043


This printing line length is very fast. So, most of the time is done spent in processing lines, not in I/O operations.

## use multiprocessing

In [2]:
def parse_line(line):
    return [int(v) for v in line.strip().split()]

parsed_lines = None
with Pool(5) as pool:
    with open('./e-coli.txt') as f:
        start_time = time.time()
        reference = f.readline().strip()
        
        lines = f.readlines()
        parsed_lines = pool.map(parse_line, lines)
        end_time = time.time()
    
        print('elapsed-time:', end_time - start_time)
print(len(parsed_lines))

elapsed-time: 4.771244525909424
5


In [4]:
parsed_lines_np = np.array(parsed_lines)
print(parsed_lines_np.shape)

(5, 8429809)


# investigate numpy array contents

In [9]:
a = np.array([['a', 'b', 'c'], [1, 2, 3]])
print(a)
print(type(a))
print(a.shape)

[['a' 'b' 'c']
 ['1' '2' '3']]
<class 'numpy.ndarray'>
(2, 3)


# investigate numpy I/O speed separate string

In [5]:
print(len(parsed_lines))
parsed_lines_np = np.concatenate([[line] for line in parsed_lines], axis=0)
print(parsed_lines_np.shape)

5
(5, 8429809)


In [17]:
np.save('./parsed_lines.npy', parsed_lines_np)
with open('./ref.txt', 'w') as f:
    f.write('{}'.format(reference))

In [18]:
read_array = np.load('./parsed_lines.npy')
with open('./ref.txt') as f:
    read_ref = f.readline()

This turns out to be ok fast.

## investigate numpy I/O speed connected string

In [27]:
print(len(reference))
print(len([c for c in reference]))

ref_np = np.array([[c for c in reference.strip()]])
print(ref_np.shape)

merged_np = np.concatenate([ref_np, parsed_lines_np], axis=0)

8429809
8429809
(1, 8429809)


This merging of ints and strings is no fast.

## convert reference to str

In [28]:
mapping = {
    'A': 0,
    'C': 1,
    'G': 2,
    'T': 3,
    '-': 4
}

ref_converted = [mapping[c] for c in reference]

In [29]:
merged_converted_np = np.concatenate([np.array([ref_converted]), parsed_lines_np], axis=0)

In [30]:
np.save('merged_converted.npy', merged_converted_np)

In [31]:
read_merged_converted_np = np.load('./merged_converted.npy')