In [1]:
import h5py
from pod5 import DatasetReader
import pyslow5

from src.Containers.Fasta import Fasta
from src.Containers.FastQ import FastQ
from src.Containers.Pod5 import Pod5
from src.Containers.Fast5 import Fast5
from src.Containers.Blow5 import Blow5
from src.Containers.Slow5 import Slow5

from src.Containers.ClassifContainer import ClassifContainer
from src.WriterSystem.WriterWrapper import WriterWrapper

from src.ReaderSystem.ReaderWrapper import ReaderWrapper

In [3]:
fasta_0 = ClassifContainer(Fasta('Coli_0', 'AAAAA'), 'E.coli')
fasta_1 = ClassifContainer(Fasta('Coli_1', 'ATAAA'), 'E.coli')
fasta_2 = ClassifContainer(Fasta('Coli_2', 'AAGAA'), 'E.coli')

fasta_3 = ClassifContainer(Fasta('GGAATTT', 'Bacillus_0'), "Bacillus")

fasta_coll = [fasta_0, fasta_1, fasta_2, fasta_3]

with WriterWrapper(False, 2, 'fasta') as oww:
    oww.write(fasta_coll)
# end with

In [4]:
fastq_0 = ClassifContainer(FastQ("Coli_0", "AAAAA", "+", "!!!!!"), "E.coli")
fastq_1 = ClassifContainer(FastQ("Coli_1", "AAATA", "+", "!!!!!"), "E.coli")
fastq_2 = ClassifContainer(FastQ("Coli_2", "AAATTT", "+", "!!!!!!"), "E.coli")

fastq_3 = ClassifContainer(FastQ("Bacillus_0", "GGAATTT", "+", "!!!!!!!"), 'B.subtilis')

fastq_coll = [fastq_0, fastq_1, fastq_2, fastq_3]

with WriterWrapper(False, 2, 'fastq') as oww:
    oww.write(fastq_coll)
# end with



In [5]:
from src.Containers.FastQ import FastQ

fastq = FastQ("Coli_0", "AAA", "+", "*+,", offset = 34)

fastq.average_quality()



9.923583702678474

In [6]:
with DatasetReader("/home/deynonih/cager/NGS-test-data/for_barapost/FAY62206_04e2939b_50079bd0_18.pod5") as dataset:
    reads = dataset.reads()

first = next(reads)
second = next(reads)
third = next(reads)

In [10]:
second

<pod5.reader.ReadRecord at 0x7fef14d64af0>

In [11]:
pod5_0 = ClassifContainer(Pod5(first), 'E.coli')
pod5_1 = ClassifContainer(Pod5(second), 'E.coli')
pod5_2 = ClassifContainer(Pod5(third), 'E.coli')

pod5_3 = ClassifContainer(Pod5(first), "Bacillus")

fasta_coll = [pod5_0, pod5_1, pod5_2, pod5_3]

with WriterWrapper(False, 2, 'pod5') as oww:
    oww.write(fasta_coll)
# end with

In [12]:
out_file_handle = h5py.File('/home/deynonih/cager/NGS-test-data/for_barapost/FAL91692_72e297f8_0.fast5', 'r')

fast5_0 = ClassifContainer(Fast5(out_file_handle, 'read_00001da1-4ebb-4ce6-9128-d050d3696fcc'), 'E.coli')
fast5_1 = ClassifContainer(Fast5(out_file_handle, 'read_000ab04f-2712-4604-87d5-2f46b6a87895'), 'E.coli')
fast5_2 = ClassifContainer(Fast5(out_file_handle, 'read_000eb25f-b049-435f-82f6-1fee4796f0c4'), 'E.coli')

fast5_3 = ClassifContainer(Fast5(out_file_handle, 'read_00001da1-4ebb-4ce6-9128-d050d3696fcc'), "Bacillus")

fasta_coll = [fast5_0, fast5_1, fast5_2, fast5_3]

with WriterWrapper(True, 2, 'fast5') as oww:
    oww.write(fasta_coll)
# end with

out_file_handle.close()



In [3]:
s5 = pyslow5.Open('/home/deynonih/cager/NGS-test-data/for_barapost/FAY62206_04e2939b_50079bd0_18.blow5', 'r')

# print(s5.get_read_ids())

first = s5.get_read('c018e0f4-2cf9-4b36-8961-751cc03d8dd5')
second = s5.get_read('0082f3c8-8b36-443b-a50a-064918f7e871')
third = s5.get_read('34f2315d-1dfc-43cb-a772-c348ab46aa59')

blow5_0 = ClassifContainer(Blow5(first), 'E.coli')
blow5_1 = ClassifContainer(Blow5(second), 'E.coli')
blow5_2 = ClassifContainer(Blow5(third), 'E.coli')

blow5_3 = ClassifContainer(Blow5(third), "Bacillus")

blow5_coll = [blow5_0, blow5_1, blow5_2, blow5_3]

with WriterWrapper(False, 2, 'blow5') as oww:
    oww.write(blow5_coll)
# end with

s5.close()

In [2]:
s5 = pyslow5.Open('/home/deynonih/cager/NGS-test-data/for_barapost/FAY62206_04e2939b_50079bd0_18.slow5', 'r')

# print(s5.get_read_ids())

first = s5.get_read('c018e0f4-2cf9-4b36-8961-751cc03d8dd5')
second = s5.get_read('0082f3c8-8b36-443b-a50a-064918f7e871')
third = s5.get_read('34f2315d-1dfc-43cb-a772-c348ab46aa59')

slow5_0 = ClassifContainer(Slow5(first), 'E.coli')
slow5_1 = ClassifContainer(Slow5(second), 'E.coli')
slow5_2 = ClassifContainer(Slow5(third), 'E.coli')

slow5_3 = ClassifContainer(Slow5(third), "Bacillus")

slow5_coll = [slow5_0, slow5_1, slow5_2, slow5_3]

with WriterWrapper(False, 2, 'slow5') as oww:
    oww.write(slow5_coll)
# end with

s5.close()

In [3]:
try:
    with ReaderWrapper('/home/deynonih/cager/NGS-test-data/for_barapost/FAY62206_04e2939b_50079bd0_18.blow5', 
                       packet_size = 1, 
                       probing_batch_size = -1,
                       mode = 'BOBER') as f:
        while True:
            next(f)
            pass
        # end while
    # end with
except StopIteration:
    pass
# end try



In [4]:
fasta_test_fpath = '/home/deynonih/cager/NGS-test-data/for_barapost/contigs.fasta.gz'

with ReaderWrapper(fasta_test_fpath, mode='sum_seq_len', packet_size=40000, probing_batch_size=20, max_seq_len=10000) as ifh:
    for l in ifh:
        print(l)
        print(sum(map(lambda sr: len(sr.seq), l)))
        print()
    # end for
# end with


[Fasta(
    header='>NODE_1_length_26183_cov_2.977840',
    sequence='GACCCGCAGGTCGCGGTCGGCGAACCCGGC../26,123bp/..GTGAAGCTGCTAAAGGTTGTTAGAGATTTG'
), Fasta(
    header='>NODE_2_length_11992_cov_2.917567',
    sequence='TGTAACCTTTAAACTTAAACTCACGTACCA../11,932bp/..AAAGATATCTGGTTTATTGAGGGCAAGGGC'
), Fasta(
    header='>NODE_3_length_11322_cov_3.701606',
    sequence='GCTCGGCATCCGCGGCATGGTGGATGAGGC../11,262bp/..ACCAAGTGCCGCTTCCCTCCGAATGAGCGA'
), Fasta(
    header='>NODE_4_length_10922_cov_4.052544',
    sequence='GCGAGAAACTTGCCAGGATGATGCTTGGAG../10,862bp/..CCCTTTGTTTTCTCATAGACTTGCTGGCAT'
)]
60419

[Fasta(
    header='>NODE_5_length_10759_cov_4.271020',
    sequence='GCCGGGTCATCGATGCAGTCCTCGGAACCG../10,699bp/..CCGGCGGGAGACGGCGCCGGCCAGGAGCCC'
), Fasta(
    header='>NODE_6_length_10324_cov_3.083552',
    sequence='ATTGCCGGCAGATGCTGCAATGACTCCATG../10,264bp/..GACAAGAGTCAAAGAAAATCTTAAGGAAGT'
), Fasta(
    header='>NODE_7_length_10236_cov_4.014832',
    sequence='GGGGGGGGGCGCCCCCCCCCCCCCGGGGGG../1

In [2]:
fastq_test_fpath = '/home/deynonih/cager/NGS-test-data/for_barapost/fastq_runid_974c5012fcdcf71c2b20c023400847a3f30c2519_118_0.fastq.gz'

with ReaderWrapper(fastq_test_fpath, packet_size=2, probing_batch_size=10) as ifh:
    for l in ifh:
        print(l)
        print()
    # end for
# end with


[FastQ(
    header='@711bf04d-20f5-4418-9712-16168ef98999 runid=974c5012fcdcf71c2b20c023400847a3f30c2519 sampleid=no_sample read=6631 ch=90 start_time=2020-12-29T13:37:03Z', 
    sequence='GTTGTATTAATTCGTGTAATATATGCCTTA../466chars/..ATTGCGTCATAATGCGAATGTCAATGAGCG', 
    plus_line='+', 
    quality="$$&##$$#$')+*%)$$&&$%#&%%$%%''../466chars/..&%'(-%$$#'$$($#%&''%+(&&',+(*$"
), FastQ(
    header='@52a41318-f8b9-4d61-a21f-4a907f107973 runid=974c5012fcdcf71c2b20c023400847a3f30c2519 sampleid=no_sample read=6655 ch=431 start_time=2020-12-29T13:36:59Z', 
    sequence='GTTGTACTTCGTTCAGTTGCGTATTGCTGC../1,116chars/..CATGGGCGCATATGCGAACAGCCATGTGTG', 
    plus_line='+', 
    quality="%,/363;<CCCBD;8;=?%('60<@=E::D../1,116chars/..CEA:<B=(7CFDC:<,'%'3%%'---,,),"
)]

[FastQ(
    header='@62b9dc2d-5462-4c2e-97f2-b8e70c90de50 runid=974c5012fcdcf71c2b20c023400847a3f30c2519 sampleid=no_sample read=6912 ch=247 start_time=2020-12-29T13:36:35Z', 
    sequence='GTGTACTTCGTTCAGTTACGTATTGCTTAT../6,219chars/..C

In [5]:
fast5_test_fpath = '/home/deynonih/cager/NGS-test-data/for_barapost/FAL91692_72e297f8_0.fast5'

with ReaderWrapper(fast5_test_fpath) as ifh:
    for i, l in enumerate(ifh):
        print(l)
        print()
        if i == 4:
            break
        # end if
    # end for
# end with


[Fast5(
    file_handle=<HDF5 file "FAL91692_72e297f8_0.fast5" (mode r)>,
    read_id='read_00001da1-4ebb-4ce6-9128-d050d3696fcc'
)]

[Fast5(
    file_handle=<HDF5 file "FAL91692_72e297f8_0.fast5" (mode r)>,
    read_id='read_000ab04f-2712-4604-87d5-2f46b6a87895'
)]

[Fast5(
    file_handle=<HDF5 file "FAL91692_72e297f8_0.fast5" (mode r)>,
    read_id='read_000eb25f-b049-435f-82f6-1fee4796f0c4'
)]

[Fast5(
    file_handle=<HDF5 file "FAL91692_72e297f8_0.fast5" (mode r)>,
    read_id='read_00275507-50ed-4f59-97e3-a95fecb2fd0b'
)]

[Fast5(
    file_handle=<HDF5 file "FAL91692_72e297f8_0.fast5" (mode r)>,
    read_id='read_00496e0f-8e9d-44c9-b612-ddd6ec792b4f'
)]



In [6]:
fast5_test_fpath = '/home/deynonih/cager/NGS-test-data/for_barapost/FAL91692_72e297f8_0.fast5'

with ReaderWrapper(fast5_test_fpath) as ifh, \
     WriterWrapper(False, 2, 'fast5') as ofh:
    for seq_batch in ifh:
        classif_containers = [
            ClassifContainer(sr, 'test_seq') for sr in seq_batch
        ]
        ofh.write(classif_containers)
    # end for
# end with

KeyboardInterrupt: 

In [16]:
fast5_test_fpath = '/home/deynonih/cager/NGS-test-data/for_barapost/FAL91692_72e297f8_0_single/0/00001da1-4ebb-4ce6-9128-d050d3696fcc.fast5'

with ReaderWrapper(fast5_test_fpath) as ifh, \
     WriterWrapper(False, 2, 'fast5') as ofh:
    for seq_batch in ifh:
        classif_containers = [
            ClassifContainer(sr, 'test_seq') for sr in seq_batch
        ]
        ofh.write(classif_containers)
    # end for
# end with

In [11]:
classif_containers[0].record

Fast5(
    file_handle=<HDF5 file "00001da1-4ebb-4ce6-9128-d050d3696fcc.fast5" (mode r)>,
    read_uuid='read_00001da1-4ebb-4ce6-9128-d050d3696fcc'
)

In [10]:
blow5_test_fpath = '/home/deynonih/cager/NGS-test-data/for_barapost/FAY62206_04e2939b_50079bd0_18.blow5'

with ReaderWrapper(blow5_test_fpath) as ifh:
    for i, l in enumerate(ifh):
        print(l)
        print()
        if i == 4:
            break
        # end if
    # end for
# end with


[Blow5(record={'read_id': 'c018e0f4-2cf9-4b36-8961-751cc03d8dd5', 'read_group': 0, 'digitisation': 8192.0, 'offset': 4.0, 'range': 1787.8380126953125, 'sampling_rate': 5000.0, 'len_raw_signal': 139005, 'signal': array([511, 486, 496, ..., 318, 240, 264], dtype=int16)})]

[Blow5(record={'read_id': '0082f3c8-8b36-443b-a50a-064918f7e871', 'read_group': 0, 'digitisation': 8192.0, 'offset': 4.0, 'range': 1787.8380126953125, 'sampling_rate': 5000.0, 'len_raw_signal': 11877, 'signal': array([534, 523, 526, ..., 411, 503, 633], dtype=int16)})]

[Blow5(record={'read_id': '34f2315d-1dfc-43cb-a772-c348ab46aa59', 'read_group': 0, 'digitisation': 8192.0, 'offset': 4.0, 'range': 1787.8380126953125, 'sampling_rate': 5000.0, 'len_raw_signal': 22636, 'signal': array([501, 506, 500, ..., 443, 463, 472], dtype=int16)})]

[Blow5(record={'read_id': '20f9f51c-56f9-4646-9fca-dd441b8dcc66', 'read_group': 0, 'digitisation': 8192.0, 'offset': 21.0, 'range': 1787.8380126953125, 'sampling_rate': 5000.0, 'len_raw_

In [11]:
slow5_test_fpath = '/home/deynonih/cager/NGS-test-data/for_barapost/FAY62206_04e2939b_50079bd0_18.slow5'

with ReaderWrapper(slow5_test_fpath) as ifh:
    for i, l in enumerate(ifh):
        print(l)
        print()
        if i == 4:
            break
        # end if
    # end for
# end with


[Slow5(
    record={'read_id': 'c018e0f4-2cf9-4b36-8961-751cc03d8dd5', 'read_group': 0, 'digitisation': 8192.0, 'offset': 4.0, 'range': 1787.838013, 'sampling_rate': 5000.0, 'len_raw_signal': 139005, 'signal': array([511, 486, 496, ..., 318, 240, 264], dtype=int16)}
)]

[Slow5(
    record={'read_id': '0082f3c8-8b36-443b-a50a-064918f7e871', 'read_group': 0, 'digitisation': 8192.0, 'offset': 4.0, 'range': 1787.838013, 'sampling_rate': 5000.0, 'len_raw_signal': 11877, 'signal': array([534, 523, 526, ..., 411, 503, 633], dtype=int16)}
)]

[Slow5(
    record={'read_id': '34f2315d-1dfc-43cb-a772-c348ab46aa59', 'read_group': 0, 'digitisation': 8192.0, 'offset': 4.0, 'range': 1787.838013, 'sampling_rate': 5000.0, 'len_raw_signal': 22636, 'signal': array([501, 506, 500, ..., 443, 463, 472], dtype=int16)}
)]

[Slow5(
    record={'read_id': '20f9f51c-56f9-4646-9fca-dd441b8dcc66', 'read_group': 0, 'digitisation': 8192.0, 'offset': 21.0, 'range': 1787.838013, 'sampling_rate': 5000.0, 'len_raw_signa

In [17]:
pod5_test_fpath = '/home/deynonih/cager/NGS-test-data/for_barapost/FAY62206_04e2939b_50079bd0_18.pod5'

with ReaderWrapper(pod5_test_fpath) as ifh:
    for i, l in enumerate(ifh):
        print(l[0].record.read_id)
        print()
        if i == 4:
            break
        # end if
    # end for
# end with


c018e0f4-2cf9-4b36-8961-751cc03d8dd5

0082f3c8-8b36-443b-a50a-064918f7e871

34f2315d-1dfc-43cb-a772-c348ab46aa59

20f9f51c-56f9-4646-9fca-dd441b8dcc66

57d94d8a-a448-47ef-8037-e1d83081c2fe

