In [1]:
import pysam
from Bio import SeqIO

## Helper functions

In [2]:
def _write_genome_to_fasta(contigs, fasta_file_path, contig_names):
    with open(fasta_file_path, 'w') as f:
        for contig, contig_name in zip(contigs, contig_names):
            f.write('>{} LN:{}\n'.format(contig_name, len(contig)))
            f.write('{}\n'.format(contig))

## Read data

In [3]:
for record in SeqIO.parse('./f-bact-varium-n15-model-23-racon-hax/consensus.fasta', 'fasta'):
    print('id:', record.id)
    print('name:', record.name)
    print('reverse_complement:', record.reverse_complement())
    print('seq:', record.seq[:10])
    print('len:', len(record.seq))
    print('description:', record.description)
    print('features:', record.features)
    print('record:\n', record)
    print()

id: contig_0
name: contig_0
reverse_complement: ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 0
Seq('TCCAATTTCTCTATATATTCTCGAGCACTTTCAACAACATCACACAAAATCATC...TTA', SingleLetterAlphabet())
seq: TAAAACTCTT
len: 2542381
description: contig_0 LN:2542381
features: []
record:
 ID: contig_0
Name: contig_0
Description: contig_0 LN:2542381
Number of features: 0
Seq('TAAAACTCTTGTTGACCCATTCATTGGAAAAATATCATTGTTTAAAATTAATTC...GGA', SingleLetterAlphabet())

id: contig_1
name: contig_1
reverse_complement: ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 0
Seq('TGTAACTTCTCCACCTAATCCATTTTCAATATTAGTTATATTCCCAATTGAATC...TCA', SingleLetterAlphabet())
seq: TGACTATTTT
len: 670332
description: contig_1 LN:670332
features: []
record:
 ID: contig_1
Name: contig_1
Description: contig_1 LN:670332
Number of features: 0
Seq('TGACTATTTTGGTACTAAAGAAAGACGAGTAATAGATGAAGAAAGAGAATTTAA...ACA', SingleLetterAlphabet())

id: contig_2


In [4]:
for record in SeqIO.parse('/home/data/oxford_nanopore/bacteria/fusobacterium/varium/f-varium.fasta', 'fasta'):
    print('id:', record.id)
    print('name:', record.name)
    print('reverse_complement:', record.reverse_complement())
    print('seq:', record.seq[:10])
    print('len:', len(record.seq))
    print()

id: CP028103.1
name: CP028103.1
reverse_complement: ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 0
Seq('AAAAGTGGATTTTAACAGTTTTTCTTTTGATATTTTTTACAGCTTTTTCATCAG...TGA', SingleLetterAlphabet())
seq: TCATTCTTTC
len: 3303644



## Output good order

In [5]:
contigs = list()
for record in SeqIO.parse('./f-bact-varium-n15-model-23-racon-hax/consensus.fasta', 'fasta'):
    contigs.append(record)

good_order = [0, 1, 2]
is_foorward = [True, True, False]
seq = ''
for order_index in good_order:
    seq += contigs[order_index].seq if is_foorward[order_index] else contigs[order_index].reverse_complement().seq

_write_genome_to_fasta([seq], './f-bact-varium-n15-model-23-racon-hax/consensus-merged-contigs.fasta', ['contig_0'])

## Test

In [21]:
new_contigs = list()
new_contig = ''
with open('./f-bact-varium-n15-model-23-racon-hax/consensus.fasta') as f:
    for line in f:
        line = line.strip()
        if line.startswith('>'):
            print(line)
            if len(new_contig) > 0:
                new_contigs.append(new_contig)
                new_contig = ''
        else:
            new_contig += line
    if len(new_contig) > 0:
        new_contigs.append(new_contig)
        new_contig = ''

for c in new_contigs:
    print(len(c))
    
_write_genome_to_fasta(new_contigs, './f-bact-varium-n15-model-23-racon-hax/consensus-test.fasta', ['contig_0', 'contig_1', 'contig_2'])

>contig_0 LN:3310449
>contig_1 LN:870763
>contig_2 LN:98150
2542381
670332
77317


In [28]:
t = [c for c in ['a', 'b', 'c', '', 'd']]
print(len(t))
print(''.join(t))

5
abcd


In [29]:
new_contigs = list()
new_contig = ''
with open('./f-bact-periodonticum-n15-model-11-racon-hax/consensus.fasta') as f:
    for line in f:
        line = line.strip()
        if line.startswith('>'):
            print(line)
            if len(new_contig) > 0:
                new_contigs.append(new_contig)
                new_contig = ''
        else:
            new_contig += line
    if len(new_contig) > 0:
        new_contigs.append(new_contig)
        new_contig = ''

for c in new_contigs:
    print(len(c))
    
# _write_genome_to_fasta(new_contigs, './f-bact-varium-n15-model-23-racon-hax/consensus-test.fasta', ['contig_0', 'contig_1', 'contig_2'])

>contig_0 LN:3742712
2536109
