In [1]:
from Bio import SeqIO


In [2]:
allowed_qualifiers = ['locus_tag', 'gene', 'product', 'pseudo', 'protein_id', 'gene_desc', 'old_locus_tag', 'note', 'inference', \
					  'organism', 'mol_type', 'strain', 'sub_species', 'isolation-source', 'country', \
					  'collection_date']  # In GenBank files, the qualifier 'collection-date' is written as 'collection_date'.

In [3]:
records = list(SeqIO.parse('CuratedAnnotationV2/Erdman.curated.V2.gb', 'genbank'))  # read a GenBank file from the standard input and convert it into a list of SeqRecord objects


In [4]:
fasta_fh = open('CuratedAnnotationV2/Erdman.V2.fsa', 'w')
feature_fh = open('CuratedAnnotationV2/Erdman.V2.tbl', 'w')

for rec in records:  # for every SeqRecord object in the list 'records'
    SeqIO.write([rec], fasta_fh, 'fasta')  # Prints this contig's sequence to the fasta file. The sequence header will be rec.description.

    # write the feature table
    print('>Feature %s' % (rec.name), file = feature_fh)  # write the first line of this record in the feature table: the LOCUS name
    for f in rec.features:
        # print the coordinates
        if f.strand == 1:
            print('%d\t%d\t%s' % (f.location.nofuzzy_start + 1, f.location.nofuzzy_end, f.type), file = feature_fh)
        else:
            print('%d\t%d\t%s' % (f.location.nofuzzy_end, f.location.nofuzzy_start + 1, f.type), file = feature_fh)

        if (f.type == 'CDS') and ('product' not in f.qualifiers):
            f.qualifiers['product'] = 'hypothetical protein'
        # print qualifiers (keys and values)
        for (key, values) in f.qualifiers.items():
            '''
            Apply the iteritems() method of the dictionary f.qualifiers for (key, values) pairs
            iteritems() is a generator that yields 2-tuples for a dictionary. It saves time and memory but is slower than the items() method.
            '''
            # if key not in allowed_qualifiers:
            #     continue  # start a new 'for' loop of f, skipping the following 'for' statement of v
            for v in values:  # else, write all values under this key (qualifier's name)
                print('\t\t\t%s\t%s' % (key, v), file = feature_fh)
fasta_fh.close()  # finish the generation of the FASTA file
feature_fh.close()  # finish the generation of the feature table



In [5]:
!table2asn --help

USAGE
  table2asn [-h] [-help] [-help-full] [-xmlhelp] [-indir Directory]
    [-outdir Directory] [-E] [-x String] [-i InFile] [-aln-file InFile]
    [-aln-gapchar STRING] [-aln-missing STRING] [-aln-alphabet STRING]
    [-o OutFile] [-out-suffix String] [-binary] [-t InFile] [-a String] [-J]
    [-A String] [-C String] [-j String] [-src-file InFile] [-accum-mods]
    [-y String] [-Y InFile] [-D InFile] [-f InFile] [-V String] [-q] [-U] [-T]
    [-P] [-W] [-K] [-H String] [-Z] [-split-dr] [-c String] [-z OutFile]
    [-N String] [-w InFile] [-M String] [-l String]
    [-linkage-evidence-file InFile] [-gap-type String] [-m String]
    [-ft-url String] [-ft-url-mod String] [-gaps-min Integer]
    [-gaps-unknown Integer] [-postprocess-pubs] [-locus-tag-prefix String]
    [-no-locus-tags-needed] [-euk] [-suspect-rules String] [-allow-acc]
    [-intronless] [-refine-prt-alignments]
    [-prt-alignment-filter-query String] [-logfile LogFile] [-logxml LogFile]
    [-split-logs] [-verbose] [-h

In [6]:
!ls -1 CuratedAnnotationV2/

Erdman.V2.fsa
Erdman.V2.sbt
Erdman.V2.tbl
Erdman.curated.V2.gb
Erdman.curated.V2.tsv


## Run `table2asn` to create a `.sqn` file 

In [7]:
!table2asn -t CuratedAnnotationV2/Erdman.V2.sbt -outdir CuratedAnnotationV2/ -indir CuratedAnnotationV2/

Will be using one threads
Recognized annotation format: five-column feature table
Problem:        Unrecognized qualifier name
SeqId:          lcl|ErdmanSF2024
Line:           4
FeatureName:    source
QualifierName:  mol_type
QualifierValue: genomic DNA


Problem:        Unrecognized qualifier name
SeqId:          lcl|ErdmanSF2024
Line:           5
FeatureName:    source
QualifierName:  db_xref
QualifierValue: taxon:652616


