In [14]:
from antismash.common import gff_parser
from antismash.common import record_processing

import sys
import os
from pathlib import Path

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

def extract_aa(records):
    faa = []
    for record in records:
        for feat in record.features:
            if feat.type=="CDS":
                assert len(feat.qualifiers['translation'])==1
                gene_id = feat.qualifiers['gene'][0]
                locus_tag = feat.qualifiers['locus_tag']
                aa = feat.qualifiers['translation'][0]

                faa_record = SeqRecord(
                    Seq(aa),
                    id=locus_tag,
                    name=locus_tag,
                    description=gene_id,
                )

                faa.append(faa_record)
    return faa

def jgi_gff_to_gbk(fasta, taxon, gff, metadata, genome_id):
    records = record_processing.parse_input_sequence(fasta, 
                                                     taxon=taxon,
                                                     gff_file=gff)
    records_biopython = [r.to_biopython() for r in records]
    
    with open(metadata, 'r') as f:
        taxonomy = f.read().split(",")
        definition = " ".join(taxonomy)
        source = " ".join(taxonomy[0:2])
    
    for record in records_biopython:
        old_name = record.name
        new_name = f"{genome_id}_{old_name.split('_')[-1]}"
        record.name = new_name
        record.id = new_name
        record.annotations['organism'] = source
        record.annotations['source'] = source
        record.description = definition
        for feat in record.features:
            if feat.type=="CDS":
                assert len(feat.qualifiers['translation'])==1
                feat.qualifiers['locus_tag'] = f"{record.name}-{feat.qualifiers['ID'][0]}"
    
    #SeqIO.write(records_biopython, out_file, "genbank")   
    return records_biopython

def jgi_convert_gff(fasta, taxon, gff, metadata, outdir, genome_id):
    records = jgi_gff_to_gbk(fasta, 'fungi', gff, outfile, metadata, genome_id)
    faa = extract_aa(records)
    
    # generate outdir
    outdir = Path(outdir)
    outdir.mkdir(parents=True, exist_ok=True)
    
    # write files
    SeqIO.write(records[0], outdir / f"{genome_id}.gbk", "genbank")
    SeqIO.write(records[0], outdir / f"{genome_id}.fna", "fasta")
    SeqIO.write(faa[0], outdir / f"{genome_id}.faa", "fasta")
    return



In [15]:
fasta = "../data/interim/fasta/1052026.fna"
gff = "../data/interim/prokka/1052026/1052026.gff"
metadata = "../data/interim/prokka/1052026/organism_info.txt"
genome_id = "1052026"
outfile = "test2/test2.gbk"

In [16]:
records = jgi_gff_to_gbk(fasta, 'fungi', gff, outfile, metadata, genome_id)
faa = extract_aa(records)

In [17]:
print(records[1])

ID: 1052026_2
Name: 1052026_2
Description: Aspergillus astellatus CBS 261.93
Number of features: 348
/molecule_type=DNA
/organism=Aspergillus astellatus
/source=Aspergillus astellatus
Seq('CCTTGATGGGAAAACATACACGGCGTTGGACTATAATGACTATAACCTGAAATC...AAA')


In [18]:
SeqIO.write(records[0], outfile, "genbank")
SeqIO.write(records[0], 'test2/test2.fna', "fasta")
SeqIO.write(faa[0], 'test2/test2.faa', "fasta")

1

In [103]:
! head {fasta}

>scaffold_1
TAAAACCCTAATAAAACCCTAAAACCCTAAAACCCTAAAACCCTAAACCCTAATAAAACCCTAAAACCCT
AAAACCCTAAAACCCTAAAACCCTAAACCCTAATAAAACCCTAATAAAAACCCTAATAAAACCCTAATTA
CTAATAGTATTTCTATAAATATTAATTCTAGAATCTTATTTCTAGATAAAATTCCTGGGTCTTCTAGCAG
attaatattattaatatttattataCTATTTCTAGACTTCCGCTGTCTATAATAAAAGAAACTTTCttaa
ttaaattttatatttttttattatCTGCCGTCTGttattttatattattaagtatatttaatttaaataa
ataattaatCTTACTATTCTGGTATCTTCCTAAATTCTTACTTTTCTCTATCAGatataaattaaaaaat
atttcttaaaattattaaaaaagatatatatCCAGCTATTATTAAAATTTCCAAATAACAAGCAGatatt
aattataattaactattaaattaatctagattaaaaaatatttcttactctgattaaatattttctaata
aaaaatataaattattataagattttattaGATTTCTAACCAGTATCTGGATCTTATACTAGTAAAAATC


In [104]:
! head {gff}

##gff-version 3
##sequence-region scaffold_1 1 1316899
scaffold_1	prediction	gene	6441	7718	0	+	.	ID=gene_1;Name=jgi.p|Aspaste1|119594;portal_id=Aspaste1;proteinId=119594;transcriptId=119867
scaffold_1	prediction	mRNA	6441	7718	.	+	.	ID=mRNA_1;Name=jgi.p|Aspaste1|119594;Parent=gene_1;proteinId=119594;track=FilteredModels1;transcriptId=119867
scaffold_1	prediction	exon	6441	7718	.	+	.	ID=exon_1_1;Parent=mRNA_1
scaffold_1	prediction	five_prime_UTR	6441	6477	.	+	.	ID=UTR5_1;Parent=mRNA_1
scaffold_1	prediction	CDS	6478	7623	.	+	0	ID=CDS_1;Parent=mRNA_1
scaffold_1	prediction	three_prime_UTR	7624	7718	.	+	.	ID=UTR3_1;Parent=mRNA_1
scaffold_1	prediction	gene	7754	9181	0	-	.	ID=gene_2;Name=jgi.p|Aspaste1|147021;portal_id=Aspaste1;proteinId=147021;transcriptId=147294
scaffold_1	prediction	mRNA	7754	9181	.	-	.	ID=mRNA_2;Name=jgi.p|Aspaste1|147021;Parent=gene_2;proteinId=147021;track=FilteredModels1;transcriptId=147294


In [105]:
! head test2/test2.gbk

LOCUS       1052026_1            1316899 bp    DNA              UNK 01-JAN-1980
DEFINITION  Aspergillus astellatus CBS 261.93.
ACCESSION   scaffold_1
VERSION     scaffold_1
KEYWORDS    .
SOURCE      Aspergillus astellatus
  ORGANISM  Aspergillus astellatus
            .
FEATURES             Location/Qualifiers
     CDS             6478..7623


In [None]:
from Bio import SeqIO
gbk_filename = "NC_005213.gbk"
faa_filename = "NC_005213_converted.faa"
input_handle  = open(gbk_filename, "r")
output_handle = open(faa_filename, "w")

for seq_record in SeqIO.parse(input_handle, "genbank") :
    print "Dealing with GenBank record %s" % seq_record.id
    for seq_feature in seq_record.features :
        if seq_feature.type=="CDS" :
            assert len(seq_feature.qualifiers['translation'])==1
            output_handle.write(">%s from %s\n%s\n" % (
                   seq_feature.qualifiers['locus_tag'][0],
                   seq_record.name,
                   seq_feature.qualifiers['translation'][0]))

output_handle.close()
input_handle.close()

In [92]:
! head Aspdef1_GeneModels_FilteredModels1_aa.fasta

>jgi|Aspdef1|10003|CE10002_86
MSATGPPVGAQAAASPTTNRESTRTSANVLESPISSPHGLDGSTSRPGHRRIVFTDPVALRYLEEDPSTV
VLHRRLALEGYEIYIVEQWACSRIHPTFVITTYTGDSSHKVVVGVLGVPTNEAAWSDRLRLYFEAFKTCQ
LREKETPLGTIMVTDLNSFPSGLTVIPVPNGDILRHREDFIVNENLKRLGCAGRAGLKLQAPSPATVAKF
HQLYRTSERIPLYSAVIELVKQCQIALMMFGNLAPEYVDGLLCDVTEAAVGDWWTDIGMDLYNIEPSDGR
LGPTTVAALLGTLMGARNRLHAFGAPVSKDVFDISSLKRGIGGFQKSQKLKRTRRLNRHTLDRLHRVTAK
AANAEGWTDAVKSTMAELSGHGGEMVMGMVRGREKGGIADIETIDIDNFAQLATGERAKWLWRGKPRKSA
VAANGPPAADMMFTTDEQGGYVWTSRKRHSHEDLGIDPAFQRSDRSWKPQEVASFPEDKDQNLPRMVIKG
VSEKVSDARVGFGKFKDAVGISGRRSQPQKQSKDGLEIVGDADYIASPESDTEISPSKKIAENYVQSGYE
SAPLPDTQNLDDIEQAEPPPDTALQSAEAKPPEITVEPAASNDDTDTSPKASITRIDDDSQVLDRSKTQS


In [120]:


record = SeqRecord(
    Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF"),
    id="YP_025292.1",
    name="HokC",
    description="toxic membrane protein, small",
)
print(record)

ID: YP_025292.1
Name: HokC
Description: toxic membrane protein, small
Number of features: 0
Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF')


In [128]:
faa = []
for feat in records[0].features:
    if feat.type=="CDS":
        assert len(feat.qualifiers['translation'])==1
        gene_id = feat.qualifiers['gene'][0]
        locus_tag = feat.qualifiers['locus_tag']
        aa = feat.qualifiers['translation'][0]
        
        faa_record = SeqRecord(
            Seq(aa),
            id=locus_tag,
            name=locus_tag,
            description=gene_id,
        )
        
        faa.append(faa_record)

In [129]:
print(faa[0])

ID: 1052026_1_CDS_1
Name: 1052026_1_CDS_1
Description: jgi.p_Aspaste1_119594
Number of features: 0
Seq('MAPVSRTSFRTVDQFIPTASIPMDEEVSWDLPRQETSFEAFSIPQDPQNPEGLQ...ETD')


In [130]:
SeqIO.write(faa[0], "test2/aa.faa", "fasta")

1

In [131]:
SeqIO.write(record[0], "test2/nuc.fna", "fasta")

AttributeError: 'str' object has no attribute 'id'