In [1]:
from Bio import SeqIO
from Bio.Seq import Seq
import os

In [2]:
infile_paths = ['INPUT/gff/' + file for file in os.listdir('INPUT/gff') if file != '.ipynb_checkpoints']
infile_paths

['INPUT/gff/d2246f26c16fb9eec9.examples.gff',
 'INPUT/gff/d2248a31d0e133ef11.examples.gff',
 'INPUT/gff/6c3b2ade31d3833745.examples.gff',
 'INPUT/gff/23422406293d40c201.examples.gff',
 'INPUT/gff/4a23bcab7c50591a6a.examples.gff',
 'INPUT/gff/27177e034d397f4d56.examples.gff',
 'INPUT/gff/ba436543bec6c30920.examples.gff',
 'INPUT/gff/7205dff302ff900300.examples.gff',
 'INPUT/gff/e8a14f064cce46f350.examples.gff',
 'INPUT/gff/596adb7fa02d385b18.examples.gff',
 'INPUT/gff/4e45823b6b83464d6e.examples.gff',
 'INPUT/gff/6bf6c4c7da68779d7a.examples.gff',
 'INPUT/gff/0093c124a5b6ee038d.examples.gff',
 'INPUT/gff/c15257c28ad9a20cdd.examples.gff',
 'INPUT/gff/9a8a6e441d6dea8617.examples.gff',
 'INPUT/gff/c1050b21cc75640d51.examples.gff',
 'INPUT/gff/aa6af7e9289c3558d3.examples.gff']

In [7]:
# integrate later
def single_gff_to_fna(infile_path):
    multifasta_path = infile_path.replace("gff", "fna")
    start_reading = False
    outfile_gff_path = infile_path.replace("INPUT/gff", "OUTPUT")
    with open(infile_path, 'r') as infile, open(outfile_gff_path, 'w') as outfile_gff, open(multifasta_path, "w") as outfile:
        for line in infile:
            line = line.strip('\n')
            if line.startswith(">"):
                start_reading = True
            if not start_reading:
                print(line, file=outfile_gff)
            if start_reading:
                print(line, file=outfile)

In [67]:
def single_gff_to_faa(infile_path):
    faa_path = infile_path.replace("gff", "faa")
    fna_path = infile_path.replace("gff", "fna")
    contigs = fasta_id_dict(fna_path)
    #outfile_gff_path = infile_path.replace("INPUT/gff", "OUTPUT")
    with open(infile_path, 'r') as infile, open(faa_path, "w") as outfile:
        for line in infile:
            line = line.strip('\n').split('\t')
            if 'PRODIGAL' in line:
                header_old = line[0]
                header = ';'.join([line[0], line[3], line[4], line[6]])
                strand = line[6]
                try:
                    dna_seq = contigs[header_old]
                except:
                    pass #some proteins in contigs are annotated without reference sequence
                aa_seq = extract_faa_seq(header, dna_seq)
                print('>' + header, file=outfile)
                print(aa_seq, file=outfile)
                

In [68]:
def extract_faa_seq(header, seq):
    header_list = header.split(';')
    start = int(header_list[-3]) - 1
    end = int(header_list[-2])
    strand = header_list[-1]
    if strand == '+':
        dna = Seq(seq[start:end])
        protein = dna.translate()
        return protein
    elif strand == '-':
        dna = Seq(seq[start:end]).reverse_complement()
        protein = dna.translate()
        return protein
    
def fasta_id_dict(fna_path):
    contigs = {}
    with open(fna_path) as handle:
        for rec in SeqIO.parse(handle, 'fasta'):
            header = rec.id
            sequence = rec.seq
            contigs[header] = sequence
    return contigs
        
    