In [22]:
from Bio import SeqIO
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
import os.path
import os

genes = ['A6', 'A8', 'CO1', 'CO2', 'CO3', 'Cytb', 'ND1', 'ND2', 'ND3', 'ND4', 'ND4L', 'ND5', 'ND6']
neg_genes = ['ND1', 'ND4', 'ND4L', 'ND5']
#genes = ['A8']
#Choose family
TAXA = 'Orthoptera'
PATH_TO_MIDORI_FOLDER = '/mnt/data/Documents/lab/TermitesAndCockroaches/MIDORI'
#set to True to use only CO1 gene for subsequent codonusage calculations
JUST_CO1 = True
if JUST_CO1:
    PATH_TO_GB= f'/mnt/data/Documents/lab/TermitesAndCockroaches/mtdna-mutspec-insecta/data/MIDORI/{TAXA}_CO1_genbanked'
else:
    PATH_TO_GB= f'/mnt/data/Documents/lab/TermitesAndCockroaches/mtdna-mutspec-insecta/data/MIDORI/{TAXA}_genbanked'

In [23]:
def parser(taxa, gene):
    if gene in neg_genes:
        strand_type = -1
    else:
        strand_type = +1
    all_sp_seqs = {}
    all_sp_prot = {}
    db = f'{PATH_TO_MIDORI_FOLDER}/MIDORI2_LONGEST_NUC_GB255_{gene}_BLAST.fasta'
    prot_db = f'{PATH_TO_MIDORI_FOLDER}/MIDORI2_LONGEST_AA_GB255_{gene}_BLAST.fasta'
    for entry in SeqIO.parse(db, 'fasta'):
        if taxa in entry.id:
            taxonomy = entry.id.split(';')
            #Species and Taxonomy are merged for now
            all_sp_seqs[f'{taxonomy[7]}|{";".join(taxonomy[1:6])}'] = entry.seq

    #print(f'Number of species per {gene} - {len(all_sp_seqs)}')

    for entry in SeqIO.parse(prot_db, 'fasta'):
        if taxa in entry.id:
            all_sp_prot[entry.id.split(';')[7]] = entry.seq
    

    for sp, seq in all_sp_seqs.items():
        if not os.path.exists(f'{PATH_TO_GB}/{sp.split("|")[0]}'):
            os.makedirs(f'{PATH_TO_GB}/{sp.split("|")[0]}')
        seq_string = seq
        seq_object = Seq(seq_string)
        record = SeqRecord(seq_object,
                   id=sp.split('|')[0], 
                   name=sp.split('|')[0])
        feature = SeqFeature(FeatureLocation(start=0, end=len(seq), strand=strand_type), type='CDS', qualifiers={'gene' : gene, 'translation' : all_sp_prot[sp.split('|')[0]]})
        record.features.append(feature)
        record.annotations['organism'] = sp.split('|')[0]
        record.annotations['taxonomy'] = sp.split('|')[1].split(';')
        record.annotations['molecule_type'] = 'dna'
        #print(f'{sp.split("|")[0]} - {all_sp_prot[sp.split("|")[0]]}')
        #print(record)
        with open(f'{PATH_TO_GB}/{sp.split("|")[0]}/{sp.split("|")[0]}_{gene}.gb', 'w') as handle:  
            SeqIO.write(record, handle, 'genbank')
        

In [24]:
if JUST_CO1:
    parser(TAXA, 'CO1')
else:
    for gene in genes:
        parser(TAXA, gene)



In [25]:
if JUST_CO1 == False:
    # code for checking if an sp has all 13 genes, then we remove all the other species
    import shutil

    all_genes = []
    for dir_path in os.listdir(PATH_TO_GB):
        counter = 0
        for path in os.listdir(f'{PATH_TO_GB}/{dir_path}'):
            if os.path.isfile(f'{PATH_TO_GB}/{dir_path}/{path}'):
                counter += 1
        if counter == 13:
            all_genes.append(dir_path)

    for dir_path in os.listdir(PATH_TO_GB):
        if dir_path not in all_genes:
            shutil.rmtree(f'{PATH_TO_GB}/{dir_path}')

Execute this code in {TAXA}_genbanked directory to merge all the folders (and also delete all the empty folders): 
`find . -type f -print0 | xargs -0 -I file mv --backup=numbered file .`\
NOTE:\
You might also need to run this command `find . -empty -type d -delete`, as the previous one might leave empty folders\
TODO:\
~~Automate this step~~

In [26]:
os.system(f'cd {PATH_TO_GB} && find . -type f -print0 | xargs -0 -I file mv --backup=numbered file .')
os.system(f'cd {PATH_TO_GB} && find . -empty -type d -delete')

0

In [28]:
if JUST_CO1:
    PATH_TO_MERGED = f'/mnt/data/Documents/lab/TermitesAndCockroaches/mtdna-mutspec-insecta/data/MIDORI/mergedAllGenes{TAXA}_CO1.gb'
else:
    PATH_TO_MERGED = f'/mnt/data/Documents/lab/TermitesAndCockroaches/mtdna-mutspec-insecta/data/MIDORI/mergedAllGenes{TAXA}.gb'
filenames = os.listdir(PATH_TO_GB)
with open(PATH_TO_MERGED, 'w') as outfile:
    for fname in filenames:
        with open(f'{PATH_TO_GB}/{fname}') as infile:
            for line in infile:
                outfile.write(line)