In [6]:
from Bio import SeqIO
import pandas as pd 
from glob import glob
import shutil
import os

In [7]:
GENE_MODEL_SETTINGS = {
    'gene_model': 2024, # 2021 or 2024
    'translate_gene_names_to_ttherm_ids': False # unused if 'gene_model': 2021
}

In [8]:
if GENE_MODEL_SETTINGS['gene_model'] == 2024:

    with open('./new_raw_data/tgd2024/Manual_check-total-gene.gff3_Right_UTR.gff3_cds.fasta', 'r') as f:
        cds_records = list(SeqIO.parse(f, 'fasta'))

    with open('./new_raw_data/tgd2024/Manual_check-total-gene.gff3_Right_UTR.gff3_pep.fasta', 'r') as f:
        pep_records = list(SeqIO.parse(f, 'fasta'))

    annotations = pd.read_csv('./TGNE/eggnog/2024_none_eggnog_compiled.annotations', comment='#', delimiter='\t')

    annotation_desc_paths = glob('./TGNE/enrichment/2024/*.csv')

    if GENE_MODEL_SETTINGS['translate_gene_names_to_ttherm_ids']:

        df_y_to_ttherm = pd.read_csv('./new_raw_data/tgd2024/yf_ttherm_mapping_feb2024.csv')
        dict_y_to_ttherm = {yf: ttherm for yf, ttherm in zip(df_y_to_ttherm['yf2024'].values, df_y_to_ttherm['ttherm2021'].values)}

        for idx, r in enumerate(cds_records):
            if (r.id).replace('.t1', '') in dict_y_to_ttherm:
                r.id = dict_y_to_ttherm[(r.id).replace('.t1', '')]

        with open('./new_raw_data/Tthermophila_MAC_protein_2021.fasta', 'r') as f:
            pep_records_2021 = list(SeqIO.parse(f, 'fasta'))

        dict_pep_records_2021 = {r.id: r for r in pep_records_2021}

        for idx, r in enumerate(pep_records):
            if (r.id).replace('.t1', '') in dict_y_to_ttherm:
                r.id = dict_y_to_ttherm[(r.id).replace('.t1', '')]
                r.description = dict_pep_records_2021[r.id].description

        annotations['query'] = [yfid if yfid.replace('.t1', '') not in dict_y_to_ttherm else dict_y_to_ttherm[yfid.replace('.t1', '')] for yfid in annotations['query'].values]




In [9]:
if GENE_MODEL_SETTINGS['gene_model'] == 2021:

    with open('./new_raw_data/Tthermophila_MAC_CDS_2021.fasta', 'r') as f:
        cds_records = list(SeqIO.parse(f, 'fasta'))

    with open('./new_raw_data/Tthermophila_MAC_protein_2021.fasta', 'r') as f:
        pep_records = list(SeqIO.parse(f, 'fasta'))

    annotations = pd.read_csv('./TGNE/eggnog/2021_none_eggnog_compiled.annotations', comment='#', delimiter='\t')

    annotation_desc_paths = glob('./TGNE/enrichment/2021/*.csv')

In [10]:
SeqIO.write(cds_records, './active_fastas/cds.fasta', 'fasta')

25987

In [11]:
SeqIO.write(pep_records, './active_fastas/pep.fasta', 'fasta')

25987

In [12]:
annotations.to_csv('./active_fastas/annotations.csv', index=False)

In [13]:
destination_dir = './active_fastas/'

for path in annotation_desc_paths:
    destination_path = os.path.join(destination_dir, os.path.basename(path))
    shutil.copy(path, destination_path)