In [1]:
from Bio import SeqIO
import pandas as pd 
from glob import glob
import shutil
import os

In [2]:
GENE_MODEL_SETTINGS = {
    'gene_model': 2024, # 2021 or 2024
    'translate_gene_names_to_ttherm_ids': False # unused if 'gene_model': 2021
}

In [3]:
if GENE_MODEL_SETTINGS['gene_model'] == 2024:

    with open('./new_raw_data/tgd2024/Manual_check-total-gene.gff3_Right_UTR.gff3_cds.fasta', 'r') as f:
        cds_records = list(SeqIO.parse(f, 'fasta'))

    with open('./new_raw_data/tgd2024/Manual_check-total-gene.gff3_Right_UTR.gff3_pep.fasta', 'r') as f:
        pep_records = list(SeqIO.parse(f, 'fasta'))

    annotations = pd.read_csv('./TGNE/eggnog/2024_none_eggnog_compiled.annotations', comment='#', delimiter='\t')

    annotation_desc_paths = glob('./TGNE/enrichment/2024/*.csv')

    if GENE_MODEL_SETTINGS['translate_gene_names_to_ttherm_ids']:

        df_y_to_ttherm = pd.read_csv('./new_raw_data/tgd2024/yf_ttherm_mapping_feb2024.csv')
        dict_y_to_ttherm = {yf: ttherm for yf, ttherm in zip(df_y_to_ttherm['yf2024'].values, df_y_to_ttherm['ttherm2021'].values)}

        for idx, r in enumerate(cds_records):
            if (r.id).replace('.t1', '') in dict_y_to_ttherm:
                r.id = dict_y_to_ttherm[(r.id).replace('.t1', '')]

        with open('./new_raw_data/Tthermophila_MAC_protein_2021.fasta', 'r') as f:
            pep_records_2021 = list(SeqIO.parse(f, 'fasta'))

        dict_pep_records_2021 = {r.id: r for r in pep_records_2021}

        for idx, r in enumerate(pep_records):
            if (r.id).replace('.t1', '') in dict_y_to_ttherm:
                r.id = dict_y_to_ttherm[(r.id).replace('.t1', '')]
                r.description = dict_pep_records_2021[r.id].description

        annotations['query'] = [yfid if yfid.replace('.t1', '') not in dict_y_to_ttherm else dict_y_to_ttherm[yfid.replace('.t1', '')] for yfid in annotations['query'].values]


In [4]:
if GENE_MODEL_SETTINGS['gene_model'] == 2021:

    with open('./new_raw_data/Tthermophila_MAC_CDS_2021.fasta', 'r') as f:
        cds_records = list(SeqIO.parse(f, 'fasta'))

    with open('./new_raw_data/Tthermophila_MAC_protein_2021.fasta', 'r') as f:
        pep_records = list(SeqIO.parse(f, 'fasta'))

    annotations = pd.read_csv('./TGNE/eggnog/2021_none_eggnog_compiled.annotations', comment='#', delimiter='\t')

    annotation_desc_paths = glob('./TGNE/enrichment/2021/*.csv')

In [5]:
SeqIO.write(cds_records, './active_fastas/cds.fasta', 'fasta')

25987

In [6]:
SeqIO.write(pep_records, './active_fastas/pep.fasta', 'fasta')

25987

In [7]:
annotations.to_csv('./active_fastas/annotations.csv', index=False)

In [8]:
destination_dir = './active_fastas/'

for path in annotation_desc_paths:
    destination_path = os.path.join(destination_dir, os.path.basename(path))
    shutil.copy(path, destination_path)

# QUICK ID TRANSLATION

In [9]:
df_y_to_ttherm = pd.read_csv('./new_raw_data/tgd2024/yf_ttherm_mapping_feb2024.csv')
dict_y_to_ttherm = {yf: ttherm for yf, ttherm in zip(df_y_to_ttherm['yf2024'].values, df_y_to_ttherm['ttherm2021'].values)}
dict_ttherm_to_y = {ttherm: yf for yf, ttherm in zip(df_y_to_ttherm['yf2024'].values, df_y_to_ttherm['ttherm2021'].values)}

In [10]:
translate_ttherms = [
'TTHERM_01055600', 
'TTHERM_01002870', 
'TTHERM_01002860', 
'TTHERM_00630470', 
'TTHERM_00624730', 
'TTHERM_00624720', 
'TTHERM_00527180', 
'TTHERM_00522600', 
'TTHERM_00378890', 
'TTHERM_00335830', 
'TTHERM_00221120',
]

print(', '.join([dict_ttherm_to_y[id] for id in translate_ttherms if id in dict_ttherm_to_y]))

YF00024749, YF00036313, YF00005803, YF00003933, YF00013821


In [11]:
translate_ttherms = [
'TTHERM_00420610', 
'TTHERM_00410210', 
'TTHERM_00313130', 
'TTHERM_00467390',
]

print(', '.join([dict_ttherm_to_y[id] for id in translate_ttherms if id in dict_ttherm_to_y]))

YF00002277, YF00018466, YF00019887


In [12]:
# mycocysts
translate_ttherms = [
"TTHERM_01055600", "TTHERM_01002870", "TTHERM_01002860", "TTHERM_00630470", "TTHERM_00624730", "TTHERM_00624720", "TTHERM_00527180", "TTHERM_00522600", "TTHERM_00378890", "TTHERM_00335830", "TTHERM_00221120"
]

print(', '.join([dict_ttherm_to_y[id] for id in translate_ttherms if id in dict_ttherm_to_y]))

muco = ['YF00024749', 'YF00036313', 'YF00005803', 'YF00003933', 'YF00013821']

print(', '.join([dict_y_to_ttherm[id] for id in muco if id in dict_y_to_ttherm]))

YF00024749, YF00036313, YF00005803, YF00003933, YF00013821
TTHERM_01055600, TTHERM_00630470, TTHERM_00624720, TTHERM_00527180, TTHERM_00335830


In [13]:
# (HH2A.1, HH2A.2, HH2B.1, HH2B.2, HH3, HH4.1, and HH4.2)
translate_ttherms = [
"TTHERM_00189170", "TTHERM_00143660", "TTHERM_00633360", "THERM_00283180", 
]

print(', '.join([dict_ttherm_to_y[id] for id in translate_ttherms if id in dict_ttherm_to_y]))

YF00005517


In [14]:
# (Figure 3A; CPD1, CPD2, and CPG3 and SMC2 and SMC4).
translate_ttherms = [
"TTHERM_00486070", "TTHERM_00392760", "TTHERM_00919690", "TTHERM_00812950", "TTHERM_0044640", 
]

print(', '.join([dict_ttherm_to_y[id] for id in translate_ttherms if id in dict_ttherm_to_y]))

YF00023687, YF00007642, YF00028933


In [15]:
# cyclin D family members, CYC4, CYC13, and CYC25
translate_ttherms = [
"TTHERM_01043080", "TTHERM_01128530", "TTHERM_00717540",
]

print(', '.join([dict_ttherm_to_y[id] for id in translate_ttherms if id in dict_ttherm_to_y]))

YF00006914, YF00023330


In [16]:
# all cyclins
translate_ttherms = [
'TTHERM_00196590',
'TTHERM_00433390',
'TTHERM_00526250',
'TTHERM_00962160',
'TTHERM_01128530',
'TTHERM_00189230',
'TTHERM_00425970',
'TTHERM_00780580',
'TTHERM_00693080',
'TTHERM_00827080',
'TTHERM_00698650',
'TTHERM_00079530',
'TTHERM_00624450',
'TTHERM_00784350',
'TTHERM_00933270',
'TTHERM_00732460',
'TTHERM_00842480',
'TTHERM_00717540',
'TTHERM_00066840',
'TTHERM_000576939',
'TTHERM_00082190',
'TTHERM_00649450',
'TTHERM_00180970',
'TTHERM_00726380',
'TTHERM_00013060',
'TTHERM_00049420',
'TTHERM_00294860',
'TTHERM_00535270',
'TTHERM_01043080',
'TTHERM_00192000',
'TTHERM_00194440',
'TTHERM_00293270',
'TTHERM_00332170',
'TTHERM_00940290',
]

print(', '.join([dict_ttherm_to_y[id] for id in translate_ttherms if id in dict_ttherm_to_y]))

YF00036972, YF00010860, YF00003848, YF00004376, YF00023330, YF00018120, YF00008997, YF00023552, YF00004862, YF00027870, YF00027031, YF00005775, YF00007755, YF00035784, YF00035293, YF00013340, YF00026392, YF00029711, YF00025382, YF00015327, YF00024034, YF00034044, YF00017344, YF00001614, YF00006914, YF00030911, YF00017795, YF00001684, YF00024727, YF00004649


In [17]:
# all MCMs
translate_ttherms = [
'TTHERM_00554270',
'TTHERM_00092850',
'TTHERM_00277550',
'TTHERM_00069420',
'TTHERM_00448570',
'TTHERM_00011740',
'TTHERM_01031060',
'TTHERM_00703910',
'TTHERM_01207610',
]

print(', '.join([dict_ttherm_to_y[id] for id in translate_ttherms if id in dict_ttherm_to_y]))

YF00012156, YF00018584, YF00026324, YF00025215, YF00009981, YF00003565, YF00003115, YF00035370


In [18]:
MCMs = ['YF00026324.1', 'YF00025215.1', 'YF00018584.1', 'YF00012156.1']
# YF00026324, YF00025215, YF00018584, YF00012156
# MCM5, MCM6, MCM3, MCM2

print(', '.join([dict_y_to_ttherm[id[:len(id) - 2]] for id in MCMs if id[:len(id) - 2] in dict_y_to_ttherm]))

TTHERM_00069420, TTHERM_00448570, TTHERM_00092850, TTHERM_00554270


In [19]:
isolated = [
'YF00022177',
'YF00021470',
'YF00015074',
'YF00010747',
'YF00010396',
'YF00003723',
]

print(', '.join([dict_y_to_ttherm[id] for id in isolated if id in dict_y_to_ttherm]))




In [20]:
muco = ['YF00036313']

print(', '.join([dict_y_to_ttherm[id] for id in muco if id in dict_y_to_ttherm]))

TTHERM_00630470
