In [1]:
from Bio import SeqIO
import pandas as pd 
from glob import glob
import shutil
import os

In [2]:
GENE_MODEL_SETTINGS = {
    'gene_model': 2024, # 2021 or 2024 (2024 WILL ALWAYS BE USED FOR RNA-seq)
    'translate_gene_names_to_ttherm_ids': False # (unused if 'gene_model': 2021)
}

In [3]:
with open('./new_raw_data/tgd2024/Manual_check-total-gene.gff3_Right_UTR.gff3_cds.fasta', 'r') as f:
    cds_records = list(SeqIO.parse(f, 'fasta'))

all_yf_ids = [r.id for r in cds_records]
all_yf_ids

['YF00013476.t1',
 'YF00013477.t1',
 'YF00036562.t1',
 'YF00036564.t1',
 'YF00013481.t1',
 'YF00013482.t1',
 'YF00013483.t1',
 'YF00013484.t1',
 'YF00036566.t1',
 'YF00013486.t1',
 'YF00036567.t1',
 'YF00013487.t1',
 'YF00013488.t1',
 'YF00013489.t1',
 'YF00013490.t1',
 'YF00013491.t1',
 'YF00013492.t1',
 'YF00013493.t1',
 'YF00013494.t1',
 'YF00013495.t1',
 'YF00013496.t1',
 'YF00013497.t1',
 'YF00013498.t1',
 'YF00013499.t1',
 'YF00013500.t1',
 'YF00013501.t1',
 'YF00013502.t1',
 'YF00013503.t1',
 'YF00013504.t1',
 'YF00013505.t1',
 'YF00032543.t1',
 'YF00013508.t1',
 'YF00013509.t1',
 'YF00032544.t1',
 'YF00013510.t1',
 'YF00013512.t1',
 'YF00013513.t1',
 'YF00036570.t1',
 'YF00032546.t1',
 'YF00013516.t1',
 'YF00013517.t1',
 'YF00013518.t1',
 'YF00036571.t1',
 'YF00013519.t1',
 'YF00013520.t1',
 'YF00036573.t1',
 'YF00013522.t1',
 'YF00036574.t1',
 'YF00036575.t1',
 'YF00013526.t1',
 'YF00013527.t1',
 'YF00013529.t1',
 'YF00036577.t1',
 'YF00032554.t1',
 'YF00032555.t1',
 'YF000135

In [4]:
all_yf_ids_set = set(all_yf_ids)

In [5]:
max_mapping = pd.read_csv('./new_raw_data/tgd2024/yf_ttherm_mapping_may2024.csv')
max_mapping.head()

Unnamed: 0,YF_ID,TTHERM_ID
0,YF00028510.t1,TTHERM_01539710 TTHERM_001050489 TTHERM_001050...
1,YF00028508.t1,TTHERM_01539710 TTHERM_01539710 TTHERM_0015396...
2,YF00038323.t1,TTHERM_01539710 TTHERM_001224653 TTHERM_001050...
3,YF00038325.t1,TTHERM_01539710 TTHERM_001050489 TTHERM_010505...
4,YF00002076.t1,TTHERM_000730254


In [6]:
all_mapping_yf_ids = list(max_mapping['YF_ID'].values)
all_mapping_yf_ids

['YF00028510.t1',
 'YF00028508.t1',
 'YF00038323.t1',
 'YF00038325.t1',
 'YF00002076.t1',
 'YF00034455.t1',
 'YF00024912.t1',
 'YF00037784.t1',
 'YF00003768.t1',
 'YF00003769.t1',
 'YF00025946.t1',
 'YF00025947.t1',
 'YF00025670.t1',
 'YF00037878.t1',
 'YF00010387.t1',
 'YF00034186.t1',
 'YF00003425.t1',
 'YF00003426.t1',
 'YF00027756.t1',
 'YF00027755.t1',
 'YF00031295.t1',
 'YF00031296.t1',
 'YF00030324.t1',
 'YF00019659.t1',
 'YF00001311.t1',
 'YF00001312.t1',
 'YF00012564.t1',
 'YF00012565.t1',
 'YF00023692.t1',
 'YF00023691.t1',
 'YF00007800.t1',
 'YF00007801.t1',
 'YF00024182.t1',
 'YF00024185.t1',
 'YF00024655.t1',
 'YF00024654.t1',
 'YF00022217.t1',
 'YF00022216.t1',
 'YF00034178.t1',
 'YF00034179.t1',
 'YF00029750.t1',
 'YF00029749.t1',
 'YF00012733.t1',
 'YF00032915.t1',
 'YF00015764.t1',
 'YF00031663.t1',
 'YF00007879.t1',
 'YF00036294.t1',
 'YF00004453.t1',
 'YF00004451.t1',
 'YF00004452.t1',
 'YF00026140.t1',
 'YF00026139.t1',
 'YF00026143.t1',
 'YF00026144.t1',
 'YF000046

In [7]:
all_mapping_yf_ids_set = set(all_mapping_yf_ids)

In [9]:
len(all_yf_ids_set), len(all_mapping_yf_ids_set), len(set.intersection(all_mapping_yf_ids_set, all_yf_ids_set))

(25987, 26904, 25841)

In [3]:
if GENE_MODEL_SETTINGS['gene_model'] == 2024:

    with open('./new_raw_data/tgd2024/Manual_check-total-gene.gff3_Right_UTR.gff3_cds.fasta', 'r') as f:
        cds_records = list(SeqIO.parse(f, 'fasta'))

    with open('./new_raw_data/tgd2024/Manual_check-total-gene.gff3_Right_UTR.gff3_pep.fasta', 'r') as f:
        pep_records = list(SeqIO.parse(f, 'fasta'))

    rna_seq = pd.read_csv('./new_raw_data/rna_seq_processed/kallisto.csv')

    annotations = pd.read_csv('./TGNE/eggnog/2024_none_eggnog_compiled.annotations', comment='#', delimiter='\t')

    annotation_desc_paths = glob('./TGNE/enrichment/2024/*.csv')

    if GENE_MODEL_SETTINGS['translate_gene_names_to_ttherm_ids']:

        df_y_to_ttherm = pd.read_csv('./new_raw_data/tgd2024/yf_ttherm_mapping_feb2024.csv')
        dict_y_to_ttherm = {yf: ttherm for yf, ttherm in zip(df_y_to_ttherm['yf2024'].values, df_y_to_ttherm['ttherm2021'].values)}

        for idx, r in enumerate(cds_records):
            if (r.id) in dict_y_to_ttherm:
                r.id = dict_y_to_ttherm[(r.id)]

        with open('./new_raw_data/Tthermophila_MAC_protein_2021.fasta', 'r') as f:
            pep_records_2021 = list(SeqIO.parse(f, 'fasta'))

        dict_pep_records_2021 = {r.id: r for r in pep_records_2021}

        for idx, r in enumerate(pep_records):
            if (r.id) in dict_y_to_ttherm:
                r.id = dict_y_to_ttherm[(r.id)]
                r.description = dict_pep_records_2021[r.id].description

        annotations['query'] = [yfid if yfid not in dict_y_to_ttherm else dict_y_to_ttherm[yfid] for yfid in annotations['query'].values]

        rna_seq['TTHERM_ID'] = [yfid if yfid not in dict_y_to_ttherm else dict_y_to_ttherm[yfid] for yfid in rna_seq['TTHERM_ID'].values]

In [4]:
if GENE_MODEL_SETTINGS['gene_model'] == 2021:

    with open('./new_raw_data/Tthermophila_MAC_CDS_2021.fasta', 'r') as f:
        cds_records = list(SeqIO.parse(f, 'fasta'))

    with open('./new_raw_data/Tthermophila_MAC_protein_2021.fasta', 'r') as f:
        pep_records = list(SeqIO.parse(f, 'fasta'))

    annotations = pd.read_csv('./TGNE/eggnog/2021_none_eggnog_compiled.annotations', comment='#', delimiter='\t')

    annotation_desc_paths = glob('./TGNE/enrichment/2021/*.csv')

In [5]:
SeqIO.write(cds_records, './active_fastas/cds.fasta', 'fasta')

25987

In [6]:
SeqIO.write(pep_records, './active_fastas/pep.fasta', 'fasta')

25987

In [7]:
rna_seq.to_csv('./active_fastas/rna_seq.csv', index=False)

In [8]:
annotations.to_csv('./active_fastas/annotations.csv', index=False)

In [9]:
destination_dir = './active_fastas/'

for path in annotation_desc_paths:
    destination_path = os.path.join(destination_dir, os.path.basename(path))
    shutil.copy(path, destination_path)

# QUICK ID TRANSLATION

In [10]:
df_y_to_ttherm = pd.read_csv('./new_raw_data/tgd2024/yf_ttherm_mapping_feb2024.csv')
dict_y_to_ttherm = {yf: ttherm for yf, ttherm in zip(df_y_to_ttherm['yf2024'].values, df_y_to_ttherm['ttherm2021'].values)}
dict_ttherm_to_y = {ttherm: yf for yf, ttherm in zip(df_y_to_ttherm['yf2024'].values, df_y_to_ttherm['ttherm2021'].values)}

In [11]:
translate_ttherms = [
'TTHERM_01055600', 
'TTHERM_01002870', 
'TTHERM_01002860', 
'TTHERM_00630470', 
'TTHERM_00624730', 
'TTHERM_00624720', 
'TTHERM_00527180', 
'TTHERM_00522600', 
'TTHERM_00378890', 
'TTHERM_00335830', 
'TTHERM_00221120',
]

print(', '.join([dict_ttherm_to_y[id] for id in translate_ttherms if id in dict_ttherm_to_y]))

YF00024749.t1, YF00005803.t1, YF00003933.t1


In [12]:
translate_ttherms = [
'TTHERM_00420610', 
'TTHERM_00410210', 
'TTHERM_00313130', 
'TTHERM_00467390',
]

print(', '.join([dict_ttherm_to_y[id] for id in translate_ttherms if id in dict_ttherm_to_y]))

YF00002277.t1, YF00018466.t1, YF00019887.t1


In [13]:
# mycocysts
translate_ttherms = [
"TTHERM_01055600", "TTHERM_01002870", "TTHERM_01002860", "TTHERM_00630470", "TTHERM_00624730", "TTHERM_00624720", "TTHERM_00527180", "TTHERM_00522600", "TTHERM_00378890", "TTHERM_00335830", "TTHERM_00221120"
]

print(', '.join([dict_ttherm_to_y[id] for id in translate_ttherms if id in dict_ttherm_to_y]))

muco = ['YF00024749', 'YF00036313', 'YF00005803', 'YF00003933', 'YF00013821']

print(', '.join([dict_y_to_ttherm[id] for id in muco if id in dict_y_to_ttherm]))

YF00024749.t1, YF00005803.t1, YF00003933.t1



In [14]:
# (HH2A.1, HH2A.2, HH2B.1, HH2B.2, HH3, HH4.1, and HH4.2)
translate_ttherms = [
"TTHERM_00189170", "TTHERM_00143660", "TTHERM_00633360", "THERM_00283180", 
]

print(', '.join([dict_ttherm_to_y[id] for id in translate_ttherms if id in dict_ttherm_to_y]))

YF00005517.t1


In [15]:
# (Figure 3A; CPD1, CPD2, and CPG3 and SMC2 and SMC4).
translate_ttherms = [
"TTHERM_00486070", "TTHERM_00392760", "TTHERM_00919690", "TTHERM_00812950", "TTHERM_0044640", 
]

print(', '.join([dict_ttherm_to_y[id] for id in translate_ttherms if id in dict_ttherm_to_y]))

YF00023687.t1, YF00007642.t1, YF00028933.t1


In [16]:
# cyclin D family members, CYC4, CYC13, and CYC25
translate_ttherms = [
"TTHERM_01043080", "TTHERM_01128530", "TTHERM_00717540",
]

print(', '.join([dict_ttherm_to_y[id] for id in translate_ttherms if id in dict_ttherm_to_y]))

YF00006914.t1, YF00023330.t1


In [17]:
# all cyclins
translate_ttherms = [
'TTHERM_00196590',
'TTHERM_00433390',
'TTHERM_00526250',
'TTHERM_00962160',
'TTHERM_01128530',
'TTHERM_00189230',
'TTHERM_00425970',
'TTHERM_00780580',
'TTHERM_00693080',
'TTHERM_00827080',
'TTHERM_00698650',
'TTHERM_00079530',
'TTHERM_00624450',
'TTHERM_00784350',
'TTHERM_00933270',
'TTHERM_00732460',
'TTHERM_00842480',
'TTHERM_00717540',
'TTHERM_00066840',
'TTHERM_000576939',
'TTHERM_00082190',
'TTHERM_00649450',
'TTHERM_00180970',
'TTHERM_00726380',
'TTHERM_00013060',
'TTHERM_00049420',
'TTHERM_00294860',
'TTHERM_00535270',
'TTHERM_01043080',
'TTHERM_00192000',
'TTHERM_00194440',
'TTHERM_00293270',
'TTHERM_00332170',
'TTHERM_00940290',
]

print(', '.join([dict_ttherm_to_y[id] for id in translate_ttherms if id in dict_ttherm_to_y]))

YF00010860.t1, YF00003848.t1, YF00004376.t1, YF00023330.t1, YF00008997.t1, YF00004862.t1, YF00027031.t1, YF00005775.t1, YF00007755.t1, YF00013340.t1, YF00026392.t1, YF00025382.t1, YF00015327.t1, YF00024034.t1, YF00017344.t1, YF00001614.t1, YF00006914.t1, YF00030911.t1, YF00017795.t1, YF00001684.t1, YF00024727.t1, YF00004649.t1


In [18]:
cyc_subset = [
'TTHERM_00196590', # CYC1 
'TTHERM_00425970', # CYC15
'TTHERM_00624450', # CYC20
'TTHERM_00842480', # CYC24
'TTHERM_00194440', # CYC6
'TTHERM_00332170', # CYC8
]

print(', '.join([dict_ttherm_to_y[id] for id in cyc_subset if id in dict_ttherm_to_y]))

YF00008997.t1, YF00005775.t1, YF00013340.t1, YF00017795.t1, YF00024727.t1


In [19]:
all_cdks = [
'TTHERM_00411810', #	CDK1
'TTHERM_00318700', #	CDK10
'TTHERM_00576780', #	CDK11
'TTHERM_00267860', #	CDK13
'TTHERM_00394590', #	CDK14
'TTHERM_01035490', #	CDK15
'TTHERM_00624260', #	CDK16
'TTHERM_00066860', #	CDK17
'TTHERM_00784290', #	CDK18
'TTHERM_00339880', #	CDK19
'TTHERM_01347900', #	CDK20
'TTHERM_000837979', #	CDK21
'TTHERM_000937671', #	CDK22
'TTHERM_00011670', #	CDK3
'TTHERM_00286770', #	CDK4
'TTHERM_01080600', #	CDK5
'TTHERM_00837980', #	CDK6
'TTHERM_00133750', #	CDK7
'TTHERM_00717780', #	CDK8
'TTHERM_00185770', #	CDK9
'TTHERM_00483640', #	TCDK3
]

print(', '.join([dict_ttherm_to_y[id] for id in all_cdks if id in dict_ttherm_to_y]))

YF00018405.t1, YF00020828.t1, YF00026883.t1, YF00007567.t1, YF00023257.t1, YF00026390.t1, YF00007765.t1, YF00013716.t1, YF00027327.t1, YF00024788.t1, YF00009986.t1, YF00021147.t1, YF00013233.t1, YF00015418.t1


In [20]:
nucleic_acid_metabolic_process = [
'TTHERM_00046490',
'TTHERM_00535470',
'TTHERM_00312260',
'TTHERM_01079170',
'TTHERM_000378989',
'TTHERM_00825460',
'TTHERM_00298220',
'TTHERM_00684490',
'TTHERM_00433640',
'TTHERM_00787360',
'TTHERM_00112560',
'TTHERM_00561799',
'TTHERM_00723610',
'TTHERM_00794250',
]

print(', '.join([dict_ttherm_to_y[id] for id in nucleic_acid_metabolic_process if id in dict_ttherm_to_y]))

YF00001188.t1, YF00019785.t1, YF00005644.t1, YF00006092.t1, YF00000878.t1, YF00016962.t1, YF00012869.t1, YF00009402.t1


In [21]:
telomere_maintenance = [
'TTHERM_000378989',
'TTHERM_00433640',
'TTHERM_00112560',
'TTHERM_00561799',
]

print(', '.join([dict_ttherm_to_y[id] for id in telomere_maintenance if id in dict_ttherm_to_y]))

YF00009402.t1


In [22]:
# DECENT
mitotic_nuclear_division = [
'TTHERM_00486070',
'TTHERM_00919690',
'TTHERM_00728870',
'TTHERM_00540340',
'TTHERM_00554600',
'TTHERM_01299730',
'TTHERM_00118700',
'TTHERM_00079520',
'TTHERM_00046490',
'TTHERM_00624870',
'TTHERM_000191179',
'TTHERM_00393260',
'TTHERM_00444760',
'TTHERM_00498210',
]

print(', '.join([dict_ttherm_to_y[id] for id in mitotic_nuclear_division if id in dict_ttherm_to_y]))

YF00023687.t1, YF00021217.t1, YF00012182.t1, YF00027032.t1, YF00018207.t1, YF00007595.t1, YF00024470.t1


In [23]:
mitotic_cell_cycle = [
'TTHERM_00486070',
'TTHERM_00919690',
'TTHERM_00728870',
'TTHERM_00540340',
'TTHERM_00554600',
'TTHERM_01299730',
'TTHERM_00118700',
'TTHERM_00079520',
'TTHERM_00046490',
'TTHERM_00624870',
'TTHERM_000191179',
'TTHERM_00393260',
'TTHERM_00444760',
'TTHERM_00498210',
]

print(', '.join([dict_ttherm_to_y[id] for id in mitotic_cell_cycle if id in dict_ttherm_to_y]))

YF00023687.t1, YF00021217.t1, YF00012182.t1, YF00027032.t1, YF00018207.t1, YF00007595.t1, YF00024470.t1


In [24]:
chromosome_organization = [
'TTHERM_00088010',
'TTHERM_00307700',
'TTHERM_00486070',
'TTHERM_00919690',
'TTHERM_00728870',
'TTHERM_00540340',
'TTHERM_00554600',
'TTHERM_01299730',
'TTHERM_00189170',
'TTHERM_00790790',
'TTHERM_00316500',
'TTHERM_00143660',
'TTHERM_00633360',
'TTHERM_00283180',
'TTHERM_00316410',
'TTHERM_00393260',
'TTHERM_00444760',
'TTHERM_00455130',
'TTHERM_000378989',
'TTHERM_00726370',
'TTHERM_00684490',
'TTHERM_00433640',
'TTHERM_00787360',
'TTHERM_00112560',
'TTHERM_00561799',
]

print(', '.join([dict_ttherm_to_y[id] for id in chromosome_organization if id in dict_ttherm_to_y]))

YF00018670.t1, YF00000554.t1, YF00023687.t1, YF00021217.t1, YF00012182.t1, YF00005517.t1, YF00020953.t1, YF00007595.t1, YF00024270.t1, YF00024033.t1, YF00016962.t1, YF00012869.t1, YF00009402.t1


In [25]:
DNA_repair = [
'TTHERM_00307700',
'TTHERM_00046490',
'TTHERM_00624870',
'TTHERM_000191179',
'TTHERM_00312260',
'TTHERM_00316410',
'TTHERM_00439320',
'TTHERM_01050440',
'TTHERM_01079170',
'TTHERM_01106120',
'TTHERM_00726470',
'TTHERM_00455309',
'TTHERM_00825460',
'TTHERM_00726370',
'TTHERM_00142290',
'TTHERM_00684490',
'TTHERM_00433640',
'TTHERM_00561799',
'TTHERM_00723610',
'TTHERM_00888060',
'TTHERM_00864890',
'TTHERM_00829440',
'TTHERM_00794250',
]

print(', '.join([dict_ttherm_to_y[id] for id in DNA_repair if id in dict_ttherm_to_y]))

YF00000554.t1, YF00018207.t1, YF00019785.t1, YF00020953.t1, YF00028517.t1, YF00005644.t1, YF00024250.t1, YF00006092.t1, YF00024033.t1, YF00005557.t1, YF00016962.t1, YF00009402.t1, YF00016312.t1


In [26]:
microtubule_cytoskeleton_organi = [
'TTHERM_00335970'
'TTHERM_00426160'
'TTHERM_01220340'
'TTHERM_00522990'
'TTHERM_00338260'
'TTHERM_000727749'
'TTHERM_000123789'
'TTHERM_000161298'
'TTHERM_00030040'
'TTHERM_000357129'
'TTHERM_00046500'
'TTHERM_000488189'
'TTHERM_000706399'
'TTHERM_00088040'
'TTHERM_00133700'
'TTHERM_00266668'
'TTHERM_00346790'
'TTHERM_00452020'
'TTHERM_00493000'
'TTHERM_00755750'
'TTHERM_00842510'
'TTHERM_00992710'
]

print(', '.join([dict_ttherm_to_y[id] for id in microtubule_cytoskeleton_organi if id in dict_ttherm_to_y]))




In [27]:
# all MCMs
translate_ttherms = [
'TTHERM_00554270',
'TTHERM_00092850',
'TTHERM_00277550',
'TTHERM_00069420',
'TTHERM_00448570',
'TTHERM_00011740',
'TTHERM_01031060',
'TTHERM_00703910',
'TTHERM_01207610',
]

print(', '.join([dict_ttherm_to_y[id] for id in translate_ttherms if id in dict_ttherm_to_y]))

YF00012156.t1, YF00018584.t1, YF00026324.t1, YF00025215.t1, YF00009981.t1, YF00003565.t1


In [28]:
histones = ['TTHERM_00146340', 'TTHERM_00446570', 'TTHERM_00498190', 'TTHERM_00189170', 'TTHERM_00189180', 'TTHERM_00016170', 'TTHERM_00016200', 'TTHERM_00790790', 'TTHERM_00316500', 'TTHERM_00143660', 'TTHERM_01079200', 'TTHERM_00633360', 'TTHERM_00283180', 'TTHERM_00354710', 'TTHERM_00474790', 'TTHERM_00852840', 'TTHERM_01194650']
print(', '.join([dict_ttherm_to_y[id] for id in histones if id in dict_ttherm_to_y]))

YF00005517.t1, YF00018904.t1, YF00023412.t1, YF00014174.t1, YF00017547.t1


In [29]:
MCMs = ['YF00026324.1', 'YF00025215.1', 'YF00018584.1', 'YF00012156.1']
# YF00026324, YF00025215, YF00018584, YF00012156
# MCM5, MCM6, MCM3, MCM2

print(', '.join([dict_y_to_ttherm[id[:len(id) - 2]] for id in MCMs if id[:len(id) - 2] in dict_y_to_ttherm]))




In [30]:
isolated = [
'YF00022177',
'YF00021470',
'YF00015074',
'YF00010747',
'YF00010396',
'YF00003723',
]

print(', '.join([dict_y_to_ttherm[id] for id in isolated if id in dict_y_to_ttherm]))




In [31]:
muco = ['YF00036313']

print(', '.join([dict_y_to_ttherm[id] for id in muco if id in dict_y_to_ttherm]))




In [32]:
mucocyst_cluster = [
'YF00036312.t1',
'YF00012829.t1',
'YF00000889.t1',
'TTHERM_00527180',
'TTHERM_00335830',
'YF00012830.t1',
'YF00009126.t1',
'YF00005954.t1',
'YF00005804.t1',
'TTHERM_01055600',
'TTHERM_00624720',
] # TTHERM_00630470: mucocyst gene with drastically different expression profile

