# IMPORTS

In [1]:
import os
from glob import glob
import pandas as pd
from Bio import SeqIO

from utils import file_utils

# EGGNOG ANNOTATIONS FILE

In [2]:
eggnog_input_path = './input_data/eggnog_annotations_file'

eggnog_input_ext = '.annotations'

eggnog_folder_files = glob(os.path.join(eggnog_input_path, f'*{eggnog_input_ext}'))

if len(eggnog_folder_files) < 1:
    raise ValueError(f'EGGNOG ANNOTATIONS FILE WITH EXTENSION \"{eggnog_input_ext}\" NOT DETECTED. PLEASE DEPOSIT THE REVEVANT FILE IN THE FOLLOWING FOLDER: {os.path.abspath(eggnog_input_path)}')
elif len(eggnog_folder_files) > 1:
    raise ValueError(f'MULTIPLE EGGNOG ANNOTATIONS FILES WITH EXTENSION \"{eggnog_input_ext}\" DETECTED. PLEASE DEPOSIT ONLY ONE REVEVANT FILE FOLLOWING FOLDER: {os.path.abspath(eggnog_input_path)}')

In [3]:
annotations = pd.read_csv(eggnog_folder_files[0], comment='#', delimiter='\t')

In [4]:
annotations.to_csv('./active_files/eggnog_annotations.csv', index=False)

# INTERPROSCAN TSV FILE

In [5]:
interproscan_input_path = './input_data/interproscan_tsv_file'

interproscan_input_ext = '.tsv'

interproscan_folder_files = glob(os.path.join(interproscan_input_path, f'*{interproscan_input_ext}'))

if len(interproscan_folder_files) < 1:
    raise ValueError(f'INTERPROSCAN TSV FILE WITH EXTENSION \"{interproscan_input_ext}\" NOT DETECTED. PLEASE DEPOSIT THE REVEVANT FILE IN THE FOLLOWING FOLDER: {os.path.abspath(interproscan_input_path)}')
elif len(interproscan_folder_files) > 1:
    raise ValueError(f'MULTIPLE INTERPROSCAN TSV FILES WITH EXTENSION \"{interproscan_input_ext}\" DETECTED. PLEASE DEPOSIT ONLY ONE REVEVANT FILE FOLLOWING FOLDER: {os.path.abspath(interproscan_input_path)}')

# KALLISTO TSV FILES

In [6]:
kallisto_input_path = './input_data/kallisto_data_folders'

kallisto_input_ext = '.tsv'

kallisto_folder_files = glob(os.path.join(kallisto_input_path, f'**/*{kallisto_input_ext}'))

if len(kallisto_folder_files) < 1:
    raise ValueError(f'KALLISTO TSV DATA FILE(S) WITH EXTENSION \"{kallisto_input_ext}\" NOT DETECTED. PLEASE DEPOSIT THE REVEVANT FOLDER(S) CONTAINING THESE FILE(S) IN THE FOLLOWING FOLDER: {os.path.abspath(kallisto_input_path)}')

# GENOME MODEL FILES

In [7]:
url = 'https://github.com/yefei521/Tetrahymena_Genome_annotation_V2024/releases/download/V2024.2/'

In [8]:
cds_input_path = './input_data/genome_model/cds_fasta'

cds_input_ext = '.fasta'

cds_folder_files = glob(os.path.join(cds_input_path, f'*{cds_input_ext}'))

if len(cds_folder_files) < 1:
    print(f'GENOME MODEL CDS FILE WITH EXTENSION \"{cds_input_ext}\" NOT DETECTED. DOWNLOADING TO THE FOLLOWING FOLDER: {os.path.abspath(cds_input_path)}')
    cds_file = os.path.join(url, 'Tetrahymena_Genome_annotation_V2024_CDS.fasta')
    file_utils.download_file_chunks(cds_file, os.path.join(cds_input_path, os.path.basename(cds_file)))
    cds_folder_files.append(os.path.join(cds_input_path, os.path.basename(cds_file)))
elif len(cds_folder_files) > 1:
    raise ValueError(f'MULTIPLE GENOME MODEL CDS FILES WITH EXTENSION \"{cds_input_ext}\" DETECTED. PLEASE DEPOSIT ONLY ONE REVEVANT FILE FOLLOWING FOLDER: {os.path.abspath(cds_input_path)}')

['./input_data/genome_model/cds_fasta/Tetrahymena_Genome_annotation_V2024_CDS.fasta']


In [9]:
with open(cds_folder_files[0], 'r') as f:
    cds_records = list(SeqIO.parse(f, 'fasta'))
    
SeqIO.write(cds_records, './active_files/cds.fasta', 'fasta')

26687

In [10]:
pep_input_path = './input_data/genome_model/pep_fasta'

pep_input_ext = '.fasta'

pep_folder_files = glob(os.path.join(pep_input_path, f'*{pep_input_ext}'))

if len(pep_folder_files) < 1:
    print(f'GENOME MODEL PEP FILE WITH EXTENSION \"{pep_input_ext}\" NOT DETECTED. DOWNLOADING TO THE FOLLOWING FOLDER: {os.path.abspath(pep_input_path)}')
    pep_file = os.path.join(url, 'Tetrahymena_Genome_annotation_V2024_Protein_addAnno.fasta')
    file_utils.download_file_chunks(pep_file, os.path.join(pep_input_path, os.path.basename(pep_file)))
    pep_folder_files.append(os.path.join(pep_input_path, os.path.basename(pep_file)))
elif len(pep_folder_files) > 1:
    raise ValueError(f'MULTIPLE GENOME MODEL PEP FILES WITH EXTENSION \"{pep_input_ext}\" DETECTED. PLEASE DEPOSIT ONLY ONE REVEVANT FILE FOLLOWING FOLDER: {os.path.abspath(pep_input_path)}')

In [11]:
with open(pep_folder_files[0], 'r') as f:
    pep_records = list(SeqIO.parse(f, 'fasta'))

common_name_dict = {'TTHERM_ID': [], 'common_name': []}

for idx, r in enumerate(pep_records):
    id_name_split = r.id.split('|')
    r.id = id_name_split[0]

    if r.id in common_name_dict['TTHERM_ID']:
        raise(ValueError(f'{r.id} appears more than once in the protein fasta file.'))
    
    common_name_dict['TTHERM_ID'].append(id_name_split[0])
    common_name_dict['common_name'].append('-' if len(id_name_split) < 2 else id_name_split[1])

common_name_df = pd.DataFrame(common_name_dict)

SeqIO.write(pep_records, './active_files/pep.fasta', 'fasta')
common_name_df.to_csv('./active_files/common_names.csv', index=False)