In [1]:
import pandas as pd
from glob import glob
import sys

sys.path.append('../../')
from utils import dataframe_utils

In [2]:
annot_files_raw = glob('./**/*.tsv')
annot_files_raw

['./test1/pep_frag_173.fasta.tsv',
 './test1/pep_frag_035.fasta.tsv',
 './test1/pep_frag_086.fasta.tsv',
 './test1/pep_frag_109.fasta.tsv',
 './test1/pep_frag_092.fasta.tsv',
 './test1/pep_frag_021.fasta.tsv',
 './test1/pep_frag_167.fasta.tsv',
 './test1/pep_frag_000.fasta.tsv',
 './test1/pep_frag_146.fasta.tsv',
 './test1/pep_frag_152.fasta.tsv',
 './test1/pep_frag_014.fasta.tsv',
 './test1/pep_frag_128.fasta.tsv',
 './test1/pep_frag_038.fasta.tsv',
 './test1/pep_frag_104.fasta.tsv',
 './test1/pep_frag_042.fasta.tsv',
 './test1/pep_frag_056.fasta.tsv',
 './test1/pep_frag_110.fasta.tsv',
 './test1/pep_frag_077.fasta.tsv',
 './test1/pep_frag_131.fasta.tsv',
 './test1/pep_frag_182.fasta.tsv',
 './test1/pep_frag_019.fasta.tsv',
 './test1/pep_frag_196.fasta.tsv',
 './test1/pep_frag_125.fasta.tsv',
 './test1/pep_frag_063.fasta.tsv',
 './test1/pep_frag_189.fasta.tsv',
 './test1/pep_frag_140.fasta.tsv',
 './test1/pep_frag_006.fasta.tsv',
 './test1/pep_frag_068.fasta.tsv',
 './test1/pep_frag_0

In [3]:
columns = [
    "Protein accession",
    "Sequence MD5 digest",
    "Sequence length",
    "Analysis",
    "Signature accession",
    "Signature description",
    "Start location",
    "Stop location",
    "Score",
    "Status",
    "Date",
    "InterPro annotations - accession",
    "InterPro annotations - description",
    "GO annotations with their source(s)",
    "Pathways annotations"
]
len(columns)

15

In [4]:
annot_df_raw = dataframe_utils.csv_files_to_df(annot_files_raw, sep='\t', cols=columns)
annot_df_raw
annot_df_raw['Protein accession'].values

array(['TTHERM_00541547', 'TTHERM_00541547', 'TTHERM_00541547', ...,
       'TTHERM_00760540', 'TTHERM_00760540', 'TTHERM_00927401'],
      dtype=object)

In [5]:
annot_df_raw

Unnamed: 0,Protein accession,Sequence MD5 digest,Sequence length,Analysis,Signature accession,Signature description,Start location,Stop location,Score,Status,Date,InterPro annotations - accession,InterPro annotations - description,GO annotations with their source(s),Pathways annotations
0,TTHERM_00541547,f07d6740e0003ef6fe12e459f21e761f,1221,SUPERFAMILY,SSF57184,Growth factor receptor domain,815,965,7.86E-21,T,06-06-2024,IPR009030,Growth factor receptor cysteine-rich domain su...,-,-
1,TTHERM_00541547,f07d6740e0003ef6fe12e459f21e761f,1221,Gene3D,G3DSA:2.10.220.10,-,957,1006,2.5E-5,T,06-06-2024,-,-,-,-
2,TTHERM_00541547,f07d6740e0003ef6fe12e459f21e761f,1221,SUPERFAMILY,SSF55486,"Metalloproteases (""zincins""), catalytic domain",58,380,3.56E-43,T,06-06-2024,-,-,-,-
3,TTHERM_00541547,f07d6740e0003ef6fe12e459f21e761f,1221,Gene3D,G3DSA:2.10.220.10,-,812,899,1.0E-5,T,06-06-2024,-,-,-,-
4,TTHERM_00541547,f07d6740e0003ef6fe12e459f21e761f,1221,Pfam,PF01457,Leishmanolysin,58,242,9.2E-19,T,06-06-2024,IPR001577,"Peptidase M8, leishmanolysin",-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
233121,TTHERM_00760540,b0810225a2bf4584e73bb200c2555f33,126,SMART,SM01278,MAPKK1_Int_3,2,121,1.3E-41,T,05-06-2024,IPR015019,Ragulator complex protein LAMTOR3,-,-
233122,TTHERM_00760540,b0810225a2bf4584e73bb200c2555f33,126,Gene3D,G3DSA:3.30.450.30,"Dynein light chain 2a, cytoplasmic",1,123,2.4E-23,T,05-06-2024,-,-,-,-
233123,TTHERM_00760540,b0810225a2bf4584e73bb200c2555f33,126,Pfam,PF08923,Mitogen-activated protein kinase kinase 1 inte...,3,120,8.9E-6,T,05-06-2024,IPR015019,Ragulator complex protein LAMTOR3,-,-
233124,TTHERM_00760540,b0810225a2bf4584e73bb200c2555f33,126,SUPERFAMILY,SSF103196,Roadblock/LC7 domain,1,123,6.28E-32,T,05-06-2024,-,-,-,-


In [6]:
annot_df_raw.isna().any()

Protein accession                      False
Sequence MD5 digest                    False
Sequence length                        False
Analysis                               False
Signature accession                    False
Signature description                  False
Start location                         False
Stop location                          False
Score                                  False
Status                                 False
Date                                   False
InterPro annotations - accession       False
InterPro annotations - description     False
GO annotations with their source(s)    False
Pathways annotations                   False
dtype: bool

In [7]:
interpro_desc_dict = {}

interpro_descs = annot_df_raw['InterPro annotations - description'].values

for idx, acc in enumerate(annot_df_raw['InterPro annotations - accession'].values):
    if acc in interpro_desc_dict and interpro_desc_dict[acc] != interpro_descs[idx]:
        raise(ValueError(f'InterPro annotation with accession={acc} has multiple descriptions:\n1. {interpro_desc_dict[acc]}\n2. {interpro_descs[idx]}\n...'))
    interpro_desc_dict[acc] = interpro_descs[idx]

In [8]:
interpro_desc_dict

{'IPR009030': 'Growth factor receptor cysteine-rich domain superfamily',
 '-': '-',
 'IPR001577': 'Peptidase M8, leishmanolysin',
 'IPR006212': 'Furin-like repeat',
 'IPR000742': 'EGF-like domain',
 'IPR009851': 'Modifier of rudimentary, Modr',
 'IPR000595': 'Cyclic nucleotide-binding domain',
 'IPR014710': 'RmlC-like jelly roll fold',
 'IPR013099': 'Potassium channel domain',
 'IPR051413': 'Potassium/sodium hyperpolarization-activated cyclic nucleotide-gated channel',
 'IPR018490': 'Cyclic nucleotide-binding domain superfamily',
 'IPR047187': 'Upf1-like, C-terminal helicase domain',
 'IPR045055': 'DNA2/NAM7-like helicase',
 'IPR027417': 'P-loop containing nucleoside triphosphate hydrolase',
 'IPR014001': 'Helicase superfamily 1/2, ATP-binding domain',
 'IPR041677': 'DNA2/NAM7 helicase, helicase domain',
 'IPR041679': 'DNA2/NAM7 helicase-like, C-terminal',
 'IPR036961': 'Kinesin motor domain superfamily',
 'IPR001752': 'Kinesin motor domain',
 'IPR027640': 'Kinesin-like protein',
 'IPR

In [9]:
interpro_desc_list = list(interpro_desc_dict.items())

interpro_desc_df = pd.DataFrame(interpro_desc_list, columns=['InterPro', 'InterPro_description'])

In [10]:
interpro_desc_df.head()

Unnamed: 0,InterPro,InterPro_description
0,IPR009030,Growth factor receptor cysteine-rich domain su...
1,-,-
2,IPR001577,"Peptidase M8, leishmanolysin"
3,IPR006212,Furin-like repeat
4,IPR000742,EGF-like domain


In [11]:
interpro_desc_df.sort_values(by='InterPro').to_csv('./interpro_annotations.csv', index=False)

In [12]:
interpro_dict = {
        'TTHERM_ID': [],
        'InterPro': [],
}

for id in annot_df_raw['Protein accession'].unique():

    curr_df = annot_df_raw.loc[annot_df_raw['Protein accession'] == id]

    ipaa = sorted(list(curr_df['InterPro annotations - accession'].unique()))

    interpro_dict['TTHERM_ID'].append(id)
    interpro_dict['InterPro'].append(
                                                            ','.join(ipaa[1: ])
                                                            if len(ipaa) > 1 and ipaa[0] == '-' else ipaa[0]
                                                            )
    
interpro_df = pd.DataFrame(interpro_dict)

In [13]:
interpro_df.sample(20)

Unnamed: 0,TTHERM_ID,InterPro
15742,TTHERM_00600830,IPR003126
2117,TTHERM_00129610,"IPR000477,IPR036691,IPR043128,IPR043502,IPR052343"
12548,TTHERM_00670790,-
18205,TTHERM_00579060,"IPR001930,IPR014782,IPR015211,IPR016024,IPR027..."
14487,TTHERM_00860440,"IPR000742,IPR002859,IPR006212,IPR009030"
19272,TTHERM_00463835,IPR004963
15896,TTHERM_01577284,"IPR001611,IPR032675"
21079,TTHERM_00427460,IPR027887
19025,TTHERM_00670915,-
6024,TTHERM_00204100,IPR013041


In [14]:
interpro_df.shape

(21650, 2)

In [15]:
interpro_df = interpro_df.loc[
    (interpro_df['InterPro'] != '-')
    # &
    # (interpro_df['InterPro annotations - description'] != '-')
    # |
    # (interpro_df['InterPro annotations - description'] != '-')
]

In [16]:
interpro_df

Unnamed: 0,TTHERM_ID,InterPro
0,TTHERM_00541547,"IPR000742,IPR001577,IPR006212,IPR009030"
1,TTHERM_01124020,IPR009851
2,TTHERM_01184230,"IPR000595,IPR013099,IPR014710,IPR018490,IPR051413"
3,TTHERM_00095380,"IPR014001,IPR027417,IPR041677,IPR041679,IPR045..."
4,TTHERM_00238980,"IPR001752,IPR019821,IPR027417,IPR027640,IPR036961"
...,...,...
21643,TTHERM_01043360,IPR008491
21644,TTHERM_00829340,IPR001611
21645,TTHERM_00426250,"IPR000447,IPR006076,IPR031656,IPR036188,IPR038299"
21647,TTHERM_00630640,"IPR001611,IPR027038,IPR032675"


In [17]:
dataframe_utils.sql_query_df({'interpro_df': interpro_df}, 'select * from interpro_df where InterPro not like "%IPR%"')

Unnamed: 0,TTHERM_ID,InterPro


In [18]:
interpro_df.sort_values(by='TTHERM_ID').to_csv('./interpro_annot.csv', index=False)

In [19]:
# annot_df_raw['Analysis'].unique()

In [20]:
# analysis_dfs = []

# for analysis in annot_df_raw['Analysis'].unique():
#     curr_analysis_dict = {
#         'TTHERM_ID': [],
#         analysis: [],
#         f'{analysis}_desc': [],
#     }

#     curr_analysis_df = annot_df_raw.loc[annot_df_raw['Analysis'] == analysis]

#     curr_analysis_dict['TTHERM_ID'] += list(curr_analysis_df['Protein accession'].values)
#     curr_analysis_dict[analysis] += list(curr_analysis_df['Signature accession'].values)
#     curr_analysis_dict[f'{analysis}_desc'] += list(curr_analysis_df['Signature description'].values)

#     analysis_dfs.append(pd.DataFrame(curr_analysis_dict))

In [21]:
# combined_df = pd.DataFrame({'TTHERM_ID': annot_df_raw['Protein accession'].values})

# for idx, df in enumerate(analysis_dfs):
#     # combined_df = combined_df.merge(df, on='TTHERM_ID', how='outer')
#     combined_df = dataframe_utils.sql_query_df({'combined_df': combined_df, 'df': df},
# '''SELECT *
# FROM combined_df
# FULL OUTER JOIN df
# ON combined_df.TTHERM_ID = df.TTHERM_ID;''')