In [1]:
import pandas as pd
from glob import glob
import sys

sys.path.append('../../')
from utils import dataframe_utils

In [2]:
annot_files_raw = glob('./**/*.tsv')
annot_files_raw

['./jul01/pep_frag_173.fasta.tsv',
 './jul01/pep_frag_035.fasta.tsv',
 './jul01/pep_frag_086.fasta.tsv',
 './jul01/pep_frag_109.fasta.tsv',
 './jul01/pep_frag_092.fasta.tsv',
 './jul01/pep_frag_021.fasta.tsv',
 './jul01/pep_frag_167.fasta.tsv',
 './jul01/pep_frag_000.fasta.tsv',
 './jul01/pep_frag_146.fasta.tsv',
 './jul01/pep_frag_152.fasta.tsv',
 './jul01/pep_frag_014.fasta.tsv',
 './jul01/pep_frag_128.fasta.tsv',
 './jul01/pep_frag_038.fasta.tsv',
 './jul01/pep_frag_104.fasta.tsv',
 './jul01/pep_frag_042.fasta.tsv',
 './jul01/pep_frag_056.fasta.tsv',
 './jul01/pep_frag_110.fasta.tsv',
 './jul01/pep_frag_077.fasta.tsv',
 './jul01/pep_frag_131.fasta.tsv',
 './jul01/pep_frag_182.fasta.tsv',
 './jul01/pep_frag_019.fasta.tsv',
 './jul01/pep_frag_196.fasta.tsv',
 './jul01/pep_frag_125.fasta.tsv',
 './jul01/pep_frag_063.fasta.tsv',
 './jul01/pep_frag_189.fasta.tsv',
 './jul01/pep_frag_140.fasta.tsv',
 './jul01/pep_frag_006.fasta.tsv',
 './jul01/pep_frag_068.fasta.tsv',
 './jul01/pep_frag_0

In [3]:
columns = [
    "Protein accession",
    "Sequence MD5 digest",
    "Sequence length",
    "Analysis",
    "Signature accession",
    "Signature description",
    "Start location",
    "Stop location",
    "Score",
    "Status",
    "Date",
    "InterPro annotations - accession",
    "InterPro annotations - description",
    "GO annotations with their source(s)",
    "Pathways annotations"
]
len(columns)

15

In [4]:
annot_df_raw = dataframe_utils.csv_files_to_df(annot_files_raw, sep='\t', cols=columns)
annot_df_raw
annot_df_raw['Protein accession'].values

array(['TTHERM_00541547', 'TTHERM_00541547', 'TTHERM_00541547', ...,
       'TTHERM_00245160', 'TTHERM_00245160', 'TTHERM_00245160'],
      dtype=object)

In [5]:
annot_df_raw

Unnamed: 0,Protein accession,Sequence MD5 digest,Sequence length,Analysis,Signature accession,Signature description,Start location,Stop location,Score,Status,Date,InterPro annotations - accession,InterPro annotations - description,GO annotations with their source(s),Pathways annotations
0,TTHERM_00541547,f07d6740e0003ef6fe12e459f21e761f,1221,SUPERFAMILY,SSF57184,Growth factor receptor domain,815,965,7.86E-21,T,02-07-2024,IPR009030,Growth factor receptor cysteine-rich domain su...,-,-
1,TTHERM_00541547,f07d6740e0003ef6fe12e459f21e761f,1221,Gene3D,G3DSA:2.10.220.10,-,661,753,3.2E-7,T,02-07-2024,-,-,-,-
2,TTHERM_00541547,f07d6740e0003ef6fe12e459f21e761f,1221,Gene3D,G3DSA:2.10.220.10,-,1082,1176,3.9E-8,T,02-07-2024,-,-,-,-
3,TTHERM_00541547,f07d6740e0003ef6fe12e459f21e761f,1221,SUPERFAMILY,SSF55486,"Metalloproteases (""zincins""), catalytic domain",58,380,3.56E-43,T,02-07-2024,-,-,-,-
4,TTHERM_00541547,f07d6740e0003ef6fe12e459f21e761f,1221,FunFam,G3DSA:3.90.132.10:FF:000001,leishmanolysin-like peptidase isoform X2,120,240,1.5E-29,T,02-07-2024,-,-,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
233202,TTHERM_00927401,65eeb2a7e98e049b64af46123ef13b7d,581,Coils,Coil,Coil,542,562,-,T,08-07-2024,-,-,-,-
233203,TTHERM_00245160,29c0c10a31c971f67381355224400f07,969,MobiDBLite,mobidb-lite,consensus disorder prediction,307,326,-,T,08-07-2024,-,-,-,-
233204,TTHERM_00245160,29c0c10a31c971f67381355224400f07,969,MobiDBLite,mobidb-lite,consensus disorder prediction,307,323,-,T,08-07-2024,-,-,-,-
233205,TTHERM_00245160,29c0c10a31c971f67381355224400f07,969,MobiDBLite,mobidb-lite,consensus disorder prediction,577,612,-,T,08-07-2024,-,-,-,-


In [6]:
annot_df_raw.isna().any()

Protein accession                      False
Sequence MD5 digest                    False
Sequence length                        False
Analysis                               False
Signature accession                    False
Signature description                  False
Start location                         False
Stop location                          False
Score                                  False
Status                                 False
Date                                   False
InterPro annotations - accession       False
InterPro annotations - description     False
GO annotations with their source(s)    False
Pathways annotations                   False
dtype: bool

In [7]:
interpro_desc_dict = {}

interpro_descs = annot_df_raw['InterPro annotations - description'].values

for idx, acc in enumerate(annot_df_raw['InterPro annotations - accession'].values):
    if acc in interpro_desc_dict and interpro_desc_dict[acc] != interpro_descs[idx]:
        raise(ValueError(f'InterPro annotation with accession={acc} has multiple descriptions:\n1. {interpro_desc_dict[acc]}\n2. {interpro_descs[idx]}\n...'))
    interpro_desc_dict[acc] = interpro_descs[idx]

In [8]:
interpro_desc_dict

{'IPR009030': 'Growth factor receptor cysteine-rich domain superfamily',
 '-': '-',
 'IPR006212': 'Furin-like repeat',
 'IPR000742': 'EGF-like domain',
 'IPR001577': 'Peptidase M8, leishmanolysin',
 'IPR009851': 'Modifier of rudimentary, Modr',
 'IPR014710': 'RmlC-like jelly roll fold',
 'IPR000595': 'Cyclic nucleotide-binding domain',
 'IPR051413': 'Potassium/sodium hyperpolarization-activated cyclic nucleotide-gated channel',
 'IPR013099': 'Potassium channel domain',
 'IPR018490': 'Cyclic nucleotide-binding domain superfamily',
 'IPR041677': 'DNA2/NAM7 helicase, helicase domain',
 'IPR027417': 'P-loop containing nucleoside triphosphate hydrolase',
 'IPR045055': 'DNA2/NAM7-like helicase',
 'IPR014001': 'Helicase superfamily 1/2, ATP-binding domain',
 'IPR041679': 'DNA2/NAM7 helicase-like, C-terminal',
 'IPR047187': 'Upf1-like, C-terminal helicase domain',
 'IPR001752': 'Kinesin motor domain',
 'IPR019821': 'Kinesin motor domain, conserved site',
 'IPR036961': 'Kinesin motor domain sup

In [9]:
interpro_desc_list = list(interpro_desc_dict.items())

interpro_desc_df = pd.DataFrame(interpro_desc_list, columns=['InterPro', 'InterPro_description'])

In [10]:
interpro_desc_df.head()

Unnamed: 0,InterPro,InterPro_description
0,IPR009030,Growth factor receptor cysteine-rich domain su...
1,-,-
2,IPR006212,Furin-like repeat
3,IPR000742,EGF-like domain
4,IPR001577,"Peptidase M8, leishmanolysin"


In [11]:
interpro_desc_df.sort_values(by='InterPro').to_csv('./interpro_annotations.csv', index=False)

In [12]:
interpro_dict = {
        'TTHERM_ID': [],
        'InterPro': [],
}

for id in annot_df_raw['Protein accession'].unique():

    curr_df = annot_df_raw.loc[annot_df_raw['Protein accession'] == id]

    ipaa = sorted(list(curr_df['InterPro annotations - accession'].unique()))

    interpro_dict['TTHERM_ID'].append(id)
    interpro_dict['InterPro'].append(
                                                            ','.join(ipaa[1: ])
                                                            if len(ipaa) > 1 and ipaa[0] == '-' else ipaa[0]
                                                            )
    
interpro_df = pd.DataFrame(interpro_dict)

In [13]:
interpro_df.sample(20)

Unnamed: 0,TTHERM_ID,InterPro
2175,TTHERM_00522480,"IPR000595,IPR014710,IPR018490,IPR043533"
12695,TTHERM_00160980,"IPR006876,IPR051584"
6891,TTHERM_00028680,IPR004947
15037,TTHERM_01260060,"IPR001611,IPR027038,IPR032675"
15386,TTHERM_01279550,"IPR006553,IPR032675"
3113,TTHERM_00584690,IPR011992
8096,TTHERM_00455230,IPR040040
12059,TTHERM_00420980,"IPR001680,IPR015943,IPR020472,IPR036322"
21469,TTHERM_01197000,IPR006212
16543,TTHERM_00077380,-


In [14]:
interpro_df.shape

(21654, 2)

In [15]:
interpro_df = interpro_df.loc[
    (interpro_df['InterPro'] != '-')
    # &
    # (interpro_df['InterPro annotations - description'] != '-')
    # |
    # (interpro_df['InterPro annotations - description'] != '-')
]

In [16]:
interpro_df

Unnamed: 0,TTHERM_ID,InterPro
0,TTHERM_00541547,"IPR000742,IPR001577,IPR006212,IPR009030"
1,TTHERM_01124020,IPR009851
2,TTHERM_01184230,"IPR000595,IPR013099,IPR014710,IPR018490,IPR051413"
3,TTHERM_00095380,"IPR014001,IPR027417,IPR041677,IPR041679,IPR045..."
4,TTHERM_00238980,"IPR001752,IPR019821,IPR027417,IPR027640,IPR036961"
...,...,...
21646,TTHERM_00829340,IPR001611
21647,TTHERM_00426250,"IPR000447,IPR006076,IPR031656,IPR036188,IPR038299"
21649,TTHERM_01073470,"IPR001005,IPR009057,IPR017930,IPR021786,IPR047..."
21650,TTHERM_00630640,"IPR001611,IPR027038,IPR032675"


In [17]:
dataframe_utils.sql_query_df({'interpro_df': interpro_df}, 'select * from interpro_df where InterPro not like "%IPR%"')

Unnamed: 0,TTHERM_ID,InterPro


In [18]:
interpro_df.sort_values(by='TTHERM_ID').to_csv('./interpro_annot.csv', index=False)

In [19]:
# annot_df_raw['Analysis'].unique()

In [20]:
# analysis_dfs = []

# for analysis in annot_df_raw['Analysis'].unique():
#     curr_analysis_dict = {
#         'TTHERM_ID': [],
#         analysis: [],
#         f'{analysis}_desc': [],
#     }

#     curr_analysis_df = annot_df_raw.loc[annot_df_raw['Analysis'] == analysis]

#     curr_analysis_dict['TTHERM_ID'] += list(curr_analysis_df['Protein accession'].values)
#     curr_analysis_dict[analysis] += list(curr_analysis_df['Signature accession'].values)
#     curr_analysis_dict[f'{analysis}_desc'] += list(curr_analysis_df['Signature description'].values)

#     analysis_dfs.append(pd.DataFrame(curr_analysis_dict))

In [21]:
# combined_df = pd.DataFrame({'TTHERM_ID': annot_df_raw['Protein accession'].values})

# for idx, df in enumerate(analysis_dfs):
#     # combined_df = combined_df.merge(df, on='TTHERM_ID', how='outer')
#     combined_df = dataframe_utils.sql_query_df({'combined_df': combined_df, 'df': df},
# '''SELECT *
# FROM combined_df
# FULL OUTER JOIN df
# ON combined_df.TTHERM_ID = df.TTHERM_ID;''')