In [1]:
import pandas as pd
import sys

sys.path.append('../../')
from utils import dataframe_utils

In [2]:
annot_df_raw = pd.read_csv('../../input_data/interproscan_tsv_file/interproscan_compiled.tsv', sep='\t')
annot_df_raw
annot_df_raw['Protein accession'].values

array(['TTHERM_00046368', 'TTHERM_00046368', 'TTHERM_00046368', ...,
       'TTHERM_00245160', 'TTHERM_00245160', 'TTHERM_00245160'],
      dtype=object)

In [3]:
annot_df_raw

Unnamed: 0,Protein accession,Sequence MD5 digest,Sequence length,Analysis,Signature accession,Signature description,Start location,Stop location,Score,Status,Date,InterPro annotations - accession,InterPro annotations - description,GO annotations with their source(s),Pathways annotations
0,TTHERM_00046368,e7ee8c885c14dee785ed46b49f88b10c,1147,MobiDBLite,mobidb-lite,consensus disorder prediction,404,475,-,T,02-07-2024,-,-,-,-
1,TTHERM_00046368,e7ee8c885c14dee785ed46b49f88b10c,1147,MobiDBLite,mobidb-lite,consensus disorder prediction,660,687,-,T,02-07-2024,-,-,-,-
2,TTHERM_00046368,e7ee8c885c14dee785ed46b49f88b10c,1147,MobiDBLite,mobidb-lite,consensus disorder prediction,212,233,-,T,02-07-2024,-,-,-,-
3,TTHERM_00046368,e7ee8c885c14dee785ed46b49f88b10c,1147,MobiDBLite,mobidb-lite,consensus disorder prediction,367,391,-,T,02-07-2024,-,-,-,-
4,TTHERM_00046368,e7ee8c885c14dee785ed46b49f88b10c,1147,MobiDBLite,mobidb-lite,consensus disorder prediction,613,630,-,T,02-07-2024,-,-,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
233202,TTHERM_00927401,65eeb2a7e98e049b64af46123ef13b7d,581,Coils,Coil,Coil,542,562,-,T,08-07-2024,-,-,-,-
233203,TTHERM_00245160,29c0c10a31c971f67381355224400f07,969,MobiDBLite,mobidb-lite,consensus disorder prediction,307,326,-,T,08-07-2024,-,-,-,-
233204,TTHERM_00245160,29c0c10a31c971f67381355224400f07,969,MobiDBLite,mobidb-lite,consensus disorder prediction,307,323,-,T,08-07-2024,-,-,-,-
233205,TTHERM_00245160,29c0c10a31c971f67381355224400f07,969,MobiDBLite,mobidb-lite,consensus disorder prediction,577,612,-,T,08-07-2024,-,-,-,-


In [4]:
annot_df_raw.isna().any()

Protein accession                      False
Sequence MD5 digest                    False
Sequence length                        False
Analysis                               False
Signature accession                    False
Signature description                  False
Start location                         False
Stop location                          False
Score                                  False
Status                                 False
Date                                   False
InterPro annotations - accession       False
InterPro annotations - description     False
GO annotations with their source(s)    False
Pathways annotations                   False
dtype: bool

In [5]:
interpro_desc_dict = {}

interpro_descs = annot_df_raw['InterPro annotations - description'].values

for idx, acc in enumerate(annot_df_raw['InterPro annotations - accession'].values):
    if acc in interpro_desc_dict and interpro_desc_dict[acc] != interpro_descs[idx]:
        raise(ValueError(f'InterPro annotation with accession={acc} has multiple descriptions:\n1. {interpro_desc_dict[acc]}\n2. {interpro_descs[idx]}\n...'))
    interpro_desc_dict[acc] = interpro_descs[idx]

In [6]:
interpro_desc_dict

{'-': '-',
 'IPR001594': 'Palmitoyltransferase, DHHC domain',
 'IPR001810': 'F-box domain',
 'IPR006553': 'Leucine-rich repeat, cysteine-containing subtype',
 'IPR036047': 'F-box-like domain superfamily',
 'IPR032675': 'Leucine-rich repeat domain superfamily',
 'IPR004344': 'Tubulin-tyrosine ligase/Tubulin polyglutamylase',
 'IPR000719': 'Protein kinase domain',
 'IPR027038': 'Ran GTPase-activating protein',
 'IPR001611': 'Leucine-rich repeat',
 'IPR011009': 'Protein kinase-like domain superfamily',
 'IPR036259': 'MFS transporter superfamily',
 'IPR051617': 'UNC-93-like transmembrane regulator',
 'IPR011701': 'Major facilitator superfamily',
 'IPR026906': 'BspA-type LRR region',
 'IPR002259': 'Equilibrative nucleoside transporter',
 'IPR025763': 'tRNA (guanine-N-7) methyltransferase catalytic subunit Trm8, eukaryote',
 'IPR003358': 'tRNA (guanine-N-7) methyltransferase, Trmb type',
 'IPR029063': 'S-adenosyl-L-methionine-dependent methyltransferase superfamily',
 'IPR007722': 'mRNA deca

In [7]:
interpro_desc_list = list(interpro_desc_dict.items())

interpro_desc_df = pd.DataFrame(interpro_desc_list, columns=['InterPro', 'InterPro_description'])

In [8]:
interpro_desc_df.head()

Unnamed: 0,InterPro,InterPro_description
0,-,-
1,IPR001594,"Palmitoyltransferase, DHHC domain"
2,IPR001810,F-box domain
3,IPR006553,"Leucine-rich repeat, cysteine-containing subtype"
4,IPR036047,F-box-like domain superfamily


In [9]:
interpro_desc_df.sort_values(by='InterPro').to_csv('../../active_files/interpro_annotations.csv', index=False)

In [10]:
interpro_dict = {
        'TTHERM_ID': [],
        'InterPro': [],
}

for id in annot_df_raw['Protein accession'].unique():

    curr_df = annot_df_raw.loc[annot_df_raw['Protein accession'] == id]

    ipaa = sorted(list(curr_df['InterPro annotations - accession'].unique()))

    interpro_dict['TTHERM_ID'].append(id)
    interpro_dict['InterPro'].append(
                                                            ','.join(ipaa[1: ])
                                                            if len(ipaa) > 1 and ipaa[0] == '-' else ipaa[0]
                                                            )
    
interpro_df = pd.DataFrame(interpro_dict)

In [11]:
interpro_df.sample(20)

Unnamed: 0,TTHERM_ID,InterPro
1493,TTHERM_00773750,"IPR001680,IPR015943,IPR019775,IPR036322,IPR050687"
5022,TTHERM_01002840,"IPR001789,IPR003594,IPR003661,IPR004358,IPR005..."
6540,TTHERM_00131160,-
14491,TTHERM_00313330,-
12741,TTHERM_00187060,-
3112,TTHERM_00502410,"IPR000595,IPR014710,IPR018490,IPR043533"
2425,TTHERM_00666970,-
13244,TTHERM_00759520,IPR032675
6235,TTHERM_01262820,"IPR006595,IPR013083,IPR024964,IPR027370,IPR037..."
10847,TTHERM_00826850,"IPR000595,IPR013099,IPR014710,IPR018490,IPR050818"


In [12]:
interpro_df.shape

(21654, 2)

In [13]:
interpro_df = interpro_df.loc[
    (interpro_df['InterPro'] != '-')
    # &
    # (interpro_df['InterPro annotations - description'] != '-')
    # |
    # (interpro_df['InterPro annotations - description'] != '-')
]

In [14]:
interpro_df

Unnamed: 0,TTHERM_ID,InterPro
1,TTHERM_00581830,IPR001594
3,TTHERM_00522350,"IPR001810,IPR006553,IPR032675,IPR036047"
4,TTHERM_00624190,IPR004344
5,TTHERM_01370770,"IPR000719,IPR001611,IPR011009,IPR027038,IPR032675"
6,TTHERM_00522360,IPR011701
...,...,...
21645,TTHERM_00263110,"IPR007421,IPR029684,IPR038461"
21646,TTHERM_00408770,"IPR006212,IPR009030"
21649,TTHERM_00939130,IPR032675
21650,TTHERM_00698680,"IPR013087,IPR036236"


In [15]:
dataframe_utils.sql_query_df({'interpro_df': interpro_df}, 'select * from interpro_df where InterPro not like "%IPR%"')

Unnamed: 0,TTHERM_ID,InterPro


In [16]:
interpro_df.sort_values(by='TTHERM_ID').to_csv('../../active_files/interpro_annot.csv', index=False)