In [116]:
import os
from dotenv import load_dotenv
import numpy as np
import pandas as pd
import datetime
from tqdm.notebook import tqdm
import warnings
import matplotlib.pyplot as plt


load_dotenv(override=True)

DATA_PATH = os.getenv('DATA_PATH')
DATA_PATH_INTERPRO = os.getenv('DATA_PATH_INTERPRO')
print(DATA_PATH)
print(DATA_PATH_INTERPRO)

SO="CCO"

/mnt/e/ML/cafa-5-protein-function-prediction
/mnt/e/ML/output


In [117]:
from Bio import SeqIO

sequences = [rec.seq for rec in SeqIO.parse(os.path.join(DATA_PATH, "Test (Targets)/testsuperset.fasta"),"fasta")]
ids = [rec.id for rec in SeqIO.parse(os.path.join(DATA_PATH, "Test (Targets)/testsuperset.fasta"),"fasta")]

In [118]:
import networkx
import obonet

# Read the taxrank ontology
url = os.path.join(DATA_PATH, "Train/go-basic.obo")
graph = obonet.read_obo(url)


In [119]:
id_to_name = {id_: data.get('name') for id_, data in graph.nodes(data=True)}
name_to_id = {data['name']: id_ for id_, data in graph.nodes(data=True) if 'name' in data}

In [134]:

def getAllAncestors(go):
    try:
        paths = networkx.all_simple_paths(
            graph,
            source=go,
            target=name_to_id["cellular_component"]
        )
    except:
        paths = []
    gos = []
    for path in paths:
        for node in path:
            gos.append(node)
    ccList = list(set(gos))

    try:
        paths = networkx.all_simple_paths(
            graph,
            source=go,
            target=name_to_id["molecular_function"]
        )
    except:
        paths = []
    gos = []
    for path in paths:
        for node in path:
            gos.append(node)
    mfList = list(set(gos))

    try:
        paths = networkx.all_simple_paths(
            graph,
            source=go,
            target=name_to_id["biological_process"]
        )
    except:
        paths = []
    gos = []
    for path in paths:
        for node in path:
            gos.append(node)
    bpList = list(set(gos))

    return [*ccList, *mfList, *bpList]


In [135]:
getAllAncestors("GO:0003677")

['GO:0097159',
 'GO:0003674',
 'GO:0003676',
 'GO:0003677',
 'GO:0005488',
 'GO:1901363']

### Read Interpro2GO

In [136]:
ipList=[]
GOList=[]
with open(os.path.join(DATA_PATH, "interpro2go.txt")) as file:
    while line := file.readline():
        if not line.startswith("!"):
            firstSplit = line.split(">")
            ip = firstSplit[0].split(" ")[0].split(":")[1].strip()
            go = firstSplit[1].split(" ; ")[-1][0:-1]
            ipList.append(ip)
            GOList.append(go)

print(len(ipList))
print(len(GOList))

30447
30447


In [137]:
dictData = {'Interpro': ipList, 'GO': GOList}
ip2go = pd.DataFrame.from_dict(dictData)
ip2go.set_index("Interpro", inplace=True)

In [138]:
ip2go["GO"]


Interpro
IPR000003    GO:0003677
IPR000003    GO:0003707
IPR000003    GO:0008270
IPR000003    GO:0006355
IPR000003    GO:0005634
                ...    
IPR048182    GO:0043464
IPR048197    GO:0004867
IPR048197    GO:0004869
IPR048205    GO:0102560
IPR048205    GO:0102561
Name: GO, Length: 30447, dtype: object

### Read Interpro Data

In [9]:
import json

allInterproData =[]

for root,dirs,files in os.walk(DATA_PATH_INTERPRO):
    for f in files:
        if f.endswith(".json") and f.startswith("test"):
            print("Processing ", f)
            with open(os.path.join(root, f)) as inputFile:
                iprData = json.load(inputFile)
            allInterproData=[*allInterproData, *iprData["results"]]

Processing  testsuperset1.fasta.json
Processing  testsuperset2.fasta.json


In [139]:
ipMatches={}

for ipData in allInterproData:
    temp=[]
    for match in ipData["matches"]:
        if match["signature"]["entry"]:
            temp.append(match["signature"]["entry"]["accession"])
    ipMatches[ipData["xref"][0]["id"]] = temp

In [140]:
len(ipMatches)

139946

### Combine the data 

In [148]:
tableData=[]
noMatch=0

for i,seqId in tqdm(enumerate(ids), total=len(ids)):
    if not seqId in ipMatches:
        noMatch +=1
        continue
    gos=[]
    for ipEntry in ipMatches[seqId]:
        if ipEntry in ip2go.index:
            # print(ip2go.loc[ipEntry]["GO"])
            if ip2go.loc[ipEntry].size==1:
                gos.append(ip2go.loc[ipEntry]["GO"])
            else:
                gos = [*gos, *ip2go.loc[ipEntry]["GO"].to_numpy()]
        else:
            continue

    gosWithAnc=[]
    for g in gos:
        tempGos=getAllAncestors(g)
        gosWithAnc = [*gosWithAnc, *tempGos]
    gosWithAnc = list(set(gosWithAnc))
    
    for g in gosWithAnc:
        tableData.append([seqId , g, 1.0])

  0%|          | 0/141865 [00:00<?, ?it/s]

In [146]:
results = pd.DataFrame(tableData, columns=['Entry ID', 'GO', 'Probability'])

In [147]:
results.to_csv(os.path.join(DATA_PATH, "submissionIp2Go.tsv"), sep="\t", header=False, index=False)