In [1]:
import os
from dotenv import load_dotenv
import numpy as np
import pandas as pd
import datetime
from tqdm.notebook import tqdm


load_dotenv(override=True)

DATA_PATH = os.getenv('DATA_PATH')
print(DATA_PATH)

# Choose subontology (CCO, MFO or BPO)
SO = 'MFO'

/mnt/e/ML/cafa-5-protein-function-prediction


## Reading fasta, obo and tsv files

In [2]:
from Bio import SeqIO

sequences = [rec.seq for rec in SeqIO.parse(os.path.join(DATA_PATH, "Train/train_sequences.fasta"),"fasta")]
ids = [rec.id for rec in SeqIO.parse(os.path.join(DATA_PATH, "Train/train_sequences.fasta"),"fasta")]
sequencesTest = [rec.seq for rec in SeqIO.parse(os.path.join(DATA_PATH, "Test (Targets)/testsuperset.fasta"),"fasta")]
idsTest = [rec.id for rec in SeqIO.parse(os.path.join(DATA_PATH, "Test (Targets)/testsuperset.fasta"),"fasta")]

In [3]:
df = pd.read_csv(os.path.join(DATA_PATH, "Train/train_terms.tsv"), sep='\t')

dfSO = df.loc[df["aspect"]==SO]
uniqueTerms = dfSO["term"].unique()
termsArr = list(dfSO["term"].to_numpy())

uniqueTermsDict={}
for i,el in enumerate(uniqueTerms):
    uniqueTermsDict[el] = i
    
print(dfSO.shape)
df=dfSO

df.set_index("EntryID", inplace=True)

(670114, 3)


In [4]:
testID = df.index.to_list()[0]

## GO analysis

In [5]:
item_counts = df["term"].value_counts()

In [6]:
# id_to_name = {id_: data.get('name') for id_, data in graph.nodes(data=True)}
# name_to_id = {data['name']: id_ for id_, data in graph.nodes(data=True) if 'name' in data}

## Label encoding

The task is a multilabel classification: The output has several possible targets (Gene Ontologies) but each can only be 1 (existing) or 0 (non existing)

Extract label weights from IA

In [7]:
dfIa = pd.read_csv(os.path.join(DATA_PATH, "IA.txt"), sep='\t', header=None)

dfIa.set_index(0, inplace=True)

labelWeights=[]
allIndices = dfIa.index.tolist()



notFound=0
for go in item_counts.index.to_list():
    if go in allIndices:
        labelWeights.append(dfIa.loc[go].to_numpy()[0])
    else:
        notFound += 1
        labelWeights.append(0)

print("Not found GOs: {} (set to 0)".format(notFound))

Not found GOs: 0 (set to 0)


In [8]:
topGOs=item_counts.index.to_list()


## Amino acids encoding

In [9]:
aa_dict = {'A': 1, 'B':24, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'K': 9, 'L': 10, 'M': 11, 'N': 12, 'O': 21, 'P': 13, 'Q': 14, 'R': 15, 'S': 16, 'T': 17, 'U': 22, 'V': 18, 'W': 19, 'Y': 20, 'X':30, 'Z':23}

## Build Dataset and Read Diamond Output

In [10]:
seqLengths = [len(seq) for seq in sequences]
maxLen = max(seqLengths)
print("The max. length of the sequences is {}".format(maxLen))

The max. length of the sequences is 35375


In [13]:

dfDiamond=pd.read_csv(os.path.join(DATA_PATH, "diamondOutput.tsv"), sep='\t', header=None, names=["target", "compare", "similarity", "val0", "val1", "val2", "val3", "val4", "val5", "val6", "val7", "val8"])

cleanedIndices=[]
for index in dfDiamond["target"]:
    cleanedIndices.append(str(index).split("\\t")[0])

dfDiamond["cleanedTarget"] = cleanedIndices

dfDiamond.set_index("cleanedTarget", inplace=True)

dfDiamond.drop(["val0", "val1", "val2", "val3", "val4", "val5", "val6", "val7", "val8"], axis=1, inplace=True)

dfDiamond.head(20)

Unnamed: 0_level_0,target,compare,similarity
cleanedTarget,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Q9CQV8,Q9CQV8\t10090,Q9CQV8,100.0
Q9CQV8,Q9CQV8\t10090,P35213,98.8
Q9CQV8,Q9CQV8\t10090,P31946,98.8
Q9CQV8,Q9CQV8\t10090,V9HWD6,98.8
Q9CQV8,Q9CQV8\t10090,Q5PRD0,91.0
Q9CQV8,Q9CQV8\t10090,P63104,87.2
Q9CQV8,Q9CQV8\t10090,Q5ZKC9,86.8
Q9CQV8,Q9CQV8\t10090,P63101,86.8
Q9CQV8,Q9CQV8\t10090,P63102,86.8
Q9CQV8,Q9CQV8\t10090,P68254,81.0


In [14]:

dfAll=pd.read_csv(os.path.join(DATA_PATH, "Train/train_terms.tsv"), sep='\t')

soEntries = dfAll.loc[dfAll["aspect"]==SO]
soEntryIds = soEntries["EntryID"].unique()

# print(len(seqEntries))
print(soEntryIds)

# SoSequences = []
# for entry in soEntryIds:
#     SoSequences.append(sequences[ids.index(entry)])

# print(len(SoSequences))
dfAll.set_index("EntryID", inplace=True)

['A0A009IHW8' 'A0A023FBW4' 'A0A023FBW7' ... 'X5L1L5' 'X5L565' 'X5M5N0']


## Classification via Similarity

In [15]:
import cProfile

def inference():
    tableData=[]

    for seq in tqdm(idsTest, smoothing=0.1):
        try:
            diaData = dfDiamond.loc[seq]
        except:
            # If not found, skip
            print("Not found: ", seq)
            continue

        similarities = diaData["similarity"]
        comparedSeq = diaData["compare"]

        # If there ist just one single entry
        if(not type(similarities) is pd.core.series.Series):
            lastSim = similarities
            similarSeqName = comparedSeq
            entries = dfAll.loc[similarSeqName]
            gos = list(entries["term"])
            tableData = tableData + [[seq, g, lastSim/100] for g in gos]
            continue
        
        gos=set()
        lastSim=100

        for i,sim in enumerate(similarities):
            if i==0 or sim>98:
                lastSim = sim
                similarSeqName = comparedSeq[i]
                entries = dfAll.loc[similarSeqName]
                gos.update(list(entries["term"]))


        # gos=list(dict.fromkeys(gos))
        tableData = tableData + [[seq, g, lastSim/100] for g in gos]
        break

    results = pd.DataFrame(tableData, columns=['Entry ID', 'GO', 'Probability'])
    return results


cProfile.run("inference()", sort="tottime")

  0%|          | 0/141865 [00:00<?, ?it/s]

         16175 function calls (14495 primitive calls) in 0.560 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
       11    0.424    0.039    0.427    0.039 {method 'get_loc' of 'pandas._libs.index.IndexEngine' objects}
        1    0.063    0.063    0.117    0.117 managers.py:2361(_merge_blocks)
    15/13    0.057    0.004    0.057    0.004 {built-in method numpy.core._multiarray_umath.implement_array_function}
        5    0.002    0.000    0.002    0.000 {method 'nonzero' of 'numpy.ndarray' objects}
       33    0.001    0.000    0.001    0.000 {method 'acquire' of '_thread.lock' objects}
       28    0.001    0.000    0.001    0.000 {built-in method posix.stat}
        5    0.000    0.000    0.003    0.001 <__array_function__ internals>:177(where)
        6    0.000    0.000    0.000    0.000 {built-in method marshal.loads}
       19    0.000    0.000    0.000    0.000 socket.py:613(send)
       25    0.000    0.000    

In [16]:
tableData=[]
notFoundCount=0
for seq in tqdm(idsTest, smoothing=0.1):
    try:
        diaData = dfDiamond.loc[seq]
    except:
        # If not found, skip
        print(notFoundCount, " Not found: ", seq)
        notFoundCount += 1
        continue

    similarities = diaData["similarity"]
    comparedSeq = diaData["compare"]

    # If there ist just one single entry
    if(not type(similarities) is pd.core.series.Series):
        lastSim = similarities
        similarSeqName = comparedSeq
        entries = dfAll.loc[similarSeqName]
        gos = list(entries["term"])
        tableData = tableData + [[seq, g, lastSim/100] for g in gos]
        continue
    
    gos=set()
    lastSim=100

    for i,sim in enumerate(similarities):
        if i==0 or sim>98:
            lastSim = sim
            similarSeqName = comparedSeq[i]
            entries = dfAll.loc[similarSeqName]
            gos.update(list(entries["term"]))


    # gos=list(dict.fromkeys(gos))
    tableData = tableData + [[seq, g, lastSim/100] for g in gos]
    


results = pd.DataFrame(tableData, columns=['Entry ID', 'GO', 'Probability'])


  0%|          | 0/141865 [00:00<?, ?it/s]

0  Not found:  Q8C1A9
1  Not found:  P83855
2  Not found:  Q14DN9
3  Not found:  Q8BVF9
4  Not found:  Q3UUF8
5  Not found:  Q8BLB8
6  Not found:  Q3UUE9
7  Not found:  P61110
8  Not found:  Q8BT18
9  Not found:  Q9DCQ2
10  Not found:  P86174
11  Not found:  Q8R2K8
12  Not found:  A2AGB2
13  Not found:  Q9DAA7
14  Not found:  B1ARW8
15  Not found:  Q9D5Q8
16  Not found:  Q3U7U4
17  Not found:  Q9CPZ3
18  Not found:  Q9CQM1
19  Not found:  Q5SPV6
20  Not found:  Q3V0A6
21  Not found:  Q8C3M9
22  Not found:  Q9DAQ4
23  Not found:  Q8BGD0
24  Not found:  Q8BGK9
25  Not found:  Q8C5S3
26  Not found:  E9Q0B3
27  Not found:  Q8BVN0
28  Not found:  Q3TC33
29  Not found:  Q8CAI1
30  Not found:  Q8C963
31  Not found:  Q3UYG1
32  Not found:  Q8BXX9
33  Not found:  E9Q1U1
34  Not found:  E9PVB3
35  Not found:  Q3UHB8
36  Not found:  Q3URK1
37  Not found:  A0A140LIT1
38  Not found:  Q8BVF4
39  Not found:  Q9DAL3
40  Not found:  Q8VEG0
41  Not found:  Q8CDM4
42  Not found:  Q9DBT3
43  Not found:  Q

In [17]:
results.head(20)

Unnamed: 0,Entry ID,GO,Probability
0,Q9CQV8,GO:0004860,0.988
1,Q9CQV8,GO:0046907,0.988
2,Q9CQV8,GO:0045892,0.988
3,Q9CQV8,GO:0019222,0.988
4,Q9CQV8,GO:0051649,0.988
5,Q9CQV8,GO:0019899,0.988
6,Q9CQV8,GO:0048583,0.988
7,Q9CQV8,GO:0005829,0.988
8,Q9CQV8,GO:1903506,0.988
9,Q9CQV8,GO:0019220,0.988


In [18]:
results.to_csv(os.path.join(DATA_PATH, "submissionDiamond.tsv"), sep="\t", header=False, index=False)