# Aid2GO Evaluation of Predictions
Author: Cleverson Matiolli Ph.D.

In [None]:
# Standard libraries
from pathlib import Path

# Third party libraries
import pandas as pd
import numpy as np

# Bioinformatics
from obonet import read_obo
import networkx as nx
from cafaeval.evaluation import cafa_eval, write_results

# Configuration
pd.options.mode.copy_on_write = True

# Define folder paths
base_dir = Path(Path.cwd())
print(f"Base directory: {base_dir}")

## Load Input Files

### Load Predictions

In [None]:
# Load predictions
test_results = pd.read_csv(
    base_dir / "outputs/models/focused_128_1:2/test_results.csv"
)

print(f"test results shape: {test_results.shape}")
test_results.head()

In [None]:
# Get predictions in CAFA format for evaluation
predictions = test_results[
    ["Protein ID", "GO ID", "Probability"]
]  # Only associations and probs
predictions["Probability"] = predictions["Probability"].round(
    3
)  # cafa evaluate 3 decimals

# Sort by protein identiiers (optional), reset index and save
predictions = predictions.sort_values(by="Protein ID")
predictions.reset_index(drop=True, inplace=True)
predictions.to_csv(
    base_dir / "predictions/predictions.tsv",
    sep="\t",
    header=False,
    index=False,
    encoding="utf-8",
)
predictions

### Load Ground-truth (Protein-GO Associations)

In [None]:
associations = pd.read_csv(base_dir / "data/ppi/associations_human.tsv", sep="\t")
print(f"test results shape: {associations.shape}")

In [None]:
# Get ground-truth

ground_truth = associations[["uniprot_id", "go_id"]]
# ground_truth = ground_truth[ground_truth["uniprot_id"].isin(tested_proteins)]
ground_truth = ground_truth[ground_truth["go_id"].isin(predictions["GO ID"])]
ground_truth = ground_truth.sort_values(by="uniprot_id")
ground_truth.reset_index(drop=True, inplace=True)
ground_truth.to_csv(
    base_dir / "ground_truth.txt",
    sep="\t",
    header=False,
    index=False,
    encoding="utf-8",
)
ground_truth

### Load GO DAG

In [None]:
from obonet import read_obo
go_graph = read_obo(base_dir / "data/go/go-basic.obo")
print(go_graph)

In [None]:
# Load IA
filepath = Path("/media/clever/aidbio/data/cafa5/information_accretion.txt")
ia = pd.read_csv(filepath, sep="\t")
ia = ia[["go_id", "ia"]]
# ia = ia[ia["go_id"].isin(go_graph.nodes())]
ia.reset_index(drop=True, inplace=True)
ia.to_csv(
    "ia.txt",
    sep="\t",
    header=False,
    index=False,
    encoding="utf-8",
)
ia

In [None]:
df, dfs_best = cafa_eval(
    obo_file=base_dir / "data/go/go-basic.obo",
    pred_dir=base_dir / "predictions",
    gt_file=base_dir / "predictions/ground_truth.txt",
    ia="ia.txt",
    no_orphans=False,
    norm="cafa",
    prop="max",
    max_terms=None,
    th_step=0.01,
    n_cpu=1,
)

# View the results
print(dfs_best)
df