# Term Filtering

In [1]:
# load necessary libraries
import obonet
import networkx as nx
import pandas as pd
import torch

In [2]:
# 1. Load ontology and training terms
go_graph = obonet.read_obo('data/Train/go-basic.obo')
train_terms = pd.read_csv('data/Train/train_terms.tsv', sep='\t')

In [3]:
# 2. Filter terms to CAFA 5 training set
cafa5_terms = train_terms['term'].unique()
subgraph_nodes = set()

In [4]:
# 3. Collect hierarchy
for term in cafa5_terms:
    if term in go_graph:
        subgraph_nodes.update(nx.ancestors(go_graph, term))
        subgraph_nodes.add(term)

In [5]:
# 4. Create subgraph
go_subgraph = go_graph.subgraph(subgraph_nodes)

In [6]:
# 5. Convert to PyTorch Geometric format
term_to_idx = {term: i for i, term in enumerate(go_subgraph.nodes())}
edges = []
for u, v in go_subgraph.edges():
    edges.append((term_to_idx[u], term_to_idx[v]))
    
edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()

In [8]:
# 6. Save for GCN
torch.save(edge_index, 'data/go_hierarchy.pt')