# Jaccard AUPRC Evaluation
This notebook computes Jaccard link-prediction scores for drug–disease pairs in a TxGNN-formatted subgraph and reports the AUPRC.

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.metrics import average_precision_score
from tqdm import tqdm

### Load data

In [2]:
import os
import pandas as pd
from pathlib import Path

# --- Automatically find the project root ---
# Works even if the notebook is opened from notebooks/ or repo root
here = Path().resolve()
while not (here / 'data').exists() and here != here.parent:
    here = here.parent
os.chdir(here)

# --- Choose which subgraph to evaluate ---
SUBGRAPH = 'drug-disease-gene_protein'   # <-- change this for each run
DATA_DIR = Path(f'data/subgraphs/{SUBGRAPH}/full_graph_42')

# --- Paths ---
node_path = DATA_DIR.parent / 'node.csv'
edge_path = DATA_DIR.parent / 'edges.csv'
train_path = DATA_DIR / 'train.csv'
test_path = DATA_DIR / 'test.csv'

# --- Load data ---
nodes = pd.read_csv(node_path, sep='\t', quotechar='"')
edges = pd.read_csv(edge_path)
train_edges = pd.read_csv(train_path)
test_edges = pd.read_csv(test_path)

print(f"Loaded {len(nodes)} nodes, {len(edges)} total edges, "
      f"{len(train_edges)} training edges, {len(test_edges)} test edges.")

  train_edges = pd.read_csv(train_path)


Loaded 52708 nodes, 3676556 total edges, 7695474 training edges, 405024 test edges.


  test_edges = pd.read_csv(test_path)


### Build 1-hop neighbor sets from the training graph

In [6]:
# Auto-detect edge column names
edge_cols = [c.lower() for c in train_edges.columns]
x_col = next((c for c in edge_cols if c.startswith('x_')), None)
y_col = next((c for c in edge_cols if c.startswith('y_')), None)

neighbors = {}
for _, row in train_edges.iterrows():
    u, v = str(row[x_col]), str(row[y_col])
    neighbors.setdefault(u, set()).add(v)
    neighbors.setdefault(v, set()).add(u)

print(f"Built neighbor sets for {len(neighbors)} nodes using columns '{x_col}' and '{y_col}'.")


KeyboardInterrupt: 

### Define Jaccard score function

In [None]:
def jaccard(u, v):
    nu = neighbors.get(str(u), set())
    nv = neighbors.get(str(v), set())
    if not nu or not nv:
        return 0.0
    return len(nu & nv) / len(nu | nv)

### Compute Jaccard scores for test positives (true drug–disease edges)

In [None]:
pos = test_edges.copy()
pos['score'] = [jaccard(x, y) for x, y in tqdm(zip(pos['x_index'], pos['y_index']), total=len(pos))]
pos['label'] = 1

print(f"Computed Jaccard scores for {len(pos)} positive test edges.")

### Sample an equal number of random negative (non-edge) drug–disease pairs

In [None]:
drug_nodes = nodes[nodes['node_type'].str.contains('drug', case=False)]['node_index'].astype(str).tolist()
disease_nodes = nodes[nodes['node_type'].str.contains('disease', case=False)]['node_index'].astype(str).tolist()

existing = set(tuple(sorted([str(x), str(y)])) for x, y in edges[['x_index', 'y_index']].astype(str).values)
neg_pairs = []
rng = np.random.default_rng(42)

while len(neg_pairs) < len(pos):
    d = rng.choice(drug_nodes)
    dis = rng.choice(disease_nodes)
    if (d, dis) not in existing and (dis, d) not in existing:
        neg_pairs.append((d, dis))

neg = pd.DataFrame(neg_pairs, columns=['x_index', 'y_index'])
neg['score'] = [jaccard(x, y) for x, y in tqdm(zip(neg['x_index'], neg['y_index']), total=len(neg))]
neg['label'] = 0

print(f"Sampled {len(neg)} negative pairs.")

### Compute AUPRC (Average Precision) for Jaccard similarity

In [None]:
y_true = np.concatenate([pos['label'], neg['label']])
y_score = np.concatenate([pos['score'], neg['score']])

auprc = average_precision_score(y_true, y_score)
print(f'Jaccard AUPRC: {auprc:.4f}')

### Save detailed results

In [None]:
out = pd.concat([pos, neg], ignore_index=True)
out.to_csv(DATA_DIR.parent / 'jaccard_eval.csv', index=False)
print('Saved per-pair Jaccard scores to jaccard_eval.csv')