# Drug Repurposing Pathfinding Algorithm Benchmark

**Purpose:** Evaluate graph pathfinding algorithms for drug repurposing by comparing predicted mechanistic pathways against curated ground truth pathways.



---

## Setup


In [None]:
# Standard libraries
import pandas as pd
import numpy as np
import networkx as nx
import time
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import heapq
from typing import Dict, List, Tuple
from collections import deque, Counter
from evaluation_helpers import *
from evaluation_metrics import *
from Algorithms import *
from evaluation_runner import run_evaluation
import importlib
import evaluation_visualization as ev
importlib.reload(ev)
from hyperparameter_tuning import (
    run_full_grid_search,
    plot_tuning_results,
    get_best_predictions
)
import os

In [None]:
from pathlib import Path

# Detect repository root (folder that contains "data")
REPO_ROOT = Path.cwd()

# If running from /notebook, move up one level
if not (REPO_ROOT / "data").exists() and (REPO_ROOT.parent / "data").exists():
    REPO_ROOT = REPO_ROOT.parent

DATA_DIR = REPO_ROOT / "data"

PATHS = {
    "nodes": DATA_DIR / "nodes.csv",
    "edges": DATA_DIR / "edges.csv",
    "ground_truth_nodes": DATA_DIR / "benchmark_pathways_nodes.csv",
    "ground_truth_edges": DATA_DIR / "benchmark_pathways_edges.csv",
}

print("Repo root:", REPO_ROOT)
for k, p in PATHS.items():
    print(f"{k}: {p} | exists={p.exists()}")

---
## Load Data

Load the PrimeKG knowledge graph and ground truth pathways.

In [None]:
# Load PrimeKG data
print("Loading PrimeKG data...")
nodes = pd.read_csv(PATHS['nodes'], encoding="latin1")
edges = pd.read_csv(PATHS['edges'], encoding="latin1")

print(f"  Nodes: {len(nodes):,}")
print(f"  Edges: {len(edges):,}")
print(f"  Node types: {nodes['node_type'].nunique()}")
print(f"  Edge types: {edges['relation'].nunique()}")

# Load ground truth
print("\nLoading ground truth pathways...")
ground_truth_nodes = pd.read_csv(PATHS['ground_truth_nodes'], dtype={'node_index': int})
ground_truth_edges = pd.read_csv(PATHS['ground_truth_edges'])

pathways = ground_truth_nodes['pathway_id'].unique()
print(f"  Pathways: {len(pathways)}")
for p in pathways:
    n_nodes = len(ground_truth_nodes[ground_truth_nodes['pathway_id'] == p])
    print(f"    - {p}: {n_nodes} nodes")

In [None]:
# filter
node_counts = ground_truth_nodes.groupby('pathway_id')['node_index'].count()
long_pathways = node_counts[node_counts >= 4].index

ground_truth_nodes = ground_truth_nodes[ground_truth_nodes['pathway_id'].isin(long_pathways)]
ground_truth_edges = ground_truth_edges[ground_truth_edges['pathway_id'].isin(long_pathways)]

print(f"Filtered to {len(long_pathways)} pathways with 4+ nodes")
print(f"  Nodes: {len(ground_truth_nodes):,}")
print(f"  Edges: {len(ground_truth_edges):,}")

---
## Build Knowledge Graph

Construct a NetworkX directed graph with node/edge attributes for pathfinding.

In [None]:
def build_graph(nodes_df, edges_df, bidirectional=True):
    """
    Build a NetworkX graph from cleaned PrimeKG CSVs.
    
    nodes_df columns:
        node_index, node_id, node_type, node_name, node_source
        
    edges_df columns:
        relation, display_relation, x_index, y_index
    """
    
    G = nx.DiGraph()
    
    # ---------- Add nodes ----------
    for _, row in nodes_df.iterrows():
        G.add_node(
            int(row['node_index']),
            node_id=str(row['node_id']),
            node_name=str(row['node_name']),
            node_type=str(row['node_type']),
            node_source=str(row['node_source'])
        )
    
    # ---------- Add edges ----------
    for _, row in edges_df.iterrows():
        G.add_edge(
            int(row['x_index']),
            int(row['y_index']),
            relation=str(row['relation']),
            display_relation=str(row['display_relation'])
        )
        
        if bidirectional:
            G.add_edge(
                int(row['y_index']),
                int(row['x_index']),
                relation=str(row['relation']),
                display_relation=str(row['display_relation'])
            )
    
    return G

print("Building graph...")
G = build_graph(nodes, edges, bidirectional=True)
print(f"Graph built: {G.number_of_nodes():,} nodes, {G.number_of_edges():,} edges")

---
## Algorithm 1 - Shortest Path Baseline

The simplest baseline: find the shortest path (by hop count) between drug and disease.

**Expected behavior:**
- ✅ Will always find the target disease (if connected)
- ❌ May take shortcuts through direct drug→disease edges
- ❌ Ignores edge types and biological mechanism

In [None]:
def run_shortest_path(graph, ground_truth_df):
    """
    Run shortest path algorithm on all pathways.
    
    Returns:
        DataFrame with predictions for each pathway
    """
    results = []
    
    for pathway_id in ground_truth_df['pathway_id'].unique():
        pathway_df = ground_truth_df[ground_truth_df['pathway_id'] == pathway_id].sort_values('step_order')
        
        # Get source (drug) and target (disease) indices
        source_idx = int(pathway_df.iloc[0]['node_index'])
        target_idx = int(pathway_df.iloc[-1]['node_index'])
        
        source_name = pathway_df.iloc[0]['node_name']
        target_name = pathway_df.iloc[-1]['node_name']
        
        print(f"\n{pathway_id}: {source_name} → {target_name}")
        
        try:
            # Find shortest path
            predicted_path = nx.shortest_path(graph, source_idx, target_idx)
            predicted_node_ids = [graph.nodes[idx]['node_id'] for idx in predicted_path]
            predicted_node_names = [graph.nodes[idx]['node_name'] for idx in predicted_path]
            
            # Get edge relations along path
            predicted_relations = []
            for i in range(len(predicted_path) - 1):
                edge_data = graph.get_edge_data(predicted_path[i], predicted_path[i+1])
                predicted_relations.append(edge_data['relation'])
            
            print(f"  ✓ Found path: {len(predicted_path)} nodes")
            print(f"  Path: {' → '.join(predicted_node_names[:5])}{'...' if len(predicted_path) > 5 else ''}")
            
            results.append({
                'pathway_id': pathway_id,
                'predicted_node_indices': ','.join(map(str, predicted_path)),
                'predicted_node_ids': ','.join(predicted_node_ids),
                'predicted_node_names': ','.join(predicted_node_names),
                'predicted_relations': ','.join(predicted_relations),
                'predicted_length': len(predicted_path),
                'ground_truth_length': len(pathway_df)
            })
            
        except nx.NetworkXNoPath:
            print(f"  ✗ No path found")
            results.append({
                'pathway_id': pathway_id,
                'predicted_node_indices': 'NONE',
                'predicted_node_ids': 'NONE',
                'predicted_node_names': 'NONE',
                'predicted_relations': 'NONE',
                'predicted_length': 0,
                'ground_truth_length': len(pathway_df)
            })
    
    return pd.DataFrame(results)


# Run shortest path
print("="*60)
print("Running Shortest Path Algorithm")
print("="*60)

sp_predictions = run_shortest_path(G, ground_truth_nodes)
# sp_predictions.to_csv('baseline_shortest_path_predictions.csv', index=False)
# print("\n✓ Saved: baseline_shortest_path_predictions.csv")

---
## Meta-Path Constrained BFS

An improved baseline that enforces biologically valid edge type sequences.

**Valid meta-path patterns:**
1. `drug → protein → disease` (direct mechanism)
2. `drug → protein → protein → disease` (protein interactions)
3. `drug → protein → anatomy → protein → disease` (tissue-specific)

**Invalid shortcuts blocked:**
- ❌ `drug → disease` (clinical indication, not mechanism)
- ❌ `drug → drug → disease` (drug similarity)

In [None]:
# Define valid meta-path patterns (edge type sequences)
VALID_METAPATHS = [
    # Pattern 1: Direct protein mechanism (drug → protein → disease)
    ['drug_protein', 'disease_protein'],
    
    # Pattern 2: Protein-protein interaction
    ['drug_protein', 'protein_protein', 'disease_protein'],
    
    # Pattern 3: Multiple protein interactions
    ['drug_protein', 'protein_protein', 'protein_protein', 'disease_protein'],
    
    # Pattern 4: Pathway-mediated
    ['drug_protein', 'pathway_protein', 'disease_protein'],
    ['drug_protein', 'pathway_protein', 'pathway_protein', 'disease_protein'],
    ['drug_protein', 'pathway_protein', 'pathway_pathway', 'pathway_protein', 'disease_protein'],
    
    # Pattern 5: Anatomy-mediated (tissue-specific)
    ['drug_protein', 'anatomy_protein_present', 'anatomy_protein_present', 'disease_protein'],
    
    # Pattern 6: Complex pathways
    ['drug_protein', 'protein_protein', 'pathway_protein', 'disease_protein'],
    ['drug_protein', 'pathway_protein', 'pathway_protein', 'pathway_protein', 'disease_protein'],
]

print(f"Defined {len(VALID_METAPATHS)} valid meta-path patterns:")
for i, pattern in enumerate(VALID_METAPATHS, 1):
    print(f"  {i}. {' → '.join(pattern)}")

In [None]:
def is_valid_metapath(relations, valid_metapaths):
    """Check if a relation sequence matches any valid meta-path pattern."""
    return relations in valid_metapaths


def could_match_metapath(relations, valid_metapaths):
    """Check if the current relation sequence could potentially lead to a valid path."""
    for pattern in valid_metapaths:
        if len(relations) <= len(pattern):
            if relations == pattern[:len(relations)]:
                return True
    return False


def metapath_constrained_bfs(source_idx, target_idx, graph, valid_metapaths, max_length=10):
    """
    Find shortest path that follows valid meta-path patterns.
    
    Uses BFS but only explores edges that could lead to a valid meta-path.
    
    Returns:
        (path_nodes, path_relations) or ([], []) if no valid path found
    """
    # Queue: (current_node, path_so_far, relations_so_far)
    queue = deque([(source_idx, [source_idx], [])])
    visited = {source_idx: []}  # Track visited states with relation sequences
    
    while queue:
        current_node, path, relations = queue.popleft()
        
        # Check if we reached target with valid meta-path
        if current_node == target_idx:
            if is_valid_metapath(relations, valid_metapaths):
                return path, relations
        
        # Stop if path too long
        if len(path) >= max_length:
            continue
        
        # Explore neighbors
        for neighbor in graph.neighbors(current_node):
            edge_data = graph.get_edge_data(current_node, neighbor)
            new_relation = edge_data['relation']
            new_relations = relations + [new_relation]
            
            # Only continue if this could lead to a valid meta-path
            if could_match_metapath(new_relations, valid_metapaths):
                state_key = (neighbor, tuple(new_relations))
                
                # Avoid revisiting same state
                if neighbor not in visited or visited[neighbor] != new_relations:
                    visited[neighbor] = new_relations
                    queue.append((neighbor, path + [neighbor], new_relations))
    
    return [], []  # No valid path found


def run_metapath_algorithm(graph, ground_truth_df, valid_metapaths):
    """
    Run meta-path constrained BFS on all pathways.
    """
    results = []
    
    for pathway_id in ground_truth_df['pathway_id'].unique():
        pathway_df = ground_truth_df[ground_truth_df['pathway_id'] == pathway_id].sort_values('step_order')
        
        source_idx = int(pathway_df.iloc[0]['node_index'])
        target_idx = int(pathway_df.iloc[-1]['node_index'])
        
        source_name = pathway_df.iloc[0]['node_name']
        target_name = pathway_df.iloc[-1]['node_name']
        
        gt_path = ' → '.join(pathway_df['node_name'].tolist())
        
        print(f"\n{pathway_id}: {source_name} → {target_name}")
        
        # Find meta-path constrained path
        predicted_path, predicted_relations = metapath_constrained_bfs(
            source_idx, target_idx, graph, valid_metapaths
        )
        
        if predicted_path:
            predicted_node_ids = [graph.nodes[idx]['node_id'] for idx in predicted_path]
            predicted_node_names = [graph.nodes[idx]['node_name'] for idx in predicted_path]
            
            print(f"  ✓ Found valid path: {len(predicted_path)} nodes")
            print(f"  Meta-path: {' → '.join(predicted_relations)}")
            print(f"  Path: {' → '.join(predicted_node_names)}")
            print(f"  Ground truth: {gt_path}")
            
            results.append({
                'pathway_id': pathway_id,
                'predicted_node_indices': ','.join(map(str, predicted_path)),
                'predicted_node_ids': ','.join(predicted_node_ids),
                'predicted_node_names': ','.join(predicted_node_names),
                'predicted_relations': ','.join(predicted_relations),
                'predicted_length': len(predicted_path),
                'ground_truth_length': len(pathway_df)
            })
        else:
            print(f"  ✗ No valid meta-path found")
            results.append({
                'pathway_id': pathway_id,
                'predicted_node_indices': 'NONE',
                'predicted_node_ids': 'NONE',
                'predicted_node_names': 'NONE',
                'predicted_relations': 'NONE',
                'predicted_length': 0,
                'ground_truth_length': len(pathway_df)
            })
    
    return pd.DataFrame(results)


# Run meta-path algorithm
print("="*60)
print("Running Meta-Path Constrained Algorithm")
print("="*60)

mp_predictions = run_metapath_algorithm(G, ground_truth_nodes, VALID_METAPATHS)
# mp_predictions.to_csv('baseline_metapath_predictions.csv', index=False)
# print("\n✓ Saved: baseline_metapath_predictions.csv")

## Algorithm 2: Hub-Penalized Weighted Shortest Path

**Core Idea:** High-degree "hub" nodes (like inflammation markers) connect to everything but don't represent specific mechanisms. Penalize them.

**Weight Formula:** `weight[u,v] = 1 + α * log(degree[v])`

- α = 0.5 is a good default (can be tuned)
- Higher degree → higher weight → less preferred

In [None]:
# ============================================================
# ALGORITHM 2: Hub-Penalized Weighted Shortest Path
# ============================================================


def run_hub_penalized(graph, ground_truth_df, alpha=0.5):
    """
    Run Hub-Penalized algorithm on all pathways.
    """
    results = []
    
    # Initialize algorithm
    print("Initializing Hub-Penalized algorithm...")
    algo = HubPenalizedShortestPath(graph, alpha=alpha)
    print(f"  Edge weights computed (α={alpha})")
    
    for pathway_id in ground_truth_df['pathway_id'].unique():
        pathway_df = ground_truth_df[ground_truth_df['pathway_id'] == pathway_id].sort_values('step_order')
        
        source_idx = int(pathway_df.iloc[0]['node_index'])
        target_idx = int(pathway_df.iloc[-1]['node_index'])
        source_name = pathway_df.iloc[0]['node_name']
        target_name = pathway_df.iloc[-1]['node_name']
        
        print(f"\n{pathway_id}: {source_name} → {target_name}")
        
        path, relations, weight = algo.find_path(source_idx, target_idx)
        
        if path:
            node_ids = [graph.nodes[idx].get('node_id', str(idx)) for idx in path]
            node_names = [graph.nodes[idx].get('node_name', str(idx)) for idx in path]
            
            print(f"  ✓ Found path: {len(path)} nodes")
            print(f"  Path: {' → '.join(node_names)}")
            
            results.append({
                'pathway_id': pathway_id,
                'predicted_node_indices': ','.join(map(str, path)),
                'predicted_node_ids': ','.join(node_ids),
                'predicted_node_names': ','.join(node_names),
                'predicted_relations': ','.join(relations),
                'predicted_length': len(path),
                'ground_truth_length': len(pathway_df)
            })
        else:
            print(f"  ✗ No path found")
            results.append({
                'pathway_id': pathway_id,
                'predicted_node_indices': 'NONE',
                'predicted_node_ids': 'NONE',
                'predicted_node_names': 'NONE',
                'predicted_relations': 'NONE',
                'predicted_length': 0,
                'ground_truth_length': len(pathway_df)
            })
    
    return pd.DataFrame(results)


# Run Algorithm 2
print("="*60)
print("Running Hub-Penalized Algorithm")
print("="*60)

hub_predictions = run_hub_penalized(G, ground_truth_nodes, alpha=0.5)
# hub_predictions.to_csv('hub_penalized_predictions.csv', index=False)
# print("\n✓ Saved: hub_penalized_predictions.csv")

In [None]:
#tuning


## Algorithm 3: PageRank-Inverse Weighted Shortest Path

**Core Idea:** PageRank captures global graph centrality. Nodes with HIGH PageRank are generic hubs. We want paths through LOW PageRank (more specific) nodes.

**Weight Formula:** `weight[u,v] = 1 / (1 + pagerank[v])`

- Low PageRank → low weight → preferred
- PageRank is computed once upfront

In [None]:
# ============================================================
# ALGORITHM 3: PageRank-Inverse Weighted Shortest Path
# ============================================================

def run_pagerank_inverse(graph, ground_truth_df, damping=0.85):
    """
    Run PageRank-Inverse algorithm on all pathways.
    """
    results = []
    
    # Initialize algorithm
    print("Initializing PageRank-Inverse algorithm...")
    algo = PageRankInverseShortestPath(graph, damping=damping)
    print(f"  Edge weights computed")
    
    for pathway_id in ground_truth_df['pathway_id'].unique():
        pathway_df = ground_truth_df[ground_truth_df['pathway_id'] == pathway_id].sort_values('step_order')
        
        source_idx = int(pathway_df.iloc[0]['node_index'])
        target_idx = int(pathway_df.iloc[-1]['node_index'])
        source_name = pathway_df.iloc[0]['node_name']
        target_name = pathway_df.iloc[-1]['node_name']
        
        print(f"\n{pathway_id}: {source_name} → {target_name}")
        
        path, relations, weight = algo.find_path(source_idx, target_idx)
        
        if path:
            node_ids = [graph.nodes[idx].get('node_id', str(idx)) for idx in path]
            node_names = [graph.nodes[idx].get('node_name', str(idx)) for idx in path]
            
            print(f"  ✓ Found path: {len(path)} nodes")
            print(f"  Path: {' → '.join(node_names)}")
            
            results.append({
                'pathway_id': pathway_id,
                'predicted_node_indices': ','.join(map(str, path)),
                'predicted_node_ids': ','.join(node_ids),
                'predicted_node_names': ','.join(node_names),
                'predicted_relations': ','.join(relations),
                'predicted_length': len(path),
                'ground_truth_length': len(pathway_df)
            })
        else:
            print(f"  ✗ No path found")
            results.append({
                'pathway_id': pathway_id,
                'predicted_node_indices': 'NONE',
                'predicted_node_ids': 'NONE',
                'predicted_node_names': 'NONE',
                'predicted_relations': 'NONE',
                'predicted_length': 0,
                'ground_truth_length': len(pathway_df)
            })
    
    return pd.DataFrame(results)


# Run Algorithm 3
print("="*60)
print("Running PageRank-Inverse Algorithm")
print("="*60)

pr_predictions = run_pagerank_inverse(G, ground_truth_nodes, damping=0.85)
# pr_predictions.to_csv('pagerank_inverse_predictions.csv', index=False)
# print("\n✓ Saved: pagerank_inverse_predictions.csv")

## Algorithm 4: Learned Embeddings + A* with Supervised Edge Weights

**Core Idea:** Learn from known drug repurposing pathways what makes a "good" edge.

**Two Phases:**
1. **Embed:** Train Node2Vec (or use spectral embeddings) to capture graph structure
2. **Learn:** Train MLP to predict edge goodness from:
   - Embedding similarity
   - Degree features
   - Edge type

**Search:** A* with learned weights + embedding-based heuristic

In [None]:
# ============================================================
# ALGORITHM 4: Learned Embeddings + A* with Supervised Edge Weights
# ============================================================

def run_learned_astar(graph, ground_truth_df, embedding_dim=64):
    """
    Run Learned Embeddings + A* algorithm on all pathways.
    """
    results = []
    
    # Initialize algorithm
    print("Initializing Learned Embeddings + A* algorithm...")
    algo = LearnedEmbeddingsAStar(graph, embedding_dim=embedding_dim)
    algo.train_embeddings()
    
    # Prepare training data from ground truth
    training_pathways = []
    for pathway_id in ground_truth_df['pathway_id'].unique():
        pathway_df = ground_truth_df[ground_truth_df['pathway_id'] == pathway_id].sort_values('step_order')
        training_pathways.append({'path_nodes': pathway_df['node_index'].tolist()})
    
    algo.train_edge_weights(training_pathways)
    
    for pathway_id in ground_truth_df['pathway_id'].unique():
        pathway_df = ground_truth_df[ground_truth_df['pathway_id'] == pathway_id].sort_values('step_order')
        
        source_idx = int(pathway_df.iloc[0]['node_index'])
        target_idx = int(pathway_df.iloc[-1]['node_index'])
        source_name = pathway_df.iloc[0]['node_name']
        target_name = pathway_df.iloc[-1]['node_name']
        
        print(f"\n{pathway_id}: {source_name} → {target_name}")
        
        path, relations, weight = algo.find_path(source_idx, target_idx)
        
        if path:
            node_ids = [graph.nodes[idx].get('node_id', str(idx)) for idx in path]
            node_names = [graph.nodes[idx].get('node_name', str(idx)) for idx in path]
            
            print(f"  ✓ Found path: {len(path)} nodes")
            print(f"  Path: {' → '.join(node_names)}")
            
            results.append({
                'pathway_id': pathway_id,
                'predicted_node_indices': ','.join(map(str, path)),
                'predicted_node_ids': ','.join(node_ids),
                'predicted_node_names': ','.join(node_names),
                'predicted_relations': ','.join(relations),
                'predicted_length': len(path),
                'ground_truth_length': len(pathway_df)
            })
        else:
            print(f"  ✗ No path found")
            results.append({
                'pathway_id': pathway_id,
                'predicted_node_indices': 'NONE',
                'predicted_node_ids': 'NONE',
                'predicted_node_names': 'NONE',
                'predicted_relations': 'NONE',
                'predicted_length': 0,
                'ground_truth_length': len(pathway_df)
            })
    
    return pd.DataFrame(results)


# Run Algorithm 4
print("="*60)
print("Running Learned Embeddings + A* Algorithm")
print("="*60)

learned_predictions = run_learned_astar(G, ground_truth_nodes, embedding_dim=64)
# learned_predictions.to_csv('learned_astar_predictions.csv', index=False)
# print("\n✓ Saved: learned_astar_predictions.csv")

## Algorithm 5: Semantic Bridging with Intermediate Node Scoring

**Core Idea:** Use NLP to find paths where consecutive nodes are semantically related (they "make sense" together).

**Weight Formula:** `weight[u,v] = 1 - β * cosine_sim(text_emb[u], text_emb[v])`

- β = 0.3 balances semantic preference with path length
- Uses TF-IDF embeddings (or SciBERT if available)

In [None]:
# ============================================================
# ALGORITHM 5: Semantic Bridging with Intermediate Node Scoring
# ============================================================

def run_semantic_bridging(graph, ground_truth_df, beta=0.3):
    """
    Run Semantic Bridging algorithm on all pathways.
    """
    results = []
    
    # Initialize algorithm
    print("Initializing Semantic Bridging algorithm...")
    algo = SemanticBridgingPath(graph, beta=beta)
    algo.compute_embeddings()
    algo.compute_edge_weights()
    
    for pathway_id in ground_truth_df['pathway_id'].unique():
        pathway_df = ground_truth_df[ground_truth_df['pathway_id'] == pathway_id].sort_values('step_order')
        
        source_idx = int(pathway_df.iloc[0]['node_index'])
        target_idx = int(pathway_df.iloc[-1]['node_index'])
        source_name = pathway_df.iloc[0]['node_name']
        target_name = pathway_df.iloc[-1]['node_name']
        
        print(f"\n{pathway_id}: {source_name} → {target_name}")
        
        path, relations, weight = algo.find_path(source_idx, target_idx)
        
        if path:
            node_ids = [graph.nodes[idx].get('node_id', str(idx)) for idx in path]
            node_names = [graph.nodes[idx].get('node_name', str(idx)) for idx in path]
            
            print(f"  ✓ Found path: {len(path)} nodes")
            print(f"  Path: {' → '.join(node_names)}")
            
            results.append({
                'pathway_id': pathway_id,
                'predicted_node_indices': ','.join(map(str, path)),
                'predicted_node_ids': ','.join(node_ids),
                'predicted_node_names': ','.join(node_names),
                'predicted_relations': ','.join(relations),
                'predicted_length': len(path),
                'ground_truth_length': len(pathway_df)
            })
        else:
            print(f"  ✗ No path found")
            results.append({
                'pathway_id': pathway_id,
                'predicted_node_indices': 'NONE',
                'predicted_node_ids': 'NONE',
                'predicted_node_names': 'NONE',
                'predicted_relations': 'NONE',
                'predicted_length': 0,
                'ground_truth_length': len(pathway_df)
            })
    
    return pd.DataFrame(results)


# Run Algorithm 5
print("="*60)
print("Running Semantic Bridging Algorithm")
print("="*60)

semantic_predictions = run_semantic_bridging(G, ground_truth_nodes, beta=0.3)
# semantic_predictions.to_csv('semantic_bridging_predictions.csv', index=False)
# print("\n✓ Saved: semantic_bridging_predictions.csv")

---
## Evaluate Algorithms

Calculate all 9 metrics for both algorithms and compare.

In [None]:
predictions_dict = {
    'Dijkstra': sp_predictions,
    'Meta-Path': mp_predictions,
    'Hub-Penalized': hub_predictions,
    'PageRank-Inverse': pr_predictions,
    'Semantic-Bridging': semantic_predictions,
}

print(edges.columns.tolist())

results, summary = run_evaluation(
    predictions_dict=predictions_dict,
    ground_truth_nodes=ground_truth_nodes,
    ground_truth_edges=ground_truth_edges,
    edges_df=edges
)

# 用新的 visualization
ev.display_summary_table(summary)


# results.to_csv('evaluation_results.csv', index=False)
# summary.to_csv('evaluation_summary.csv')


In [None]:
# Run grid search for all tunable algorithms
best_configs, tuning_results = run_full_grid_search(
    G,                          # Your NetworkX graph
    ground_truth_nodes,         # Ground truth nodes DataFrame
    ground_truth_edges,         # Ground truth edges DataFrame
    edges,                      # PrimeKG edges DataFrame
    hub_alphas=[0.1, 0.3, 0.5, 0.7, 1.0, 1.5],   # Alpha values to test for Hub-Penalized
    pr_dampings=[0.7, 0.85, 0.9],                 # Damping values to test for PageRank
    semantic_betas=[0.1, 0.3, 0.5, 0.7]          # Beta values to test for Semantic-Bridging
)


# Generate predictions using the optimal parameters found
best_predictions = get_best_predictions(G, ground_truth_nodes, best_configs)

# Combine with non-tunable algorithms (Dijkstra, Meta-Path)
predictions_dict = {
    'Dijkstra': sp_predictions,
    'Meta-Path': mp_predictions,
    **best_predictions  # Adds tuned Hub-Penalized, PageRank-Inverse, Semantic-Bridging
}

# Run final evaluation with all algorithms
results, summary = run_evaluation(
    predictions_dict=predictions_dict,
    ground_truth_nodes=ground_truth_nodes,
    ground_truth_edges=ground_truth_edges,
    edges_df=edges
)


---
## Results Summary

Compare algorithm performance across all metrics.

In [None]:
# Calculate average metrics per algorithm
metrics = ['precision', 'recall', 'f1_score', 'hits_at_1', 'relation_type_accuracy', 
           'hub_node_ratio', 'path_edit_distance', 'path_length_mae']

summary = all_eval.groupby('algorithm')[metrics].mean().round(3)

print("="*70)
print("ALGORITHM COMPARISON: Average Metrics Across All Pathways")
print("="*70
print(summary.T.to_string())
print("\n" + "="*70)

# Count perfect matches
print("\nPerfect Matches (Edit Distance = 0):")
for alg in ['Shortest Path', 'Meta-Path']:
    perfect = (all_eval[all_eval['algorithm'] == alg]['path_edit_distance'] == 0).sum()
    total = len(all_eval[all_eval['algorithm'] == alg])
    print(f"  {alg}: {perfect}/{total} pathways")