# Import and Load

In [96]:
import pandas as pd
import torch
from torch_geometric.data import Data

In [97]:
df = pd.read_csv('/Users/marco/Documents/python_projects/vector-database-ICD/Data/icd11-25_data_clean_with_generated_descriptions.csv')

In [98]:
df.columns

Index(['id', 'code', 'title', 'browser_url', 'class_kind', 'definition',
       'parent', 'inclusions', 'foundation_children',
       'foundation_child_references', 'index_terms', 'related_entities',
       'full_text', 'children', 'postcoordination_scales',
       'index_term_references', 'exclusions', 'exclusion_references',
       'fully_specified_name', 'generated_description', 'chapter'],
      dtype='object')

In [104]:
df = df[['id', 'code', 'chapter', 'title',  'parent','children', 'generated_description']]

# Graph Creation

## Simple 1-chapter graph

In [27]:
def create_graph_from_chapter(df, chapter_num=1, directed=True):
    # Filter for specific chapter
    chapter_df = df[df['chapter'] == chapter_num].copy()
    
    # Create node mapping (id -> index)
    unique_ids = chapter_df['id'].unique()
    id_to_idx = {id_: idx for idx, id_ in enumerate(unique_ids)}
    
    # Create edge index
    edge_list = []
    
    # Add parent -> child edges
    for _, row in chapter_df.iterrows():
        if pd.notna(row['parent']):
            # Add parent -> child edge
            if row['parent'] in id_to_idx and row['id'] in id_to_idx:
                edge_list.append([
                    id_to_idx[row['parent']], 
                    id_to_idx[row['id']]
                ])
                # Add reverse edge for undirected graph
                if not directed:
                    edge_list.append([
                        id_to_idx[row['id']],
                        id_to_idx[row['parent']]
                    ])
    
        # Add child -> parent edges from children column
        if pd.notna(row['children']):
            children = str(row['children']).split(';')
            for child in children:
                if child and child.strip() in id_to_idx:
                    edge_list.append([
                        id_to_idx[row['id']], 
                        id_to_idx[child.strip()]
                    ])
                    # Add reverse edge for undirected graph
                    if not directed:
                        edge_list.append([
                            id_to_idx[child.strip()],
                            id_to_idx[row['id']]
                        ])
    
    # Convert to PyG format
    edge_index = torch.tensor(edge_list, dtype=torch.long).t().contiguous()
    
    # Create node features (placeholder - you can modify based on your needs)
    # Here we'll just use a simple one-hot encoding of the position
    num_nodes = len(unique_ids)
    node_features = torch.eye(num_nodes)  # placeholder features
    
    # Create PyG Data object
    graph = Data(
        x=node_features,
        edge_index=edge_index
    )
    
    return graph, id_to_idx

In [28]:
simple_graph, simple_mapping = create_graph_from_chapter(df, chapter_num=1)

## Full graph

In [59]:
def create_full_icd_graph(df, directed=True):
    # Create node mapping (id -> index) for all nodes
    unique_ids = df['id'].unique()
    id_to_idx = {id_: idx for idx, id_ in enumerate(unique_ids)}
    
    # Create edge index
    edge_list = []
    
    # Track chapter information for each node
    node_chapters = torch.zeros(len(unique_ids), dtype=torch.long)
    
    # Process all rows to create edges
    for _, row in df.iterrows():
        # Store chapter information
        node_idx = id_to_idx[row['id']]
        node_chapters[node_idx] = row['chapter']
        
        # Add parent -> child edge
        if pd.notna(row['parent']):
            if row['parent'] in id_to_idx:
                edge_list.append([
                    id_to_idx[row['parent']], 
                    id_to_idx[row['id']]
                ])
                # Add reverse edge for undirected graph
                if not directed:
                    edge_list.append([
                        id_to_idx[row['id']],
                        id_to_idx[row['parent']]
                    ])
    
        # Add edges from children column
        if pd.notna(row['children']):
            children = str(row['children']).split(';')
            for child in children:
                if child and child.strip() in id_to_idx:
                    edge_list.append([
                        id_to_idx[row['id']], 
                        id_to_idx[child.strip()]
                    ])
                    # Add reverse edge for undirected graph
                    if not directed:
                        edge_list.append([
                            id_to_idx[child.strip()],
                            id_to_idx[row['id']]
                        ])
    
    # Convert to PyG format
    edge_index = torch.tensor(edge_list, dtype=torch.long).t().contiguous()

    sorted_indices = edge_index[0].argsort()
    edge_index = edge_index[:, sorted_indices]
    
    node_features = torch.eye(len(unique_ids))
    
    graph = Data(
        x=node_features,
        edge_index=edge_index,
        chapter=node_chapters,  
    )
    
    return graph, id_to_idx

def analyze_graph(graph, id_to_idx):
    """Helper function to analyze the created graph"""
    # Reverse mapping for analysis
    idx_to_id = {v: k for k, v in id_to_idx.items()}
    
    # Basic statistics
    print(f"Number of nodes: {graph.num_nodes}")
    print(f"Number of edges: {graph.num_edges}")
    
    # Analyze inter-chapter connections
    edge_chapters = []
    for edge in graph.edge_index.t():
        source_chapter = graph.chapter[edge[0]].item()
        target_chapter = graph.chapter[edge[1]].item()
        if source_chapter != target_chapter:
            edge_chapters.append((source_chapter, target_chapter))
    
    print(f"\nNumber of inter-chapter connections: {len(edge_chapters)}")
    if edge_chapters:
        print("Inter-chapter connections (chapter pairs):")
        for source_ch, target_ch in set(edge_chapters):
            print(f"Chapter {source_ch} → Chapter {target_ch}")
    
    return {
        'num_nodes': graph.num_nodes,
        'num_edges': graph.num_edges,
        'inter_chapter_connections': edge_chapters
    }

In [69]:
graph, id_to_edx = create_full_icd_graph(df)
edx_to_id = {v: k for k, v in id_to_edx.items()}

In [77]:
df[df['children'].isna()].head(10)

Unnamed: 0,id,code,chapter,title,parent,children,generated_description
0,1937339080,1C22,1,Infections due to Chlamydia psittaci,1127435854,,"Infections due to Chlamydia psittaci, also kno..."
3,328097188,1A36.12,1,Cutaneous amoebiasis,1777228366,,Cutaneous amoebiasis is a parasitic infection ...
5,1056849595,1B13.0,1,Acute miliary tuberculosis of a single specifi...,861638547,,Acute miliary tuberculosis of a single specifi...
6,181304776,1D01.2,1,Parasitic or protozoal meningitis,121670633,,Parasitic or protozoal meningitis is a rare an...
8,813571137,1F66.4,1,Subcutaneous dirofilariasis,1975325075,,Subcutaneous dirofilariasis is a parasitic inf...
9,51885381,1D85.1,1,Acute viral carditis,874478433,,Acute viral carditis is an inflammation of the...
10,1356928923,1D42,1,O'nyong-nyong fever,921595235,,O'nyong-nyong fever is a viral illness caused ...
12,1342682193,1G01.3,1,Cutaneous myiasis,1367149207,,Cutaneous myiasis is a parasitic skin infestat...
13,1761012301,1D04.3,1,Intraspinal subdural granuloma,2108355318,,Intraspinal subdural granuloma is a rare condi...
15,1373005257,1F66.2,1,Filariasis due to Brugia species,1975325075,,Filariasis due to Brugia species is a parasiti...


In [79]:
def check_connection(edge_index, source, target):
    """Check if there is a direct edge from source to target node in the graph"""
    # Convert edge_index to list of tuples for easier checking
    edges = list(zip(edge_index[0].tolist(), edge_index[1].tolist()))
    return (source, target) in edges

out = id_to_edx[1937339080]
inc = id_to_edx[813571137]

# Check if connection exists
connection_exists = check_connection(graph.edge_index, out, inc)
print(f"Connection {out}-->{inc} exists: {connection_exists}")

Connection 0-->8 exists: False


In [85]:
graph

Data(x=[13960, 13960], edge_index=[2, 13934], chapter=[13960])

# Node Embeddings

In [80]:
embeddings_df = pd.read_csv('icd11_gatortron_vectors_no_coma.csv')

In [83]:
embeddings_df.shape

(13062, 2)

# Self-supervised learning


While link prediction is a valid starting point, for your goal of clustering similar diseases based on augmented text embeddings within the ICD hierarchy, other graph learning tasks might be more directly aligned and potentially more effective:

1.  **Contrastive Learning (e.g., GRACE, GraphCL, or custom variants):**
    *   **How it works:** These methods learn node representations by maximizing the agreement between different "views" of the same node (or its local neighborhood) and minimizing agreement with other nodes. Views can be created by augmenting the graph (e.g., dropping edges/nodes, masking features) or by sampling positive/negative pairs based on structural relationships.
    *   **Why it might be better for your context:**
        *   **Flexible Definition of Similarity:** You can define "positive pairs" (nodes that should have similar embeddings) more broadly than just directly linked nodes. For example:
            *   Nodes sharing the same parent (siblings).
            *   Nodes within a certain distance in the hierarchy (e.g., sharing a common grandparent).
            *   Nodes belonging to the same sub-category.
        *   **Robust Representations:** By learning to be invariant to certain perturbations or by contrasting against diverse negative samples, the GNN can learn more robust and generalizable features that capture deeper structural similarity.
        *   **Directly Optimizes for Embedding Similarity:** The core objective is to make embeddings of "similar" nodes close and "dissimilar" nodes far apart, which is ideal for downstream clustering.

2.  **Graph Autoencoders (GAE) / Variational Graph Autoencoders (VGAE) with a focus on embedding quality:**
    *   **How it works:** A GNN encoder maps nodes to a low-dimensional latent space (your augmented embeddings). A decoder then tries to reconstruct the original graph structure (e.g., the adjacency matrix) from these latent embeddings.
    *   **Why it might be better for your context:**
        *   **Embedding-Centric:** The primary output is the learned node embeddings. While the reconstruction task is often link prediction, the model is trained to produce embeddings that *encode* the structural information necessary for this reconstruction.
        *   **Implicit Clustering:** The latent space learned by GAEs/VGAEs often exhibits good clustering properties because nodes that are structurally similar (and thus co-occur in similar patterns of links) will be mapped to nearby points in the latent space.

**Why these could be more suitable than simple link prediction alone:**

*   **Beyond Direct Links:** Simple link prediction primarily focuses on 1-hop connectivity. For clustering "similar" diseases, you often care about broader relationships (e.g., diseases in the same family, even if not directly parent-child). Contrastive methods and GAEs can capture these higher-order proximities more effectively.
*   **Alignment with Clustering Goal:** Contrastive learning directly optimizes for a similarity metric in the embedding space. GAEs aim to find a compact representation that preserves structural information, which is also beneficial for clustering.

**Recommendation:**

Consider exploring **contrastive learning approaches where you strategically define positive pairs based on the ICD hierarchy.** For example, treating all codes under a specific parent node as positive examples for each other (or for a prototype of that parent node) could directly encourage the GNN to learn embeddings that group hierarchically related diseases. This aligns more closely with forming meaningful clusters based on the inherent structure of your data than just predicting immediate connections.
