In [1]:
import rdflib
import networkx as nx
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, GATConv
from torch_geometric.data import Data
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from SPARQLWrapper import SPARQLWrapper, JSON
import re
from scholarly import scholarly


  Referenced from: <084F0101-0C02-3262-85FB-B16F3CD9274E> /Users/mariachristina/anaconda3/lib/python3.11/site-packages/torch_scatter/_version_cpu.so
  Expected in:     <D400622C-0C6B-3AE1-AB45-F1D0BF19B384> /Users/mariachristina/anaconda3/lib/python3.11/site-packages/torch/lib/libtorch_cpu.dylib
  Referenced from: <1BBCA6F9-4348-38E7-BE49-97514DC7CE1C> /Users/mariachristina/anaconda3/lib/python3.11/site-packages/torch_sparse/_version_cpu.so
  Expected in:     <D400622C-0C6B-3AE1-AB45-F1D0BF19B384> /Users/mariachristina/anaconda3/lib/python3.11/site-packages/torch/lib/libtorch_cpu.dylib
  from pandas.core import (


In [2]:
class GraphLinkPredictor:
    
    """
    A class for performing link prediction on RDF graphs using embeddings and 
    various machine learning models, specifically graph neural networks (GNNs).

    Attributes:
    -----------
    rdf_files (list): 
        A list of RDF files to be loaded and processed.
    edge_label_mapping (dict): 
        A mapping of RDF predicates to edge labels used in the network.
    embeddings_file (str): 
        Path to the file containing precomputed node embeddings.
    mapping_file (str): 
        Path to the file containing the mapping from nodes to embedding indices.
    graphs (list): 
        A list to hold the RDFLib Graph objects parsed from the RDF files.
    g (rdflib.Graph): 
        A combined RDFLib Graph containing data from all provided RDF files.
    G (networkx.Graph): 
        A NetworkX graph created from the RDFLib Graph.
    embeddings (numpy.ndarray): 
        Loaded embeddings corresponding to the graph nodes.
    node_to_idx (dict): 
        A mapping from node URIs to their respective embedding indices.
    idx_to_node (dict): 
        A mapping from embedding indices to their respective node URIs.
    x (torch.Tensor): 
        A tensor representation of the node embeddings.
    edge_index (torch.Tensor): 
        A tensor representing edges in the graph.
    data (torch_geometric.data.Data): 
        A PyTorch Geometric Data object containing node features and edges.
    train_edges (list): 
        A list of edges used for training.
    test_edges (list): 
        A list of edges used for testing.
    negative_train_edges (list): 
        A list of non-existent edges used as negative examples during training.
    negative_test_edges (list): 
        A list of non-existent edges used as negative examples during testing.
    train_edge_index (torch.Tensor): 
        A tensor representing the edges used in training.
    train_edge_labels (torch.Tensor): 
        A tensor representing the labels (existence or non-existence) of training edges.
    train_losses (list): 
        A list to store training loss values over epochs.
    test_losses (list): 
        A list to store test loss values over epochs.
    model (torch.nn.Module): 
        The GNN model being trained.
    criterion (torch.nn.Module): 
        The loss function used during training.
    optimizer (torch.optim.Optimizer): 
        The optimizer used to update model parameters during training.
    """
    
    def __init__(self, rdf_files, edge_label_mapping, embeddings_file, mapping_file):
        
        """
        Initializes the GraphLinkPredictor with RDF files, edge label mappings, 
        and paths to the embeddings and mapping files.

        Parameters:
        -----------
        rdf_files (list): 
            A list of RDF file paths.
        edge_label_mapping (dict): 
            A mapping of RDF predicates to edge labels.
        embeddings_file (str): 
            Path to the node embeddings file.
        mapping_file (str): 
            Path to the node-to-index mapping file.
        """
        
        self.rdf_files = rdf_files
        self.edge_label_mapping = edge_label_mapping
        self.embeddings_file = embeddings_file
        self.mapping_file = mapping_file
        self.graphs = []
        self.load_graphs()
        self.create_networkx_graph()
        self.load_embeddings_and_mappings()
        self.create_tensors()
    
    def load_graphs(self):
        
        """
        Loads RDF files into RDFLib Graph objects and combines them into a single graph.
        """
        
        self.graphs = [rdflib.Graph() for _ in self.rdf_files]
        for g, rdf_file in zip(self.graphs, self.rdf_files):
            g.parse(rdf_file, format="xml")

        # Combine all graphs into one
        self.g = rdflib.Graph()
        for graph in self.graphs:
            self.g += graph
    
    def create_networkx_graph(self):
        
        """
        Converts the combined RDFLib Graph into a NetworkX graph, using the specified 
        edge label mapping to define edges.
        """
        
        self.G = nx.Graph()
        for subj, pred, obj in self.g:
            relation = str(pred)
            if relation in self.edge_label_mapping:
                self.G.add_edge(str(subj), str(obj), relation=relation)
    
    def load_embeddings_and_mappings(self):
        
        """
        Loads precomputed node embeddings from a file and constructs mappings between 
        nodes and their corresponding embedding indices.
        """
        
        self.embeddings = np.load(self.embeddings_file)
        self.node_to_idx = {}
        self.idx_to_node = {}
        with open(self.mapping_file, 'r') as f:
            for line in f:
                node, idx = line.strip().split('\t')
                self.node_to_idx[node] = int(idx)
                self.idx_to_node[int(idx)] = node
        self.x = torch.tensor(self.embeddings, dtype=torch.float)
    
    def create_tensors(self):
        
        """
        Creates PyTorch tensors for graph edges and prepares data for model training.
        """
        
        self.edge_index = torch.tensor([[self.node_to_idx[src], self.node_to_idx[dst]] for src, dst in self.G.edges()], dtype=torch.long).t().contiguous()
        self.data = Data(x=self.x, edge_index=self.edge_index)
    
    def load_edge_data(self, files):
        
        """
        Loads training and test edges, along with their corresponding negative examples, 
        from provided files.

        Parameters:
        -----------
        files (dict): 
            A dictionary containing paths to files with train, test, and negative edges.
        """
        
        with open(files['train_edges'], 'rb') as f:
            self.train_edges = pickle.load(f)
        with open(files['test_edges'], 'rb') as f:
            self.test_edges = pickle.load(f)
        with open(files['negative_train_edges'], 'rb') as f:
            self.negative_train_edges = pickle.load(f)
        with open(files['negative_test_edges'], 'rb') as f:
            self.negative_test_edges = pickle.load(f)
        
        self.train_edges = [(u, v) for u, v in self.train_edges if self.G.has_edge(u, v)]
        self.test_edges = [(u, v) for u, v in self.test_edges if self.G.has_edge(u, v)]
        
        self.train_edge_index = torch.tensor([[self.node_to_idx[src], self.node_to_idx[dst]] for src, dst in self.train_edges + self.negative_train_edges], dtype=torch.long).t().contiguous()
        self.train_edge_labels = torch.tensor([1] * len(self.train_edges) + [0] * len(self.negative_train_edges), dtype=torch.float)
    
    def train_model(self, model, criterion, optimizer, epochs=100):
        
        """
        Trains the GNN model on the provided graph data using the specified loss function 
        and optimizer.

        Parameters:
        -----------
        model (torch.nn.Module): 
            The GNN model to be trained.
        criterion (torch.nn.Module): 
            The loss function used during training.
        optimizer (torch.optim.Optimizer): 
            The optimizer used to update model parameters.
        epochs (int): 
            Number of training epochs (default is 100).
        """
        
        self.model = model
        self.criterion = criterion
        self.optimizer = optimizer
        self.train_losses = []
        self.test_losses = []

        for epoch in range(epochs):
            self.model.train()
            self.optimizer.zero_grad()
            train_scores = self.model(self.data, self.edge_index, self.train_edge_index)
            train_loss = self.criterion(train_scores, self.train_edge_labels)
            train_loss.backward()
            self.optimizer.step()

            self.train_losses.append(train_loss.item())
            
            self.model.eval()
            with torch.no_grad():
                test_edge_index = torch.tensor([[self.node_to_idx[src], self.node_to_idx[dst]] for src, dst in self.test_edges + self.negative_test_edges], dtype=torch.long).t().contiguous()
                test_edge_labels = torch.tensor([1] * len(self.test_edges) + [0] * len(self.negative_test_edges), dtype=torch.float)
                
                test_scores = self.model(self.data, self.edge_index, test_edge_index)
                test_loss = self.criterion(test_scores, test_edge_labels)
                self.test_losses.append(test_loss.item())
                
                test_probs = torch.sigmoid(test_scores)
                test_preds = (test_probs >= 0.5).float()

            if (epoch + 1) % 10 == 0:
                print(f'Epoch {epoch+1}, Training Loss: {train_loss.item()}, Test Loss: {test_loss.item()}')
    
    def evaluate_model(self):
        
        """
        Evaluates the trained model on the test data, printing the accuracy, classification 
        report, and plotting the confusion matrix.
        """
        
        self.model.eval()
        with torch.no_grad():
            test_edge_index = torch.tensor([[self.node_to_idx[src], self.node_to_idx[dst]] for src, dst in self.test_edges + self.negative_test_edges], dtype=torch.long).t().contiguous()
            test_edge_labels = torch.tensor([1] * len(self.test_edges) + [0] * len(self.negative_test_edges), dtype=torch.float)
            test_scores = self.model(self.data, self.edge_index, test_edge_index)
            
            test_probs = torch.sigmoid(test_scores)
            test_preds = (test_probs >= 0.5).float()
            
            cm = confusion_matrix(test_edge_labels.numpy(), test_preds.numpy())
            accuracy = accuracy_score(test_edge_labels.numpy(), test_preds.numpy())
            class_report = classification_report(test_edge_labels.numpy(), test_preds.numpy())
            
            print(f"Accuracy: {accuracy}")
            print("Classification Report:")
            print(class_report)
            
            plt.figure(figsize=(8, 6))
            sns.heatmap(cm, annot=True, fmt="d")
            plt.xlabel('Predicted Label')
            plt.ylabel('True Label')
            plt.title('Confusion Matrix')
            plt.show()
    
    def predict_new_links(self, new_edges):
        
        """
        Predicts the existence of new links (edges) in the graph using the trained model.

        Parameters:
        -----------
        new_edges (list): 
            A list of new edges to predict, where each edge is a tuple of (source_node, target_node).

        Returns:
        -----------
        torch.Tensor: 
            The predicted scores for the new edges.
        """
        
        edge_indices = torch.tensor([[self.node_to_idx[src], self.node_to_idx[dst]] for src, dst in new_edges], dtype=torch.long).t().contiguous()
        self.model.eval()
        with torch.no_grad():
            scores = self.model(self.data, self.edge_index, edge_indices)
        return scores


    
    def format_url(self, entity_type, entity_id):
        
        """
        Formats a URL for a given entity based on its type and identifier.

        Parameters:
        -----------
        entity_type (str): 
            The type of the entity (e.g., 'sideeffect').
        entity_id (str): 
            The identifier of the entity.

        Returns:
        -----------
        str: 
            The formatted URL.
        """
        
        if entity_type == 'sideeffect':
            return f"http://erias.fr/oregano/side_effect/{entity_id}"
        else:
            return f"http://erias.fr/oregano/{entity_type}/{entity_id}"

    def read_misclassified_edges(self, file_path):
        
        """
        Reads a CSV file containing misclassified edges and formats them for further processing.

        Parameters:
        -----------
        file_path (str): 
            Path to the CSV file containing misclassified edges.

        Returns:
        -----------
        list: 
            A list of dictionaries, where each dictionary contains the 'edge', 'actual_label', 'predicted_label', 
            and 'score'.
        """
        
        df = pd.read_csv(file_path)
        misclassified_edges = df[df['actual label'] != df['predicted label']]
        new_edges = []
        for _, row in misclassified_edges.iterrows():
            from_url = self.format_url(row['from'].split('_')[0], row['from'])
            to_url = self.format_url(row['to'].split('_')[0], row['to'])
            new_edges.append({
                'edge': (from_url, to_url),
                'actual_label': row['actual label'],
                'predicted_label': row['predicted label'],
                'score': row['score']
            })
        return new_edges

    def create_results_df(self, new_edges, scores_prob):
        
        """
        Creates a DataFrame containing the results of predictions for misclassified edges.

        Parameters:
        -----------
        new_edges (list): 
            A list of misclassified edges formatted as dictionaries.
        scores_prob (torch.Tensor): 
            The predicted probabilities for the edges.

        Returns:
        -----------
        pandas.DataFrame: 
            A DataFrame containing the prediction results.
        """
        
        results_df = pd.DataFrame({
            'From': [self.extract_name(item['edge'][0]) for item in new_edges],
            'To': [self.extract_name(item['edge'][1]) for item in new_edges],
            'Actual Label': [item['actual_label'] for item in new_edges],
            'Predicted Label Node2Vec': [item['predicted_label'] for item in new_edges],
            'Node2Vec Score': [item['score'] for item in new_edges],
            'GNN Score': [score.item() for score in scores_prob]
        })
        return results_df


    def extract_name(self, uri):
        return uri.split('/')[-1]  # Extracts the last part of the URI
    
    def train_and_evaluate_all_models(self, models_config, epochs=100):
        
        """
        Trains and evaluates multiple models as specified in the models_config. 
        Combines the results into a single DataFrame.

        Parameters:
        -----------
        models_config (list): 
            A list of tuples containing model information in the format (model_name, gnn_model, link_predictor).
        epochs (int): 
            Number of training epochs for each model (default is 100).

        Returns:
        -----------
        pandas.DataFrame: 
            A DataFrame containing the combined evaluation results of all models.
        """
        
        results = []

        for model_name, gnn, link_predictor in models_config:
            print(f"Training and evaluating {model_name}...")

            # Initialize criterion and optimizer for this model
            criterion = nn.BCEWithLogitsLoss()
            optimizer = torch.optim.Adam(link_predictor.parameters(), lr=0.01)

            # Set the model, criterion, and optimizer
            self.model = link_predictor
            self.criterion = criterion
            self.optimizer = optimizer

            # Train the model
            self.train_model(self.model, self.criterion, self.optimizer, epochs)

            # Evaluate the model
            self.evaluate_model()

            # Predict new links and create results DataFrame
            csv_file_path = 'predicted_edges_bestParams.csv'
            misclassified_edges = self.read_misclassified_edges(csv_file_path)
            scores_prob_all = self.predict_new_links([item['edge'] for item in misclassified_edges])
            scores_prob_all = torch.sigmoid(scores_prob_all)

            # Create results DataFrame for this model
            results_df = self.create_results_df(misclassified_edges, scores_prob_all)
            results_df['Model'] = model_name
            results_df['Score'] = scores_prob_all.numpy()  # Add model scores as a separate column
            results.append(results_df)

        # Combine all results into a single DataFrame
        sum_df = pd.concat(results, ignore_index=True)

        # Pivot the DataFrame to have a separate column for each model's scores
        sum_df_pivot = sum_df.pivot_table(index=['From', 'To', 'Actual Label'], columns='Model', values='Score', aggfunc='mean').reset_index()

        return sum_df_pivot
    
    def print_misclassified_edges(self):
        
        """
        Identifies and returns misclassified edges from the test set, along with their 
        actual and predicted labels and prediction scores.

        Returns:
        -----------
        pandas.DataFrame: 
            A DataFrame containing the misclassified edges and their details.
        """
        
        self.model.eval()
        with torch.no_grad():
            test_edge_index = torch.tensor([[self.node_to_idx[src], self.node_to_idx[dst]] for src, dst in self.test_edges + self.negative_test_edges], dtype=torch.long).t().contiguous()
            test_edge_labels = torch.tensor([1] * len(self.test_edges) + [0] * len(self.negative_test_edges), dtype=torch.float)
            test_scores = self.model(self.data, self.edge_index, test_edge_index)

            test_probs = torch.sigmoid(test_scores)
            test_preds = (test_probs >= 0.5).float()

            misclassified_indices = (test_preds != test_edge_labels).nonzero(as_tuple=True)[0]
            misclassified_edges = [self.test_edges[i] if i < len(self.test_edges) else self.negative_test_edges[i - len(self.test_edges)] for i in misclassified_indices.tolist()]
            misclassified_labels = test_edge_labels[misclassified_indices].tolist()
            misclassified_preds = test_preds[misclassified_indices].tolist()
            misclassified_scores = test_probs[misclassified_indices].tolist()

            # Extract node names
            misclassified_from = [self.extract_name(edge[0]) for edge in misclassified_edges]
            misclassified_to = [self.extract_name(edge[1]) for edge in misclassified_edges]

            # Create a DataFrame
            df = pd.DataFrame({
                'From': misclassified_from,
                'To': misclassified_to,
                'Actual Label': misclassified_labels,
                'Predicted Label': misclassified_preds,
                'Score': misclassified_scores
            })

#             print(df)
            return df
    
    


# GNN1

In [3]:
# Define the GNN model
class GNN1(nn.Module):
    
    """
    A Graph Neural Network (GNN) model using Graph Convolutional Network (GCN) layers 
    for processing graph-structured data.

    This model consists of two GCN layers. The first layer transforms the input features 
    into a 16-dimensional space, and the second layer reduces it further to an 8-dimensional space.

    Attributes:
        conv1 (GCNConv): The first GCN layer that maps input features to 16 dimensions.
        conv2 (GCNConv): The second GCN layer that maps the intermediate features to 8 dimensions.
    """
    
    def __init__(self, input_dim):
        
        """
        Initializes the GNN1 model with the specified input dimension.

        Args:
            input_dim (int): The number of input features per node in the graph.
        """
        
        super(GNN1, self).__init__()
        self.conv1 = GCNConv(input_dim, 16)
        self.conv2 = GCNConv(16, 8)

    def forward(self, x, edge_index):
        
        """
        Defines the forward pass of the GNN1 model.

        Args:
            x (torch.Tensor): The node feature matrix with shape [num_nodes, input_dim].
            edge_index (torch.Tensor): The edge index tensor with shape [2, num_edges] 
                                       specifying the graph connectivity.

        Returns:
            torch.Tensor: The output node features after applying the GCN layers.
        """
        
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return x

# Define the LinkPredictor model
class LinkPredictorGNN1(nn.Module):
    
    """
    A link prediction model that utilizes the GNN1 model to predict the existence of links (edges) 
    in a graph.

    This model takes as input the embeddings produced by the GNN1 model for each node, 
    concatenates the embeddings of the nodes at both ends of an edge, and passes them 
    through a linear layer to predict the probability of the edge existing.

    Attributes:
        gnn (GNN1): The GNN model used to generate node embeddings.
        linear (nn.Linear): A linear layer that takes concatenated node embeddings and 
                            outputs a scalar score representing the probability of an edge.
    """
    
    def __init__(self, gnn):
        
        """
        Initializes the LinkPredictorGNN1 model with a given GNN model.

        Args:
            gnn (GNN1): The GNN model that generates node embeddings.
        """
        
        super(LinkPredictorGNN1, self).__init__()
        self.gnn = gnn
        self.linear = nn.Linear(8 * 2, 1)  # 8 from each node's embedding

    def forward(self, data, edge_index, edge_label_index):
        
        """
        Defines the forward pass of the LinkPredictorGNN1 model.

        Args:
            data (torch_geometric.data.Data): The PyTorch Geometric data object containing 
                                              node features and other graph data.
            edge_index (torch.Tensor): The edge index tensor with shape [2, num_edges] 
                                       specifying the graph connectivity.
            edge_label_index (torch.Tensor): A tensor with shape [2, num_edges] specifying 
                                             the indices of the edges to predict.

        Returns:
            torch.Tensor: A tensor of scores with shape [num_edges], where each score represents 
                          the predicted probability of the corresponding edge in edge_label_index.
        """
        
        x = self.gnn(data.x, edge_index)
        edge_embeddings = torch.cat([x[edge_label_index[0]], x[edge_label_index[1]]], dim=-1)
        scores = self.linear(edge_embeddings).squeeze()
        return scores

# GAT1

In [4]:

class GAT1(nn.Module):
    
    """
    A Graph Attention Network (GAT) model for processing graph-structured data.

    This model uses two layers of Graph Attention Convolution (GATConv), where the first layer 
    has multiple attention heads to capture different aspects of the node neighborhood, 
    and the second layer reduces the dimensionality of the node features.

    Attributes:
        conv1 (GATConv): The first GAT layer with 4 attention heads, mapping input features 
                         to a 16-dimensional space per head (total 64 dimensions).
        conv2 (GATConv): The second GAT layer with 1 attention head, reducing the feature 
                         space to 8 dimensions.
    """
    
    
    def __init__(self, input_dim):       
        super(GAT1, self).__init__()
        self.conv1 = GATConv(input_dim, 16, heads=4)  # 4 attention heads
        self.conv2 = GATConv(16 * 4, 8, heads=1)  # 1 attention head for the last layer

    def forward(self, x, edge_index):     
        x = F.elu(self.conv1(x, edge_index))
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return x
    
class LinkPredictorGAT1(nn.Module):
    def __init__(self, gnn):
        super(LinkPredictorGAT1, self).__init__()
        self.gnn = gnn
        self.linear = nn.Linear(8 * 2, 1)  # Embedding size from GAT

    def forward(self, data, edge_index, edge_label_index):
        x = self.gnn(data.x, edge_index)
        edge_embeddings = torch.cat([x[edge_label_index[0]], x[edge_label_index[1]]], dim=-1)
        scores = self.linear(edge_embeddings).squeeze()
        return scores

# GAT2

In [5]:
from torch_geometric.nn import GATConv

class GAT2(nn.Module):
    
    """
    A deeper Graph Attention Network (GAT) model for processing graph-structured data.

    This model consists of three GAT convolutional layers, each with multiple attention heads,
    designed to capture more complex relationships in the graph data.

    Attributes:
        conv1 (GATConv): The first GAT layer with 4 attention heads, each producing a 32-dimensional output.
        conv2 (GATConv): The second GAT layer with 4 attention heads, each producing a 32-dimensional output.
        conv3 (GATConv): The third GAT layer with 1 attention head, producing an 8-dimensional output.
    """
    
    
    def __init__(self, input_dim):
        super(GAT2, self).__init__()
        self.conv1 = GATConv(input_dim, 32, heads=4, concat=True)
        self.conv2 = GATConv(32 * 4, 32, heads=4, concat=True)
        self.conv3 = GATConv(32 * 4, 8, heads=1, concat=True)

    def forward(self, x, edge_index):
        x = F.elu(self.conv1(x, edge_index))
        x = F.dropout(x, training=self.training)
        x = F.elu(self.conv2(x, edge_index))
        x = F.dropout(x, training=self.training)
        x = self.conv3(x, edge_index)
        return x

class LinkPredictorGAT2(nn.Module):
    def __init__(self, gnn):
        super(LinkPredictorGAT2, self).__init__()
        self.gnn = gnn
        self.linear = nn.Linear(8 * 2, 1)

    def forward(self, data, edge_index, edge_label_index):
        x = self.gnn(data.x, edge_index)
        edge_embeddings = torch.cat([x[edge_label_index[0]], x[edge_label_index[1]]], dim=-1)
        scores = self.linear(edge_embeddings).squeeze()
        return scores


# ComplexGCN

Applies **batch normalization** over a batch of features as described in the `"Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift" <https://arxiv.org/abs/1502.03167>` paper.


The mean and standard-deviation are calculated per-dimension over all nodes inside the mini-batch.

**Residual connection** allow the output of a layer to be added directly to the input of a later layer helping gradients flow through very deep networks and making it easier to train deep architectures by addressing issues like vanishing gradients.
https://pubmed.ncbi.nlm.nih.gov/37327757/

In [6]:
class ComplexGCN(nn.Module):
    
    """
    A Graph Convolutional Network (GCN) model with residual connections and batch normalization.

    This model consists of three GCN layers, each followed by batch normalization, and 
    incorporates residual connections to improve gradient flow and model performance.

    Attributes:
        conv1 (GCNConv): The first GCN layer that transforms input features to a 32-dimensional space.
        bn1 (nn.BatchNorm1d): Batch normalization layer applied after the first GCN layer.
        conv2 (GCNConv): The second GCN layer that further processes the 32-dimensional features.
        bn2 (nn.BatchNorm1d): Batch normalization layer applied after the second GCN layer.
        conv3 (GCNConv): The third GCN layer that further processes the 32-dimensional features.
        bn3 (nn.BatchNorm1d): Batch normalization layer applied after the third GCN layer.
    """
    
    
    def __init__(self, input_dim):
        super(ComplexGCN, self).__init__()
        self.conv1 = GCNConv(input_dim, 32)
        self.bn1 = nn.BatchNorm1d(32)
        self.conv2 = GCNConv(32, 32) 
        self.bn2 = nn.BatchNorm1d(32)
        self.conv3 = GCNConv(32, 32) 
        self.bn3 = nn.BatchNorm1d(32)

    def forward(self, x, edge_index):
        # Layer 1
        x1 = F.relu(self.bn1(self.conv1(x, edge_index)))
        # Layer 2 with residual connection
        x2 = F.relu(self.bn2(self.conv2(x1, edge_index)) + x1)
        # Layer 3 with residual connection
        x3 = self.bn3(self.conv3(x2, edge_index)) + x2 
        return x3
    

class LinkPredictorComplexGCN(nn.Module):
    def __init__(self, gnn):
        super(LinkPredictorComplexGCN, self).__init__()
        self.gnn = gnn
        self.linear = nn.Linear(32 * 2, 1) 

    def forward(self, data, edge_index, edge_label_index):
        x = self.gnn(data.x, edge_index)
        edge_embeddings = torch.cat([x[edge_label_index[0]], x[edge_label_index[1]]], dim=-1)
        scores = self.linear(edge_embeddings).squeeze()
        return scores

# Initialize parameters

In [7]:
rdf_files = [
    "alzheimer_direct_neighborhood.rdf",
    "depression_direct_neighborhood_only.rdf",
    "urinary_direct_neighborhood_only.rdf",
    "brexpiprazole_direct_neighborhood_only.rdf",
    "cardiovascular_direct_neighborhood.rdf",
    "breast_direct_neighborhood.rdf",
    "ovarian_direct_neighborhood_only.rdf"
]

edge_label_mapping = {
    'http://erias.fr/oregano/#has_effect',
    'http://erias.fr/oregano/#increase_effect',
    'http://erias.fr/oregano/#decrease_effect',
    'http://erias.fr/oregano/#increase_efficacy',
    'http://erias.fr/oregano/#decreases_efficacy',
    'http://erias.fr/oregano/#has_indication',
    'http://erias.fr/oregano/#has_activity',
    'http://erias.fr/oregano/#increase_activity',
    'http://erias.fr/oregano/#decrease_activity',
    'http://erias.fr/oregano/#has_side_effect',
    'http://erias.fr/oregano/#has_target',
    'http://erias.fr/oregano/#is_affecting',
    'http://erias.fr/oregano/#is_substance_that_treats',
    'http://erias.fr/oregano/#acts_within',
    'http://erias.fr/oregano/#causes_condition',
    'http://erias.fr/oregano/#gene_product_of',
    'http://erias.fr/oregano/#has_phenotype'
}

files = {
    'train_edges': "train_edges.pkl",
    'test_edges': "test_edges.pkl",
    'negative_train_edges': "negative_train_edges.pkl",
    'negative_test_edges': "negative_test_edges.pkl"
}

embeddings_file = 'node2vec_embeddings_bestParams.npy'
mapping_file = 'node_to_idx_bestParams.txt'

predictor = GraphLinkPredictor(rdf_files, edge_label_mapping, embeddings_file, mapping_file)
predictor.load_edge_data(files)

# Run each model separately and investigate the results

In [12]:
# gnn = GAT1(input_dim=predictor.x.size(1))
# link_predictor = LinkPredictorGAT1(gnn)
# criterion = nn.BCEWithLogitsLoss()
# optimizer = torch.optim.Adam(link_predictor.parameters(), lr=0.01)

# predictor.train_model(link_predictor, criterion, optimizer)
# predictor.evaluate_model()



In [13]:
# # Find the misclassified examples and filter
# misclassified_df = predictor.print_misclassified_edges()

# filtered_df = misclassified_df[(misclassified_df['Actual Label'] == 0)]
              

# # Condition 1: From is disease and To is gene or compound
# condition_1 = filtered_df['From'].str.startswith('disease_') & (filtered_df['To'].str.startswith('gene_') | \
#                                                                 filtered_df['To'].str.startswith('compound_'))

# # Condition 2: To is disease and From is gene or compound
# condition_2 = filtered_df['To'].str.startswith('disease_') & (filtered_df['From'].str.startswith('gene_') | \
#                                                               filtered_df['From'].str.startswith('compound_'))

# # Combine conditions
# filtered_df = filtered_df[condition_1 | condition_2]
# filtered_df

In [14]:
# # GNN1 Model
# gnn1 = GNN1(input_dim=predictor.x.size(1))
# link_predictor_gnn1 = LinkPredictorGNN1(gnn1)
# criterion = nn.BCEWithLogitsLoss()
# optimizer = torch.optim.Adam(link_predictor_gnn1.parameters(), lr=0.01)

# # Find the misclassified examples for GNN1
# misclassified_df_gnn1 = predictor.print_misclassified_edges()

# # Filter misclassified edges based on your conditions (GNN1)
# filtered_df_gnn1 = misclassified_df_gnn1[(misclassified_df_gnn1['Actual Label'] == 0)]

# condition_1_gnn1 = filtered_df_gnn1['From'].str.startswith('disease_') & (filtered_df_gnn1['To'].str.startswith('gene_') | \
#                                                                           filtered_df_gnn1['To'].str.startswith('compound_'))

# condition_2_gnn1 = filtered_df_gnn1['To'].str.startswith('disease_') & (filtered_df_gnn1['From'].str.startswith('gene_') | \
#                                                                         filtered_df_gnn1['From'].str.startswith('compound_'))

# filtered_df_gnn1 = filtered_df_gnn1[condition_1_gnn1 | condition_2_gnn1]

# # Train and evaluate GNN1
# predictor.train_model(link_predictor_gnn1, criterion, optimizer)
# predictor.evaluate_model()

In [15]:
# # Find common misclassified edges between GNN1 and GAT1
# common_misclassified = pd.merge(filtered_df_gnn1, filtered_df, on=['From', 'To', 'Actual Label'], how='inner')

# # Display the common misclassified edges
# common_misclassified

In [None]:
# # Example of predicting new links and handling misclassified edges
# new_edges = [
#     ('http://erias.fr/oregano/compound/compound_8067', 'http://erias.fr/oregano/disease/disease_145'),
#     ('http://erias.fr/oregano/disease/disease_3310', 'http://erias.fr/oregano/disease/disease_145'), 
#     ('http://erias.fr/oregano/disease/disease_804', 'http://erias.fr/oregano/disease/disease_145'),
# ]

# scores_prob = predictor.predict_new_links(new_edges)
# scores_prob = torch.sigmoid(scores_prob)

# for edge, score in zip(new_edges, scores_prob):
#     print(f"Link: ({predictor.extract_name(edge[0])}, {predictor.extract_name(edge[1])}): Score = {score.item():.4f}")



In [None]:
# csv_file_path = 'predicted_edges_bestParams.csv'
# misclassified_edges = predictor.read_misclassified_edges(csv_file_path)
# scores_prob_all = predictor.predict_new_links([item['edge'] for item in misclassified_edges])
# scores_prob_all = torch.sigmoid(scores_prob_all)

# results_df = predictor.create_results_df(misclassified_edges, scores_prob_all)
# results_df

# Run all model and accumulate the results for the misclassified examples of the Node2Vec

In [None]:
# Define the configuration for each model
models_config = [
    ("GNN1", GNN1(input_dim=64), LinkPredictorGNN1(GNN1(input_dim=64))),
    ("GAT1", GAT1(input_dim=64), LinkPredictorGAT1(GAT1(input_dim=64))),
    ("GAT2", GAT2(input_dim=64), LinkPredictorGAT2(GAT2(input_dim=64))),
    ("ComplexGCN", ComplexGCN(input_dim=64), LinkPredictorComplexGCN(ComplexGCN(input_dim=64)))
]

# Train and evaluate all models
sum_df = predictor.train_and_evaluate_all_models(models_config, epochs=100)


In [None]:
sum_df

In [None]:
# Extract categories from 'From' and 'To' columns
sum_df['From_Category'] = sum_df['From'].str.split('_').str[0]
sum_df['To_Category'] = sum_df['To'].str.split('_').str[0]

# Combine the categories
sum_df_melted = sum_df.melt(id_vars=['Actual Label'], value_vars=['From_Category', 'To_Category'], var_name='Type', value_name='Category')

# Count occurrences of each category by actual label
category_counts = sum_df_melted.groupby(['Actual Label', 'Category']).size().unstack(fill_value=0)

palette = sns.color_palette("Set2") 


# Plotting with the custom palette
category_counts.plot(kind='bar', stacked=True, color=palette)
plt.title('Number of Links by Category and Actual Label')
plt.xlabel('Category')
plt.ylabel('Count of Links')
plt.xticks(rotation=45)
plt.legend(title='Actual Label')
plt.show()

**Heatmap to identify which models might be redundant (high correlation) or complementary (low correlation).**

This information is useful when deciding which models to ensemble or focus on.
A strong correlation between certain models, might need further investigation on why they are similar and whether they need diversification to improve overall performance

In [None]:
# Select the model columns
model_columns = ['ComplexGCN', 'GAT1', 'GAT2', 'GNN1']

# Plot heatmap of the predictions
plt.figure(figsize=(10, 6))
sns.heatmap(sum_df[model_columns].corr(), annot=True, cmap='viridis')
plt.title('Correlation Heatmap of Model Predictions')
plt.show()

**Boxplots to see how well the models distinguish between different classes**

In [None]:
# Melt the DataFrame to make it easier to plot
melted_df = sum_df.melt(id_vars=['Actual Label'], value_vars=model_columns, 
                        var_name='Model', value_name='Prediction')

# Plot boxplot
plt.figure(figsize=(12, 6))
sns.boxplot(x='Model', y='Prediction', hue='Actual Label', data=melted_df, palette='viridis')
plt.axhline(0.5, color='red', linestyle='--')
plt.title('Boxplot of Model Predictions by Actual Label')
plt.show()

In [None]:
from sklearn.metrics import roc_curve, auc

plt.figure(figsize=(10, 8))
for model in model_columns:
    fpr, tpr, _ = roc_curve(sum_df['Actual Label'], sum_df[model])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=2, label=f'{model} (area = {roc_auc:.2f})')

plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for Different Models')
plt.legend(loc="lower right")
plt.show()

**Precision**

"Of all the instances that the model predicted as positive, how many were actually positive?"

**Racall**

"Of all the actual positives, how many did the model successfully identify?"

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


metrics = {}
for model in model_columns:
    metrics[model] = {
        'Accuracy': accuracy_score(sum_df['Actual Label'], (sum_df[model] > 0.5).astype(int)),
        'Precision': precision_score(sum_df['Actual Label'], (sum_df[model] > 0.5).astype(int)),
        'Recall': recall_score(sum_df['Actual Label'], (sum_df[model] > 0.5).astype(int)),
        'F1 Score': f1_score(sum_df['Actual Label'], (sum_df[model] > 0.5).astype(int))
    }

# Convert to DataFrame
metrics_df = pd.DataFrame(metrics).T

# Plot bar chart for each metric
metrics_df.plot(kind='bar', figsize=(12, 8), colormap='viridis')
plt.title('Model Performance Metrics')
plt.xlabel('Model')
plt.ylabel('Score')
plt.legend(title='Metric')
plt.show()

In [None]:
# Filter misclassified rows where Actual Label is 0 and all score columns are > 0.5
filtered_df = sum_df[(sum_df['Actual Label'] == 0) & 
                 (sum_df[['ComplexGCN', 'GAT1', 'GAT2', 'GNN1']].gt(0.5).all(axis=1))]

# filtered_df

In [None]:
# Filter the links disease-compound, disease-gene

# Disease-compound links
condition1 = ((filtered_df['From_Category'] == 'compound') & (filtered_df['To_Category'] == 'disease')) | \
             ((filtered_df['From_Category'] == 'disease') & (filtered_df['To_Category'] == 'compound'))
# Disease-gene links 
condition2 = ((filtered_df['From_Category'] == 'disease') & (filtered_df['To_Category'] == 'gene')) | \
             ((filtered_df['From_Category'] == 'gene') & (filtered_df['To_Category'] == 'disease'))

filtered_df = filtered_df[condition1 | condition2]

filtered_df

# Sparql query to find the labels of the identified entities

In [None]:
# !pip install SPARQLWrapper

In [None]:
# Initialize SPARQL endpoint (replace with your actual endpoint URL)
# For this step, it is necessary to have GraphDB for Desktop and load the oreganov2.1_metadata_complete.ttl file with name Graph-1.
sparql = SPARQLWrapper("http://localhost:7200/repositories/Graph-1")

def get_labels(entity_uri):
    query = f"""
    PREFIX oregano: <http://erias.fr/oregano/#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema/#>
    SELECT DISTINCT ?label
    WHERE {{
      <{entity_uri}> rdfs:label ?label.
    }}
    """
    
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    try:
        results = sparql.query().convert()
        labels = [result["label"]["value"] for result in results["results"]["bindings"]]
        return labels
    except Exception as e:
        print(f"Query failed for {entity_uri}: {e}")
        return []

def construct_uri(entity_id, category):
    return f"http://erias.fr/oregano/{category}/{entity_id}"

# def get_first_label(entity_uri):
#     labels = get_labels(entity_uri)
#     return labels[0] if labels else None

def get_all_labels(entity_uri):
    labels = get_labels(entity_uri)
    return ', '.join(labels) if labels else None
    
    
# Print all labels
filtered_df['From_All_Labels'] = filtered_df.apply(lambda row: get_all_labels(construct_uri(row['From'], row['From_Category'])), axis=1)
filtered_df['To_All_Labels'] = filtered_df.apply(lambda row: get_all_labels(construct_uri(row['To'], row['To_Category'])), axis=1)

filtered_df

# Search on web for papers that include both entities of each row by using scholarly

In [None]:
# !pip install scholarly

In [None]:

# Function to clean labels by removing any prefix before the colon
def clean_label(label):
    if label:
        # Remove any prefix before the colon and strip any extra spaces
        cleaned_label = re.sub(r'^[^:]+:', '', label).strip()
        return cleaned_label
    return None

# Function to extract the second label from a comma-separated string
def get_second_label(labels_str):
    labels = labels_str.split(', ')
    return labels[1] if len(labels) > 1 else None

# Apply the function to extract the second label
filtered_df['From_Second_Label'] = filtered_df['From_All_Labels'].apply(get_second_label)
filtered_df['To_Second_Label'] = filtered_df['To_All_Labels'].apply(get_second_label)

# Clean the second labels
filtered_df['From_Cleaned_Label'] = filtered_df['From_Second_Label'].apply(clean_label)
filtered_df['To_Cleaned_Label'] = filtered_df['To_Second_Label'].apply(clean_label)

# Display the updated DataFrame
# print(filtered_df[['From', 'To', 'From_Cleaned_Label', 'To_Cleaned_Label']])

# Function to search for papers using combined labels
def search_papers(query):
    search_query = scholarly.search_pubs(query)
    papers = []
    for i in range(5):  # Limit the number of papers retrieved
        try:
            paper = next(search_query)
            papers.append({
                'title': paper.get('bib', {}).get('title', 'No title'),
#                 'url': paper.get('url', 'No URL')
            })
        except StopIteration:
            break
    return papers

# Search for papers related to the combination of the two cleaned labels
for index, row in filtered_df.iterrows():
    from_label = row['From_Cleaned_Label']
    to_label = row['To_Cleaned_Label']
    
    if from_label and to_label:
        combined_query = f"{from_label} + {to_label}"
        print(f"Searching papers for: {combined_query}")
        print('----------------------------------------------------------')
        papers = search_papers(combined_query)
        print(f"Results for {combined_query}: {papers}")
        print('===================================================================================================')