In [1]:
import pandas as pd
import networkx as nx
import os.path as osp
import math
from itertools import chain

import torch
import torch_geometric
from torch_geometric.data import Dataset, download_url
from torch_geometric.data import Data, InMemoryDataset
from torch_geometric.loader import DataLoader
from torch_geometric.utils.convert import from_networkx
import numpy as np
from scipy.sparse.csgraph import shortest_path

from gensim.models import Word2Vec

from torch.nn import Parameter
import torch.nn.functional as F
from torch_geometric.nn import GCNConv,Linear,RGCNConv,ChebConv
from torch_geometric.nn import GAE, Node2Vec,VGAE
from torch.utils.tensorboard import SummaryWriter
from torch_geometric.nn.models.autoencoder import ARGVA
from torch_geometric.datasets import Planetoid

from torch_geometric.transforms import RandomLinkSplit
from tqdm import tqdm
from torch_geometric.utils import k_hop_subgraph, to_scipy_sparse_matrix
from torch_geometric.nn import MLP, GCNConv, global_sort_pool
from torch.nn import BCEWithLogitsLoss, Conv1d, MaxPool1d, ModuleList
from sklearn.metrics import roc_auc_score

In [2]:
class SEALDataset(InMemoryDataset):
    def __init__(self,dataset, num_hops, num_val=0,num_test=0.32,neg_sampling_ratio=1,split='train'):
        self.data = dataset
        self.num_hops = num_hops
        self.num_val=num_val
        self.num_test=num_test
        self.neg_sampling_ratio=neg_sampling_ratio
        
        super().__init__(dataset.root)
        
        index = ['train', 'val', 'test'].index(split)
        self.data, self.slices = torch.load(self.processed_paths[index])

    @property
    def processed_file_names(self):
        #deve vedere tutti i file altrimenti processa
        return ['SEAL_train_data.pt',
                'SEAL_val_data.pt',
                'SEAL_test_data.pt']

    def process(self):
        transform = RandomLinkSplit(is_undirected=False,
                                    split_labels=True,          
                                    neg_sampling_ratio=self.neg_sampling_ratio,
                                    key = "edge_label",
                                    disjoint_train_ratio=0,
                                    num_val =self.num_val,
                                    num_test=self.num_test)
        
        train_data, val_data, test_data = transform(self.data)

        self._max_z = 0

        # Collect a list of subgraphs for training, validation and testing:
        train_pos_data_list = self.extract_enclosing_subgraphs(
            train_data.edge_index, train_data.pos_edge_label_index, 1)
        train_neg_data_list = self.extract_enclosing_subgraphs(
            train_data.edge_index, train_data.neg_edge_label_index, 0)
    
        if self.num_val:
            
            val_pos_data_list = self.extract_enclosing_subgraphs(
                val_data.edge_index, val_data.pos_edge_label_index, 1)
            val_neg_data_list = self.extract_enclosing_subgraphs(
                val_data.edge_index, val_data.neg_edge_label_index, 0)
        else:
            val_pos_data_list=[]
            val_neg_data_list=[]
        
        test_pos_data_list = self.extract_enclosing_subgraphs(
            test_data.edge_index, test_data.pos_edge_label_index, 1)
        test_neg_data_list = self.extract_enclosing_subgraphs(
            test_data.edge_index, test_data.neg_edge_label_index, 0)

        # Convert node labeling to one-hot features.
        for data in chain(train_pos_data_list, train_neg_data_list,
                          val_pos_data_list,val_neg_data_list,
                          test_pos_data_list, test_neg_data_list):
            # We solely learn links from structure, dropping any node features:
            data.x = F.one_hot(data.z, self._max_z + 1).to(torch.float)
    
        
        torch.save(self.collate(train_pos_data_list + train_neg_data_list),
                   self.processed_paths[0])
        
        if self.num_val:
            torch.save(self.collate(val_pos_data_list + val_neg_data_list),
                           self.processed_paths[1])
        else:
            torch.save(self.collate([Data()]+[Data()]),self.processed_paths[1])
            
        torch.save(self.collate(test_pos_data_list + test_neg_data_list),
                   self.processed_paths[2])

    def extract_enclosing_subgraphs(self, edge_index, edge_label_index, y):
        data_list = []
        for src, dst in tqdm(edge_label_index.t().tolist()):
            sub_nodes, sub_edge_index, mapping, _ = k_hop_subgraph(
                [src, dst], self.num_hops, edge_index, relabel_nodes=True)
            src, dst = mapping.tolist()

            # Remove target link from the subgraph.
            mask1 = (sub_edge_index[0] != src) | (sub_edge_index[1] != dst)
            mask2 = (sub_edge_index[0] != dst) | (sub_edge_index[1] != src)
            sub_edge_index = sub_edge_index[:, mask1 & mask2]

            # Calculate node labeling.
            z = self.drnl_node_labeling(sub_edge_index, src, dst,
                                        num_nodes=sub_nodes.size(0))
            data = Data( z=z,
                        edge_index=sub_edge_index, y=y)
            data_list.append(data)

        return data_list

    def drnl_node_labeling(self, edge_index, src, dst, num_nodes=None):
        # Double-radius node labeling (DRNL).
        src, dst = (dst, src) if src > dst else (src, dst)
        adj = to_scipy_sparse_matrix(edge_index, num_nodes=num_nodes).tocsr()

        idx = list(range(src)) + list(range(src + 1, adj.shape[0]))
        adj_wo_src = adj[idx, :][:, idx]

        idx = list(range(dst)) + list(range(dst + 1, adj.shape[0]))
        adj_wo_dst = adj[idx, :][:, idx]

        dist2src = shortest_path(adj_wo_dst, directed=False, unweighted=True,
                                 indices=src)
        dist2src = np.insert(dist2src, dst, 0, axis=0)
        dist2src = torch.from_numpy(dist2src)

        dist2dst = shortest_path(adj_wo_src, directed=False, unweighted=True,
                                 indices=dst - 1)
        dist2dst = np.insert(dist2dst, src, 0, axis=0)
        dist2dst = torch.from_numpy(dist2dst)

        dist = dist2src + dist2dst
        dist_over_2, dist_mod_2 = torch.div(dist,2,rounding_mode='floor'), dist % 2

        z = 1 + torch.min(dist2src, dist2dst)
        z += dist_over_2 * (dist_over_2 + dist_mod_2 - 1)
        z[src] = 1.
        z[dst] = 1.
        z[torch.isnan(z)] = 0.

        self._max_z = max(int(z.max()), self._max_z)

        return z.to(torch.long)

In [3]:
df=pd.read_csv('PPI.csv')
df=df.iloc[:50000,:]
G=nx.from_pandas_edgelist(df,'Official Symbol Interactor A','Official Symbol Interactor B' )
  
pyg_graph = from_networkx(G) 

In [4]:
df.shape

(50000, 2)

In [5]:
pyg_graph

Data(edge_index=[2, 66048], num_nodes=8728)

In [11]:
pyg_graph.root='C:\\Users\\calni\\OneDrive\\Desktop\\PPI\\data\\PPI_seal'

In [12]:
train_dataset = SEALDataset_2(pyg_graph, num_hops=2, split='train')
val_dataset = SEALDataset_2(pyg_graph, num_hops=2, split='val')
test_dataset = SEALDataset_2(pyg_graph, num_hops=2, split='test')

Processing...


processando


100%|███████████████████████████████████████████████████████████████████████████| 44913/44913 [02:20<00:00, 320.47it/s]
100%|███████████████████████████████████████████████████████████████████████████| 44913/44913 [01:40<00:00, 444.71it/s]
100%|███████████████████████████████████████████████████████████████████████████| 21135/21135 [01:06<00:00, 317.83it/s]
100%|███████████████████████████████████████████████████████████████████████████| 21135/21135 [00:48<00:00, 439.27it/s]
Done!


In [13]:
train_dataset.data

Data(edge_index=[2, 183722760], y=[89826], z=[30795214], x=[30795214, 76])

In [14]:
val_dataset.data

Data()

In [15]:
test_dataset.data

Data(edge_index=[2, 86249978], y=[42270], z=[14479340], x=[14479340, 76])

In [16]:
train_loader = DataLoader(train_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

In [22]:
class DGCNN(torch.nn.Module):
    def __init__(self, hidden_channels, num_layers, GNN=GCNConv, k=0.6):
        super().__init__()

        if k < 1:  # Transform percentile to number.
            num_nodes = sorted([data.x.shape[0] for data in train_dataset])#([data.num_nodes for data in train_dataset])
            k = num_nodes[int(math.ceil(k * len(num_nodes))) - 1]
            k = max(10, k)
        self.k = int(k)

        self.convs = ModuleList()
        self.convs.append(GNN(train_dataset.data.x.shape[1], hidden_channels))
        for i in range(0, num_layers - 1):
            self.convs.append(GNN(hidden_channels, hidden_channels))
        self.convs.append(GNN(hidden_channels, 1))

        conv1d_channels = [16, 32]
        total_latent_dim = hidden_channels * num_layers + 1
        conv1d_kws = [total_latent_dim, 5]
        self.conv1 = Conv1d(1, conv1d_channels[0], conv1d_kws[0],
                            conv1d_kws[0])
        self.maxpool1d = MaxPool1d(2, 2)
        self.conv2 = Conv1d(conv1d_channels[0], conv1d_channels[1],
                            conv1d_kws[1], 1)
        dense_dim = int((self.k - 2) / 2 + 1)
        dense_dim = (dense_dim - conv1d_kws[1] + 1) * conv1d_channels[1]
        self.mlp = MLP([dense_dim, 128, 1], dropout=0.5, batch_norm=False)

    def forward(self, x, edge_index, batch):
        xs = [x]
        for conv in self.convs:
            xs += [conv(xs[-1], edge_index).tanh()]
        x = torch.cat(xs[1:], dim=-1)

        # Global pooling.
        x = global_sort_pool(x, batch, self.k)
        x = x.unsqueeze(1)  # [num_graphs, 1, k * hidden]
        x = self.conv1(x).relu()
        x = self.maxpool1d(x)
        x = self.conv2(x).relu()
        x = x.view(x.size(0), -1)  # [num_graphs, dense_dim]

        return self.mlp(x)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = DGCNN(hidden_channels=32, num_layers=3).to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.0001)
criterion = BCEWithLogitsLoss()

In [23]:
def train():
    model.train()

    total_loss = 0
    for data in train_loader:
        
        data = data.to(device)
        optimizer.zero_grad()
        out = model(data.x, data.edge_index, data.batch)
       
        loss = criterion(out.view(-1), data.y.to(torch.float))
        loss.backward()
        optimizer.step()
        total_loss += float(loss) * data.num_graphs

    return total_loss / len(train_dataset)

In [24]:
@torch.no_grad()
def test(loader):
    model.eval()

    y_pred, y_true = [], []
    for data in loader:
        data = data.to(device)
        logits = model(data.x, data.edge_index, data.batch)
        y_pred.append(logits.view(-1).cpu())
        y_true.append(data.y.view(-1).cpu().to(torch.float))

    return roc_auc_score(torch.cat(y_true), torch.cat(y_pred))


for epoch in range(1, 51):
    loss = train()
    test_auc = test(test_loader)
    print(f'Epoch: {epoch:02d}, Loss: {loss:.4f},Test: {test_auc:.4f}')

KeyboardInterrupt: 

In [None]:
'''
for i in train_loader:
    print(i.x.shape)
    a=i.x
    print(i.edge_index[0].shape,i.edge_index[1].shape)
    print(i.z.shape)
    print(i.y.shape)
    break
    
    
for j in range(len(train_loader.dataset.slices['x'])+1):
        print(train_dataset.data.x[train_loader.dataset.slices['x'][j]:train_loader.dataset.slices['x'][j+1],:])
        b=train_dataset.data.x[train_loader.dataset.slices['x'][j]:train_loader.dataset.slices['x'][j+1],:]
        break
        
(a==b).all()
'''
