In [5]:
import torch
import numpy as np
import pandas as pd
import time
import os   
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score
# from sklearn.metrics import precision_score,recall_score
# from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from torch_geometric.datasets import CitationFull
from torch_geometric.datasets import Coauthor
from torch_geometric.datasets import Amazon
from torch_geometric.datasets import Actor,WebKB,NELL,Reddit,AttributedGraphDataset,HeterophilousGraphDataset,Planetoid
from torch_geometric.nn.models import GAT,GIN

import torch.nn.functional as F
import torch.nn as nn
from torch.nn import Linear
import torch_geometric.transforms as T
import torch_geometric
from torch_geometric.nn import Linear,GCNConv,SAGEConv,GATConv,GATv2Conv,ChebConv,SGConv,FAConv,EGConv,GCN2Conv,GeneralConv,SSGConv
from torch_geometric.utils import degree
import psutil

In [6]:
dsname="Physics"

if dsname in ["CS","Physics","BlogCatalog","WIKI","cora_ml","citeseer","pubmed","Photo","Computers" ]:
    if dsname =="CS":
        nm = Coauthor(root="./datasets2",name=dsname)
    if dsname =="Physics":
        nm = Coauthor(root="./datasets2",name=dsname)
    if dsname =="BlogCatalog":
        nm=AttributedGraphDataset(root="./datasets",name=dsname)
    if dsname =="WIKI":
        nm=AttributedGraphDataset(root="./datasets",name=dsname)
    if dsname =="cora_ml":
        nm = CitationFull(root="./datasets",name=dsname)
    if dsname =="citeseer":
        nm = CitationFull(root="./datasets",name=dsname)
    if dsname =="pubmed":
        nm = CitationFull(root="./datasets",name=dsname)
    if dsname =="Photo":
        nm = Amazon(root="./datasets",name=dsname)
    if dsname =="Computers":
        nm = Amazon(root="./datasets",name=dsname)
        
    print(f'Dataset: {nm}:')
    print(f'Number of graphs: {len(nm)}')
    print(f'Number of features: {nm.num_features}')
    print(f'Number of classes: {nm.num_classes}')
    print(nm.x.shape)
    print(nm.edge_index.shape)
    node_degree=degree(nm[0].edge_index[0])
    avgdegree=node_degree.mean().item()
    print(f'Avg degree: {avgdegree}')
else:
    print("The name of datsets error,try again!")

Dataset: CoauthorPhysics():
Number of graphs: 1
Number of features: 8415
Number of classes: 5
torch.Size([34493, 8415])
torch.Size([2, 495924])
Avg degree: 14.37752628326416


In [7]:
def khop_graphs_sparse(x, edge_index, k,name,device,features=True, regular=False):
    # Comprobamos si ya existe el fichero
    if os.path.isfile('./data/hops_'+name+'.pkl'):
        import pickle
        with open('./data/hops_'+name+'.pkl', 'rb') as f:
            hops = pickle.load(f)
        return hops
    similarity = torch.cdist(x, x, p=2)
    # Normalize between 0 and 1
    similarity = (similarity - similarity.min()) / (similarity.max() - similarity.min())
    similarity = similarity.to(device)
    hops = list()
    attributes = list()
    N = edge_index.max().item() + 1
    # Create the adjacency matrix
    A = torch.sparse_coo_tensor(edge_index, torch.ones(edge_index.size(1)), (N, N)).to(device)
    # Add self loops
    I = torch.sparse_coo_tensor(torch.arange(N).unsqueeze(0).repeat(2, 1), torch.ones(N), (N, N)).to(device)
    A = A + I
    # Degree matrix
    degrees = torch.sparse.sum(A, dim=1)
    degrees = torch.pow(degrees, -0.5)
    # Get the indices of the diagonal elements
    indices = torch.arange(N).unsqueeze(0).repeat(2, 1).to(device)
    values = degrees.coalesce().values().to(device)
    # Create the sparse diagonal matrix
    D_tilde = torch.sparse_coo_tensor(indices, values, (N, N)).to(device)
    A_tilde = torch.sparse.mm(torch.sparse.mm(D_tilde, A), D_tilde)
    # Compute A_tilde^k
    A_tilde_k = A_tilde.clone().to(device)
    hops.append(A_tilde_k.clone().coalesce().indices().to(device))
    # Ahora ponemos los pesos de cada una de las aristas
    #attributes.append(A_tilde_k.clone().coalesce().values().to(device))
    for i in range(k - 1):
        print("Computing k: ", i+1, " of ", k-1)
        if device == 'cpu':
            # Mostramos cuanta memoria ram del sistema estamos usando
            print("Ram memory: ", psutil.virtual_memory().percent, "%")
        else:
            # Mostramos cuanta memoria estamos usando
            print(torch.cuda.memory_allocated(device=device), "out of ", torch.cuda.max_memory_allocated(device=device))
        A_tilde_k = torch.sparse.mm(A_tilde_k, A_tilde)
        # We store those indices that in similarity has a value greater than 0.5
        print("Before pruning: ", A_tilde_k.coalesce().indices().size(1))
        if features:
            indices = A_tilde_k.coalesce().indices().to(device)
            indices = indices[:, similarity[indices[0], indices[1]] >= similarity.mean()]
            # Select only the initial number of edges
            if regular == False:
                if indices.size(1) > edge_index.size(1):
                    # We select the edges with the highest similarity
                    indices = indices[:, similarity[indices[0], indices[1]].argsort(descending=True)[:edge_index.size(1)]]       
            print("After pruning: ", indices.size(1))
            hops.append(indices.clone())

            #A_tilde_k = torch.sparse_coo_tensor(indices, torch.ones(indices.size(1)), (N, N)).to(device)
    #    attributes.append(A_tilde_k.clone().coalesce().values().to(device))
    # Nos guardamos la lista de hops
    import pickle
    with open('./data/hops_'+name+'.pkl', 'wb') as f:
        pickle.dump(hops, f)
    return hops#, attributes        
        


In [8]:

init_edge_index = nm.edge_index.clone()
hops = khop_graphs_sparse(nm.x,nm.edge_index,3,nm.name,"cpu")
hops.append(init_edge_index)
print("Done!")
nm.edge_index = hops


Done!


In [9]:
class MO_GNN_large(torch.nn.Module):
    def __init__(self, in_channels,hidden_channels, out_channels,num_layers,dropout=0.2,seed=12345):
        super(MO_GNN_large, self).__init__()
        # seed
        torch.manual_seed(seed)
        # Create the layers
        self.MLP = torch.nn.Linear(in_channels,hidden_channels)
        self.MLP2 = torch.nn.Linear(hidden_channels,hidden_channels)
        self.convs = torch.nn.ModuleList()
        self.init_conv = GCNConv(in_channels, hidden_channels)
        self.bn_extra = torch.nn.BatchNorm1d(hidden_channels)
        self.init_conv2 = GCNConv(hidden_channels, hidden_channels)
        self.bn_extra_2 = torch.nn.BatchNorm1d(hidden_channels)
        
        self.bn = torch.nn.ModuleList()
        for _ in range(num_layers):
            self.convs.append(GCNConv(in_channels, hidden_channels, cached=False, normalize=True, add_self_loops=True))
            self.bn.append(torch.nn.BatchNorm1d(hidden_channels))
        # Final layer
        #self.fc1 = Linear((num_layers + 2)*hidden_channels, out_channels)
        self.fc1 = Linear(hidden_channels, out_channels)
        # Attention mechanism
        self.att = nn.Parameter(torch.ones(num_layers + 2))
        self.sm = nn.Softmax(dim=0)
        # Dropout
        self.dropout = dropout
    def forward(self, x, edge_indexes):
        mask = self.sm(self.att)
#         print(mask)
#         print(mask.shape)
#         print(mask[-1])
        
#         print(edge_indexes[0])
#         print(edge_indexes[-1])
        # GCNConv over the original graph
#         print(edge_indexes[-1])
#         print(edge_indexes[-1]).shape
        extra_conv = self.init_conv(x, edge_indexes[-1]).relu()
        extra_conv = F.dropout(extra_conv, p=0.5, training=self.training)
        extra_conv = self.init_conv2(extra_conv, edge_indexes[-1]).relu() * mask[-1]
        
        # GCNConv over the n hops of graph
        embeddings = list()
        for i, conv in enumerate(self.convs):
            tmp_embedding = conv(x, edge_indexes[i]).relu() * mask[i]
            embeddings.append(tmp_embedding.unsqueeze(0))
        # MLP over the features of the graph
        x = self.MLP(x).relu()
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.MLP2(x).relu() * mask[-2]
        # Sum all the embeddings
        final_embedding = torch.cat(embeddings,dim=0)
        final_embedding = torch.cat([final_embedding, x.unsqueeze(0)], dim=0)
        final_embedding = torch.cat([final_embedding, extra_conv.unsqueeze(0)], dim=0)
        # Sum all the embeddings
        final_embedding = final_embedding.sum(dim=0)
        z = F.dropout(final_embedding, p=self.dropout, training=self.training)
        z = self.fc1(z).log_softmax(dim=-1)
        return z


def ind2mask(a, n):
    b = np.zeros(n)
    b[a] = 1
    return torch.tensor(b)


def train(model,data,edge_gpu,train_set_ind,train_label_ind):
    model.train()
    optimizer.zero_grad()
#     print("nininini:::{}".format(data.edge_index))
    out = model(data.x,edge_gpu)
    label=torch.nn.functional.one_hot(data.y[train_label_ind], num_classes=nm.num_classes)
    label=label.to(torch.float32)    
    train_set_ind=train_set_ind.to(dtype=torch.int64)

    loss = F.cross_entropy(out[train_label_ind], label)
    loss.backward()
    optimizer.step()

    return float(loss)

@torch.no_grad()
def test(model,data,edge_gpu,train_set_ind,train_label_ind,test_label_ind):
    model.eval()
    ret = model(data.x, edge_gpu)
    pred=ret.argmax(dim=-1)
    
    mask = train_label_ind
    trainacc=int((pred[mask] == data.y[mask]).sum()) / int(len(mask))
    mask = test_label_ind
    testacc=int((pred[mask] == data.y[mask]).sum()) / int(len(mask))
        
    return trainacc,testacc

if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#     dataset = CitationFull("./datasets", name="citeseer", transform=T.NormalizeFeatures())
    data = nm[0].to(device)
    edge_gpu = [tensor.to(device) for tensor in nm.edge_index]
    

    framedata=[]
    for percent in range(1,10):
        for num in range(10):

            index=[i for i in range(nm.x.shape[0])]
            train_set_ind,test_set_ind,train_label_ind,test_label_ind=train_test_split(index,index,test_size=0.1*percent,random_state=42,stratify=nm[0].y)
            train_mask=ind2mask(train_set_ind, nm.x.shape[0])
            test_mask=ind2mask(test_set_ind, nm.x.shape[0])

            value=[0.1*percent,0,0,0,0]
            
            
            model = MO_GNN_large(nm.num_features,32,nm.num_classes,2).to(device)
#             model = torch_geometric.compile(model)                   # Compile the model into an optimized version:
            optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)
            
            for epoch in range(0, 400):
                loss = train(model,data,edge_gpu,train_mask,train_label_ind)
                train_acc,test_acc = test(model,data,edge_gpu,test_mask,train_label_ind,test_label_ind)
                print(f'Epoch: {epoch}, Loss: {loss:.4f}, Train: {train_acc:.4f},'f'Test: {test_acc:.4f}')
                if test_acc > value[4]:
                    value[4] = test_acc 
                    value[1] = epoch
                    value[2] = loss
                    value[3] = train_acc
            print(value)
            framedata.append(value)
            time.sleep(0.5)
    pd.DataFrame(framedata).to_csv("./HEXGNN/HEXGNN_{}_3hop.csv".format(dsname))
