In [52]:
import dgl
import numpy as np
import scipy.io
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import datetime
import os
import torch.nn.functional as F
import dgl.function as fn
from sklearn.metrics import f1_score


import scipy.io as sio
import urllib.request

from dgl.data.utils import download, get_download_dir, _get_dgl_url


url = 'dataset/ACM.mat'
data_path = get_download_dir() + '/ACM.mat'
download(_get_dgl_url(url), path=data_path)

data = sio.loadmat(data_path)


Downloading C:\Users\Abdullah Al Amin\.dgl/ACM.mat from https://data.dgl.ai/dataset/ACM.mat...


In [53]:
metapaths = [['written-by','writing'],['is-about','has']]
def get_acm_data():
    
    G = dgl.heterograph({
        ('paper', 'written-by', 'author') : data['PvsA'].nonzero(),
        ('author', 'writing', 'paper') : data['PvsA'].transpose().nonzero(),
        ('paper', 'citing', 'paper') : data['PvsP'].nonzero(),
        ('paper', 'cited', 'paper') : data['PvsP'].transpose().nonzero(),
        ('paper', 'is-about', 'subject') : data['PvsL'].nonzero(),
        ('subject', 'has', 'paper') : data['PvsL'].transpose().nonzero(),
        ('paper', 'published-in', 'venue') : data['PvsV'].nonzero(),
        ('venue', 'published', 'paper') : data['PvsV'].transpose().nonzero(),
        ('paper', 'related-to', 'field') : data['PvsL'].nonzero(),
        ('field', 'described-by', 'paper') : data['PvsL'].transpose().nonzero(),
        ('paper', 'contains', 'term'):  data['PvsT'].nonzero(),
        ('term', 'consist-of', 'paper'):  data['PvsT'].transpose().nonzero()
    })
    
    pvc = data['PvsC'].tocsr()
    c_selected = [0, 11, 13]  # SODA, COLT, VLDB
    p_selected = pvc[:, c_selected].tocoo()

    labels = pvc.indices
    labels[labels == 11] = 1
    labels[labels == 13] = 2
    labels = torch.tensor(labels).long()
    
    p_vs_c_filter = pvc[:, c_selected]
    p_sel = (p_vs_c_filter.sum(1) != 0).A1.nonzero()[0]
    p_vs_t = data['PvsT'].tocsr()[p_sel]
    features = torch.FloatTensor(data['PvsT'].toarray())


    num_classes = len(c_selected)
    # split of train/val/test 
    pid = p_selected.row
    shuffle = np.random.permutation(pid)
    train_idx = torch.tensor(shuffle[0:1200]).long()
    val_idx = torch.tensor(shuffle[1200:1500]).long()
    test_idx = torch.tensor(shuffle[1500:]).long()
    
    return G, labels, train_idx, val_idx, test_idx,num_classes, features
    

In [54]:
class HeterogeneousRGCNLayer(nn.Module):
    def __init__(self, input_size, output_size, etypes):
        super(HeterogeneousRGCNLayer, self).__init__()
        self.weight = nn.ModuleDict({
                name : nn.Linear(input_size, output_size) for name in etypes
            })
        
        
    def forward(self, G, feat_dict):
        funcs = {}
        for srctype, etype, dsttype in G.canonical_etypes:
            Wh = self.weight[etype](feat_dict[srctype])
            G.nodes[srctype].data['Wh_%s' % etype] = Wh
            funcs[etype] = (fn.copy_u('Wh_%s' % etype, 'm'), fn.mean('m', 'h'))
        G.multi_update_all(funcs, 'sum')
        return {ntype : G.nodes[ntype].data['h'] for ntype in G.ntypes}

In [55]:
class HeterogeneousRGCN(nn.Module):
    def __init__(self, G, input_size, hidden_layer_size, output_size):
        super(HeterogeneousRGCN, self).__init__()
        embed_dict = {ntype : nn.Parameter(torch.Tensor(G.number_of_nodes(ntype), input_size))
                      for ntype in G.ntypes}
        for key, embed in embed_dict.items():
            nn.init.xavier_uniform_(embed)
        self.embed = nn.ParameterDict(embed_dict)
        self.layer1 = HeterogeneousRGCNLayer(input_size, hidden_layer_size, G.etypes)
        self.layer2 = HeterogeneousRGCNLayer(hidden_layer_size, output_size, G.etypes)
        
    def forward(self, G):
        h_dict = self.layer1(G, self.embed)
        h_dict = {k : F.relu(h) for k, h in h_dict.items()}
        h_dict = self.layer2(G, h_dict)
        return h_dict['paper']

In [56]:
def prepare_accuracy_csv(model,loss, train_acc, validation_acc, test_acc, total_epoch, total_duration, timestamp):
    accuracy_csv_data = []
    accuracy_csv_data.append([model,str(loss),str(train_acc),str(validation_acc), str(test_acc), str(total_epoch), str(total_duration), str(timestamp)])
    return accuracy_csv_data


In [57]:
def write_csv(file_name, columns_name, data):
    df_csv = pd.DataFrame(data, columns = columns_name)
    if not os.path.isfile(file_name):
       df_csv.to_csv(file_name, header='column_names')
    else: # else it exists so append without writing the header
       df_csv.to_csv(file_name, header=False, mode='a')

In [58]:
accuracy_columns = ['model','Loss(Cross_entropy)','Train_Acc', 'Validation_Acc', 'Test_Acc', 'Total_Epoch', 'Total_Duration','TimeStamp']
modelaccuracy_file_name = 'train_modelaccuracy.csv'

G, labels, train_idx, val_idx, test_idx, num_classes, features = get_acm_data()

In [59]:
from dgl.nn.pytorch import GATConv

class SemanticAttention(nn.Module):
    def __init__(self, in_size, hidden_size=128):
        super(SemanticAttention, self).__init__()

        self.project = nn.Sequential(
            nn.Linear(in_size, hidden_size),
            nn.Tanh(),
            nn.Linear(hidden_size, 1, bias=False)
        )

    def forward(self, z):
        w = self.project(z).mean(0)                    # (M, 1)
        beta = torch.softmax(w, dim=0)                 # (M, 1)
        beta = beta.expand((z.shape[0],) + beta.shape) # (N, M, 1)

        return (beta * z).sum(1)                       # (N, D * K)

class HANLayer(nn.Module):
    def __init__(self, meta_paths, in_size, out_size, layer_num_heads, dropout):
        super(HANLayer, self).__init__()

        # One GAT layer for each meta path based adjacency matrix
        self.gat_layers = nn.ModuleList()
        for i in range(len(meta_paths)):
            self.gat_layers.append(GATConv(in_size, out_size, layer_num_heads,
                                           dropout, dropout, activation=F.elu,
                                           allow_zero_in_degree=True))
        self.semantic_attention = SemanticAttention(in_size=out_size * layer_num_heads)
        self.meta_paths = list(tuple(meta_path) for meta_path in meta_paths)

        self._cached_graph = None
        self._cached_coalesced_graph = {}

    def forward(self, g, h):
        semantic_embeddings = []

        if self._cached_graph is None or self._cached_graph is not g:
            self._cached_graph = g
            self._cached_coalesced_graph.clear()
            for meta_path in self.meta_paths:
                self._cached_coalesced_graph[meta_path] = dgl.metapath_reachable_graph(
                        g, meta_path)

        for i, meta_path in enumerate(self.meta_paths):
            new_g = self._cached_coalesced_graph[meta_path]
            semantic_embeddings.append(self.gat_layers[i](new_g, h).flatten(1))
        semantic_embeddings = torch.stack(semantic_embeddings, dim=1)                  # (N, M, D * K)

        return self.semantic_attention(semantic_embeddings)                            # (N, D * K)

class HAN(nn.Module):
    def __init__(self, meta_paths, in_size, hidden_size, out_size, num_heads, dropout):
        super(HAN, self).__init__()

        self.layers = nn.ModuleList()
        self.layers.append(HANLayer(meta_paths, in_size, hidden_size, num_heads[0], dropout))
        for l in range(1, len(num_heads)):
            self.layers.append(HANLayer(meta_paths, hidden_size * num_heads[l-1],
                                        hidden_size, num_heads[l], dropout))
        self.predict = nn.Linear(hidden_size * num_heads[-1], out_size)

    def forward(self, g, h):
        for gnn in self.layers:
            h = gnn(g, h)

        return self.predict(h)

In [60]:
def get_binary_mask(total_size, indices):
    mask = torch.zeros(total_size)
    mask[indices] = 1
    return mask.byte()

def score(logits, labels):
    _, indices = torch.max(logits, dim=1)
    prediction = indices.long().cpu().numpy()
    labels = labels.cpu().numpy()

    accuracy = (prediction == labels).sum() / len(prediction)
    micro_f1 = f1_score(labels, prediction, average='micro')
    macro_f1 = f1_score(labels, prediction, average='macro')

    return accuracy, micro_f1, macro_f1

def evaluate(model, g, features, labels, mask, loss_func):
    model.eval()
    with torch.no_grad():
        logits = model(g, features)
    loss = loss_func(logits[mask], labels[mask])
    accuracy, micro_f1, macro_f1 = score(logits[mask], labels[mask]) 
    return loss, accuracy, micro_f1, macro_f1
    


In [61]:
def run_model_hgcn(total_epoch=10):
    print ("starting full batch HRGCN model")
    start_time = datetime.datetime.now().replace(microsecond=0)
    model = HeterogeneousRGCN(G, 20, 20, 3)

    opt = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)

    for epoch in range(total_epoch):
        logits = model(G)
        model.train()
        opt.zero_grad()
        loss = F.cross_entropy(logits[train_idx], labels[train_idx])
        pred = logits.argmax(1)
        
        train_acc = (pred[train_idx] == labels[train_idx]).float().mean()
        valid_acc = (pred[val_idx] == labels[val_idx]).float().mean()
        test_acc = (pred[test_idx] == labels[test_idx]).float().mean()
        
        loss.backward()
        opt.step()

        if epoch == (total_epoch - 1):
            end_time = datetime.datetime.now().replace(microsecond=0)
            total_time = end_time - start_time
            print ("Train_Acc :", train_acc)
            print ("Validation_Acc :",valid_acc)
            print ("Test_Acc :",test_acc)
            print ("Total_Epoch :",total_epoch)
            print ("Total_Duration :",total_time)
            data_accuracy = prepare_accuracy_csv('HeteroRGCN',loss.item(),train_acc.item(), valid_acc.item(), test_acc.item(), total_epoch, total_time, end_time)
            write_csv(modelaccuracy_file_name, accuracy_columns, data_accuracy)
            print ("Finished training HRGCN model")

In [62]:
han_configure = {
    'lr': 0.005,             # Learning rate
    'num_heads': [8],        # Number of attention heads for node-level attention
    'hidden_units': 8,
    'dropout': 0.6,
    'weight_decay': 0.001,
    'num_epochs': 200,
    'patience': 100
}   

def run_model_han(num_epochs=10):
    model = HAN(meta_paths=[['written-by','writing'],['citing','cited']],
                    in_size=features.shape[1],
                    hidden_size=han_configure['hidden_units'],
                    out_size=num_classes,
                    num_heads=han_configure['num_heads'],
                    dropout=han_configure['dropout'])
    
    num_nodes = G.number_of_nodes('paper')
    train_mask = get_binary_mask(num_nodes,train_idx)
    val_mask = get_binary_mask(num_nodes,val_idx)
    test_mask = get_binary_mask(num_nodes,test_idx)
    
    print('starting HAN model')
    start_time = datetime.datetime.now().replace(microsecond=0)
    loss_fcn = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=han_configure['lr'], weight_decay=han_configure['weight_decay'])

    for epoch in range(num_epochs):
        model.train()
        logits = model(G, features)
        loss = loss_fcn(logits[train_mask], labels[train_mask])

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


        train_acc, train_micro_f1, train_macro_f1 = score(logits[train_mask], labels[train_mask])
        val_loss, val_acc, val_micro_f1, val_macro_f1 = evaluate(model, G, features, labels, val_mask, loss_fcn)



    test_loss, test_acc, test_micro_f1, test_macro_f1 = evaluate(model, G, features, labels, test_mask, loss_fcn)
    
    if epoch == (num_epochs - 1):
            end_time = datetime.datetime.now().replace(microsecond=0)
            total_time = end_time - start_time
            print ("Train_Acc :", train_acc)
            print ("Validation_Acc :",val_acc)
            print ("Test_Acc :",test_acc)
            print ("Total_Epoch :",num_epochs)
            print ("Total_Duration :",total_time)
            data_accuracy = prepare_accuracy_csv('HAN',loss.item(),train_acc.item(), val_acc.item(), test_acc.item(), num_epochs, total_time, end_time)
            write_csv(modelaccuracy_file_name, accuracy_columns, data_accuracy)
            print ("Finished training han model")


In [63]:
run_model_han(100)

starting HAN model


  allow_unreachable=True)  # allow_unreachable flag


Train_Acc : 0.9483333333333334
Validation_Acc : 0.8933333333333333
Test_Acc : 0.8367003367003367
Total_Epoch : 100
Total_Duration : 0:04:02
Finished training han model


In [65]:
run_model_hgcn()

starting full batch HRGCN model
Train_Acc : tensor(0.5233)
Validation_Acc : tensor(0.4933)
Test_Acc : tensor(0.4848)
Total_Epoch : 10
Total_Duration : 0:00:02
Finished training HRGCN model


In [67]:
def run_experiments(model_name,epochs):
    if model_name is 'han':
        run_model_han(epochs)
    elif model_name is 'hgcn':
        run_model_hgcn(epochs)
    else:
        print('Model is not recognized. please use han or hgcn as model name')
    

In [68]:
run_experiments('han',10)

starting HAN model




Train_Acc : 0.8133333333333334
Validation_Acc : 0.8533333333333334
Test_Acc : 0.8047138047138047
Total_Epoch : 10
Total_Duration : 0:00:28
Finished training han model
