In [1]:
import dgl
import numpy as np
import scipy.io
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import datetime
import os
import torch.nn.functional as F
import dgl.function as fn


data_file_path = './ACM.mat'
data = scipy.io.loadmat(data_file_path)
print(list(data.keys()))


Using backend: pytorch


['__header__', '__version__', '__globals__', 'TvsP', 'PvsA', 'PvsV', 'AvsF', 'VvsC', 'PvsL', 'PvsC', 'A', 'C', 'F', 'L', 'P', 'T', 'V', 'PvsT', 'CNormPvsA', 'RNormPvsA', 'CNormPvsC', 'RNormPvsC', 'CNormPvsT', 'RNormPvsT', 'CNormPvsV', 'RNormPvsV', 'CNormVvsC', 'RNormVvsC', 'CNormAvsF', 'RNormAvsF', 'CNormPvsL', 'RNormPvsL', 'stopwords', 'nPvsT', 'nT', 'CNormnPvsT', 'RNormnPvsT', 'nnPvsT', 'nnT', 'CNormnnPvsT', 'RNormnnPvsT', 'PvsP', 'CNormPvsP', 'RNormPvsP']


In [2]:
data['PvsC']

<12499x14 sparse matrix of type '<class 'numpy.float64'>'
	with 12499 stored elements in Compressed Sparse Column format>

In [3]:
G = dgl.heterograph({
        ('paper', 'written-by', 'author') : data['PvsA'].nonzero(),
        ('author', 'writing', 'paper') : data['PvsA'].transpose().nonzero(),
        ('paper', 'citing', 'paper') : data['PvsP'].nonzero(),
        ('paper', 'cited', 'paper') : data['PvsP'].transpose().nonzero(),
        ('paper', 'is-about', 'subject') : data['PvsL'].nonzero(),
        ('subject', 'has', 'paper') : data['PvsL'].transpose().nonzero(),
        ('paper', 'published-in', 'venue') : data['PvsV'].nonzero(),
        ('venue', 'published', 'paper') : data['PvsV'].transpose().nonzero(),
        ('paper', 'related-to', 'field') : data['PvsL'].nonzero(),
        ('field', 'described-by', 'paper') : data['PvsL'].transpose().nonzero(),
        ('paper', 'contains', 'term'):  data['PvsT'].nonzero(),
        ('term', 'consist-of', 'paper'):  data['PvsT'].transpose().nonzero()
    })

In [4]:
print(G)

Graph(num_nodes={'author': 17431, 'field': 73, 'paper': 12499, 'subject': 73, 'term': 1903, 'venue': 196},
      num_edges={('author', 'writing', 'paper'): 37055, ('field', 'described-by', 'paper'): 12499, ('paper', 'cited', 'paper'): 30789, ('paper', 'citing', 'paper'): 30789, ('paper', 'contains', 'term'): 972973, ('paper', 'is-about', 'subject'): 12499, ('paper', 'published-in', 'venue'): 12499, ('paper', 'related-to', 'field'): 12499, ('paper', 'written-by', 'author'): 37055, ('subject', 'has', 'paper'): 12499, ('term', 'consist-of', 'paper'): 972973, ('venue', 'published', 'paper'): 12499},
      metagraph=[('author', 'paper', 'writing'), ('paper', 'paper', 'cited'), ('paper', 'paper', 'citing'), ('paper', 'term', 'contains'), ('paper', 'subject', 'is-about'), ('paper', 'venue', 'published-in'), ('paper', 'field', 'related-to'), ('paper', 'author', 'written-by'), ('field', 'paper', 'described-by'), ('term', 'paper', 'consist-of'), ('subject', 'paper', 'has'), ('venue', 'paper', 'p

In [5]:
class HeterogeneousRGCNLayer(nn.Module):
    def __init__(self, input_size, output_size, etypes):
        super(HeterogeneousRGCNLayer, self).__init__()
        self.weight = nn.ModuleDict({
                name : nn.Linear(input_size, output_size) for name in etypes
            })
        
        
    def forward(self, G, feat_dict):
        funcs = {}
        for srctype, etype, dsttype in G.canonical_etypes:
            Wh = self.weight[etype](feat_dict[srctype])
            G.nodes[srctype].data['Wh_%s' % etype] = Wh
            funcs[etype] = (fn.copy_u('Wh_%s' % etype, 'm'), fn.mean('m', 'h'))
        G.multi_update_all(funcs, 'sum')
        return {ntype : G.nodes[ntype].data['h'] for ntype in G.ntypes}

In [6]:
class HeterogeneousRGCN(nn.Module):
    def __init__(self, G, input_size, hidden_layer_size, output_size):
        super(HeterogeneousRGCN, self).__init__()
        embed_dict = {ntype : nn.Parameter(torch.Tensor(G.number_of_nodes(ntype), input_size))
                      for ntype in G.ntypes}
        for key, embed in embed_dict.items():
            nn.init.xavier_uniform_(embed)
        self.embed = nn.ParameterDict(embed_dict)
        self.layer1 = HeterogeneousRGCNLayer(input_size, hidden_layer_size, G.etypes)
        self.layer2 = HeterogeneousRGCNLayer(hidden_layer_size, output_size, G.etypes)
        
    def forward(self, G):
        h_dict = self.layer1(G, self.embed)
        h_dict = {k : F.relu(h) for k, h in h_dict.items()}
        h_dict = self.layer2(G, h_dict)
        return h_dict['paper']

In [7]:
pvc = data['PvsC'].tocsr()
c_selected = [0, 12, 13]  # KDD, COLT, VLDB
p_selected = pvc[:, c_selected].tocoo()

labels = pvc.indices
labels[labels == 12] = 1
labels[labels == 13] = 2
labels = torch.tensor(labels).long()

# split of train/val/test 
pid = p_selected.row
shuffle = np.random.permutation(pid)
train_idx = torch.tensor(shuffle[0:1200]).long()
val_idx = torch.tensor(shuffle[1200:1500]).long()
test_idx = torch.tensor(shuffle[1500:]).long()

In [8]:
def prepare_accuracy_csv(loss, train_acc, validation_acc, test_acc, total_epoch, total_duration, timestamp):
    accuracy_csv_data = []
    accuracy_csv_data.append([str(loss),str(train_acc),str(validation_acc), str(test_acc), str(total_epoch), str(total_duration), str(timestamp)])
    return accuracy_csv_data


In [9]:
def write_csv(file_name, columns_name, data):
    df_csv = pd.DataFrame(data, columns = columns_name)
    if not os.path.isfile(file_name):
       df_csv.to_csv(file_name, header='column_names')
    else: # else it exists so append without writing the header
       df_csv.to_csv(file_name, header=False, mode='a')

In [10]:
accuracy_columns = ['Loss','Train_Acc', 'Validation_Acc', 'Test_Acc', 'Total_Epoch', 'Total_Duration','TimeStamp']
modelaccuracy_file_name = 'train_modelaccuracy.csv'


In [11]:
def run_model(total_epoch):
    print ("Started")
    start_time = datetime.datetime.now().replace(microsecond=0)
    model = HeterogeneousRGCN(G, 20, 20, 3)

    opt = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)

    for epoch in range(total_epoch):
        logits = model(G)
        model.train()
        opt.zero_grad()
        loss = F.cross_entropy(logits[train_idx], labels[train_idx])
        pred = logits.argmax(1)
        train_acc = (pred[train_idx] == labels[train_idx]).float().mean()
        valid_acc = (pred[val_idx] == labels[val_idx]).float().mean()
        test_acc = (pred[test_idx] == labels[test_idx]).float().mean()
        loss.backward()
        opt.step()

        if epoch == (total_epoch - 1):
            end_time = datetime.datetime.now().replace(microsecond=0)
            total_time = end_time - start_time
            print ("Train_Acc :", train_acc)
            print ("Validation_Acc :",valid_acc)
            print ("Test_Acc :",test_acc)
            print ("Total_Epoch :",total_epoch)
            print ("Total_Duration :",total_time)
            data_accuracy = prepare_accuracy_csv(loss.item(),train_acc.item(), valid_acc.item(), test_acc.item(), total_epoch, total_time, end_time)
            write_csv(modelaccuracy_file_name, accuracy_columns, data_accuracy)
            print ("Finished")

In [12]:
run_model(100)

Started
Train_Acc : tensor(1.)
Validation_Acc : tensor(0.9567)
Test_Acc : tensor(0.9739)
Total_Epoch : 100
Total_Duration : 0:00:09
Finished
