In [1]:
import datetime
import dgl
import errno
import numpy as np
import os
import pickle
import random
import argparse

import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F

from scipy import sparse
from scipy import io as sio
from pprint import pprint
from sklearn.metrics import f1_score
from dgl.data.utils import download, get_download_dir, _get_dgl_url
from dgl.nn.pytorch import GATConv

Using backend: pytorch


### 部分参数设置

In [2]:
# The configuration below is from the paper.
default_configure = {
    'lr': 0.005,             # Learning rate
    'num_heads': [8],        # Number of attention heads for node-level attention
    'hidden_units': 8,
    'dropout': 0.6,
    'weight_decay': 0.001,
    'num_epochs': 200,
    'patience': 100
}

sampling_configure = {
    'batch_size': 20
}

In [3]:
def setup(args):
    args.update(default_configure)
    set_random_seed(args['seed'])
    args['dataset'] = 'ACMRaw' if args['hetero'] else 'ACM'
    args['device'] = 'cuda:0' if torch.cuda.is_available() else 'cpu'
    return args

In [4]:
def setup_for_sampling(args):
    args.update(default_configure)
    args.update(sampling_configure)
    set_random_seed()
    args['device'] = 'cuda:0' if torch.cuda.is_available() else 'cpu'
    return args

#### 随机种子

In [5]:
def set_random_seed(seed=0):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)

#### 表可视化

In [6]:
def pd_show(data):
    data = np.array(data)
    return pd.DataFrame(data)

#### 数据编码

In [7]:
def get_binary_mask(total_size, indices):
    mask = torch.zeros(total_size)
    mask[indices] = 1
    return mask.byte()

#### 图可视化

In [8]:
def img(graph):
    nx.draw(graph, with_labels=False)
    plt.show()
    
"""
author_g 为邻接矩阵
G = author_g.to_networkx()
img(G)
"""

'\nauthor_g 为邻接矩阵\nG = author_g.to_networkx()\nimg(G)\n'

### 最有模型保存和使用

In [9]:
class EarlyStopping(object):
    def __init__(self, patience=10):
        dt = datetime.datetime.now()
        self.filename = 'early_stop_{}_{:02d}-{:02d}-{:02d}.pth'.format(dt.date(), dt.hour, dt.minute, dt.second)
        self.patience = patience
        self.counter = 0
        self.best_acc = None
        self.best_loss = None
        self.early_stop = False

    def step(self, loss, acc, model):
        if self.best_loss is None:
            self.best_acc = acc
            self.best_loss = loss
            self.save_checkpoint(model)
        elif (loss > self.best_loss) and (acc < self.best_acc):
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            if (loss <= self.best_loss) and (acc >= self.best_acc):
                self.save_checkpoint(model)
            self.best_loss = np.min((loss, self.best_loss))
            self.best_acc = np.max((acc, self.best_acc))
            self.counter = 0
        return self.early_stop

    def save_checkpoint(self, model):
        """Saves model when validation loss decreases."""
        torch.save(model.state_dict(), self.filename)

    def load_checkpoint(self, model):
        """Load the latest checkpoint."""
        model.load_state_dict(torch.load(self.filename))

### 读取数据（二选一）：ACM or ACMRaw

In [10]:
def load_acm(remove_self_loop):
    url = 'dataset/ACM3025.pkl'
    data_path = get_download_dir() + '/ACM3025.pkl'
    # download(_get_dgl_url(url), path=data_path) # 数据下载

    with open(data_path, 'rb') as f:  # 导入data数据。dict_keys(['label', 'feature', 'PAP', 'PLP', 'train_idx', 'val_idx', 'test_idx'])
        data = pickle.load(f)

    # 点标签 和 点特征
    labels, features = torch.from_numpy(data['label'].todense()).long(), \
                       torch.from_numpy(data['feature'].todense()).float()
    num_classes = labels.shape[1]
    labels = labels.nonzero()[:, 1]  # 将label的one-hot转换成类别

    # 去环
    if remove_self_loop:
        num_nodes = data['label'].shape[0]
        data['PAP'] = sparse.csr_matrix(data['PAP'] - np.eye(num_nodes))
        data['PLP'] = sparse.csr_matrix(data['PLP'] - np.eye(num_nodes))

    # 点 形成邻接矩阵
    # Adjacency matrices for meta path based neighbors
    # (Mufei): I verified both of them are binary adjacency matrices with self loops
    author_g = dgl.from_scipy(data['PAP'])   # 定义p-a-p的meta-path; # 建立dgl格式的graph
    subject_g = dgl.from_scipy(data['PLP'])  # 定义p-s-p的meta-path
    gs = [author_g, subject_g]  # 将两个meta-path形成的图组合在一起

    train_idx = torch.from_numpy(data['train_idx']).long().squeeze(0)
    val_idx = torch.from_numpy(data['val_idx']).long().squeeze(0)
    test_idx = torch.from_numpy(data['test_idx']).long().squeeze(0)

    num_nodes = author_g.number_of_nodes()  # 节点数量
    train_mask = get_binary_mask(num_nodes, train_idx)  # 对应位置上的节点设置为1，其余位置为0
    val_mask = get_binary_mask(num_nodes, val_idx)
    test_mask = get_binary_mask(num_nodes, test_idx)

    print('dataset loaded')
    pprint({
        'dataset': 'ACM',
        'train': train_mask.sum().item() / num_nodes,
        'val': val_mask.sum().item() / num_nodes,
        'test': test_mask.sum().item() / num_nodes
    })
    
    # Returns:
    # gs - PAP,PSP下的图; fetures - 节点特征; labels:labels; num_classes:label数量
    return gs, features, labels, num_classes, train_idx, val_idx, test_idx, train_mask, val_mask, test_mask

In [11]:
def load_acm_raw(remove_self_loop):
    assert not remove_self_loop
    url = 'dataset/ACM.mat'
    data_path = get_download_dir() + '/ACM.mat'
    # download(_get_dgl_url(url), path=data_path)
    
    # 数据没有定义meta-path的邻居; 这里通过边，自定义metapath的点边关系。
    data = sio.loadmat(data_path)
    p_vs_l = data['PvsL']       # paper-field?
    p_vs_a = data['PvsA']       # paper-author
    p_vs_t = data['PvsT']       # paper-term, bag of words
    p_vs_c = data['PvsC']       # paper-conference, labels come from that

    # We assign
    # (1) KDD papers as class 0 (data mining),
    # (2) SIGMOD and VLDB papers as class 1 (database),
    # (3) SIGCOMM and MOBICOMM papers as class 2 (communication)
    conf_ids = [0, 1, 9, 10, 13]  # 选择这5个会议的数据
    label_ids = [0, 1, 2, 2, 1]  # 分别标记为不同的labels

    p_vs_c_filter = p_vs_c[:, conf_ids]   # paper-conference; 选择对应到的paper
    p_selected = (p_vs_c_filter.sum(1) != 0).A1.nonzero()[0]   # 不发表在寻找会议中的文章，将其去除
    p_vs_l = p_vs_l[p_selected]  # 选择对应的节点
    p_vs_a = p_vs_a[p_selected]
    p_vs_t = p_vs_t[p_selected]
    p_vs_c = p_vs_c[p_selected]

    '''
    构建异构图
    >>> data_dict = {
    ...     ('user', 'follows', 'user'): (torch.tensor([0, 1]), torch.tensor([1, 2])),
    ...     ('user', 'follows', 'topic'): (torch.tensor([1, 1]), torch.tensor([1, 2])),
    ...     ('user', 'plays', 'game'): (torch.tensor([0, 3]), torch.tensor([3, 4]))
    ... }
    >>> g = dgl.heterograph(data_dict)
    '''
    # metapah:pa; pf
    hg = dgl.heterograph({
        ('paper', 'pa', 'author'): p_vs_a.nonzero(),   # paper-author构成边，关系='pa'
        ('author', 'ap', 'paper'): p_vs_a.transpose().nonzero(),
        ('paper', 'pf', 'field'): p_vs_l.nonzero(),
        ('field', 'fp', 'paper'): p_vs_l.transpose().nonzero()
    })
    # p_vs_t: 节点特征
    features = torch.FloatTensor(p_vs_t.toarray())
    # p_vs_c: labels
    pc_p, pc_c = p_vs_c.nonzero()
    labels = np.zeros(len(p_selected), dtype=np.int64)  # 定义labels
    for conf_id, label_id in zip(conf_ids, label_ids):  # 将label转换成
        labels[pc_p[pc_c == conf_id]] = label_id
    labels = torch.LongTensor(labels)

    num_classes = 3
    # trian,val,test
    float_mask = np.zeros(len(pc_p))  # 节点数量
    for conf_id in conf_ids:  # 对每个类别生层[0,1]之间等距的值; 目的是对每个类别按比例去train，val，test
        pc_c_mask = (pc_c == conf_id)
        float_mask[pc_c_mask] = np.random.permutation(np.linspace(0, 1, pc_c_mask.sum()))  # 每个节点对应的类别生成随机数，train，val使用
    train_idx = np.where(float_mask <= 0.2)[0]
    val_idx = np.where((float_mask > 0.2) & (float_mask <= 0.3))[0]
    test_idx = np.where(float_mask > 0.3)[0]

    num_nodes = hg.number_of_nodes('paper')   # 节点数量
    train_mask = get_binary_mask(num_nodes, train_idx)  # 对训练集mask
    val_mask = get_binary_mask(num_nodes, val_idx)
    test_mask = get_binary_mask(num_nodes, test_idx)
    # hg:异构图; features:节点特征; labels:labels; num_classes:类别数量
    return hg, features, labels, num_classes, train_idx, val_idx, test_idx, train_mask, val_mask, test_mask

In [12]:
def load_data(dataset, remove_self_loop=False):
    if dataset == 'ACM':
        return load_acm(remove_self_loop)
    elif dataset == 'ACMRaw':
        return load_acm_raw(remove_self_loop)
    else:
        return NotImplementedError('Unsupported dataset {}'.format(dataset))

### 配置模型（二选一）：框架现存HAN or 原生HAN

In [13]:
class SemanticAttention(nn.Module):
    def __init__(self, in_size, hidden_size=128):
        super(SemanticAttention, self).__init__()
        # input:[Node, metapath, in_size]; output:[None, metapath, 1]; 所有节点在每个meta-path上的重要性值
        self.project = nn.Sequential(
            nn.Linear(in_size, hidden_size),
            nn.Tanh(),
            nn.Linear(hidden_size, 1, bias=False)
        )

    def forward(self, z):
        w = self.project(z).mean(0)    # 每个节点在metapath维度的均值; mean(0): 每个meta-path上的均值(/|V|); (MetaPath, 1)
        beta = torch.softmax(w, dim=0)       # 归一化          # (M, 1)
        beta = beta.expand((z.shape[0],) + beta.shape) #  拓展到N个节点上的metapath的值   (N, M, 1)

        return (beta * z).sum(1)     #  (beta * z)=>所有节点，在metapath上的attention值;    (beta * z).sum(1)=>节点最终的值      (N, D * K)

#### 框架现存HAN

In [14]:
class HANLayer(nn.Module):
    def __init__(self, num_meta_paths, in_size, out_size, layer_num_heads, dropout):
        super(HANLayer, self).__init__()

        # One GAT layer for each meta path based adjacency matrix
        self.gat_layers = nn.ModuleList()
        for i in range(num_meta_paths):  # meta-path Layers; 两个meta-path的维度是一致的
            self.gat_layers.append(GATConv(in_size, out_size, layer_num_heads,
                                           dropout, dropout, activation=F.elu))
        self.semantic_attention = SemanticAttention(in_size=out_size * layer_num_heads)  # 语义attention; out-size*layers
        self.num_meta_paths = num_meta_paths

    def forward(self, gs, h):
        semantic_embeddings = []  # 语义级别的embeddings
        # 每个meta-path下对应到的节点级别的attention layer
        for i, g in enumerate(gs):  # 每个meta-path的图信息，求节点的attention. 两个GAT; gat_layers[i](g, h) g:图; h:features => [3025, 8, 8] => [3025, 64]
            semantic_embeddings.append(self.gat_layers[i](g, h).flatten(1))  # 两个GAT; gat_layers[i](g, h) 每个metapath对应一个GAT
        semantic_embeddings = torch.stack(semantic_embeddings, dim=1)   # (N, M, D * K) 每个节点对应到metapath下的每个节点的embedding值（Node Attention）
        # 聚合meta-path下，每个节点最终的输出值
        return self.semantic_attention(semantic_embeddings)                            # (N, D * K)

class HAN(nn.Module):
    def __init__(self, num_meta_paths, in_size, hidden_size, out_size, num_heads, dropout):
        super(HAN, self).__init__()

        self.layers = nn.ModuleList()
        self.layers.append(HANLayer(num_meta_paths, in_size, hidden_size, num_heads[0], dropout)) # meta-path数量 + semantic_attention
        for l in range(1, len(num_heads)): # 多层多头，目前是没有
            self.layers.append(HANLayer(num_meta_paths, hidden_size * num_heads[l-1],
                                        hidden_size, num_heads[l], dropout))
        self.predict = nn.Linear(hidden_size * num_heads[-1], out_size)  # hidden*heads, classes; HAN->classes

    def forward(self, g, h):
        for gnn in self.layers:  # GAT-GAT 节点级别的GAT; semantic_attention语义级别attention;
            h = gnn(g, h)  # HANLayer
            # 输出的是：节点, meta-path数量, embedding; Returns:节点HAN后输出的embedding
        return self.predict(h)  # HAN输出节点embedding后接一个Linear层

#### 原生HAN

In [15]:
class Or_HANLayer(nn.Module):
    def __init__(self, meta_paths, in_size, out_size, layer_num_heads, dropout):
        super(Or_HANLayer, self).__init__()

        # One GAT layer for each meta path based adjacency matrix
        self.gat_layers = nn.ModuleList()
        for i in range(len(meta_paths)):
            self.gat_layers.append(GATConv(in_size, out_size, layer_num_heads,
                                           dropout, dropout, activation=F.elu,
                                           allow_zero_in_degree=True))
        self.semantic_attention = SemanticAttention(in_size=out_size * layer_num_heads)
        self.meta_paths = list(tuple(meta_path) for meta_path in meta_paths)  # 将meta-path转换成元组形式

        self._cached_graph = None
        self._cached_coalesced_graph = {}

    def forward(self, g, h):
        semantic_embeddings = []

        if self._cached_graph is None or self._cached_graph is not g:  # 第一次，建立一张metapath下的异构图
            self._cached_graph = g
            self._cached_coalesced_graph.clear()
            for meta_path in self.meta_paths:
                self._cached_coalesced_graph[meta_path] = dgl.metapath_reachable_graph(
                        g, meta_path)  # 构建异构图的邻居;
        # self._cached_coalesced_graph 多个metapath下的异构图
        for i, meta_path in enumerate(self.meta_paths):
            new_g = self._cached_coalesced_graph[meta_path]  # meta-path下的节点邻居图
            semantic_embeddings.append(self.gat_layers[i](new_g, h).flatten(1))   # 图attention
        semantic_embeddings = torch.stack(semantic_embeddings, dim=1)                  # (N, M, D * K)

        return self.semantic_attention(semantic_embeddings)                            # (N, D * K)

class Or_HAN(nn.Module):
    def __init__(self, meta_paths, in_size, hidden_size, out_size, num_heads, dropout):
        super(Or_HAN, self).__init__()

        self.layers = nn.ModuleList()
        self.layers.append(Or_HANLayer(meta_paths, in_size, hidden_size, num_heads[0], dropout))
        for l in range(1, len(num_heads)):
            self.layers.append(Or_HANLayer(meta_paths, hidden_size * num_heads[l-1],
                                        hidden_size, num_heads[l], dropout))
        self.predict = nn.Linear(hidden_size * num_heads[-1], out_size)

    def forward(self, g, h):
        for gnn in self.layers:
            h = gnn(g, h)

        return self.predict(h)

#### 新增参数

In [16]:
parser = argparse.ArgumentParser('HAN')
parser.add_argument('-s', '--seed', type=int, default=1, help='Random seed')
parser.add_argument('-ld', '--log-dir', type=str, default='results', help='Dir for saving training results')
parser.add_argument('--hetero', action='store_true', help='Use metapath coalescing with DGL\'s own dataset')
args = parser.parse_args(args=[]).__dict__
args['hetero'] = False  # 自建异构图
args = setup(args)

In [17]:
g, features, labels, num_classes, train_idx, val_idx, test_idx, train_mask, val_mask, test_mask = load_data(args['dataset'])

dataset loaded
{'dataset': 'ACM',
 'test': 0.7024793388429752,
 'train': 0.19834710743801653,
 'val': 0.09917355371900827}


In [18]:
# len(g)

In [19]:
if hasattr(torch, 'BoolTensor'):
    train_mask = train_mask.bool()  # 布尔类型转换
    val_mask = val_mask.bool()
    test_mask = test_mask.bool()

In [20]:
if torch.cuda.is_available():
    features = features.to(args['device'])
    labels = labels.to(args['device'])
    train_mask = train_mask.to(args['device'])
    val_mask = val_mask.to(args['device'])
    test_mask = test_mask.to(args['device'])

In [21]:
args

{'seed': 1,
 'log_dir': 'results',
 'hetero': False,
 'lr': 0.005,
 'num_heads': [8],
 'hidden_units': 8,
 'dropout': 0.6,
 'weight_decay': 0.001,
 'num_epochs': 200,
 'patience': 100,
 'dataset': 'ACM',
 'device': 'cpu'}

In [22]:
if args['hetero']:   # 构建异构图的邻居节点
    # 原生HAN
    model = Or_HAN(
        meta_paths=[['pa', 'ap'], ['pf', 'fp']],  # 之前构建的边: pa, ap,组合成meta-path: PAP
        in_size=features.shape[1],
        hidden_size=args['hidden_units'],
        out_size=num_classes,
        num_heads=args['num_heads'],
        dropout=args['dropout']
    ).to(args['device'])
    g = g.to(args['device'])
else:
    # 框架现存HAN
    model = HAN(
        num_meta_paths=len(g),
        in_size=features.shape[1],
        hidden_size=args['hidden_units'],
        out_size=num_classes,
        num_heads=args['num_heads'],
        dropout=args['dropout']
    ).to(args['device'])
    g = [graph.to(args['device']) for graph in g]

- layers层是对点进行编码？？？

In [23]:
model

HAN(
  (layers): ModuleList(
    (0): HANLayer(
      (gat_layers): ModuleList(
        (0): GATConv(
          (fc): Linear(in_features=1870, out_features=64, bias=False)
          (feat_drop): Dropout(p=0.6, inplace=False)
          (attn_drop): Dropout(p=0.6, inplace=False)
          (leaky_relu): LeakyReLU(negative_slope=0.2)
        )
        (1): GATConv(
          (fc): Linear(in_features=1870, out_features=64, bias=False)
          (feat_drop): Dropout(p=0.6, inplace=False)
          (attn_drop): Dropout(p=0.6, inplace=False)
          (leaky_relu): LeakyReLU(negative_slope=0.2)
        )
      )
      (semantic_attention): SemanticAttention(
        (project): Sequential(
          (0): Linear(in_features=64, out_features=128, bias=True)
          (1): Tanh()
          (2): Linear(in_features=128, out_features=1, bias=False)
        )
      )
    )
  )
  (predict): Linear(in_features=64, out_features=3, bias=True)
)

In [24]:
stopper = EarlyStopping(patience=args['patience'])

In [25]:
loss_fcn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=args['lr'], weight_decay=args['weight_decay'])

### 准确率 & 评估

In [26]:
def score(logits, labels):
    _, indices = torch.max(logits, dim=1)
    prediction = indices.long().cpu().numpy()
    labels = labels.cpu().numpy()

    accuracy = (prediction == labels).sum() / len(prediction)
    micro_f1 = f1_score(labels, prediction, average='micro')
    macro_f1 = f1_score(labels, prediction, average='macro')

    return accuracy, micro_f1, macro_f1

In [27]:
def evaluate(model, g, features, labels, mask, loss_func):
    model.eval()
    with torch.no_grad():
        logits = model(g, features)
    loss = loss_func(logits[mask], labels[mask])
    accuracy, micro_f1, macro_f1 = score(logits[mask], labels[mask])

    return loss, accuracy, micro_f1, macro_f1

### 训练网络

In [28]:
for epoch in range(args['num_epochs']):
    model.train()
    logits = model(g, features)
    loss = loss_fcn(logits[train_mask], labels[train_mask])
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    train_acc, train_micro_f1, train_macro_f1 = score(logits[train_mask], labels[train_mask])
    val_loss, val_acc, val_micro_f1, val_macro_f1 = evaluate(model, g, features, labels, val_mask, loss_fcn)
    early_stop = stopper.step(val_loss.data.item(), val_acc, model)
    
    print('Epoch {:d} | Train Loss {:.4f} | Train Acc {:.4f} | Train Micro f1 {:.4f} | Train Macro f1 {:.4f} | '
              'Val Loss {:.4f} | Val Micro f1 {:.4f} | Val Macro f1 {:.4f}'.format(epoch + 1, loss.item(),
                 train_acc, train_micro_f1, train_macro_f1, val_loss.item(), val_micro_f1, val_macro_f1))

    if early_stop:
        break

  'precision', 'predicted', average, warn_for)


Epoch 1 | Train Loss 1.1285 | Train Acc 0.2300 | Train Micro f1 0.2300 | Train Macro f1 0.2224 | Val Loss 1.0002 | Val Micro f1 0.6500 | Val Macro f1 0.5208
Epoch 2 | Train Loss 0.9952 | Train Acc 0.6133 | Train Micro f1 0.6133 | Train Macro f1 0.5156 | Val Loss 0.8695 | Val Micro f1 0.9200 | Val Macro f1 0.9205
Epoch 3 | Train Loss 0.8612 | Train Acc 0.9050 | Train Micro f1 0.9050 | Train Macro f1 0.9053 | Val Loss 0.7423 | Val Micro f1 0.9400 | Val Macro f1 0.9405
Epoch 4 | Train Loss 0.7244 | Train Acc 0.9217 | Train Micro f1 0.9217 | Train Macro f1 0.9218 | Val Loss 0.6140 | Val Micro f1 0.9300 | Val Macro f1 0.9301
Epoch 5 | Train Loss 0.5860 | Train Acc 0.9383 | Train Micro f1 0.9383 | Train Macro f1 0.9380 | Val Loss 0.4972 | Val Micro f1 0.9400 | Val Macro f1 0.9397
Epoch 6 | Train Loss 0.4704 | Train Acc 0.9433 | Train Micro f1 0.9433 | Train Macro f1 0.9428 | Val Loss 0.3898 | Val Micro f1 0.9533 | Val Macro f1 0.9532
Epoch 7 | Train Loss 0.3744 | Train Acc 0.9517 | Train Mic

EarlyStopping counter: 26 out of 100
Epoch 48 | Train Loss 0.0455 | Train Acc 0.9883 | Train Micro f1 0.9883 | Train Macro f1 0.9883 | Val Loss 0.0999 | Val Micro f1 0.9700 | Val Macro f1 0.9700
EarlyStopping counter: 27 out of 100
Epoch 49 | Train Loss 0.0363 | Train Acc 0.9883 | Train Micro f1 0.9883 | Train Macro f1 0.9883 | Val Loss 0.1042 | Val Micro f1 0.9633 | Val Macro f1 0.9633
EarlyStopping counter: 28 out of 100
Epoch 50 | Train Loss 0.0450 | Train Acc 0.9867 | Train Micro f1 0.9867 | Train Macro f1 0.9867 | Val Loss 0.1143 | Val Micro f1 0.9667 | Val Macro f1 0.9666
EarlyStopping counter: 29 out of 100
Epoch 51 | Train Loss 0.0359 | Train Acc 0.9933 | Train Micro f1 0.9933 | Train Macro f1 0.9933 | Val Loss 0.1286 | Val Micro f1 0.9533 | Val Macro f1 0.9533
EarlyStopping counter: 30 out of 100
Epoch 52 | Train Loss 0.0317 | Train Acc 0.9917 | Train Micro f1 0.9917 | Train Macro f1 0.9917 | Val Loss 0.1286 | Val Micro f1 0.9533 | Val Macro f1 0.9533
EarlyStopping counter: 31

EarlyStopping counter: 69 out of 100
Epoch 91 | Train Loss 0.0344 | Train Acc 0.9933 | Train Micro f1 0.9933 | Train Macro f1 0.9933 | Val Loss 0.1145 | Val Micro f1 0.9567 | Val Macro f1 0.9567
EarlyStopping counter: 70 out of 100
Epoch 92 | Train Loss 0.0310 | Train Acc 0.9933 | Train Micro f1 0.9933 | Train Macro f1 0.9933 | Val Loss 0.0968 | Val Micro f1 0.9533 | Val Macro f1 0.9534
EarlyStopping counter: 71 out of 100
Epoch 93 | Train Loss 0.0297 | Train Acc 0.9933 | Train Micro f1 0.9933 | Train Macro f1 0.9933 | Val Loss 0.0921 | Val Micro f1 0.9700 | Val Macro f1 0.9700
EarlyStopping counter: 72 out of 100
Epoch 94 | Train Loss 0.0305 | Train Acc 0.9950 | Train Micro f1 0.9950 | Train Macro f1 0.9950 | Val Loss 0.0967 | Val Micro f1 0.9667 | Val Macro f1 0.9666
EarlyStopping counter: 73 out of 100
Epoch 95 | Train Loss 0.0329 | Train Acc 0.9933 | Train Micro f1 0.9933 | Train Macro f1 0.9933 | Val Loss 0.1062 | Val Micro f1 0.9533 | Val Macro f1 0.9533
EarlyStopping counter: 74

In [29]:
stopper.load_checkpoint(model)

In [30]:
test_loss, test_acc, test_micro_f1, test_macro_f1 = evaluate(model, g, features, labels, test_mask, loss_fcn)

In [32]:
print('Test loss {:.4f} | Test Acc {:.4f} | Test Micro f1 {:.4f} | Test Macro f1 {:.4f}'.format(test_loss.item(), test_acc, test_micro_f1, test_macro_f1))

Test loss 0.3348 | Test Acc 0.8847 | Test Micro f1 0.8847 | Test Macro f1 0.8849
