In [1]:
import torch

def format_pytorch_version(version):
  return version.split('+')[0]

TORCH_version = torch.__version__
TORCH = format_pytorch_version(TORCH_version)

def format_cuda_version(version):
  return 'cu' + version.replace('.', '')

CUDA_version = torch.version.cuda
CUDA = format_cuda_version(CUDA_version)

!pip install torch-scatter -f https://data.pyg.org/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-cluster -f https://data.pyg.org/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-spline-conv -f https://data.pyg.org/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-geometric

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://data.pyg.org/whl/torch-1.12.1+cu113.html
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://data.pyg.org/whl/torch-1.12.1+cu113.html
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://data.pyg.org/whl/torch-1.12.1+cu113.html
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://data.pyg.org/whl/torch-1.12.1+cu113.html
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!pip install ogb
!pip install umap-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
from typing import Callable, Optional
from torch_geometric.typing import Adj, OptTensor

import torch
from torch import Tensor
import torch.nn.functional as F
from torch_sparse import SparseTensor, matmul
from torch_geometric.nn.conv import MessagePassing
from torch_geometric.nn.conv.gcn_conv import gcn_norm

import torch.nn as nn
from torch.nn import ModuleList, Linear, BatchNorm1d
import torch.optim as optim
import numpy as np

from torch_geometric.nn import GCNConv, GATConv, SAGEConv, JumpingKnowledge
from torch_geometric.data import NeighborSampler
import torch_geometric.transforms as T

from ogb.nodeproppred import PygNodePropPredDataset, Evaluator
from tqdm import tqdm

from torch_geometric.utils import to_undirected
from torch_sparse import SparseTensor

In [4]:
class Logger(object):
    def __init__(self, runs, info=None):
        self.info = info
        self.results = [[] for _ in range(runs)]

    def add_result(self, run, result):
        assert len(result) == 3
        assert run >= 0 and run < len(self.results)
        self.results[run].append(result)

    def print_statistics(self, run=None, print_all=True):
        if run is not None:
            result = 100 * torch.tensor(self.results[run])
            argmax = result[:, 1].argmax().item()
            print(f'Run {run + 1:02d}:', flush=True)
            print(f'Highest Train: {result[:, 0].max():.2f}', flush=True)
            print(f'Highest Valid: {result[:, 1].max():.2f}', flush=True)
            print(f'  Final Train: {result[argmax, 0]:.2f}', flush=True)
            print(f'   Final Test: {result[argmax, 2]:.2f}', flush=True)
        else:
            result = 100 * torch.tensor(self.results)

            best_results = []
            for r in result:
                train1 = r[:, 0].max().item()
                valid = r[:, 1].max().item()
                train2 = r[r[:, 1].argmax(), 0].item()
                test = r[r[:, 1].argmax(), 2].item()
                best_results.append((train1, valid, train2, test))

            best_result = torch.tensor(best_results)

            print(f'All runs:', flush=True)
            if print_all:
                r = best_result[:, 0]
                print(f'Highest Train: {r.mean():.2f} ± {r.std():.2f}', flush=True)
                r = best_result[:, 1]
                print(f'Highest Valid: {r.mean():.2f} ± {r.std():.2f}', flush=True)
                r = best_result[:, 2]
                print(f'  Final Train: {r.mean():.2f} ± {r.std():.2f}', flush=True)
            r = best_result[:, 3]
            print(f'   Final Test: {r.mean():.2f} ± {r.std():.2f}', flush=True)

In [5]:
class LabelPropagation(MessagePassing):
    def __init__(self, num_layers: int, alpha: float):
        super(LabelPropagation, self).__init__(aggr='add')
        self.num_layers = num_layers
        self.alpha = alpha

    @torch.no_grad()
    def forward(
            self, y: Tensor, edge_index: Adj, mask: Optional[Tensor] = None,
            edge_weight: OptTensor = None,
            post_step: Callable = lambda y: y.clamp_(0., 1.)
    ) -> Tensor:
        """"""

        if y.dtype == torch.long:
            y = F.one_hot(y.view(-1)).to(torch.float)

        out = y
        if mask is not None:
            out = torch.zeros_like(y)
            out[mask] = y[mask]

        if isinstance(edge_index, SparseTensor) and not edge_index.has_value():
            edge_index = gcn_norm(edge_index, add_self_loops=False)
        elif isinstance(edge_index, Tensor) and edge_weight is None:
            edge_index, edge_weight = gcn_norm(edge_index, num_nodes=y.size(0),
                                               add_self_loops=False)

        res = (1 - self.alpha) * out
        for _ in range(self.num_layers):
            # propagate_type: (y: Tensor, edge_weight: OptTensor)
            out = self.propagate(edge_index, x=out, edge_weight=edge_weight,
                                 size=None)
            out.mul_(self.alpha).add_(res)
            out = post_step(out)

        return out

    def message(self, x_j: Tensor, edge_weight: OptTensor) -> Tensor:
        return x_j if edge_weight is None else edge_weight.view(-1, 1) * x_j

    def message_and_aggregate(self, adj_t: SparseTensor, x: Tensor) -> Tensor:
        return matmul(adj_t, x, reduce=self.aggr)

    def __repr__(self):
        return '{}(num_layers={}, alpha={})'.format(self.__class__.__name__,
                                                    self.num_layers,
                                                    self.alpha)

In [6]:
class CorrectAndSmooth(torch.nn.Module):
    def __init__(self, num_correction_layers: int, correction_alpha: float,
                 num_smoothing_layers: int, smoothing_alpha: float,
                 autoscale: bool = True, scale: float = 1.0):
        super(CorrectAndSmooth, self).__init__()
        self.autoscale = autoscale
        self.scale = scale

        self.prop1 = LabelPropagation(num_correction_layers, correction_alpha)
        self.prop2 = LabelPropagation(num_smoothing_layers, smoothing_alpha)

    def correct(self, y_soft: Tensor, y_true: Tensor, mask: Tensor,
                edge_index: Adj, edge_weight: OptTensor = None) -> Tensor:
        assert abs((float(y_soft.sum()) / y_soft.size(0)) - 1.0) < 1e-2

        numel = int(mask.sum()) if mask.dtype == torch.bool else mask.size(0)
        assert y_true.size(0) == numel

        if y_true.dtype == torch.long:
            y_true = F.one_hot(y_true.view(-1), y_soft.size(-1))
            y_true = y_true.to(y_soft.dtype)

        error = torch.zeros_like(y_soft)
        error[mask] = y_true - y_soft[mask]

        if self.autoscale:
            smoothed_error = self.prop1(error, edge_index,
                                        edge_weight=edge_weight,
                                        post_step=lambda x: x.clamp_(-1., 1.))

            sigma = error[mask].abs().sum() / numel
            scale = sigma / smoothed_error.abs().sum(dim=1, keepdim=True)
            scale[scale.isinf() | (scale > 1000)] = 1.0
            return y_soft + scale * smoothed_error
        else:

            def fix_input(x):
                x[mask] = error[mask]
                return x

            smoothed_error = self.prop1(error, edge_index,
                                        edge_weight=edge_weight,
                                        post_step=fix_input)
            return y_soft + self.scale * smoothed_error

    def smooth(self, y_soft: Tensor, y_true: Tensor, mask: Tensor,
               edge_index: Adj, edge_weight: OptTensor = None) -> Tensor:

        numel = int(mask.sum()) if mask.dtype == torch.bool else mask.size(0)
        assert y_true.size(0) == numel

        if y_true.dtype == torch.long:
            y_true = F.one_hot(y_true.view(-1), y_soft.size(-1))
            y_true = y_true.to(y_soft.dtype)

        y_soft[mask] = y_true

        return self.prop2(y_soft, edge_index, edge_weight=edge_weight)

    def __repr__(self):
        L1, alpha1 = self.prop1.num_layers, self.prop1.alpha
        L2, alpha2 = self.prop2.num_layers, self.prop2.alpha
        return (f'{self.__class__.__name__}(\n'
                f'    correct: num_layers={L1}, alpha={alpha1}\n'
                f'    smooth:  num_layers={L2}, alpha={alpha2}\n'
                f'    autoscale={self.autoscale}, scale={self.scale}\n'
                ')')

## Dataset

In [7]:
dataset = PygNodePropPredDataset(name='ogbn-products', root='./products/')
print(dataset, flush=True)

PygNodePropPredDataset()


In [8]:
data = dataset[0]
print(data, flush=True)

Data(num_nodes=2449029, edge_index=[2, 123718280], x=[2449029, 100], y=[2449029, 1])


In [9]:
# split_idx contains a dictionary of train, validation and test node indices
split_idx = dataset.get_idx_split()
# predefined ogb evaluator method used for validation of predictions
evaluator = Evaluator(name='ogbn-products')

In [10]:
# lets check the node ids distribution of train, test and val
print('Number of training nodes:', split_idx['train'].size(0))
print('Number of validation nodes:', split_idx['valid'].size(0))
print('Number of test nodes:', split_idx['test'].size(0))

Number of training nodes: 196615
Number of validation nodes: 39323
Number of test nodes: 2213091


In [11]:
# lets check some graph statistics of ogb-product graph
print("Number of nodes in the graph:", data.num_nodes)
print("Number of edges in the graph:", data.num_edges)
print("---------------------------------------------")
print("Node feature matrix with shape:", data.x.shape) # [num_nodes, num_node_features]
print("Graph connectivity in COO format with shape:", data.edge_index.shape) # [2, num_edges]
print("Target to train against :", data.y.shape) 
print("Node feature length", dataset.num_features)

Number of nodes in the graph: 2449029
Number of edges in the graph: 123718280
---------------------------------------------
Node feature matrix with shape: torch.Size([2449029, 100])
Graph connectivity in COO format with shape: torch.Size([2, 123718280])
Target to train against : torch.Size([2449029, 1])
Node feature length 100


In [12]:
train_idx = split_idx['train']
test_idx = split_idx['test']

In [None]:
train_loader = NeighborSampler(data.edge_index, node_idx=train_idx,
                               sizes=[15, 10, 5], batch_size=1024,
                               shuffle=True, num_workers=12)



In [14]:
subgraph_loader = NeighborSampler(data.edge_index, node_idx=None, sizes=[-1],
                                  batch_size=1024, shuffle=False,
                                  num_workers=12)

In [15]:
class SAGE(torch.nn.Module):
    def __init__(self, dataset, hidden_channels, num_layers=3):
        super(SAGE, self).__init__()

        self.num_layers = num_layers
        self.convs = nn.ModuleList()
        self.bns = nn.ModuleList()

        self.convs.append(SAGEConv(dataset.num_node_features, hidden_channels))
        self.bns.append(nn.BatchNorm1d(hidden_channels))

        for i in range(self.num_layers - 2):
            self.convs.append(SAGEConv(hidden_channels, hidden_channels))
            self.bns.append(nn.BatchNorm1d(hidden_channels))

        self.convs.append(SAGEConv(hidden_channels, dataset.num_classes))

    def reset_parameters(self):
        for conv in self.convs:
            conv.reset_parameters()
        for bn in self.bns:
            bn.reset_parameters()

    def forward(self, x, adjs):
        # `train_loader` computes the k-hop neighborhood of a batch of nodes,
        # and returns, for each layer, a bipartite graph object, holding the
        # bipartite edges `edge_index`, the index `e_id` of the original edges,
        # and the size/shape `size` of the bipartite graph.
        # Target nodes are also included in the source nodes so that one can
        # easily apply skip-connections or add self-loops.
        for i, (adj_t, _, size) in enumerate(adjs):
            x_target = x[:size[1]]  # Target nodes are always placed first.
            x = self.convs[i]((x, x_target), adj_t)
            # x = x + self.skips[i](x_target)
            if i != self.num_layers - 1:
                x = self.bns[i](x)
                x = F.relu(x)
                x = F.dropout(x, p=0.5, training=self.training)
        return x.log_softmax(dim=-1)

    def inference(self, x_all):
        pbar = tqdm(total=x_all.size(0) * self.num_layers)
        pbar.set_description('Evaluating')

        # Compute representations of nodes layer by layer, using *all*
        # available edges. This leads to faster computation in contrast to
        # immediately computing the final representations of each batch.
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        total_edges = 0
        for i in range(self.num_layers):
            xs = []
            for batch_size, n_id, adj in subgraph_loader:
                adj_t, _, size = adj.to(device)
                total_edges += adj_t.size(1)
                x = x_all[n_id].to(device)
                x_target = x[:size[1]]
                x = self.convs[i]((x, x_target), adj_t)
                # x = x + self.skips[i](x_target)

                if i != self.num_layers - 1:
                    x = self.bns[i](x)
                    x = F.relu(x)
                xs.append(x.cpu())

                pbar.update(batch_size)

            x_all = torch.cat(xs, dim=0)

        pbar.close()

        return x_all

In [16]:
model = SAGE(dataset=dataset, hidden_channels=256, num_layers=3)
print(model, flush=True)

SAGE(
  (convs): ModuleList(
    (0): SAGEConv(100, 256, aggr=mean)
    (1): SAGEConv(256, 256, aggr=mean)
    (2): SAGEConv(256, 47, aggr=mean)
  )
  (bns): ModuleList(
    (0): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
)


In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device, flush=True)
model.to(device)
data = data.to(device)

cuda


In [18]:
x, y = data.x.to(device), data.y.squeeze().to(device)
train_idx = split_idx['train'].to(device)
val_idx = split_idx['valid'].to(device)
test_idx = split_idx['test'].to(device)
x_train, y_train = x[train_idx], y[train_idx]

In [19]:
def process_adj(data):
    N = data.num_nodes
    data.edge_index = to_undirected(data.edge_index, data.num_nodes)

    row, col = data.edge_index

    adj = SparseTensor(row=row, col=col, sparse_sizes=(N, N))
    adj = adj.set_diag()
    deg = adj.sum(dim=1).to(torch.float)
    deg_inv_sqrt = deg.pow(-0.5)
    deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0
    adj = deg_inv_sqrt.view(-1, 1) * adj * deg_inv_sqrt.view(1, -1)
    return adj

In [20]:
DAD = process_adj(data).to(device)

In [21]:
criterion = nn.NLLLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.003)

In [22]:
def train():
    model.train()

    pbar = tqdm(total=train_idx.size(0))
    pbar.set_description(f'Epoch {epoch:02d}')

    total_loss = total_correct = 0
    for batch_size, n_id, adjs in train_loader:
        # `adjs` holds a list of `(edge_index, e_id, size)` tuples.
        adjs = [adj.to(device) for adj in adjs]

        out = model(x[n_id], adjs)
        loss = F.nll_loss(out, y[n_id[:batch_size]])

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += float(loss)
        total_correct += int(out.argmax(dim=-1).eq(y[n_id[:batch_size]]).sum())
        pbar.update(batch_size)

    pbar.close()

    loss = total_loss / len(train_loader)
    approx_acc = total_correct / int(train_idx.size(0))

    return loss, approx_acc

In [23]:
@torch.no_grad()
def test(out=None):
    model.eval()

    out = model.inference(x) if out is None else out
    y_pred = out.argmax(dim=-1, keepdim=True)

    train_acc = evaluator.eval({
        'y_true': data.y[split_idx['train']],
        'y_pred': y_pred[split_idx['train']],
    })['acc']
    valid_acc = evaluator.eval({
        'y_true': data.y[split_idx['valid']],
        'y_pred': y_pred[split_idx['valid']],
    })['acc']
    test_acc = evaluator.eval({
        'y_true': data.y[split_idx['test']],
        'y_pred': y_pred[split_idx['test']],
    })['acc']

    return train_acc, valid_acc, test_acc, out

In [24]:
if __name__ == '__main__':
    runs = 2
    logger = Logger(runs)

    for run in range(runs):
        print(sum(p.numel() for p in model.parameters()), flush=True)

        print('', flush=True)
        print(f'Run {run + 1:02d}:', flush=True)
        print('', flush=True)

        model.reset_parameters()

        best_val_acc = 0

        for epoch in range(10):
            loss, acc = train()
            # print('Epoch {:03d} train_loss: {:.4f}'.format(epoch, loss))
            print(f'Epoch {epoch:02d}, Loss: {loss:.4f}, Approx. Train: {acc:.4f}', flush=True)

            if (epoch + 1) > 5:

                train_acc, val_acc, test_acc, out = test()
                result = (train_acc, val_acc, test_acc)
                # print(f'Train: {train_acc:.4f}, Val: {valid_acc:.4f}, 'f'Test: {test_acc:.4f}')
                if val_acc > best_val_acc:
                    best_val_acc = val_acc
                    y_soft = out.softmax(dim=-1).to(device)

                print(f'Run: {run + 1:02d}, '
                      f'Epoch: {epoch:02d}, '
                      f'Loss: {loss:.4f}, '
                      f'Train: {100 * train_acc:.2f}%, '
                      f'Valid: {100 * val_acc:.2f}% '
                      f'Test: {100 * test_acc:.2f}%', flush=True)

        post = CorrectAndSmooth(num_correction_layers=100, correction_alpha=0.8,
                                num_smoothing_layers=100, smoothing_alpha=0.8,
                                autoscale=False, scale=10.)

        print('Correct and smooth...', flush=True)

        y_soft = post.correct(y_soft, y_train, train_idx, DAD)
        y_soft = post.smooth(y_soft, y_train, train_idx, DAD)
        print('Done!', flush=True)
        
        train_acc, val_acc, test_acc, _ = test(y_soft)        
        print(f'Train: {train_acc:.4f}, Val: {val_acc:.4f}, Test: {test_acc:.4f}', flush=True)

        result = (train_acc, val_acc, test_acc)
        logger.add_result(run, result)

    logger.print_statistics()

207919

Run 01:



Epoch 00: 100%|██████████| 196615/196615 [00:47<00:00, 4158.86it/s]

Epoch 00, Loss: 0.5680, Approx. Train: 0.8522



Epoch 01: 100%|██████████| 196615/196615 [00:47<00:00, 4147.90it/s]

Epoch 01, Loss: 0.3684, Approx. Train: 0.8980



Epoch 02: 100%|██████████| 196615/196615 [00:47<00:00, 4139.23it/s]

Epoch 02, Loss: 0.3535, Approx. Train: 0.9025



Epoch 03: 100%|██████████| 196615/196615 [00:47<00:00, 4122.21it/s]

Epoch 03, Loss: 0.3466, Approx. Train: 0.9023



Epoch 04: 100%|██████████| 196615/196615 [00:47<00:00, 4126.23it/s]

Epoch 04, Loss: 0.3238, Approx. Train: 0.9091



Epoch 05: 100%|██████████| 196615/196615 [00:47<00:00, 4135.70it/s]

Epoch 05, Loss: 0.3185, Approx. Train: 0.9091



Evaluating: 100%|██████████| 7347087/7347087 [02:33<00:00, 47957.40it/s]


Run: 01, Epoch: 05, Loss: 0.3185, Train: 92.44%, Valid: 91.53% Test: 78.18%


Epoch 06: 100%|██████████| 196615/196615 [00:47<00:00, 4111.52it/s]

Epoch 06, Loss: 0.2996, Approx. Train: 0.9148



Evaluating: 100%|██████████| 7347087/7347087 [02:28<00:00, 49356.92it/s]

Run: 01, Epoch: 06, Loss: 0.2996, Train: 92.80%, Valid: 91.47% Test: 79.01%



Epoch 07: 100%|██████████| 196615/196615 [00:47<00:00, 4097.22it/s]

Epoch 07, Loss: 0.3027, Approx. Train: 0.9138



Evaluating: 100%|██████████| 7347087/7347087 [02:22<00:00, 51509.50it/s]


Run: 01, Epoch: 07, Loss: 0.3027, Train: 92.97%, Valid: 91.75% Test: 78.75%


Epoch 08: 100%|██████████| 196615/196615 [00:48<00:00, 4078.05it/s]

Epoch 08, Loss: 0.2996, Approx. Train: 0.9145



Evaluating: 100%|██████████| 7347087/7347087 [02:29<00:00, 49082.56it/s]

Run: 01, Epoch: 08, Loss: 0.2996, Train: 92.77%, Valid: 91.50% Test: 77.97%



Epoch 09: 100%|██████████| 196615/196615 [00:47<00:00, 4098.42it/s]

Epoch 09, Loss: 0.2963, Approx. Train: 0.9166



Evaluating: 100%|██████████| 7347087/7347087 [02:22<00:00, 51533.74it/s]

Run: 01, Epoch: 09, Loss: 0.2963, Train: 92.98%, Valid: 91.67% Test: 78.46%
Correct and smooth...





Done!
Train: 0.9720, Val: 0.9221, Test: 0.8041
207919

Run 02:



Epoch 00: 100%|██████████| 196615/196615 [00:48<00:00, 4053.38it/s]

Epoch 00, Loss: 0.5707, Approx. Train: 0.8530



Epoch 01: 100%|██████████| 196615/196615 [00:47<00:00, 4096.39it/s]

Epoch 01, Loss: 0.3882, Approx. Train: 0.8939



Epoch 02: 100%|██████████| 196615/196615 [00:48<00:00, 4072.00it/s]

Epoch 02, Loss: 0.3637, Approx. Train: 0.9008



Epoch 03: 100%|██████████| 196615/196615 [00:48<00:00, 4087.73it/s]

Epoch 03, Loss: 0.3531, Approx. Train: 0.9006



Epoch 04: 100%|██████████| 196615/196615 [00:48<00:00, 4074.21it/s]

Epoch 04, Loss: 0.3354, Approx. Train: 0.9076



Epoch 05: 100%|██████████| 196615/196615 [00:48<00:00, 4065.86it/s] 

Epoch 05, Loss: 0.3334, Approx. Train: 0.9059



Evaluating: 100%|██████████| 7347087/7347087 [02:28<00:00, 49540.09it/s]


Run: 02, Epoch: 05, Loss: 0.3334, Train: 92.30%, Valid: 91.25% Test: 77.68%


Epoch 06: 100%|██████████| 196615/196615 [00:48<00:00, 4075.64it/s]

Epoch 06, Loss: 0.3138, Approx. Train: 0.9119



Evaluating: 100%|██████████| 7347087/7347087 [02:24<00:00, 50906.77it/s]


Run: 02, Epoch: 06, Loss: 0.3138, Train: 92.50%, Valid: 91.46% Test: 77.35%


Epoch 07: 100%|██████████| 196615/196615 [00:48<00:00, 4071.71it/s]

Epoch 07, Loss: 0.3143, Approx. Train: 0.9114



Evaluating: 100%|██████████| 7347087/7347087 [02:30<00:00, 48854.14it/s]

Run: 02, Epoch: 07, Loss: 0.3143, Train: 92.47%, Valid: 91.32% Test: 76.97%



Epoch 08: 100%|██████████| 196615/196615 [00:48<00:00, 4039.03it/s]

Epoch 08, Loss: 0.3255, Approx. Train: 0.9086



Evaluating: 100%|██████████| 7347087/7347087 [02:24<00:00, 50920.76it/s]


Run: 02, Epoch: 08, Loss: 0.3255, Train: 92.69%, Valid: 91.46% Test: 77.77%


Epoch 09: 100%|██████████| 196615/196615 [00:48<00:00, 4081.61it/s]

Epoch 09, Loss: 0.3159, Approx. Train: 0.9112



Evaluating: 100%|██████████| 7347087/7347087 [02:32<00:00, 48245.80it/s]


Run: 02, Epoch: 09, Loss: 0.3159, Train: 92.70%, Valid: 91.66% Test: 78.72%
Correct and smooth...
Done!
Train: 0.9729, Val: 0.9231, Test: 0.8040
All runs:
Highest Train: 97.25 ± 0.06
Highest Valid: 92.26 ± 0.07
  Final Train: 97.25 ± 0.06
   Final Test: 80.40 ± 0.00
