In [1]:
import argparse
import time
import os

import numpy as np
import torch
from torch_geometric.nn import GCNConv, ChebConv  # noqa
import torch.nn.functional as F
from ogb.nodeproppred import Evaluator

  from .autonotebook import tqdm as notebook_tqdm


In [2]:


from GNN import GNN
from GNN_early import GNNEarly
from GNN_KNN import GNN_KNN
from GNN_KNN_early import GNNKNNEarly
from data import get_dataset, set_train_val_test_split
from graph_rewiring import apply_KNN, apply_beltrami, apply_edge_sampling
from best_params import best_params_dict
from heterophilic import get_fixed_splits
from utils import ROOT_DIR
from CGNN import CGNN, get_sym_adj
from CGNN import train as train_cgnn


In [3]:
# possible datset paras
# 'Cora, Citeseer, Pubmed, Computers, Photo, CoauthorCS, ogbn-arxiv'

customArgs = ['--dataset', 'Cora']
#customArgs = ['--dataset', 'Citeseer']
#customArgs = ['--dataset', 'Pubmed']
#customArgs = ['--dataset', 'Computers']
#customArgs = ['--dataset', 'Photo']
#customArgs = ['--dataset', 'CoauthorCS']
customArgs = ['--dataset', 'ogbn-arxiv'] #doesnt work yet


#customArgs = customArgs + ['--early-stopping']

In [4]:


def get_optimizer(name, parameters, lr, weight_decay=0):
  if name == 'sgd':
    return torch.optim.SGD(parameters, lr=lr, weight_decay=weight_decay)
  elif name == 'rmsprop':
    return torch.optim.RMSprop(parameters, lr=lr, weight_decay=weight_decay)
  elif name == 'adagrad':
    return torch.optim.Adagrad(parameters, lr=lr, weight_decay=weight_decay)
  elif name == 'adam':
    return torch.optim.Adam(parameters, lr=lr, weight_decay=weight_decay)
  elif name == 'adamax':
    return torch.optim.Adamax(parameters, lr=lr, weight_decay=weight_decay)
  else:
    raise Exception("Unsupported optimizer: {}".format(name))



In [5]:
def add_labels(feat, labels, idx, num_classes, device):
  onehot = torch.zeros([feat.shape[0], num_classes]).to(device)
  if idx.dtype == torch.bool:
    idx = torch.where(idx)[0]  # convert mask to linear index
  onehot[idx, labels.squeeze()[idx]] = 1

  return torch.cat([feat, onehot], dim=-1)


def get_label_masks(data, mask_rate=0.5):
  """
  when using labels as features need to split training nodes into training and prediction
  """
  if data.train_mask.dtype == torch.bool:
    idx = torch.where(data.train_mask)[0]
  else:
    idx = data.train_mask
  mask = torch.rand(idx.shape) < mask_rate
  train_label_idx = idx[mask]
  train_pred_idx = idx[~mask]
  return train_label_idx, train_pred_idx


In [6]:


def train(model, optimizer, data, pos_encoding=None):
  model.train()
  optimizer.zero_grad()
  feat = data.x
  if model.opt['use_labels']:
    train_label_idx, train_pred_idx = get_label_masks(data, model.opt['label_rate'])

    feat = add_labels(feat, data.y, train_label_idx, model.num_classes, model.device)
  else:
    train_pred_idx = data.train_mask

  out = model(feat, pos_encoding)

  if model.opt['dataset'] == 'ogbn-arxiv':
    lf = torch.nn.functional.nll_loss
    loss = lf(out.log_softmax(dim=-1)[data.train_mask], data.y.squeeze(1)[data.train_mask])
  else:
    lf = torch.nn.CrossEntropyLoss()
    loss = lf(out[data.train_mask], data.y.squeeze()[data.train_mask])
  if model.odeblock.nreg > 0:  # add regularisation - slower for small data, but faster and better performance for large data
    reg_states = tuple(torch.mean(rs) for rs in model.reg_states)
    regularization_coeffs = model.regularization_coeffs

    reg_loss = sum(
      reg_state * coeff for reg_state, coeff in zip(reg_states, regularization_coeffs) if coeff != 0
    )
    loss = loss + reg_loss

  model.fm.update(model.getNFE())
  model.resetNFE()
  loss.backward()
  optimizer.step()
  model.bm.update(model.getNFE())
  model.resetNFE()
  return loss.item()



In [7]:

def train_OGB(model, mp, optimizer, data, pos_encoding=None):
  model.train()
  optimizer.zero_grad()
  feat = data.x
  if model.opt['use_labels']:
    train_label_idx, train_pred_idx = get_label_masks(data, model.opt['label_rate'])

    feat = add_labels(feat, data.y, train_label_idx, model.num_classes, model.device)
  else:
    train_pred_idx = data.train_mask

  pos_encoding = mp(pos_encoding).to(model.device)
  out = model(feat, pos_encoding)

  if model.opt['dataset'] == 'ogbn-arxiv':
    lf = torch.nn.functional.nll_loss
    loss = lf(out.log_softmax(dim=-1)[data.train_mask], data.y.squeeze(1)[data.train_mask])
  else:
    lf = torch.nn.CrossEntropyLoss()
    loss = lf(out[data.train_mask], data.y.squeeze()[data.train_mask])
  if model.odeblock.nreg > 0:  # add regularisation - slower for small data, but faster and better performance for large data
    reg_states = tuple(torch.mean(rs) for rs in model.reg_states)
    regularization_coeffs = model.regularization_coeffs

    reg_loss = sum(
      reg_state * coeff for reg_state, coeff in zip(reg_states, regularization_coeffs) if coeff != 0
    )
    loss = loss + reg_loss

  model.fm.update(model.getNFE())
  model.resetNFE()
  loss.backward()
  optimizer.step()
  model.bm.update(model.getNFE())
  model.resetNFE()
  return loss.item()



In [8]:

@torch.no_grad()
def test(model, data, pos_encoding=None, opt=None):  # opt required for runtime polymorphism
  model.eval()
  feat = data.x
  if model.opt['use_labels']:
    feat = add_labels(feat, data.y, data.train_mask, model.num_classes, model.device)
  logits, accs = model(feat, pos_encoding), []
  for _, mask in data('train_mask', 'val_mask', 'test_mask'):
    pred = logits[mask].max(1)[1]
    acc = pred.eq(data.y[mask]).sum().item() / mask.sum().item()
    accs.append(acc)
  return accs


def print_model_params(model):
  print(model)
  for name, param in model.named_parameters():
    if param.requires_grad:
      print(name)
      print(param.data.shape)


@torch.no_grad()
def test_OGB(model, data, pos_encoding, opt):
  if opt['dataset'] == 'ogbn-arxiv':
    name = 'ogbn-arxiv'

  feat = data.x
  if model.opt['use_labels']:
    feat = add_labels(feat, data.y, data.train_mask, model.num_classes, model.device)

  evaluator = Evaluator(name=name)
  model.eval()

  out = model(feat, pos_encoding).log_softmax(dim=-1)
  y_pred = out.argmax(dim=-1, keepdim=True)

  train_acc = evaluator.eval({
    'y_true': data.y[data.train_mask],
    'y_pred': y_pred[data.train_mask],
  })['acc']
  valid_acc = evaluator.eval({
    'y_true': data.y[data.val_mask],
    'y_pred': y_pred[data.val_mask],
  })['acc']
  test_acc = evaluator.eval({
    'y_true': data.y[data.test_mask],
    'y_pred': y_pred[data.test_mask],
  })['acc']

  return train_acc, valid_acc, test_acc


def merge_cmd_args(cmd_opt, opt):
  if cmd_opt['beltrami']:
    opt['beltrami'] = True
  if cmd_opt['function'] is not None:
    opt['function'] = cmd_opt['function']
  if cmd_opt['block'] is not None:
    opt['block'] = cmd_opt['block']
  if cmd_opt['attention_type'] != 'scaled_dot':
    opt['attention_type'] = cmd_opt['attention_type']
  if cmd_opt['self_loop_weight'] is not None:
    opt['self_loop_weight'] = cmd_opt['self_loop_weight']
  if cmd_opt['method'] is not None:
    opt['method'] = cmd_opt['method']
  if cmd_opt['step_size'] != 1:
    opt['step_size'] = cmd_opt['step_size']
  if cmd_opt['time'] != 1:
    opt['time'] = cmd_opt['time']
  if cmd_opt['epoch'] != 100:
    opt['epoch'] = cmd_opt['epoch']
  if not cmd_opt['not_lcc']:
    opt['not_lcc'] = False
  if cmd_opt['num_splits'] != 1:
    opt['num_splits'] = cmd_opt['num_splits']


In [9]:
parser = argparse.ArgumentParser()
parser.add_argument('--use_cora_defaults', action='store_true',
                  help='Whether to run with best params for cora. Overrides the choice of dataset')
# data args
parser.add_argument('--dataset', type=str, default='Cora',
                  help='Cora, Citeseer, Pubmed, Computers, Photo, CoauthorCS, ogbn-arxiv')
parser.add_argument('--data_norm', type=str, default='rw',
                  help='rw for random walk, gcn for symmetric gcn norm')
parser.add_argument('--self_loop_weight', type=float, default=1.0, help='Weight of self-loops.')
parser.add_argument('--use_labels', dest='use_labels', action='store_true', help='Also diffuse labels')
parser.add_argument('--geom_gcn_splits', dest='geom_gcn_splits', action='store_true',
                  help='use the 10 fixed splits from '
                       'https://arxiv.org/abs/2002.05287')
parser.add_argument('--num_splits', type=int, dest='num_splits', default=1,
                  help='the number of splits to repeat the results on')
parser.add_argument('--label_rate', type=float, default=0.5,
                  help='% of training labels to use when --use_labels is set.')
parser.add_argument('--planetoid_split', action='store_true',
                  help='use planetoid splits for Cora/Citeseer/Pubmed')
# GNN args
parser.add_argument('--hidden_dim', type=int, default=16, help='Hidden dimension.')
parser.add_argument('--fc_out', dest='fc_out', action='store_true',
                  help='Add a fully connected layer to the decoder.')
parser.add_argument('--input_dropout', type=float, default=0.5, help='Input dropout rate.')
parser.add_argument('--dropout', type=float, default=0.0, help='Dropout rate.')
parser.add_argument("--batch_norm", dest='batch_norm', action='store_true', help='search over reg params')
parser.add_argument('--optimizer', type=str, default='adam', help='One from sgd, rmsprop, adam, adagrad, adamax.')
parser.add_argument('--lr', type=float, default=0.01, help='Learning rate.')
parser.add_argument('--decay', type=float, default=5e-4, help='Weight decay for optimization')
parser.add_argument('--epoch', type=int, default=100, help='Number of training epochs per iteration.')
parser.add_argument('--alpha', type=float, default=1.0, help='Factor in front matrix A.')
parser.add_argument('--alpha_dim', type=str, default='sc', help='choose either scalar (sc) or vector (vc) alpha')
parser.add_argument('--no_alpha_sigmoid', dest='no_alpha_sigmoid', action='store_true',
                  help='apply sigmoid before multiplying by alpha')
parser.add_argument('--beta_dim', type=str, default='sc', help='choose either scalar (sc) or vector (vc) beta')
parser.add_argument('--block', type=str, default='constant', help='constant, mixed, attention, hard_attention')
parser.add_argument('--function', type=str, default='laplacian', help='laplacian, transformer, dorsey, GAT')
parser.add_argument('--use_mlp', dest='use_mlp', action='store_true',
                  help='Add a fully connected layer to the encoder.')
parser.add_argument('--add_source', dest='add_source', action='store_true',
                  help='If try get rid of alpha param and the beta*x0 source term')
parser.add_argument('--cgnn', dest='cgnn', action='store_true', help='Run the baseline CGNN model from ICML20')

# ODE args
parser.add_argument('--time', type=float, default=1.0, help='End time of ODE integrator.')
parser.add_argument('--augment', action='store_true',
                  help='double the length of the feature vector by appending zeros to stabilist ODE learning')
parser.add_argument('--method', type=str, help="set the numerical solver: dopri5, euler, rk4, midpoint")
parser.add_argument('--step_size', type=float, default=1,
                  help='fixed step size when using fixed step solvers e.g. rk4')
parser.add_argument('--max_iters', type=float, default=100, help='maximum number of integration steps')
parser.add_argument("--adjoint_method", type=str, default="adaptive_heun",
                  help="set the numerical solver for the backward pass: dopri5, euler, rk4, midpoint")
parser.add_argument('--adjoint', dest='adjoint', action='store_true',
                  help='use the adjoint ODE method to reduce memory footprint')
parser.add_argument('--adjoint_step_size', type=float, default=1,
                  help='fixed step size when using fixed step adjoint solvers e.g. rk4')
parser.add_argument('--tol_scale', type=float, default=1., help='multiplier for atol and rtol')
parser.add_argument("--tol_scale_adjoint", type=float, default=1.0,
                  help="multiplier for adjoint_atol and adjoint_rtol")
parser.add_argument('--ode_blocks', type=int, default=1, help='number of ode blocks to run')
parser.add_argument("--max_nfe", type=int, default=1000,
                  help="Maximum number of function evaluations in an epoch. Stiff ODEs will hang if not set.")
parser.add_argument("--no_early", action="store_true",
                  help="Whether or not to use early stopping of the ODE integrator when testing.")
parser.add_argument('--earlystopxT', type=float, default=3, help='multiplier for T used to evaluate best model')
parser.add_argument("--max_test_steps", type=int, default=100,
                  help="Maximum number steps for the dopri5Early test integrator. "
                       "used if getting OOM errors at test time")

# Attention args
parser.add_argument('--leaky_relu_slope', type=float, default=0.2,
                  help='slope of the negative part of the leaky relu used in attention')
parser.add_argument('--attention_dropout', type=float, default=0., help='dropout of attention weights')
parser.add_argument('--heads', type=int, default=4, help='number of attention heads')
parser.add_argument('--attention_norm_idx', type=int, default=0, help='0 = normalise rows, 1 = normalise cols')
parser.add_argument('--attention_dim', type=int, default=64,
                  help='the size to project x to before calculating att scores')
parser.add_argument('--mix_features', dest='mix_features', action='store_true',
                  help='apply a feature transformation xW to the ODE')
parser.add_argument('--reweight_attention', dest='reweight_attention', action='store_true',
                  help="multiply attention scores by edge weights before softmax")
parser.add_argument('--attention_type', type=str, default="scaled_dot",
                  help="scaled_dot,cosine_sim,pearson, exp_kernel")
parser.add_argument('--square_plus', action='store_true', help='replace softmax with square plus')

# regularisation args
parser.add_argument('--jacobian_norm2', type=float, default=None, help="int_t ||df/dx||_F^2")
parser.add_argument('--total_deriv', type=float, default=None, help="int_t ||df/dt||^2")

parser.add_argument('--kinetic_energy', type=float, default=None, help="int_t ||f||_2^2")
parser.add_argument('--directional_penalty', type=float, default=None, help="int_t ||(df/dx)^T f||^2")

# rewiring args
parser.add_argument("--not_lcc", action="store_false", help="don't use the largest connected component")
parser.add_argument('--rewiring', type=str, default=None, help="two_hop, gdc")
parser.add_argument('--gdc_method', type=str, default='ppr', help="ppr, heat, coeff")
parser.add_argument('--gdc_sparsification', type=str, default='topk', help="threshold, topk")
parser.add_argument('--gdc_k', type=int, default=64, help="number of neighbours to sparsify to when using topk")
parser.add_argument('--gdc_threshold', type=float, default=0.0001,
                  help="obove this edge weight, keep edges when using threshold")
parser.add_argument('--gdc_avg_degree', type=int, default=64,
                  help="if gdc_threshold is not given can be calculated by specifying avg degree")
parser.add_argument('--ppr_alpha', type=float, default=0.05, help="teleport probability")
parser.add_argument('--heat_time', type=float, default=3., help="time to run gdc heat kernal diffusion for")
parser.add_argument('--att_samp_pct', type=float, default=1,
                  help="float in [0,1). The percentage of edges to retain based on attention scores")
parser.add_argument('--use_flux', dest='use_flux', action='store_true',
                  help='incorporate the feature grad in attention based edge dropout')
parser.add_argument("--exact", action="store_true",
                  help="for small datasets can do exact diffusion. If dataset is too big for matrix inversion then you can't")
parser.add_argument('--M_nodes', type=int, default=64, help="new number of nodes to add")
parser.add_argument('--new_edges', type=str, default="random", help="random, random_walk, k_hop")
parser.add_argument('--sparsify', type=str, default="S_hat", help="S_hat, recalc_att")
parser.add_argument('--threshold_type', type=str, default="topk_adj", help="topk_adj, addD_rvR")
parser.add_argument('--rw_addD', type=float, default=0.02, help="percentage of new edges to add")
parser.add_argument('--rw_rmvR', type=float, default=0.02, help="percentage of edges to remove")
parser.add_argument('--rewire_KNN', action='store_true', help='perform KNN rewiring every few epochs')
parser.add_argument('--rewire_KNN_T', type=str, default="T0", help="T0, TN")
parser.add_argument('--rewire_KNN_epoch', type=int, default=5, help="frequency of epochs to rewire")
parser.add_argument('--rewire_KNN_k', type=int, default=64, help="target degree for KNN rewire")
parser.add_argument('--rewire_KNN_sym', action='store_true', help='make KNN symmetric')
parser.add_argument('--KNN_online', action='store_true', help='perform rewiring online')
parser.add_argument('--KNN_online_reps', type=int, default=4, help="how many online KNN its")
parser.add_argument('--KNN_space', type=str, default="pos_distance", help="Z,P,QKZ,QKp")
# beltrami args
parser.add_argument('--beltrami', action='store_true', help='perform diffusion beltrami style')
parser.add_argument('--fa_layer', action='store_true', help='add a bottleneck paper style layer with more edges')
parser.add_argument('--pos_enc_type', type=str, default="DW64",
                  help='positional encoder either GDC, DW64, DW128, DW256')
parser.add_argument('--pos_enc_orientation', type=str, default="row", help="row, col")
parser.add_argument('--feat_hidden_dim', type=int, default=64, help="dimension of features in beltrami")
parser.add_argument('--pos_enc_hidden_dim', type=int, default=32, help="dimension of position in beltrami")
parser.add_argument('--edge_sampling', action='store_true', help='perform edge sampling rewiring')
parser.add_argument('--edge_sampling_T', type=str, default="T0", help="T0, TN")
parser.add_argument('--edge_sampling_epoch', type=int, default=5, help="frequency of epochs to rewire")
parser.add_argument('--edge_sampling_add', type=float, default=0.64, help="percentage of new edges to add")
parser.add_argument('--edge_sampling_add_type', type=str, default="importance",
                  help="random, ,anchored, importance, degree")
parser.add_argument('--edge_sampling_rmv', type=float, default=0.32, help="percentage of edges to remove")
parser.add_argument('--edge_sampling_sym', action='store_true', help='make KNN symmetric')
parser.add_argument('--edge_sampling_online', action='store_true', help='perform rewiring online')
parser.add_argument('--edge_sampling_online_reps', type=int, default=4, help="how many online KNN its")
parser.add_argument('--edge_sampling_space', type=str, default="attention",
                  help="attention,pos_distance, z_distance, pos_distance_QK, z_distance_QK")
parser.add_argument('--symmetric_attention', action='store_true',
                  help='maks the attention symmetric for rewring in QK space')

parser.add_argument('--fa_layer_edge_sampling_rmv', type=float, default=0.8, help="percentage of edges to remove")
parser.add_argument('--gpu', type=int, default=0, help="GPU to run on (default 0)")
parser.add_argument('--pos_enc_csv', action='store_true', help="Generate pos encoding as a sparse CSV")

parser.add_argument('--pos_dist_quantile', type=float, default=0.001, help="percentage of N**2 edges to keep")


args = parser.parse_args(customArgs)

cmd_opt = vars(args)

In [10]:

import gc

try:
    best_opt = best_params_dict[cmd_opt['dataset']]
    opt = {**cmd_opt, **best_opt}
    merge_cmd_args(cmd_opt, opt)
except KeyError:
    opt = cmd_opt

dataset = get_dataset(opt, f'{ROOT_DIR}/data', opt['not_lcc'])
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = torch.device('cpu')

if opt['beltrami']:
    pos_encoding = apply_beltrami(dataset.data, opt).to(device)
    opt['pos_enc_dim'] = pos_encoding.shape[1]
else:
    pos_encoding = None
 
print("adjoint method ", opt["adjoint_method"])
print("no early ", opt["no_early"])
#print("before model")
#print(torch.cuda.memory_allocated())#print(torch.cuda.memory_stats())
    
if opt['rewire_KNN'] or opt['fa_layer']:
    model = GNN_KNN(opt, dataset, device).to(device) if opt["no_early"] else GNNKNNEarly(opt, dataset, device).to(device)
else:
    model = GNN(opt, dataset, device).to(device) if opt["no_early"] else GNNEarly(opt, dataset, device).to(device)

if not opt['planetoid_split'] and opt['dataset'] in ['Cora','Citeseer','Pubmed']:
    dataset.data = set_train_val_test_split(np.random.randint(0, 1000), dataset.data, num_development=5000 if opt["dataset"] == "CoauthorCS" else 1500)

print("before dataset")
stats = torch.cuda.memory_stats()
print(torch.cuda.memory_allocated(), torch.cuda.memory_reserved(), stats['inactive_split.all.current'], stats['inactive_split_bytes.all.current'])
data = dataset.data.to(device)
print("after dataset")
print(torch.cuda.memory_allocated(), torch.cuda.memory_reserved(), stats['inactive_split.all.current'], stats['inactive_split_bytes.all.current'])

parameters = [p for p in model.parameters() if p.requires_grad]
print_model_params(model)
optimizer = get_optimizer(opt['optimizer'], parameters, lr=opt['lr'], weight_decay=opt['decay'])
best_time = best_epoch = train_acc = val_acc = test_acc = 0

this_test = test_OGB if opt['dataset'] == 'ogbn-arxiv' else test

print("before loop")
print(torch.cuda.memory_allocated())#print(torch.cuda.memory_stats())

print(f"epochs until {opt['epoch']}")
for epoch in range(1, opt['epoch']):
    start_time = time.time()

    if opt['rewire_KNN'] and epoch % opt['rewire_KNN_epoch'] == 0 and epoch != 0:
        ei = apply_KNN(data, pos_encoding, model, opt)
        model.odeblock.odefunc.edge_index = ei

    print("before train: allocated reserved")
    print(torch.cuda.memory_allocated(), torch.cuda.memory_reserved(), stats['inactive_split.all.current'], stats['inactive_split_bytes.all.current'])
    loss = train(model, optimizer, data, pos_encoding)
    #print("before test")
    #print(torch.cuda.memory_allocated())
    tmp_train_acc, tmp_val_acc, tmp_test_acc = this_test(model, data, pos_encoding, opt)
    #print("type train ret ", type(loss))
    #print("type test ret ", type(tmp_train_acc), type(tmp_val_acc), type(tmp_test_acc))
    
    print("after test: allocated reserved")
    print(torch.cuda.memory_allocated(), torch.cuda.memory_reserved(), stats['inactive_split.all.current'], stats['inactive_split_bytes.all.current'])

    best_time = opt['time']
    if tmp_val_acc > val_acc:
        best_epoch = epoch
        train_acc = tmp_train_acc
        val_acc = tmp_val_acc
        test_acc = tmp_test_acc
        best_time = opt['time']
    if not opt['no_early'] and model.odeblock.test_integrator.solver.best_val > val_acc:
        best_epoch = epoch
        val_acc = model.odeblock.test_integrator.solver.best_val
        test_acc = model.odeblock.test_integrator.solver.best_test
        train_acc = model.odeblock.test_integrator.solver.best_train
        best_time = model.odeblock.test_integrator.solver.best_time

    log = 'Epoch: {:03d}, Runtime {:03f}, Loss {:03f}, forward nfe {:d}, backward nfe {:d}, Train: {:.4f}, Val: {:.4f}, Test: {:.4f}, Best time: {:.4f}'

    print(log.format(epoch, time.time() - start_time, loss, model.fm.sum, model.bm.sum, train_acc, val_acc, test_acc, best_time))
    del loss
    del tmp_train_acc
    del tmp_val_acc
    del tmp_test_acc
    torch.cuda.empty_cache()
    gc.collect()
print('best val accuracy {:03f} with test accuracy {:03f} at epoch {:d} and best time {:03f}'.format(val_acc, test_acc,best_epoch,best_time))



adjoint method  rk4
no early  False
before dataset
49942016 62914560 2 12972544
after dataset
177126400 188743680 2 12972544
GNNEarly
m1.weight
torch.Size([98, 128])
m1.bias
torch.Size([98])
m2.weight
torch.Size([40, 98])
m2.bias
torch.Size([40])
bn_in.weight
torch.Size([98])
bn_in.bias
torch.Size([98])
bn_out.weight
torch.Size([98])
bn_out.bias
torch.Size([98])
odeblock.odefunc.alpha_train
torch.Size([])
odeblock.odefunc.beta_train
torch.Size([])
odeblock.odefunc.alpha_sc
torch.Size([1])
odeblock.odefunc.beta_sc
torch.Size([1])
odeblock.odefunc.w
torch.Size([98, 98])
odeblock.odefunc.d
torch.Size([98])
odeblock.reg_odefunc.odefunc.alpha_train
torch.Size([])
odeblock.reg_odefunc.odefunc.beta_train
torch.Size([])
odeblock.reg_odefunc.odefunc.alpha_sc
torch.Size([1])
odeblock.reg_odefunc.odefunc.beta_sc
torch.Size([1])
odeblock.reg_odefunc.odefunc.w
torch.Size([98, 98])
odeblock.reg_odefunc.odefunc.d
torch.Size([98])
before loop
177126400
epochs until 100
before train: allocated reserved



after test: allocated reserved
1507964928 5471469568 2 12972544
Epoch: 001, Runtime 2.130238, Loss 3.722777, forward nfe 26, backward nfe 16, Train: 0.3272, Val: 0.3966, Test: 0.4274, Best time: 6.8704
before train: allocated reserved
1507964928 3382706176 2 12972544




after test: allocated reserved
2436622336 6312427520 2 12972544
Epoch: 002, Runtime 2.127146, Loss 2.620591, forward nfe 96, backward nfe 32, Train: 0.3517, Val: 0.4021, Test: 0.4381, Best time: 1.2792
before train: allocated reserved
1505786880 4290772992 2 12972544




after test: allocated reserved
2435896320 6245318656 2 12972544
Epoch: 003, Runtime 2.129802, Loss 2.248224, forward nfe 166, backward nfe 48, Train: 0.3517, Val: 0.4021, Test: 0.4381, Best time: 3.6760
before train: allocated reserved
1506512896 4223664128 2 12972544




after test: allocated reserved
2435896320 6178209792 2 12972544
Epoch: 004, Runtime 2.129230, Loss 2.129002, forward nfe 236, backward nfe 64, Train: 0.3517, Val: 0.4021, Test: 0.4381, Best time: 3.6760
before train: allocated reserved
1505786880 4223664128 2 12972544




after test: allocated reserved
2435898880 6178209792 2 12972544
Epoch: 005, Runtime 2.128439, Loss 1.891284, forward nfe 306, backward nfe 80, Train: 0.3761, Val: 0.4192, Test: 0.4054, Best time: 1.3084
before train: allocated reserved
1505789440 4223664128 2 12972544




after test: allocated reserved
2435172864 6178209792 2 12972544
Epoch: 006, Runtime 2.132022, Loss 1.820689, forward nfe 376, backward nfe 96, Train: 0.3761, Val: 0.4192, Test: 0.4054, Best time: 3.6760
before train: allocated reserved
1505786880 4156555264 2 12972544




after test: allocated reserved
2435170304 6111100928 2 12972544
Epoch: 007, Runtime 2.129968, Loss 1.752731, forward nfe 446, backward nfe 112, Train: 0.4238, Val: 0.4238, Test: 0.3867, Best time: 1.3199
before train: allocated reserved
1505060864 4156555264 2 12972544




after test: allocated reserved
2435170816 6111100928 2 12972544
Epoch: 008, Runtime 2.132790, Loss 1.704897, forward nfe 516, backward nfe 128, Train: 0.4531, Val: 0.4658, Test: 0.4949, Best time: 3.0671
before train: allocated reserved
1505787392 4156555264 2 12972544




after test: allocated reserved
2435170816 6111100928 2 12972544
Epoch: 009, Runtime 2.133305, Loss 1.601694, forward nfe 586, backward nfe 144, Train: 0.5114, Val: 0.5359, Test: 0.5397, Best time: 5.0545
before train: allocated reserved
1505786880 4156555264 2 12972544




after test: allocated reserved
2435170304 6111100928 2 12972544
Epoch: 010, Runtime 2.132052, Loss 1.561344, forward nfe 656, backward nfe 160, Train: 0.5114, Val: 0.5359, Test: 0.5397, Best time: 3.6760
before train: allocated reserved
1505060864 4156555264 2 12972544




after test: allocated reserved
2435170816 6111100928 2 12972544
Epoch: 011, Runtime 2.131982, Loss 1.533624, forward nfe 726, backward nfe 176, Train: 0.5525, Val: 0.5631, Test: 0.5575, Best time: 5.0604
before train: allocated reserved
1505787392 4156555264 2 12972544




after test: allocated reserved
2434444800 6111100928 2 12972544
Epoch: 012, Runtime 2.132167, Loss 1.520287, forward nfe 796, backward nfe 192, Train: 0.5525, Val: 0.5631, Test: 0.5575, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 013, Runtime 2.133778, Loss 1.490342, forward nfe 866, backward nfe 208, Train: 0.5701, Val: 0.5727, Test: 0.5561, Best time: 5.0287
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 014, Runtime 2.134248, Loss 1.495018, forward nfe 936, backward nfe 224, Train: 0.5701, Val: 0.5727, Test: 0.5561, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 015, Runtime 2.132890, Loss 1.490550, forward nfe 1006, backward nfe 240, Train: 0.5701, Val: 0.5727, Test: 0.5561, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 016, Runtime 2.134703, Loss 1.463875, forward nfe 1076, backward nfe 256, Train: 0.5701, Val: 0.5727, Test: 0.5561, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 017, Runtime 2.132648, Loss 1.439820, forward nfe 1146, backward nfe 272, Train: 0.5701, Val: 0.5727, Test: 0.5561, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 018, Runtime 2.133724, Loss 1.424446, forward nfe 1216, backward nfe 288, Train: 0.5701, Val: 0.5727, Test: 0.5561, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444800 6043992064 2 12972544
Epoch: 019, Runtime 2.133769, Loss 1.407836, forward nfe 1286, backward nfe 304, Train: 0.5701, Val: 0.5727, Test: 0.5561, Best time: 3.6760
before train: allocated reserved
1505061376 4089446400 2 12972544




after test: allocated reserved
2434444800 6043992064 2 12972544
Epoch: 020, Runtime 2.136408, Loss 1.394886, forward nfe 1356, backward nfe 320, Train: 0.6096, Val: 0.6019, Test: 0.6165, Best time: 7.2690
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 021, Runtime 2.134227, Loss 1.380493, forward nfe 1426, backward nfe 336, Train: 0.6096, Val: 0.6019, Test: 0.6165, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 022, Runtime 2.135217, Loss 1.369253, forward nfe 1496, backward nfe 352, Train: 0.6350, Val: 0.6364, Test: 0.6452, Best time: 7.1946
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 023, Runtime 2.135655, Loss 1.357157, forward nfe 1566, backward nfe 368, Train: 0.6350, Val: 0.6364, Test: 0.6452, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 024, Runtime 2.136337, Loss 1.349045, forward nfe 1636, backward nfe 384, Train: 0.6475, Val: 0.6531, Test: 0.6577, Best time: 7.0929
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 025, Runtime 2.136203, Loss 1.338705, forward nfe 1706, backward nfe 400, Train: 0.6475, Val: 0.6531, Test: 0.6577, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444800 6043992064 2 12972544
Epoch: 026, Runtime 2.136506, Loss 1.333053, forward nfe 1776, backward nfe 416, Train: 0.6595, Val: 0.6653, Test: 0.6624, Best time: 6.9897
before train: allocated reserved
1505061376 4089446400 2 12972544




after test: allocated reserved
2434444800 6043992064 2 12972544
Epoch: 027, Runtime 2.136945, Loss 1.322487, forward nfe 1846, backward nfe 432, Train: 0.6595, Val: 0.6653, Test: 0.6624, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 028, Runtime 2.136636, Loss 1.317096, forward nfe 1916, backward nfe 448, Train: 0.6595, Val: 0.6653, Test: 0.6624, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 029, Runtime 2.138465, Loss 1.306982, forward nfe 1986, backward nfe 464, Train: 0.6595, Val: 0.6653, Test: 0.6624, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 030, Runtime 2.137079, Loss 1.304060, forward nfe 2056, backward nfe 480, Train: 0.6595, Val: 0.6653, Test: 0.6624, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 031, Runtime 2.136886, Loss 1.294568, forward nfe 2126, backward nfe 496, Train: 0.6595, Val: 0.6653, Test: 0.6624, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 032, Runtime 2.137596, Loss 1.298335, forward nfe 2196, backward nfe 512, Train: 0.6595, Val: 0.6653, Test: 0.6624, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 033, Runtime 2.137270, Loss 1.286620, forward nfe 2266, backward nfe 528, Train: 0.6595, Val: 0.6653, Test: 0.6624, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 034, Runtime 2.137154, Loss 1.287769, forward nfe 2336, backward nfe 544, Train: 0.6595, Val: 0.6653, Test: 0.6624, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 035, Runtime 2.139032, Loss 1.277176, forward nfe 2406, backward nfe 560, Train: 0.6595, Val: 0.6653, Test: 0.6624, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 036, Runtime 2.138183, Loss 1.283590, forward nfe 2476, backward nfe 576, Train: 0.6595, Val: 0.6653, Test: 0.6624, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 037, Runtime 2.136764, Loss 1.287752, forward nfe 2546, backward nfe 592, Train: 0.6595, Val: 0.6653, Test: 0.6624, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 038, Runtime 2.138548, Loss 1.288240, forward nfe 2616, backward nfe 608, Train: 0.6595, Val: 0.6653, Test: 0.6624, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 039, Runtime 2.139306, Loss 1.282103, forward nfe 2686, backward nfe 624, Train: 0.6615, Val: 0.6666, Test: 0.6629, Best time: 6.2464
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 040, Runtime 2.141016, Loss 1.283102, forward nfe 2756, backward nfe 640, Train: 0.6775, Val: 0.6703, Test: 0.6508, Best time: 6.2124
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 041, Runtime 2.140358, Loss 1.259356, forward nfe 2826, backward nfe 656, Train: 0.6733, Val: 0.6751, Test: 0.6685, Best time: 6.1947
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 042, Runtime 2.139648, Loss 1.255477, forward nfe 2896, backward nfe 672, Train: 0.6733, Val: 0.6751, Test: 0.6685, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 043, Runtime 2.139259, Loss 1.243481, forward nfe 2966, backward nfe 688, Train: 0.6811, Val: 0.6812, Test: 0.6726, Best time: 6.1621
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 044, Runtime 2.138723, Loss 1.241348, forward nfe 3036, backward nfe 704, Train: 0.6811, Val: 0.6812, Test: 0.6726, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 045, Runtime 2.141486, Loss 1.238640, forward nfe 3106, backward nfe 720, Train: 0.6847, Val: 0.6819, Test: 0.6645, Best time: 6.1277
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 046, Runtime 2.140018, Loss 1.235352, forward nfe 3176, backward nfe 736, Train: 0.6847, Val: 0.6819, Test: 0.6645, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 047, Runtime 2.142477, Loss 1.238091, forward nfe 3246, backward nfe 752, Train: 0.6847, Val: 0.6819, Test: 0.6645, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 048, Runtime 2.141075, Loss 1.231826, forward nfe 3316, backward nfe 768, Train: 0.6847, Val: 0.6819, Test: 0.6645, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 049, Runtime 2.142256, Loss 1.229268, forward nfe 3386, backward nfe 784, Train: 0.6857, Val: 0.6825, Test: 0.6601, Best time: 6.0180
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 050, Runtime 2.142290, Loss 1.224825, forward nfe 3456, backward nfe 800, Train: 0.6857, Val: 0.6825, Test: 0.6601, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444800 6043992064 2 12972544
Epoch: 051, Runtime 2.140707, Loss 1.222159, forward nfe 3526, backward nfe 816, Train: 0.6857, Val: 0.6825, Test: 0.6601, Best time: 3.6760
before train: allocated reserved
1505061376 4089446400 2 12972544




after test: allocated reserved
2434444800 7019167744 2 12972544
Epoch: 052, Runtime 2.140594, Loss 1.217671, forward nfe 3596, backward nfe 832, Train: 0.6857, Val: 0.6825, Test: 0.6601, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 053, Runtime 2.141123, Loss 1.216479, forward nfe 3666, backward nfe 848, Train: 0.6857, Val: 0.6825, Test: 0.6601, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 054, Runtime 2.282874, Loss 1.211761, forward nfe 3736, backward nfe 864, Train: 0.6857, Val: 0.6825, Test: 0.6601, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 055, Runtime 2.141841, Loss 1.207314, forward nfe 3812, backward nfe 880, Train: 0.6857, Val: 0.6825, Test: 0.6601, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 056, Runtime 2.282422, Loss 1.205518, forward nfe 3882, backward nfe 896, Train: 0.6857, Val: 0.6825, Test: 0.6601, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 057, Runtime 2.141658, Loss 1.202264, forward nfe 3958, backward nfe 912, Train: 0.6857, Val: 0.6825, Test: 0.6601, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 058, Runtime 2.281448, Loss 1.201733, forward nfe 4028, backward nfe 928, Train: 0.6857, Val: 0.6825, Test: 0.6601, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 059, Runtime 2.140995, Loss 1.203570, forward nfe 4104, backward nfe 944, Train: 0.6857, Val: 0.6825, Test: 0.6601, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 060, Runtime 2.282834, Loss 1.206118, forward nfe 4174, backward nfe 960, Train: 0.6857, Val: 0.6825, Test: 0.6601, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 061, Runtime 2.282941, Loss 1.209662, forward nfe 4250, backward nfe 976, Train: 0.6857, Val: 0.6825, Test: 0.6601, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444800 6043992064 2 12972544
Epoch: 062, Runtime 2.282295, Loss 1.210039, forward nfe 4326, backward nfe 992, Train: 0.6857, Val: 0.6825, Test: 0.6601, Best time: 3.6760
before train: allocated reserved
1505061376 4089446400 2 12972544




after test: allocated reserved
2434444800 6043992064 2 12972544
Epoch: 063, Runtime 2.281411, Loss 1.208198, forward nfe 4402, backward nfe 1008, Train: 0.6857, Val: 0.6825, Test: 0.6601, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 064, Runtime 2.281807, Loss 1.214048, forward nfe 4478, backward nfe 1024, Train: 0.6857, Val: 0.6825, Test: 0.6601, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 065, Runtime 2.285228, Loss 1.211374, forward nfe 4554, backward nfe 1040, Train: 0.6857, Val: 0.6825, Test: 0.6601, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434446848 6043992064 2 12972544
Epoch: 066, Runtime 2.284818, Loss 1.213263, forward nfe 4630, backward nfe 1056, Train: 0.6857, Val: 0.6825, Test: 0.6601, Best time: 3.6760
before train: allocated reserved
1505063424 4089446400 2 12972544




after test: allocated reserved
2434446848 6043992064 2 12972544
Epoch: 067, Runtime 2.282291, Loss 1.217883, forward nfe 4706, backward nfe 1072, Train: 0.6857, Val: 0.6825, Test: 0.6601, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434446848 6043992064 2 12972544
Epoch: 068, Runtime 2.286851, Loss 1.201325, forward nfe 4782, backward nfe 1088, Train: 0.6857, Val: 0.6825, Test: 0.6601, Best time: 3.6760
before train: allocated reserved
1505063424 4089446400 2 12972544




after test: allocated reserved
2434446848 6043992064 2 12972544
Epoch: 069, Runtime 2.283789, Loss 1.189611, forward nfe 4858, backward nfe 1104, Train: 0.6857, Val: 0.6825, Test: 0.6601, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 070, Runtime 2.285119, Loss 1.190413, forward nfe 4934, backward nfe 1120, Train: 0.6857, Val: 0.6825, Test: 0.6601, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 071, Runtime 2.283258, Loss 1.196368, forward nfe 5010, backward nfe 1136, Train: 0.6857, Val: 0.6825, Test: 0.6601, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 072, Runtime 2.284803, Loss 1.187998, forward nfe 5086, backward nfe 1152, Train: 0.6857, Val: 0.6825, Test: 0.6601, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 073, Runtime 2.284715, Loss 1.193100, forward nfe 5162, backward nfe 1168, Train: 0.6857, Val: 0.6825, Test: 0.6601, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 074, Runtime 2.285682, Loss 1.187896, forward nfe 5238, backward nfe 1184, Train: 0.6857, Val: 0.6825, Test: 0.6601, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 075, Runtime 2.283334, Loss 1.184723, forward nfe 5314, backward nfe 1200, Train: 0.6956, Val: 0.6952, Test: 0.6950, Best time: 5.4588
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 076, Runtime 2.283763, Loss 1.178550, forward nfe 5390, backward nfe 1216, Train: 0.6956, Val: 0.6952, Test: 0.6950, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 077, Runtime 2.283973, Loss 1.171537, forward nfe 5466, backward nfe 1232, Train: 0.7004, Val: 0.6982, Test: 0.6971, Best time: 5.4435
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 078, Runtime 2.284306, Loss 1.165042, forward nfe 5542, backward nfe 1248, Train: 0.7004, Val: 0.6982, Test: 0.6971, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 079, Runtime 2.284778, Loss 1.159745, forward nfe 5618, backward nfe 1264, Train: 0.7038, Val: 0.6994, Test: 0.6922, Best time: 5.4299
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 080, Runtime 2.285426, Loss 1.156617, forward nfe 5694, backward nfe 1280, Train: 0.7038, Val: 0.6994, Test: 0.6922, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 081, Runtime 2.284631, Loss 1.154815, forward nfe 5770, backward nfe 1296, Train: 0.7038, Val: 0.6994, Test: 0.6922, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 082, Runtime 2.284135, Loss 1.156082, forward nfe 5846, backward nfe 1312, Train: 0.7038, Val: 0.6994, Test: 0.6922, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 083, Runtime 2.417328, Loss 1.157592, forward nfe 5928, backward nfe 1328, Train: 0.7038, Val: 0.6994, Test: 0.6922, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 084, Runtime 2.286649, Loss 1.164430, forward nfe 6004, backward nfe 1344, Train: 0.7038, Val: 0.6994, Test: 0.6922, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444800 6043992064 2 12972544
Epoch: 085, Runtime 2.416790, Loss 1.162054, forward nfe 6086, backward nfe 1360, Train: 0.7038, Val: 0.6994, Test: 0.6922, Best time: 3.6760
before train: allocated reserved
1505061376 4089446400 2 12972544




after test: allocated reserved
2434444800 6043992064 2 12972544
Epoch: 086, Runtime 2.417781, Loss 1.165852, forward nfe 6168, backward nfe 1376, Train: 0.7038, Val: 0.6994, Test: 0.6922, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 087, Runtime 2.417533, Loss 1.160196, forward nfe 6250, backward nfe 1392, Train: 0.7038, Val: 0.6994, Test: 0.6922, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 088, Runtime 2.418004, Loss 1.155695, forward nfe 6332, backward nfe 1408, Train: 0.7038, Val: 0.6994, Test: 0.6922, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444800 6043992064 2 12972544
Epoch: 089, Runtime 2.419920, Loss 1.152120, forward nfe 6414, backward nfe 1424, Train: 0.7038, Val: 0.6994, Test: 0.6922, Best time: 3.6760
before train: allocated reserved
1505061376 4089446400 2 12972544




after test: allocated reserved
2434444800 6043992064 2 12972544
Epoch: 090, Runtime 2.418892, Loss 1.148581, forward nfe 6496, backward nfe 1440, Train: 0.7038, Val: 0.6994, Test: 0.6922, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 091, Runtime 2.420436, Loss 1.142343, forward nfe 6578, backward nfe 1456, Train: 0.7038, Val: 0.6994, Test: 0.6922, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 092, Runtime 2.419020, Loss 1.138423, forward nfe 6660, backward nfe 1472, Train: 0.7038, Val: 0.6994, Test: 0.6922, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 093, Runtime 2.419957, Loss 1.139502, forward nfe 6742, backward nfe 1488, Train: 0.7038, Val: 0.6994, Test: 0.6922, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 094, Runtime 2.420765, Loss 1.136667, forward nfe 6824, backward nfe 1504, Train: 0.7038, Val: 0.6994, Test: 0.6922, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 095, Runtime 2.420194, Loss 1.143490, forward nfe 6906, backward nfe 1520, Train: 0.7038, Val: 0.6994, Test: 0.6922, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 096, Runtime 2.421622, Loss 1.143039, forward nfe 6988, backward nfe 1536, Train: 0.7038, Val: 0.6994, Test: 0.6922, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 097, Runtime 2.418273, Loss 1.146141, forward nfe 7070, backward nfe 1552, Train: 0.7038, Val: 0.6994, Test: 0.6922, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 098, Runtime 2.419801, Loss 1.144164, forward nfe 7152, backward nfe 1568, Train: 0.7038, Val: 0.6994, Test: 0.6922, Best time: 3.6760
before train: allocated reserved
1505060864 4089446400 2 12972544




after test: allocated reserved
2434444288 6043992064 2 12972544
Epoch: 099, Runtime 2.419363, Loss 1.147638, forward nfe 7234, backward nfe 1584, Train: 0.7038, Val: 0.6994, Test: 0.6922, Best time: 3.6760
best val accuracy 0.699386 with test accuracy 0.692179 at epoch 79 and best time 3.676016


In [11]:
print(torch.cuda.memory_allocated())
print(torch.cuda.max_memory_allocated())
print(torch.cuda.memory_reserved())
print(torch.cuda.max_memory_reserved())

1505060864
5408115200
4089446400
7019167744
