# Pointer Networks

In [1]:
import sys
sys.path.insert(0, '../')

import math, operator
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import pickle

import os, time, copy
import json
import models.train_predict_utils as ut
%load_ext autoreload
%autoreload 2

In [2]:
from torch.utils.tensorboard import SummaryWriter
%load_ext tensorboard

In [3]:
from models.batch_env_rl import BatchEnvRL

from models.neural_net import Agent
from models.run_episode import RunEpisode

from models.features_utils import ScalerGlob, DynamicFeatures
from generator.op.generator_utils import get_generated_seeds

In [4]:
import torch, torch.nn as nn
import torch.autograd as autograd
from torch.distributions import Categorical
from torch.utils.checkpoint import checkpoint

from torch import optim
from torch import dot
import torch.nn.functional as F

In [5]:
print(torch.__version__)

1.8.1


In [6]:
# for reproducibility"
random_seed = 25029
np.random.seed(random_seed)

torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)

## Train

In [7]:
def trainEpochs(run_episode, ds, model_opt, scheduler, args, run_vr ='0'):
    
    val_ranges = dict()
    val_ranges[20] = [1, 250]
    val_ranges[50] = [251, 500]
    val_ranges[100] = [501, 750]
    val_ranges[200] = [751, 1000]
    
    # Keep track of time elapsed and running averages
    start = time.time()
     
    reward_total = 0 
    tloss_total = 0
    rwds_total = 0
    pen_total = 0
    train_hist = []
    
    gen_seeds = get_generated_seeds()
    #to consider only up to seed 4000 for all n_nodes
    gen_seeds[20] = np.array([s for s in gen_seeds.get(20) if s<=4000]) 
    
    writer = SummaryWriter()
    with tqdm(range(args.epochs), leave=False, desc='1th loop') as tepoch:
        
        step = 0
        for epoch in tepoch:
            avreward, tloss, avg_rwds, avg_pen = ut.train_test_model(run_episode, ds.data_scaler, model_opt, scheduler,
                                                                         args, gen_seeds)
            
            reward_total += avreward
            tloss_total += tloss
            rwds_total += avg_rwds
            pen_total += avg_pen
            step +=1    
            if (epoch+1) % args.nprint == 0:
                taverage_loss = tloss_total / step
                avreward_total = reward_total / step
                avg_rwds_total = rwds_total / step
                avg_pen_total = pen_total / step

                print('epoch: {}, Av. loss: {:.3f}, Av. final reward: {:.3f}'.format(str(epoch+1), taverage_loss, avreward_total))
                print('epoch: {}, Av. rwd: {:.3f}, Av. pen: {:.3f}'.format(str(epoch+1), avg_rwds_total, avg_pen_total))
                tepoch.set_postfix(loss=taverage_loss, reward=avg_rwds_total, penalty=avg_pen_total, final=avreward_total)
                time.sleep(0.1)
                
                tloss_total = 0
                reward_total = 0
                rwds_total = 0
                pen_total = 0
                step = 0
  
                step_dict = {}
                step_dict['epoch'] = epoch+1
                step_dict['tr_rwd'] = avg_rwds_total
                step_dict['tr_pen'] = avg_pen_total
                step_dict['tr_loss'] = taverage_loss
                writer.add_scalar(f'tr_rwd', avg_rwds_total, epoch)
                writer.add_scalar(f'tr_pen', avg_pen_total, epoch)
                writer.add_scalar(f'tr_total', avg_rwds_total+avg_pen_total, epoch)
                writer.add_scalar(f'tr_loss', taverage_loss, epoch)
                
                file_path = '{path}/train_hist_{agent_name}_noise_{noise}_{notebook_name}_r{run_vr}.csv'.format(path=args.save_hist_dir, 
                                               agent_name=args.agent_name,
                                               noise=str(int(args.noise_on)),
                                               notebook_name=args.nb_name,
                                               run_vr=run_vr)
                train_hist_df = pd.DataFrame(train_hist)
                train_hist_df.to_csv(file_path, index=False)
                
                av_rws_total = 0
                av_pens_total = 0
                for n_nodes_val in [20, 50, 100, 200]:
                    av_rwds, av_pens = ut.run_validation(run_episode, 
                                                         val_ranges.get(n_nodes_val)[0],
                                                         val_ranges.get(n_nodes_val)[1], 
                                                         ds, args, which_set='test')
                    print (f'validation {n_nodes_val} nodes - reward: {av_rwds:.2f}, penalty: {av_pens:.2f}, final: {(av_rwds+av_pens):.2f}')

                    writer.add_scalar(f'rwds_val_{n_nodes_val}', av_rwds, epoch+1)
                    writer.add_scalar(f'pens_val_{n_nodes_val}', av_pens, epoch+1)
                    writer.add_scalar(f'total_val_{n_nodes_val}', av_rwds+av_pens, epoch+1)
                    
                    step_dict[f'val_rwd_{n_nodes_val}'] = av_rwds
                    step_dict[f'val_pen_{n_nodes_val}'] = av_pens
                    av_rws_total += av_rwds
                    av_pens_total += av_pens
                    
                writer.add_scalar(f'rwds_val_all', av_rws_total/4, epoch+1)
                writer.add_scalar(f'pens_val_all', av_pens_total/4, epoch+1)
                writer.add_scalar(f'total_val_all', av_rws_total/4 + av_pens_total/4, epoch+1)
                print(' ')
                
                train_hist.append(step_dict)

            if (epoch+1) % args.nsave == 0:
                file_path = '{path}/model_{agent_name}_noise_{noise}_{notebook_name}_epoch_{epoch}_r{run_vr}.pkl'\
                                            .format(path=args.save_weights_dir, 
                                                   agent_name=args.agent_name,
                                                   noise=str(int(args.noise_on)),
                                                   notebook_name=args.nb_name,
                                                   epoch=epoch+1,
                                                   run_vr=run_vr)
                
                
                torch.save({
                        'epoch': epoch+1,
                        'model_state_dict': run_episode.state_dict(),
                        'optimizer_state_dict': model_opt.state_dict()}, file_path)
        writer.close()
    return pd.DataFrame(train_hist)

## Set up

In [8]:
# ====================================================
# Config
# ====================================================
class args:
    save_weights_dir = '../weights'
    save_hist_dir = '../training_hist'
    save_sub = '../submissions'
    epochs = 15000
    n_nodes_list = range(10, 210)
    save_with_tr = True
    nb_name = 'nb7p0p8p3rg'
    agent_name = 'agent001'
    nsave = 1000
    ndfeatures = 34
    lr = 1e-4
    min_lr = 1e-5
    batch_size = 32
    weight_decay = 1e-5
    max_grad_norm = 2
    beta = 0.0 # for moving Av
    gamma = 0.01 # for entropy 
    # Model parameters
    rnn_hidden = 256  # dimension of decoder 
    encoder_dim = 256
    pre_lnorm = False
    has_glimpse = False
    use_lookahead = True
    dropout = 0.1
    n_layers = 3
    n_heads = 8
    ff_dim = 512
    use_cuda = True
    device = torch.device("cuda:0" if use_cuda else "cpu")
    use_checkpoint = True
    nprint = 250
    n_sims = 6
    accumulation_steps = n_sims 
    from_file = True
    noise_on = True
    feature_list = ['x_coordinate',
                    'y_coordinate',
                    'tw_low',
                    'tw_high',
                    'prize',
                    'tmax',
                    'tw_delta',
                    'prize_tw_delta_ratio',
                    'tw_high_tmax_delta',
                    'tw_low_tmax_delta',
                    'prize_max_return_time_ratio']

    nfeatures = len(feature_list)

## RunEpisode testing

In [9]:
model = Agent(args.nfeatures, args.ndfeatures, args.rnn_hidden, args).to(args.device)

In [10]:
run_episode = RunEpisode(model, args, DynamicFeatures, args.use_lookahead)

In [11]:
ds = ScalerGlob()

In [12]:
model_opt = optim.AdamW(run_episode.parameters(), lr=args.lr)
scheduler = optim.lr_scheduler.CosineAnnealingLR(model_opt, T_max= args.epochs, 
                                                        eta_min=args.min_lr)

In [14]:
train_hist = trainEpochs(run_episode, ds, model_opt, scheduler, args)

1th loop:   0%|          | 0/15000 [00:00<?, ?it/s]

epoch: 250, Av. loss: -0.294, Av. final reward: -1.692
epoch: 250, Av. rwd: 5.979, Av. pen: -7.671


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.16, penalty: -0.01, final: 5.15


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 7.64, penalty: 0.00, final: 7.64


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 10.46, penalty: 0.00, final: 10.46


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 15.75, penalty: -0.00, final: 15.75
 
epoch: 500, Av. loss: -0.184, Av. final reward: 5.489
epoch: 500, Av. rwd: 7.304, Av. pen: -1.815


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.21, penalty: -0.00, final: 5.21


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 7.86, penalty: -0.00, final: 7.85


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 10.98, penalty: -0.01, final: 10.97


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 16.66, penalty: -0.00, final: 16.66
 
epoch: 750, Av. loss: -0.364, Av. final reward: 6.023
epoch: 750, Av. rwd: 7.277, Av. pen: -1.254


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.25, penalty: -0.08, final: 5.17


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 7.94, penalty: 0.00, final: 7.94


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.13, penalty: -0.02, final: 11.11


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.09, penalty: -0.01, final: 17.08
 
epoch: 1000, Av. loss: -0.211, Av. final reward: 5.827
epoch: 1000, Av. rwd: 7.310, Av. pen: -1.483


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.27, penalty: 0.00, final: 5.27


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 7.99, penalty: 0.00, final: 7.99


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.22, penalty: -0.01, final: 11.21


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.22, penalty: 0.00, final: 17.22
 
epoch: 1250, Av. loss: -0.195, Av. final reward: 6.400
epoch: 1250, Av. rwd: 7.568, Av. pen: -1.168


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.25, penalty: 0.00, final: 5.25


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 7.96, penalty: 0.00, final: 7.96


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.25, penalty: 0.00, final: 11.25


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.14, penalty: -0.00, final: 17.14
 
epoch: 1500, Av. loss: -45.013, Av. final reward: -10.624
epoch: 1500, Av. rwd: 7.632, Av. pen: -18.256


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.27, penalty: 0.00, final: 5.27


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.03, penalty: 0.00, final: 8.03


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.35, penalty: 0.00, final: 11.35


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.46, penalty: -0.00, final: 17.46
 
epoch: 1750, Av. loss: -8.835, Av. final reward: 2.319
epoch: 1750, Av. rwd: 7.833, Av. pen: -5.513


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.27, penalty: 0.00, final: 5.27


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.03, penalty: 0.00, final: 8.03


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.35, penalty: 0.00, final: 11.35


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.50, penalty: -0.00, final: 17.50
 
epoch: 2000, Av. loss: -21.836, Av. final reward: -1.055
epoch: 2000, Av. rwd: 8.147, Av. pen: -9.201


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.27, penalty: 0.00, final: 5.27


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.05, penalty: 0.00, final: 8.05


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.39, penalty: 0.00, final: 11.39


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.49, penalty: 0.00, final: 17.49
 
epoch: 2250, Av. loss: -45.842, Av. final reward: -14.181
epoch: 2250, Av. rwd: 7.997, Av. pen: -22.178


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.27, penalty: 0.00, final: 5.27


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.06, penalty: 0.00, final: 8.06


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.40, penalty: -1.60, final: 9.80


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.58, penalty: -1.60, final: 15.98
 
epoch: 2500, Av. loss: -11.045, Av. final reward: 4.541
epoch: 2500, Av. rwd: 8.064, Av. pen: -3.524


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.27, penalty: 0.00, final: 5.27


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.04, penalty: 0.00, final: 8.04


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.41, penalty: -1.60, final: 9.81


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.54, penalty: -1.60, final: 15.94
 
epoch: 2750, Av. loss: -3.266, Av. final reward: 5.190
epoch: 2750, Av. rwd: 8.087, Av. pen: -2.897


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.28, penalty: 0.00, final: 5.28


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.04, penalty: 0.00, final: 8.04


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.40, penalty: 0.00, final: 11.40


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.49, penalty: 0.00, final: 17.49
 
epoch: 3000, Av. loss: -0.561, Av. final reward: 6.847
epoch: 3000, Av. rwd: 8.016, Av. pen: -1.169


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.28, penalty: 0.00, final: 5.28


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.04, penalty: 0.00, final: 8.04


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.41, penalty: 0.00, final: 11.41


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.60, penalty: 0.00, final: 17.60
 
epoch: 3250, Av. loss: -0.146, Av. final reward: 7.355
epoch: 3250, Av. rwd: 7.853, Av. pen: -0.498


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.27, penalty: 0.00, final: 5.27


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.05, penalty: 0.00, final: 8.05


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.42, penalty: 0.00, final: 11.42


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.56, penalty: 0.00, final: 17.56
 
epoch: 3500, Av. loss: -8.375, Av. final reward: 4.483
epoch: 3500, Av. rwd: 8.639, Av. pen: -4.156


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.28, penalty: 0.00, final: 5.28


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.05, penalty: 0.00, final: 8.05


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.40, penalty: 0.00, final: 11.40


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.60, penalty: 0.00, final: 17.60
 
epoch: 3750, Av. loss: -28.052, Av. final reward: 1.222
epoch: 3750, Av. rwd: 7.818, Av. pen: -6.596


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.28, penalty: 0.00, final: 5.28


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.04, penalty: 0.00, final: 8.04


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.43, penalty: 0.00, final: 11.43


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.60, penalty: 0.00, final: 17.60
 
epoch: 4000, Av. loss: -42.473, Av. final reward: 0.250
epoch: 4000, Av. rwd: 8.219, Av. pen: -7.969


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.27, penalty: 0.00, final: 5.27


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.06, penalty: 0.00, final: 8.06


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.41, penalty: 0.00, final: 11.41


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.58, penalty: 0.00, final: 17.58
 
epoch: 4250, Av. loss: -24.705, Av. final reward: 3.203
epoch: 4250, Av. rwd: 8.108, Av. pen: -4.905


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.28, penalty: 0.00, final: 5.28


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.07, penalty: 0.00, final: 8.07


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.44, penalty: 0.00, final: 11.44


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.64, penalty: 0.00, final: 17.64
 
epoch: 4500, Av. loss: -23.881, Av. final reward: 2.325
epoch: 4500, Av. rwd: 8.215, Av. pen: -5.890


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.28, penalty: 0.00, final: 5.28


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.06, penalty: -0.00, final: 8.06


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.43, penalty: 0.00, final: 11.43


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.57, penalty: 0.00, final: 17.57
 
epoch: 4750, Av. loss: -0.823, Av. final reward: 7.575
epoch: 4750, Av. rwd: 8.581, Av. pen: -1.006


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.27, penalty: 0.00, final: 5.27


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.06, penalty: 0.00, final: 8.06


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.43, penalty: 0.00, final: 11.43


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.62, penalty: 0.00, final: 17.62
 
epoch: 5000, Av. loss: -17.877, Av. final reward: 1.650
epoch: 5000, Av. rwd: 7.921, Av. pen: -6.271


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.33, penalty: -0.08, final: 5.25


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.09, penalty: -0.20, final: 7.89


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.43, penalty: -0.40, final: 11.03


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.61, penalty: 0.00, final: 17.61
 
epoch: 5250, Av. loss: -8.652, Av. final reward: 5.113
epoch: 5250, Av. rwd: 8.337, Av. pen: -3.223


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.31, penalty: -0.08, final: 5.23


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.10, penalty: 0.00, final: 8.10


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.43, penalty: 0.00, final: 11.43


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.64, penalty: 0.00, final: 17.64
 
epoch: 5500, Av. loss: -3.622, Av. final reward: 5.812
epoch: 5500, Av. rwd: 8.078, Av. pen: -2.266


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.31, penalty: -0.08, final: 5.23


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.07, penalty: 0.00, final: 8.07


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.43, penalty: 0.00, final: 11.43


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.62, penalty: 0.00, final: 17.62
 
epoch: 5750, Av. loss: -15.379, Av. final reward: 1.951
epoch: 5750, Av. rwd: 8.042, Av. pen: -6.092


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.30, penalty: -0.08, final: 5.22


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.09, penalty: 0.00, final: 8.09


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.45, penalty: 0.00, final: 11.45


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.66, penalty: 0.00, final: 17.66
 
epoch: 6000, Av. loss: -6.059, Av. final reward: 6.372
epoch: 6000, Av. rwd: 8.212, Av. pen: -1.840


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.31, penalty: -0.08, final: 5.23


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.09, penalty: 0.00, final: 8.09


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.45, penalty: 0.00, final: 11.45


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.68, penalty: 0.00, final: 17.68
 
epoch: 6250, Av. loss: -1.505, Av. final reward: 7.484
epoch: 6250, Av. rwd: 8.501, Av. pen: -1.017


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.34, penalty: -0.16, final: 5.18


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.10, penalty: 0.00, final: 8.10


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.46, penalty: 0.00, final: 11.46


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.63, penalty: 0.00, final: 17.63
 
epoch: 6500, Av. loss: -4.701, Av. final reward: 6.141
epoch: 6500, Av. rwd: 8.784, Av. pen: -2.643


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.38, penalty: -0.24, final: 5.14


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.10, penalty: -0.40, final: 7.70


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.45, penalty: 0.00, final: 11.45


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.62, penalty: 0.00, final: 17.62
 
epoch: 6750, Av. loss: -4.128, Av. final reward: 6.616
epoch: 6750, Av. rwd: 8.342, Av. pen: -1.726


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.30, penalty: 0.00, final: 5.30


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.07, penalty: 0.00, final: 8.07


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.46, penalty: 0.00, final: 11.46


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.62, penalty: 0.00, final: 17.62
 
epoch: 7000, Av. loss: -2.673, Av. final reward: 7.310
epoch: 7000, Av. rwd: 8.509, Av. pen: -1.199


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.35, penalty: -0.16, final: 5.19


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.12, penalty: 0.00, final: 8.12


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.48, penalty: 0.00, final: 11.48


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.67, penalty: 0.00, final: 17.67
 
epoch: 7250, Av. loss: -0.111, Av. final reward: 7.642
epoch: 7250, Av. rwd: 8.113, Av. pen: -0.471


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.28, penalty: 0.00, final: 5.28


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.07, penalty: 0.00, final: 8.07


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.44, penalty: 0.00, final: 11.44


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.60, penalty: 0.00, final: 17.60
 
epoch: 7500, Av. loss: -2.399, Av. final reward: 6.726
epoch: 7500, Av. rwd: 8.493, Av. pen: -1.767


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.33, penalty: -0.16, final: 5.17


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.09, penalty: 0.00, final: 8.09


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.45, penalty: 0.00, final: 11.45


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.55, penalty: 0.00, final: 17.55
 
epoch: 7750, Av. loss: -2.607, Av. final reward: 6.738
epoch: 7750, Av. rwd: 7.895, Av. pen: -1.157


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.32, penalty: -0.08, final: 5.24


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.10, penalty: 0.00, final: 8.10


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.45, penalty: 0.00, final: 11.45


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.65, penalty: 0.00, final: 17.65
 
epoch: 8000, Av. loss: -2.523, Av. final reward: 7.451
epoch: 8000, Av. rwd: 8.865, Av. pen: -1.414


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.40, penalty: -0.24, final: 5.16


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.13, penalty: -0.40, final: 7.73


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.50, penalty: -0.40, final: 11.10


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.66, penalty: 0.00, final: 17.66
 
epoch: 8250, Av. loss: -0.854, Av. final reward: 7.621
epoch: 8250, Av. rwd: 8.537, Av. pen: -0.916


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.49, penalty: -0.48, final: 5.01


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.24, penalty: -0.40, final: 7.83


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.63, penalty: -2.40, final: 9.23


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.84, penalty: -2.40, final: 15.44
 
epoch: 8500, Av. loss: -2.821, Av. final reward: 6.712
epoch: 8500, Av. rwd: 8.252, Av. pen: -1.539


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.62, penalty: -1.36, final: 4.26


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.42, penalty: -1.60, final: 6.82


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.99, penalty: -4.00, final: 7.99


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 18.28, penalty: -11.20, final: 7.08
 
epoch: 8750, Av. loss: -22.495, Av. final reward: 3.812
epoch: 8750, Av. rwd: 8.199, Av. pen: -4.387


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.36, penalty: -0.16, final: 5.20


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.12, penalty: 0.00, final: 8.12


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.48, penalty: 0.00, final: 11.48


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.71, penalty: 0.00, final: 17.71
 


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



validation 200 nodes - reward: 17.67, penalty: 0.00, final: 17.67
 
epoch: 9500, Av. loss: -17.128, Av. final reward: 5.255
epoch: 9500, Av. rwd: 8.394, Av. pen: -3.139


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.44, penalty: -0.32, final: 5.12


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.18, penalty: -0.60, final: 7.58


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.57, penalty: -0.80, final: 10.77


  0%|          | 0/250 [00:00<?, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



validation 200 nodes - reward: 17.64, penalty: 0.00, final: 17.64
 
epoch: 10000, Av. loss: -15.287, Av. final reward: 5.786
epoch: 10000, Av. rwd: 8.648, Av. pen: -2.862


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.39, penalty: -0.24, final: 5.15


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.13, penalty: -0.20, final: 7.93


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.49, penalty: 0.00, final: 11.49


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.64, penalty: 0.00, final: 17.64
 
epoch: 10250, Av. loss: -17.709, Av. final reward: 4.984
epoch: 10250, Av. rwd: 8.572, Av. pen: -3.587


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.39, penalty: -0.16, final: 5.23


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.14, penalty: -0.20, final: 7.94


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.51, penalty: 0.00, final: 11.51


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.67, penalty: 0.00, final: 17.67
 
epoch: 10500, Av. loss: -15.878, Av. final reward: 5.257
epoch: 10500, Av. rwd: 9.065, Av. pen: -3.808


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.36, penalty: -0.24, final: 5.12


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.13, penalty: -0.20, final: 7.93


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.51, penalty: 0.00, final: 11.51


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.68, penalty: 0.00, final: 17.68
 
epoch: 10750, Av. loss: -10.012, Av. final reward: 6.064
epoch: 10750, Av. rwd: 8.200, Av. pen: -2.136


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.41, penalty: -0.24, final: 5.17


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.14, penalty: -0.20, final: 7.94


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.51, penalty: 0.00, final: 11.51


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.69, penalty: 0.00, final: 17.69
 
epoch: 11000, Av. loss: -18.161, Av. final reward: 5.005
epoch: 11000, Av. rwd: 8.602, Av. pen: -3.596


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.42, penalty: -0.16, final: 5.26


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.13, penalty: -0.20, final: 7.93


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.51, penalty: 0.00, final: 11.51


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.66, penalty: 0.00, final: 17.66
 
epoch: 11250, Av. loss: -7.648, Av. final reward: 6.770
epoch: 11250, Av. rwd: 8.327, Av. pen: -1.557


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.41, penalty: -0.24, final: 5.17


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.16, penalty: -0.20, final: 7.96


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.53, penalty: 0.00, final: 11.53


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.70, penalty: 0.00, final: 17.70
 
epoch: 11500, Av. loss: -8.299, Av. final reward: 6.007
epoch: 11500, Av. rwd: 7.802, Av. pen: -1.795


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.44, penalty: -0.40, final: 5.04


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.19, penalty: -0.20, final: 7.99


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.54, penalty: 0.00, final: 11.54


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.70, penalty: 0.00, final: 17.70
 
epoch: 11750, Av. loss: -11.335, Av. final reward: 6.062
epoch: 11750, Av. rwd: 8.164, Av. pen: -2.102


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.43, penalty: -0.32, final: 5.11


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.17, penalty: -0.20, final: 7.97


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.55, penalty: -0.40, final: 11.15


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.68, penalty: 0.00, final: 17.68
 
epoch: 12000, Av. loss: -13.870, Av. final reward: 6.565
epoch: 12000, Av. rwd: 8.707, Av. pen: -2.142


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.38, penalty: -0.16, final: 5.22


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.13, penalty: -0.20, final: 7.93


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.51, penalty: 0.00, final: 11.51


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.72, penalty: 0.00, final: 17.72
 
epoch: 12250, Av. loss: -8.502, Av. final reward: 6.345
epoch: 12250, Av. rwd: 8.113, Av. pen: -1.768


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.41, penalty: -0.16, final: 5.25


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.14, penalty: 0.00, final: 8.14


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.52, penalty: 0.00, final: 11.52


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.71, penalty: 0.00, final: 17.71
 
epoch: 12500, Av. loss: -9.155, Av. final reward: 7.102
epoch: 12500, Av. rwd: 8.759, Av. pen: -1.657


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.47, penalty: -0.32, final: 5.15


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.21, penalty: 0.00, final: 8.21


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.60, penalty: 0.00, final: 11.60


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.78, penalty: -1.60, final: 16.18
 
epoch: 12750, Av. loss: -7.876, Av. final reward: 6.548
epoch: 12750, Av. rwd: 8.338, Av. pen: -1.790


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.42, penalty: -0.16, final: 5.26


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.12, penalty: -0.20, final: 7.92


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.52, penalty: 0.00, final: 11.52


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.72, penalty: 0.00, final: 17.72
 
epoch: 13000, Av. loss: -11.014, Av. final reward: 7.228
epoch: 13000, Av. rwd: 9.159, Av. pen: -1.931


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.43, penalty: -0.16, final: 5.27


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.15, penalty: 0.00, final: 8.15


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.54, penalty: 0.00, final: 11.54


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.70, penalty: 0.00, final: 17.70
 
epoch: 13250, Av. loss: -5.418, Av. final reward: 7.525
epoch: 13250, Av. rwd: 8.683, Av. pen: -1.159


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.43, penalty: -0.16, final: 5.27


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.15, penalty: 0.00, final: 8.15


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.54, penalty: 0.00, final: 11.54


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.72, penalty: 0.00, final: 17.72
 
epoch: 13500, Av. loss: -8.011, Av. final reward: 6.416
epoch: 13500, Av. rwd: 8.250, Av. pen: -1.834


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.44, penalty: -0.32, final: 5.12


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.18, penalty: 0.00, final: 8.18


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.54, penalty: -0.40, final: 11.14


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.68, penalty: 0.00, final: 17.68
 
epoch: 13750, Av. loss: -6.678, Av. final reward: 6.364
epoch: 13750, Av. rwd: 8.041, Av. pen: -1.677


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.42, penalty: -0.16, final: 5.26


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.15, penalty: -0.20, final: 7.95


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.54, penalty: 0.00, final: 11.54


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.69, penalty: 0.00, final: 17.69
 
epoch: 14000, Av. loss: -8.867, Av. final reward: 7.139
epoch: 14000, Av. rwd: 8.307, Av. pen: -1.168


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.41, penalty: -0.16, final: 5.25


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.14, penalty: -0.20, final: 7.94


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.54, penalty: 0.00, final: 11.54


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.71, penalty: 0.00, final: 17.71
 
epoch: 14250, Av. loss: -13.584, Av. final reward: 5.919
epoch: 14250, Av. rwd: 8.121, Av. pen: -2.202


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.39, penalty: -0.16, final: 5.23


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.14, penalty: -0.20, final: 7.94


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.52, penalty: 0.00, final: 11.52


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.68, penalty: 0.00, final: 17.68
 
epoch: 14500, Av. loss: -9.481, Av. final reward: 6.315
epoch: 14500, Av. rwd: 7.932, Av. pen: -1.617


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.41, penalty: -0.16, final: 5.25


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.15, penalty: -0.20, final: 7.95


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.54, penalty: 0.00, final: 11.54


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.69, penalty: 0.00, final: 17.69
 
epoch: 14750, Av. loss: -4.635, Av. final reward: 7.024
epoch: 14750, Av. rwd: 8.341, Av. pen: -1.317


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.43, penalty: -0.16, final: 5.27


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.14, penalty: 0.00, final: 8.14


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.54, penalty: 0.00, final: 11.54


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.71, penalty: 0.00, final: 17.71
 
epoch: 15000, Av. loss: -8.141, Av. final reward: 6.771
epoch: 15000, Av. rwd: 8.359, Av. pen: -1.588


  0%|          | 0/250 [00:00<?, ?it/s]

validation 20 nodes - reward: 5.43, penalty: -0.16, final: 5.27


  0%|          | 0/250 [00:00<?, ?it/s]

validation 50 nodes - reward: 8.15, penalty: -0.20, final: 7.95


  0%|          | 0/250 [00:00<?, ?it/s]

validation 100 nodes - reward: 11.54, penalty: 0.00, final: 11.54


  0%|          | 0/250 [00:00<?, ?it/s]

validation 200 nodes - reward: 17.72, penalty: 0.00, final: 17.72
 


In [None]:
checkpoint = torch.load('{path}/model_{agent_name}_noise_{noise}_{notebook_name}_epoch_{epoch}_r0.pkl'.format(path=args.save_weights_dir, 
                                   agent_name=args.agent_name,
                                   noise=str(int(args.noise_on)),
                                   notebook_name=args.nb_name,
                                   epoch=15000))
run_episode.load_state_dict(checkpoint['model_state_dict'])
model_opt.load_state_dict(checkpoint['optimizer_state_dict'])

In [15]:
av_rwds, av_pens = ut.run_validation(run_episode, 1, 1000, ds, args, which_set='test')
print(av_rwds+av_pens, av_rwds, av_pens)

  0%|          | 0/1000 [00:00<?, ?it/s]

10.617339999999997 10.707339999999997 -0.09


In [16]:
av_rwds, av_pens = ut.create_submission(run_episode, ds, args, n_tours=100, with_as=False, which_set='test')
print(av_rwds+av_pens, av_rwds, av_pens)

  0%|          | 0/1000 [00:00<?, ?it/s]

test  - reward: 10.719, penalty: -0.046
10.673176199999748 10.718786199999748 -0.04561
