# Models Testing


In [1]:
import time

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader

In [2]:
# Set seeds
torch.manual_seed(0)
np.random.seed(0)

In [3]:
synthetic_calls_path = '../data/trinomial_synthetic_calls.csv'
synthetic_puts_path = '../data/trinomial_synthetic_puts.csv'

In [4]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
    
    return df

In [5]:
synthetic_calls = pd.read_csv(synthetic_calls_path, index_col=0)
synthetic_puts = pd.read_csv(synthetic_puts_path, index_col=0)

synthetic_calls = reduce_mem_usage(synthetic_calls)
synthetic_puts = reduce_mem_usage(synthetic_puts)

In [6]:
synthetic_options = pd.concat([synthetic_calls, synthetic_puts], axis=0)
synthetic_options = shuffle(synthetic_options, random_state=0)
synthetic_options = synthetic_options.reset_index()
synthetic_options = synthetic_options.drop('index', axis=1)

# Preprocessing

In [7]:
synthetic_options = pd.get_dummies(synthetic_options, prefix='', prefix_sep='')

In [8]:
input_sc = StandardScaler()
output_sc = StandardScaler()
input_data = input_sc.fit_transform(synthetic_options.drop('Option Price', axis=1))
output_data = output_sc.fit_transform(synthetic_options['Option Price'].values.reshape(-1, 1))

train_size = 0.8
val_size = 0.1

last_train_idx = int(np.round(len(input_data) * train_size))
last_val_idx = last_train_idx + int(np.round(len(input_data) * val_size))

X_train = input_data[0:last_train_idx]
X_val = input_data[last_train_idx:last_val_idx]
X_test = input_data[last_val_idx:]

y_train = output_data[0:last_train_idx]
y_val = output_data[last_train_idx:last_val_idx]
y_test = output_data[last_val_idx:]

In [9]:
X_train = Variable(torch.Tensor(X_train))
X_val = Variable(torch.Tensor(X_val))
X_test = Variable(torch.Tensor(X_test))

y_train = Variable(torch.Tensor(y_train))
y_val = Variable(torch.Tensor(y_val))
y_test = Variable(torch.Tensor(y_test))

# Model

In [10]:
CUDA = torch.cuda.is_available()
device = 'cuda:0' if CUDA else 'cpu'

In [11]:
class ResBlock(nn.Module):

  def __init__(self, module):
    super(ResBlock, self).__init__()
    self.module = module

  def forward(self, x):
    return self.module(x) + x

In [12]:
class HiddenLayer(nn.Module):

  def __init__(self, layer_size, act_fn):
      super(HiddenLayer, self).__init__()
      
      if act_fn == 'ReLU':
        self.layer = nn.Sequential(
          nn.Linear(layer_size, layer_size),
          nn.ReLU())
      elif act_fn == 'LeakyReLU':
        self.layer = nn.Sequential(
          nn.Linear(layer_size, layer_size),
          nn.LeakyReLU())
      elif act_fn == 'ELU':
        self.layer = nn.Sequential(
          nn.Linear(layer_size, layer_size),
          nn.ELU())
    
  def forward(self, x):
    return self.layer(x)

In [13]:
class Net(nn.Module):

  def __init__(self, input_size, output_size, hidden_size, num_layers, act_fn):
    super(Net, self).__init__()
    self.input_size = input_size
    self.output_size = output_size
    self.hidden_size = hidden_size

    if act_fn == 'ReLU':
      self.initial_layer = nn.Sequential(
          nn.Linear(self.input_size, self.hidden_size),
          nn.ReLU())
    elif act_fn == 'LeakyReLU':
      self.initial_layer = nn.Sequential(
          nn.Linear(self.input_size, self.hidden_size),
          nn.LeakyReLU())
    elif act_fn == 'ELU':
      self.initial_layer = nn.Sequential(
          nn.Linear(self.input_size, self.hidden_size),
          nn.ELU())

    self.hidden_layers_list = []

    for i in range(num_layers // 2):
      self.hidden_layers_list.append(
          ResBlock(
            nn.Sequential(
                HiddenLayer(self.hidden_size, act_fn),
                HiddenLayer(self.hidden_size, act_fn)
            )
        )
      )

    self.hidden_layers = nn.Sequential(*self.hidden_layers_list)

    self.net = nn.Sequential(
        self.initial_layer,
        self.hidden_layers,
        nn.Linear(self.hidden_size, self.output_size)
    )
  
  def forward(self, x):
    return self.net(x)

In [14]:
def init_weights(m, init_m: str):

  @torch.no_grad()
  def init_uniform(m):
    if isinstance(m, nn.Linear):
      torch.nn.init.uniform_(m.weight)
      m.bias.data.fill_(0.01)

  @torch.no_grad()
  def init_normal(m):
    if isinstance(m, nn.Linear):
      torch.nn.init.normal_(m.weight)
      m.bias.data.fill_(0.01)

  @torch.no_grad()
  def init_xuniform(m):
    if isinstance(m, nn.Linear):
      torch.nn.init.xavier_uniform_(m.weight)
      m.bias.data.fill_(0.01)

  @torch.no_grad()
  def init_xnormal(m):
    if isinstance(m, nn.Linear):
      torch.nn.init.xavier_normal_(m.weight)
      m.bias.data.fill_(0.01)

  if init_m == 'uniform':
    m.apply(init_uniform)
  elif init_m == 'normal':
    m.apply(init_normal)
  elif init_m == 'xaiver uniform':
    m.apply(init_xuniform)
  elif init_m == 'xavier normal':
    m.apply(init_xnormal)

# Best models from cross validation

In [15]:
best_models = [
               {'n_hidden': 400, 'n_layers': 8, 'act_fun': 'LeakyReLU', 'init_method': 'xavier uniform'},
               {'n_hidden': 400, 'n_layers': 4, 'act_fun': 'ReLU', 'init_method': 'xavier uniform'},
               {'n_hidden': 400, 'n_layers': 4, 'act_fun': 'LeakyReLU', 'init_method': 'xavier uniform'},
               {'n_hidden': 400, 'n_layers': 6, 'act_fun': 'LeakyReLU', 'init_method': 'xavier uniform'},
               {'n_hidden': 600, 'n_layers': 4, 'act_fun': 'LeakyReLU', 'init_method': 'xavier uniform'}
]

# Training

In [16]:
input_size = 7
output_size = 1
batch_size = 1208
epochs = 125
lr = 1e-4

loss_fn = nn.MSELoss()

In [17]:
class OptDataset(Dataset):

  def __init__(self, X, y):
    self.X = X
    self.y = y

  def __getitem__(self, idx):
    return self.X[idx], self.y[idx]

  def __len__(self):
    return len(self.X)

In [18]:
def evaluate(model, loss_fn, X, y):
  model.eval()
  with torch.no_grad():
    out = model(X.to(device))
    loss = loss_fn(out, y.to(device))
    return loss.item()

In [19]:
def mape_loss(y_pred, y):
  return sum([np.abs(y_i - hy_i) / y_i for y_i, hy_i in zip(y_pred, y)]) / len(y_pred)

In [20]:
def get_mape_loss(model, X, y):
  model.eval()
  with torch.no_grad():
    out = model(X.to(device))
    loss = mape_loss(
        out.squeeze().cpu().detach().numpy(), 
        y.squeeze().cpu().detach().numpy())
    return loss

In [21]:
def test_models(
  models_dict,
  epochs,
  batch_size,
  X_train,
  y_train,
  X_val,
  y_val,
  X_test,
  y_test,
  loss_fn
):
  testing_results = pd.DataFrame(columns=
                               ['hidden_size',
                                'n_layers',
                                'act_fun',
                                'init_methods',
                                'mean_val_result',
                                'std_val_result',
                                'test_mse',
                                'test_mae',
                                'test_rmse',
                                'test_mape'])

  for model_dict in models_dict:
    model = Net(input_size,
                output_size, 
                model_dict['n_hidden'], 
                model_dict['n_layers'], 
                model_dict['act_fun']).to(device)
    init_weights(model, model_dict['init_method'])
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    validation_losses = []
    test_res = {
        'hidden_size': model_dict['n_hidden'],
        'n_layers': model_dict['n_layers'],
        'act_fun': model_dict['act_fun'],
        'init_methods': model_dict['init_method']
    }

    print('Model: ', test_res)

    for epoch in range(epochs):
      model.train()
      total_loss = 0
      start_time = time.time()
      i = 0

      for batch, batch_labels in DataLoader(OptDataset(X_train, y_train), batch_size):
        out = model(batch.to(device))
        optimizer.zero_grad()

        loss = loss_fn(out, batch_labels.to(device))
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

        if i > 0 and i % 25 == 0:
          avg_loss = total_loss / 50
          elapsed = time.time() - start_time
          print('| Epoch {:3d} | {:5d}/{:5d} batches | lr {:2.5f} | ms/batch {:5.2f} | '
                  'loss {:5.8f}'.format(
              epoch, i, len(X_train) // batch_size+1, lr, elapsed * 1000 / 50,
              avg_loss))
          start_time = time.time()
          total_loss = 0
        
        i += 1

      validation_losses.append(evaluate(model, loss_fn, X_val, y_val))
    
    mse_test = evaluate(model, loss_fn, X_test, y_test)
    mae_loss = nn.L1Loss()
    mae_test = evaluate(model, mae_loss, X_test, y_test)
    mape_test = get_mape_loss(model, X_test, y_test)
    validation_losses = np.array(validation_losses)
    test_res['mean_val_result'] = validation_losses.mean()
    test_res['std_val_result'] = validation_losses.std()
    test_res['test_mse'] = mse_test
    test_res['test_mae'] = mae_test
    test_res['test_rmse'] = np.sqrt(mse_test)
    test_res['test_mape'] = mape_test

    testing_results = testing_results.append(test_res, ignore_index=True)

  return testing_results

In [22]:
testing_results = test_models(best_models,
            epochs,
            batch_size,
             X_train,
            y_train,
            X_val,
            y_val,
            X_test,
            y_test,
            loss_fn)

Model:  {'hidden_size': 400, 'n_layers': 8, 'act_fun': 'LeakyReLU', 'init_methods': 'xavier uniform'}
| Epoch   0 |    25/  338 batches | lr 0.00010 | ms/batch 16.94 | loss 0.29636818
| Epoch   0 |    50/  338 batches | lr 0.00010 | ms/batch 11.21 | loss 0.02504778
| Epoch   0 |    75/  338 batches | lr 0.00010 | ms/batch  9.64 | loss 0.00809468
| Epoch   0 |   100/  338 batches | lr 0.00010 | ms/batch 11.30 | loss 0.00451989
| Epoch   0 |   125/  338 batches | lr 0.00010 | ms/batch 11.26 | loss 0.00279375
| Epoch   0 |   150/  338 batches | lr 0.00010 | ms/batch  9.66 | loss 0.00175735
| Epoch   0 |   175/  338 batches | lr 0.00010 | ms/batch 11.29 | loss 0.00124702
| Epoch   0 |   200/  338 batches | lr 0.00010 | ms/batch 11.27 | loss 0.00088120
| Epoch   0 |   225/  338 batches | lr 0.00010 | ms/batch 10.32 | loss 0.00067185
| Epoch   0 |   250/  338 batches | lr 0.00010 | ms/batch 11.35 | loss 0.00057322
| Epoch   0 |   275/  338 batches | lr 0.00010 | ms/batch 11.33 | loss 0.00046

| Epoch   7 |   225/  338 batches | lr 0.00010 | ms/batch 11.25 | loss 0.00003220
| Epoch   7 |   250/  338 batches | lr 0.00010 | ms/batch 11.31 | loss 0.00007083
| Epoch   7 |   275/  338 batches | lr 0.00010 | ms/batch  9.71 | loss 0.00002875
| Epoch   7 |   300/  338 batches | lr 0.00010 | ms/batch 11.29 | loss 0.00006430
| Epoch   7 |   325/  338 batches | lr 0.00010 | ms/batch 11.30 | loss 0.00006287
| Epoch   8 |    25/  338 batches | lr 0.00010 | ms/batch 11.66 | loss 0.00005953
| Epoch   8 |    50/  338 batches | lr 0.00010 | ms/batch 11.35 | loss 0.00006013
| Epoch   8 |    75/  338 batches | lr 0.00010 | ms/batch  9.68 | loss 0.00003066
| Epoch   8 |   100/  338 batches | lr 0.00010 | ms/batch 11.26 | loss 0.00004956
| Epoch   8 |   125/  338 batches | lr 0.00010 | ms/batch 11.26 | loss 0.00005167
| Epoch   8 |   150/  338 batches | lr 0.00010 | ms/batch  9.68 | loss 0.00003457
| Epoch   8 |   175/  338 batches | lr 0.00010 | ms/batch 11.25 | loss 0.00003281
| Epoch   8 |   

| Epoch  15 |   125/  338 batches | lr 0.00010 | ms/batch 11.28 | loss 0.00006222
| Epoch  15 |   150/  338 batches | lr 0.00010 | ms/batch 11.38 | loss 0.00001800
| Epoch  15 |   175/  338 batches | lr 0.00010 | ms/batch 11.57 | loss 0.00008013
| Epoch  15 |   200/  338 batches | lr 0.00010 | ms/batch 10.15 | loss 0.00007408
| Epoch  15 |   225/  338 batches | lr 0.00010 | ms/batch 11.55 | loss 0.00002585
| Epoch  15 |   250/  338 batches | lr 0.00010 | ms/batch 11.41 | loss 0.00002254
| Epoch  15 |   275/  338 batches | lr 0.00010 | ms/batch  9.74 | loss 0.00001355
| Epoch  15 |   300/  338 batches | lr 0.00010 | ms/batch 11.39 | loss 0.00004015
| Epoch  15 |   325/  338 batches | lr 0.00010 | ms/batch 11.36 | loss 0.00005059
| Epoch  16 |    25/  338 batches | lr 0.00010 | ms/batch 11.83 | loss 0.00001719
| Epoch  16 |    50/  338 batches | lr 0.00010 | ms/batch  9.87 | loss 0.00002927
| Epoch  16 |    75/  338 batches | lr 0.00010 | ms/batch 11.52 | loss 0.00003065
| Epoch  16 |   

| Epoch  23 |    25/  338 batches | lr 0.00010 | ms/batch 10.21 | loss 0.00001345
| Epoch  23 |    50/  338 batches | lr 0.00010 | ms/batch 11.33 | loss 0.00001787
| Epoch  23 |    75/  338 batches | lr 0.00010 | ms/batch 11.38 | loss 0.00005500
| Epoch  23 |   100/  338 batches | lr 0.00010 | ms/batch  9.70 | loss 0.00001812
| Epoch  23 |   125/  338 batches | lr 0.00010 | ms/batch 11.30 | loss 0.00003799
| Epoch  23 |   150/  338 batches | lr 0.00010 | ms/batch 11.26 | loss 0.00002118
| Epoch  23 |   175/  338 batches | lr 0.00010 | ms/batch  9.72 | loss 0.00001604
| Epoch  23 |   200/  338 batches | lr 0.00010 | ms/batch 11.38 | loss 0.00002881
| Epoch  23 |   225/  338 batches | lr 0.00010 | ms/batch 11.48 | loss 0.00002532
| Epoch  23 |   250/  338 batches | lr 0.00010 | ms/batch 10.04 | loss 0.00001854
| Epoch  23 |   275/  338 batches | lr 0.00010 | ms/batch 11.37 | loss 0.00003902
| Epoch  23 |   300/  338 batches | lr 0.00010 | ms/batch 11.40 | loss 0.00001194
| Epoch  23 |   

| Epoch  30 |   250/  338 batches | lr 0.00010 | ms/batch 11.57 | loss 0.00000878
| Epoch  30 |   275/  338 batches | lr 0.00010 | ms/batch 11.37 | loss 0.00000604
| Epoch  30 |   300/  338 batches | lr 0.00010 | ms/batch  9.79 | loss 0.00001130
| Epoch  30 |   325/  338 batches | lr 0.00010 | ms/batch 11.34 | loss 0.00000843
| Epoch  31 |    25/  338 batches | lr 0.00010 | ms/batch 11.81 | loss 0.00000656
| Epoch  31 |    50/  338 batches | lr 0.00010 | ms/batch 11.37 | loss 0.00001596
| Epoch  31 |    75/  338 batches | lr 0.00010 | ms/batch 11.38 | loss 0.00001071
| Epoch  31 |   100/  338 batches | lr 0.00010 | ms/batch  9.77 | loss 0.00004950
| Epoch  31 |   125/  338 batches | lr 0.00010 | ms/batch 11.37 | loss 0.00000657
| Epoch  31 |   150/  338 batches | lr 0.00010 | ms/batch 11.33 | loss 0.00000874
| Epoch  31 |   175/  338 batches | lr 0.00010 | ms/batch  9.75 | loss 0.00003536
| Epoch  31 |   200/  338 batches | lr 0.00010 | ms/batch 11.34 | loss 0.00002524
| Epoch  31 |   

| Epoch  38 |   150/  338 batches | lr 0.00010 | ms/batch 11.14 | loss 0.00001287
| Epoch  38 |   175/  338 batches | lr 0.00010 | ms/batch 11.17 | loss 0.00002656
| Epoch  38 |   200/  338 batches | lr 0.00010 | ms/batch 11.17 | loss 0.00003229
| Epoch  38 |   225/  338 batches | lr 0.00010 | ms/batch  9.57 | loss 0.00000876
| Epoch  38 |   250/  338 batches | lr 0.00010 | ms/batch 11.16 | loss 0.00001024
| Epoch  38 |   275/  338 batches | lr 0.00010 | ms/batch 11.15 | loss 0.00002015
| Epoch  38 |   300/  338 batches | lr 0.00010 | ms/batch  9.64 | loss 0.00001497
| Epoch  38 |   325/  338 batches | lr 0.00010 | ms/batch 11.21 | loss 0.00001188
| Epoch  39 |    25/  338 batches | lr 0.00010 | ms/batch 11.60 | loss 0.00001223
| Epoch  39 |    50/  338 batches | lr 0.00010 | ms/batch 11.19 | loss 0.00001194
| Epoch  39 |    75/  338 batches | lr 0.00010 | ms/batch  9.60 | loss 0.00005408
| Epoch  39 |   100/  338 batches | lr 0.00010 | ms/batch 11.14 | loss 0.00001392
| Epoch  39 |   

| Epoch  46 |    50/  338 batches | lr 0.00010 | ms/batch  9.59 | loss 0.00001129
| Epoch  46 |    75/  338 batches | lr 0.00010 | ms/batch 11.14 | loss 0.00000522
| Epoch  46 |   100/  338 batches | lr 0.00010 | ms/batch 11.15 | loss 0.00000513
| Epoch  46 |   125/  338 batches | lr 0.00010 | ms/batch  9.57 | loss 0.00000462
| Epoch  46 |   150/  338 batches | lr 0.00010 | ms/batch 11.15 | loss 0.00000440
| Epoch  46 |   175/  338 batches | lr 0.00010 | ms/batch 11.25 | loss 0.00000763
| Epoch  46 |   200/  338 batches | lr 0.00010 | ms/batch  9.59 | loss 0.00001292
| Epoch  46 |   225/  338 batches | lr 0.00010 | ms/batch 11.16 | loss 0.00000984
| Epoch  46 |   250/  338 batches | lr 0.00010 | ms/batch 11.16 | loss 0.00000677
| Epoch  46 |   275/  338 batches | lr 0.00010 | ms/batch  9.59 | loss 0.00001202
| Epoch  46 |   300/  338 batches | lr 0.00010 | ms/batch 11.12 | loss 0.00000647
| Epoch  46 |   325/  338 batches | lr 0.00010 | ms/batch 11.12 | loss 0.00000790
| Epoch  47 |   

| Epoch  53 |   275/  338 batches | lr 0.00010 | ms/batch 11.21 | loss 0.00000563
| Epoch  53 |   300/  338 batches | lr 0.00010 | ms/batch 11.20 | loss 0.00000523
| Epoch  53 |   325/  338 batches | lr 0.00010 | ms/batch  9.59 | loss 0.00006463
| Epoch  54 |    25/  338 batches | lr 0.00010 | ms/batch  9.97 | loss 0.00001100
| Epoch  54 |    50/  338 batches | lr 0.00010 | ms/batch 11.18 | loss 0.00000561
| Epoch  54 |    75/  338 batches | lr 0.00010 | ms/batch 11.16 | loss 0.00000403
| Epoch  54 |   100/  338 batches | lr 0.00010 | ms/batch 11.16 | loss 0.00000814
| Epoch  54 |   125/  338 batches | lr 0.00010 | ms/batch  9.59 | loss 0.00001183
| Epoch  54 |   150/  338 batches | lr 0.00010 | ms/batch 11.18 | loss 0.00001133
| Epoch  54 |   175/  338 batches | lr 0.00010 | ms/batch 11.18 | loss 0.00001139
| Epoch  54 |   200/  338 batches | lr 0.00010 | ms/batch  9.60 | loss 0.00000846
| Epoch  54 |   225/  338 batches | lr 0.00010 | ms/batch 11.12 | loss 0.00000588
| Epoch  54 |   

| Epoch  61 |   175/  338 batches | lr 0.00010 | ms/batch 11.20 | loss 0.00001123
| Epoch  61 |   200/  338 batches | lr 0.00010 | ms/batch 11.20 | loss 0.00001486
| Epoch  61 |   225/  338 batches | lr 0.00010 | ms/batch 11.18 | loss 0.00000477
| Epoch  61 |   250/  338 batches | lr 0.00010 | ms/batch  9.60 | loss 0.00000391
| Epoch  61 |   275/  338 batches | lr 0.00010 | ms/batch 11.23 | loss 0.00001673
| Epoch  61 |   300/  338 batches | lr 0.00010 | ms/batch 11.20 | loss 0.00002602
| Epoch  61 |   325/  338 batches | lr 0.00010 | ms/batch  9.64 | loss 0.00000457
| Epoch  62 |    25/  338 batches | lr 0.00010 | ms/batch 10.04 | loss 0.00001277
| Epoch  62 |    50/  338 batches | lr 0.00010 | ms/batch 11.13 | loss 0.00001061
| Epoch  62 |    75/  338 batches | lr 0.00010 | ms/batch 11.19 | loss 0.00001460
| Epoch  62 |   100/  338 batches | lr 0.00010 | ms/batch  9.65 | loss 0.00000927
| Epoch  62 |   125/  338 batches | lr 0.00010 | ms/batch 11.32 | loss 0.00000497
| Epoch  62 |   

| Epoch  69 |    75/  338 batches | lr 0.00010 | ms/batch  9.61 | loss 0.00000531
| Epoch  69 |   100/  338 batches | lr 0.00010 | ms/batch 11.16 | loss 0.00000540
| Epoch  69 |   125/  338 batches | lr 0.00010 | ms/batch 11.21 | loss 0.00001222
| Epoch  69 |   150/  338 batches | lr 0.00010 | ms/batch  9.67 | loss 0.00002695
| Epoch  69 |   175/  338 batches | lr 0.00010 | ms/batch 11.16 | loss 0.00001651
| Epoch  69 |   200/  338 batches | lr 0.00010 | ms/batch 11.24 | loss 0.00001336
| Epoch  69 |   225/  338 batches | lr 0.00010 | ms/batch  9.60 | loss 0.00000336
| Epoch  69 |   250/  338 batches | lr 0.00010 | ms/batch 11.16 | loss 0.00000333
| Epoch  69 |   275/  338 batches | lr 0.00010 | ms/batch 11.15 | loss 0.00000316
| Epoch  69 |   300/  338 batches | lr 0.00010 | ms/batch  9.63 | loss 0.00001032
| Epoch  69 |   325/  338 batches | lr 0.00010 | ms/batch 11.21 | loss 0.00003540
| Epoch  70 |    25/  338 batches | lr 0.00010 | ms/batch 11.59 | loss 0.00001301
| Epoch  70 |   

| Epoch  76 |   300/  338 batches | lr 0.00010 | ms/batch 11.14 | loss 0.00000388
| Epoch  76 |   325/  338 batches | lr 0.00010 | ms/batch 11.18 | loss 0.00000297
| Epoch  77 |    25/  338 batches | lr 0.00010 | ms/batch 11.59 | loss 0.00005865
| Epoch  77 |    50/  338 batches | lr 0.00010 | ms/batch  9.66 | loss 0.00003013
| Epoch  77 |    75/  338 batches | lr 0.00010 | ms/batch 11.21 | loss 0.00001310
| Epoch  77 |   100/  338 batches | lr 0.00010 | ms/batch 11.15 | loss 0.00000647
| Epoch  77 |   125/  338 batches | lr 0.00010 | ms/batch 11.24 | loss 0.00000583
| Epoch  77 |   150/  338 batches | lr 0.00010 | ms/batch  9.63 | loss 0.00000384
| Epoch  77 |   175/  338 batches | lr 0.00010 | ms/batch 11.23 | loss 0.00000479
| Epoch  77 |   200/  338 batches | lr 0.00010 | ms/batch 11.19 | loss 0.00001344
| Epoch  77 |   225/  338 batches | lr 0.00010 | ms/batch  9.60 | loss 0.00000368
| Epoch  77 |   250/  338 batches | lr 0.00010 | ms/batch 11.13 | loss 0.00000418
| Epoch  77 |   

| Epoch  84 |   200/  338 batches | lr 0.00010 | ms/batch 11.12 | loss 0.00000953
| Epoch  84 |   225/  338 batches | lr 0.00010 | ms/batch 11.17 | loss 0.00000498
| Epoch  84 |   250/  338 batches | lr 0.00010 | ms/batch 11.18 | loss 0.00000656
| Epoch  84 |   275/  338 batches | lr 0.00010 | ms/batch  9.57 | loss 0.00001365
| Epoch  84 |   300/  338 batches | lr 0.00010 | ms/batch 11.24 | loss 0.00001210
| Epoch  84 |   325/  338 batches | lr 0.00010 | ms/batch 11.21 | loss 0.00000692
| Epoch  85 |    25/  338 batches | lr 0.00010 | ms/batch 11.65 | loss 0.00002545
| Epoch  85 |    50/  338 batches | lr 0.00010 | ms/batch  9.62 | loss 0.00002161
| Epoch  85 |    75/  338 batches | lr 0.00010 | ms/batch 11.21 | loss 0.00000908
| Epoch  85 |   100/  338 batches | lr 0.00010 | ms/batch 11.19 | loss 0.00001179
| Epoch  85 |   125/  338 batches | lr 0.00010 | ms/batch  9.75 | loss 0.00001236
| Epoch  85 |   150/  338 batches | lr 0.00010 | ms/batch 11.73 | loss 0.00000877
| Epoch  85 |   

| Epoch  92 |   100/  338 batches | lr 0.00010 | ms/batch  9.75 | loss 0.00000413
| Epoch  92 |   125/  338 batches | lr 0.00010 | ms/batch 11.34 | loss 0.00000376
| Epoch  92 |   150/  338 batches | lr 0.00010 | ms/batch 11.32 | loss 0.00000315
| Epoch  92 |   175/  338 batches | lr 0.00010 | ms/batch  9.66 | loss 0.00000806
| Epoch  92 |   200/  338 batches | lr 0.00010 | ms/batch 11.26 | loss 0.00002200
| Epoch  92 |   225/  338 batches | lr 0.00010 | ms/batch 11.24 | loss 0.00000579
| Epoch  92 |   250/  338 batches | lr 0.00010 | ms/batch  9.69 | loss 0.00000513
| Epoch  92 |   275/  338 batches | lr 0.00010 | ms/batch 11.13 | loss 0.00000509
| Epoch  92 |   300/  338 batches | lr 0.00010 | ms/batch 11.18 | loss 0.00000603
| Epoch  92 |   325/  338 batches | lr 0.00010 | ms/batch  9.59 | loss 0.00000415
| Epoch  93 |    25/  338 batches | lr 0.00010 | ms/batch 11.59 | loss 0.00000613
| Epoch  93 |    50/  338 batches | lr 0.00010 | ms/batch  9.68 | loss 0.00000397
| Epoch  93 |   

| Epoch  99 |   325/  338 batches | lr 0.00010 | ms/batch 11.20 | loss 0.00001112
| Epoch 100 |    25/  338 batches | lr 0.00010 | ms/batch 11.66 | loss 0.00001599
| Epoch 100 |    50/  338 batches | lr 0.00010 | ms/batch 11.47 | loss 0.00000786
| Epoch 100 |    75/  338 batches | lr 0.00010 | ms/batch  9.70 | loss 0.00000297
| Epoch 100 |   100/  338 batches | lr 0.00010 | ms/batch 11.21 | loss 0.00000399
| Epoch 100 |   125/  338 batches | lr 0.00010 | ms/batch 11.18 | loss 0.00000569
| Epoch 100 |   150/  338 batches | lr 0.00010 | ms/batch 11.19 | loss 0.00001172
| Epoch 100 |   175/  338 batches | lr 0.00010 | ms/batch  9.58 | loss 0.00000523
| Epoch 100 |   200/  338 batches | lr 0.00010 | ms/batch 11.16 | loss 0.00000381
| Epoch 100 |   225/  338 batches | lr 0.00010 | ms/batch 11.18 | loss 0.00000189
| Epoch 100 |   250/  338 batches | lr 0.00010 | ms/batch  9.67 | loss 0.00000393
| Epoch 100 |   275/  338 batches | lr 0.00010 | ms/batch 11.16 | loss 0.00002881
| Epoch 100 |   

| Epoch 107 |   225/  338 batches | lr 0.00010 | ms/batch 11.18 | loss 0.00000245
| Epoch 107 |   250/  338 batches | lr 0.00010 | ms/batch 11.23 | loss 0.00000294
| Epoch 107 |   275/  338 batches | lr 0.00010 | ms/batch 11.29 | loss 0.00000454
| Epoch 107 |   300/  338 batches | lr 0.00010 | ms/batch  9.57 | loss 0.00000398
| Epoch 107 |   325/  338 batches | lr 0.00010 | ms/batch 11.16 | loss 0.00001152
| Epoch 108 |    25/  338 batches | lr 0.00010 | ms/batch 11.58 | loss 0.00000560
| Epoch 108 |    50/  338 batches | lr 0.00010 | ms/batch 11.15 | loss 0.00000275
| Epoch 108 |    75/  338 batches | lr 0.00010 | ms/batch  9.60 | loss 0.00000497
| Epoch 108 |   100/  338 batches | lr 0.00010 | ms/batch 11.15 | loss 0.00001945
| Epoch 108 |   125/  338 batches | lr 0.00010 | ms/batch 11.17 | loss 0.00003455
| Epoch 108 |   150/  338 batches | lr 0.00010 | ms/batch  9.60 | loss 0.00000464
| Epoch 108 |   175/  338 batches | lr 0.00010 | ms/batch 11.15 | loss 0.00000348
| Epoch 108 |   

| Epoch 115 |   125/  338 batches | lr 0.00010 | ms/batch  9.61 | loss 0.00000210
| Epoch 115 |   150/  338 batches | lr 0.00010 | ms/batch 11.15 | loss 0.00000224
| Epoch 115 |   175/  338 batches | lr 0.00010 | ms/batch 11.17 | loss 0.00000402
| Epoch 115 |   200/  338 batches | lr 0.00010 | ms/batch  9.59 | loss 0.00000388
| Epoch 115 |   225/  338 batches | lr 0.00010 | ms/batch 11.14 | loss 0.00001195
| Epoch 115 |   250/  338 batches | lr 0.00010 | ms/batch 11.14 | loss 0.00000539
| Epoch 115 |   275/  338 batches | lr 0.00010 | ms/batch  9.57 | loss 0.00000590
| Epoch 115 |   300/  338 batches | lr 0.00010 | ms/batch 11.17 | loss 0.00000432
| Epoch 115 |   325/  338 batches | lr 0.00010 | ms/batch 11.16 | loss 0.00001381
| Epoch 116 |    25/  338 batches | lr 0.00010 | ms/batch 11.57 | loss 0.00004053
| Epoch 116 |    50/  338 batches | lr 0.00010 | ms/batch 11.16 | loss 0.00000988
| Epoch 116 |    75/  338 batches | lr 0.00010 | ms/batch  9.58 | loss 0.00000351
| Epoch 116 |   

| Epoch 123 |    25/  338 batches | lr 0.00010 | ms/batch 10.02 | loss 0.00001098
| Epoch 123 |    50/  338 batches | lr 0.00010 | ms/batch 11.14 | loss 0.00000555
| Epoch 123 |    75/  338 batches | lr 0.00010 | ms/batch 11.16 | loss 0.00000762
| Epoch 123 |   100/  338 batches | lr 0.00010 | ms/batch  9.61 | loss 0.00000815
| Epoch 123 |   125/  338 batches | lr 0.00010 | ms/batch 11.11 | loss 0.00000354
| Epoch 123 |   150/  338 batches | lr 0.00010 | ms/batch 11.16 | loss 0.00001105
| Epoch 123 |   175/  338 batches | lr 0.00010 | ms/batch 11.16 | loss 0.00000324
| Epoch 123 |   200/  338 batches | lr 0.00010 | ms/batch  9.59 | loss 0.00000449
| Epoch 123 |   225/  338 batches | lr 0.00010 | ms/batch 11.20 | loss 0.00000540
| Epoch 123 |   250/  338 batches | lr 0.00010 | ms/batch 11.16 | loss 0.00000868
| Epoch 123 |   275/  338 batches | lr 0.00010 | ms/batch  9.64 | loss 0.00000619
| Epoch 123 |   300/  338 batches | lr 0.00010 | ms/batch 11.21 | loss 0.00000368
| Epoch 123 |   

| Epoch   5 |   225/  338 batches | lr 0.00010 | ms/batch  8.09 | loss 0.00005123
| Epoch   5 |   250/  338 batches | lr 0.00010 | ms/batch  9.76 | loss 0.00005484
| Epoch   5 |   275/  338 batches | lr 0.00010 | ms/batch  9.68 | loss 0.00003836
| Epoch   5 |   300/  338 batches | lr 0.00010 | ms/batch  9.69 | loss 0.00004261
| Epoch   5 |   325/  338 batches | lr 0.00010 | ms/batch  8.06 | loss 0.00005973
| Epoch   6 |    25/  338 batches | lr 0.00010 | ms/batch  8.39 | loss 0.00004319
| Epoch   6 |    50/  338 batches | lr 0.00010 | ms/batch  9.64 | loss 0.00004238
| Epoch   6 |    75/  338 batches | lr 0.00010 | ms/batch  9.70 | loss 0.00007795
| Epoch   6 |   100/  338 batches | lr 0.00010 | ms/batch  8.09 | loss 0.00004951
| Epoch   6 |   125/  338 batches | lr 0.00010 | ms/batch  9.65 | loss 0.00003998
| Epoch   6 |   150/  338 batches | lr 0.00010 | ms/batch  9.65 | loss 0.00004648
| Epoch   6 |   175/  338 batches | lr 0.00010 | ms/batch  8.11 | loss 0.00003515
| Epoch   6 |   

| Epoch  13 |   125/  338 batches | lr 0.00010 | ms/batch  9.87 | loss 0.00003067
| Epoch  13 |   150/  338 batches | lr 0.00010 | ms/batch  8.64 | loss 0.00003320
| Epoch  13 |   175/  338 batches | lr 0.00010 | ms/batch  9.71 | loss 0.00001651
| Epoch  13 |   200/  338 batches | lr 0.00010 | ms/batch  9.80 | loss 0.00004216
| Epoch  13 |   225/  338 batches | lr 0.00010 | ms/batch  8.16 | loss 0.00002489
| Epoch  13 |   250/  338 batches | lr 0.00010 | ms/batch  9.80 | loss 0.00002475
| Epoch  13 |   275/  338 batches | lr 0.00010 | ms/batch  9.73 | loss 0.00001443
| Epoch  13 |   300/  338 batches | lr 0.00010 | ms/batch  8.21 | loss 0.00001491
| Epoch  13 |   325/  338 batches | lr 0.00010 | ms/batch  9.78 | loss 0.00007340
| Epoch  14 |    25/  338 batches | lr 0.00010 | ms/batch 10.25 | loss 0.00002401
| Epoch  14 |    50/  338 batches | lr 0.00010 | ms/batch  9.71 | loss 0.00002293
| Epoch  14 |    75/  338 batches | lr 0.00010 | ms/batch  9.77 | loss 0.00003375
| Epoch  14 |   

| Epoch  21 |    25/  338 batches | lr 0.00010 | ms/batch  9.98 | loss 0.00001765
| Epoch  21 |    50/  338 batches | lr 0.00010 | ms/batch  8.08 | loss 0.00001230
| Epoch  21 |    75/  338 batches | lr 0.00010 | ms/batch  9.73 | loss 0.00001881
| Epoch  21 |   100/  338 batches | lr 0.00010 | ms/batch  9.63 | loss 0.00002015
| Epoch  21 |   125/  338 batches | lr 0.00010 | ms/batch  8.11 | loss 0.00001274
| Epoch  21 |   150/  338 batches | lr 0.00010 | ms/batch  9.72 | loss 0.00002587
| Epoch  21 |   175/  338 batches | lr 0.00010 | ms/batch  9.70 | loss 0.00001932
| Epoch  21 |   200/  338 batches | lr 0.00010 | ms/batch  9.75 | loss 0.00001498
| Epoch  21 |   225/  338 batches | lr 0.00010 | ms/batch  8.08 | loss 0.00004748
| Epoch  21 |   250/  338 batches | lr 0.00010 | ms/batch  9.65 | loss 0.00004386
| Epoch  21 |   275/  338 batches | lr 0.00010 | ms/batch  9.66 | loss 0.00001658
| Epoch  21 |   300/  338 batches | lr 0.00010 | ms/batch  8.13 | loss 0.00001381
| Epoch  21 |   

| Epoch  28 |   250/  338 batches | lr 0.00010 | ms/batch  8.10 | loss 0.00001125
| Epoch  28 |   275/  338 batches | lr 0.00010 | ms/batch  9.64 | loss 0.00000682
| Epoch  28 |   300/  338 batches | lr 0.00010 | ms/batch  9.67 | loss 0.00007228
| Epoch  28 |   325/  338 batches | lr 0.00010 | ms/batch  9.67 | loss 0.00002936
| Epoch  29 |    25/  338 batches | lr 0.00010 | ms/batch  9.97 | loss 0.00001644
| Epoch  29 |    50/  338 batches | lr 0.00010 | ms/batch  8.09 | loss 0.00000828
| Epoch  29 |    75/  338 batches | lr 0.00010 | ms/batch  9.67 | loss 0.00001240
| Epoch  29 |   100/  338 batches | lr 0.00010 | ms/batch  9.67 | loss 0.00001543
| Epoch  29 |   125/  338 batches | lr 0.00010 | ms/batch  8.20 | loss 0.00000915
| Epoch  29 |   150/  338 batches | lr 0.00010 | ms/batch  9.64 | loss 0.00001740
| Epoch  29 |   175/  338 batches | lr 0.00010 | ms/batch  9.64 | loss 0.00001121
| Epoch  29 |   200/  338 batches | lr 0.00010 | ms/batch  8.08 | loss 0.00013050
| Epoch  29 |   

| Epoch  36 |   150/  338 batches | lr 0.00010 | ms/batch  9.76 | loss 0.00002435
| Epoch  36 |   175/  338 batches | lr 0.00010 | ms/batch  8.09 | loss 0.00007402
| Epoch  36 |   200/  338 batches | lr 0.00010 | ms/batch  9.92 | loss 0.00002776
| Epoch  36 |   225/  338 batches | lr 0.00010 | ms/batch  9.72 | loss 0.00001194
| Epoch  36 |   250/  338 batches | lr 0.00010 | ms/batch  8.08 | loss 0.00000690
| Epoch  36 |   275/  338 batches | lr 0.00010 | ms/batch  9.68 | loss 0.00000669
| Epoch  36 |   300/  338 batches | lr 0.00010 | ms/batch  9.66 | loss 0.00000847
| Epoch  36 |   325/  338 batches | lr 0.00010 | ms/batch  8.10 | loss 0.00000977
| Epoch  37 |    25/  338 batches | lr 0.00010 | ms/batch  8.42 | loss 0.00000798
| Epoch  37 |    50/  338 batches | lr 0.00010 | ms/batch  9.63 | loss 0.00001494
| Epoch  37 |    75/  338 batches | lr 0.00010 | ms/batch  9.71 | loss 0.00001575
| Epoch  37 |   100/  338 batches | lr 0.00010 | ms/batch  9.66 | loss 0.00002329
| Epoch  37 |   

| Epoch  44 |    50/  338 batches | lr 0.00010 | ms/batch 10.22 | loss 0.00000893
| Epoch  44 |    75/  338 batches | lr 0.00010 | ms/batch  8.58 | loss 0.00000562
| Epoch  44 |   100/  338 batches | lr 0.00010 | ms/batch 10.35 | loss 0.00000812
| Epoch  44 |   125/  338 batches | lr 0.00010 | ms/batch 10.58 | loss 0.00000600
| Epoch  44 |   150/  338 batches | lr 0.00010 | ms/batch  8.49 | loss 0.00000604
| Epoch  44 |   175/  338 batches | lr 0.00010 | ms/batch 10.18 | loss 0.00000511
| Epoch  44 |   200/  338 batches | lr 0.00010 | ms/batch 10.06 | loss 0.00000971
| Epoch  44 |   225/  338 batches | lr 0.00010 | ms/batch 10.75 | loss 0.00000510
| Epoch  44 |   250/  338 batches | lr 0.00010 | ms/batch  8.59 | loss 0.00002838
| Epoch  44 |   275/  338 batches | lr 0.00010 | ms/batch 10.25 | loss 0.00001108
| Epoch  44 |   300/  338 batches | lr 0.00010 | ms/batch 11.35 | loss 0.00000819
| Epoch  44 |   325/  338 batches | lr 0.00010 | ms/batch 10.19 | loss 0.00003193
| Epoch  45 |   

| Epoch  51 |   275/  338 batches | lr 0.00010 | ms/batch  8.32 | loss 0.00000921
| Epoch  51 |   300/  338 batches | lr 0.00010 | ms/batch  9.81 | loss 0.00000761
| Epoch  51 |   325/  338 batches | lr 0.00010 | ms/batch  9.67 | loss 0.00008740
| Epoch  52 |    25/  338 batches | lr 0.00010 | ms/batch 10.09 | loss 0.00001413
| Epoch  52 |    50/  338 batches | lr 0.00010 | ms/batch  9.72 | loss 0.00000693
| Epoch  52 |    75/  338 batches | lr 0.00010 | ms/batch  8.14 | loss 0.00000528
| Epoch  52 |   100/  338 batches | lr 0.00010 | ms/batch  9.69 | loss 0.00000643
| Epoch  52 |   125/  338 batches | lr 0.00010 | ms/batch  9.67 | loss 0.00000497
| Epoch  52 |   150/  338 batches | lr 0.00010 | ms/batch  8.12 | loss 0.00000734
| Epoch  52 |   175/  338 batches | lr 0.00010 | ms/batch 10.77 | loss 0.00001105
| Epoch  52 |   200/  338 batches | lr 0.00010 | ms/batch  9.91 | loss 0.00001084
| Epoch  52 |   225/  338 batches | lr 0.00010 | ms/batch  8.37 | loss 0.00000670
| Epoch  52 |   

| Epoch  59 |   175/  338 batches | lr 0.00010 | ms/batch  9.66 | loss 0.00001554
| Epoch  59 |   200/  338 batches | lr 0.00010 | ms/batch  8.08 | loss 0.00001003
| Epoch  59 |   225/  338 batches | lr 0.00010 | ms/batch  9.62 | loss 0.00000455
| Epoch  59 |   250/  338 batches | lr 0.00010 | ms/batch  9.62 | loss 0.00000505
| Epoch  59 |   275/  338 batches | lr 0.00010 | ms/batch  8.07 | loss 0.00000513
| Epoch  59 |   300/  338 batches | lr 0.00010 | ms/batch  9.60 | loss 0.00002468
| Epoch  59 |   325/  338 batches | lr 0.00010 | ms/batch  9.70 | loss 0.00001216
| Epoch  60 |    25/  338 batches | lr 0.00010 | ms/batch  9.97 | loss 0.00001951
| Epoch  60 |    50/  338 batches | lr 0.00010 | ms/batch  8.13 | loss 0.00000548
| Epoch  60 |    75/  338 batches | lr 0.00010 | ms/batch  9.62 | loss 0.00000462
| Epoch  60 |   100/  338 batches | lr 0.00010 | ms/batch  9.63 | loss 0.00001908
| Epoch  60 |   125/  338 batches | lr 0.00010 | ms/batch  9.62 | loss 0.00001451
| Epoch  60 |   

| Epoch  67 |    75/  338 batches | lr 0.00010 | ms/batch  9.62 | loss 0.00000422
| Epoch  67 |   100/  338 batches | lr 0.00010 | ms/batch  8.06 | loss 0.00000517
| Epoch  67 |   125/  338 batches | lr 0.00010 | ms/batch  9.61 | loss 0.00000504
| Epoch  67 |   150/  338 batches | lr 0.00010 | ms/batch  9.64 | loss 0.00000578
| Epoch  67 |   175/  338 batches | lr 0.00010 | ms/batch  8.09 | loss 0.00006236
| Epoch  67 |   200/  338 batches | lr 0.00010 | ms/batch  9.60 | loss 0.00001018
| Epoch  67 |   225/  338 batches | lr 0.00010 | ms/batch  9.64 | loss 0.00000641
| Epoch  67 |   250/  338 batches | lr 0.00010 | ms/batch  9.67 | loss 0.00000436
| Epoch  67 |   275/  338 batches | lr 0.00010 | ms/batch  8.07 | loss 0.00000535
| Epoch  67 |   300/  338 batches | lr 0.00010 | ms/batch  9.66 | loss 0.00000923
| Epoch  67 |   325/  338 batches | lr 0.00010 | ms/batch  9.64 | loss 0.00001169
| Epoch  68 |    25/  338 batches | lr 0.00010 | ms/batch 10.02 | loss 0.00000482
| Epoch  68 |   

| Epoch  74 |   300/  338 batches | lr 0.00010 | ms/batch  8.05 | loss 0.00000947
| Epoch  74 |   325/  338 batches | lr 0.00010 | ms/batch  9.63 | loss 0.00003277
| Epoch  75 |    25/  338 batches | lr 0.00010 | ms/batch  9.93 | loss 0.00000644
| Epoch  75 |    50/  338 batches | lr 0.00010 | ms/batch  9.61 | loss 0.00000494
| Epoch  75 |    75/  338 batches | lr 0.00010 | ms/batch  9.63 | loss 0.00000519
| Epoch  75 |   100/  338 batches | lr 0.00010 | ms/batch  8.08 | loss 0.00000383
| Epoch  75 |   125/  338 batches | lr 0.00010 | ms/batch  9.64 | loss 0.00000790
| Epoch  75 |   150/  338 batches | lr 0.00010 | ms/batch  9.72 | loss 0.00005891
| Epoch  75 |   175/  338 batches | lr 0.00010 | ms/batch  8.14 | loss 0.00000551
| Epoch  75 |   200/  338 batches | lr 0.00010 | ms/batch  9.74 | loss 0.00000888
| Epoch  75 |   225/  338 batches | lr 0.00010 | ms/batch  9.71 | loss 0.00000842
| Epoch  75 |   250/  338 batches | lr 0.00010 | ms/batch  8.13 | loss 0.00000454
| Epoch  75 |   

| Epoch  82 |   200/  338 batches | lr 0.00010 | ms/batch 10.22 | loss 0.00000580
| Epoch  82 |   225/  338 batches | lr 0.00010 | ms/batch  8.44 | loss 0.00000385
| Epoch  82 |   250/  338 batches | lr 0.00010 | ms/batch 10.24 | loss 0.00000540
| Epoch  82 |   275/  338 batches | lr 0.00010 | ms/batch 10.24 | loss 0.00000943
| Epoch  82 |   300/  338 batches | lr 0.00010 | ms/batch  8.59 | loss 0.00001073
| Epoch  82 |   325/  338 batches | lr 0.00010 | ms/batch  9.99 | loss 0.00000341
| Epoch  83 |    25/  338 batches | lr 0.00010 | ms/batch 10.30 | loss 0.00002545
| Epoch  83 |    50/  338 batches | lr 0.00010 | ms/batch  9.80 | loss 0.00001058
| Epoch  83 |    75/  338 batches | lr 0.00010 | ms/batch  8.10 | loss 0.00000352
| Epoch  83 |   100/  338 batches | lr 0.00010 | ms/batch  9.63 | loss 0.00000599
| Epoch  83 |   125/  338 batches | lr 0.00010 | ms/batch  9.69 | loss 0.00000385
| Epoch  83 |   150/  338 batches | lr 0.00010 | ms/batch  9.68 | loss 0.00000317
| Epoch  83 |   

| Epoch  90 |   100/  338 batches | lr 0.00010 | ms/batch  9.94 | loss 0.00000512
| Epoch  90 |   125/  338 batches | lr 0.00010 | ms/batch  8.51 | loss 0.00000394
| Epoch  90 |   150/  338 batches | lr 0.00010 | ms/batch 10.05 | loss 0.00000372
| Epoch  90 |   175/  338 batches | lr 0.00010 | ms/batch 10.03 | loss 0.00000472
| Epoch  90 |   200/  338 batches | lr 0.00010 | ms/batch  8.45 | loss 0.00000658
| Epoch  90 |   225/  338 batches | lr 0.00010 | ms/batch 10.01 | loss 0.00000525
| Epoch  90 |   250/  338 batches | lr 0.00010 | ms/batch  9.94 | loss 0.00000488
| Epoch  90 |   275/  338 batches | lr 0.00010 | ms/batch  9.99 | loss 0.00001020
| Epoch  90 |   300/  338 batches | lr 0.00010 | ms/batch  8.33 | loss 0.00000372
| Epoch  90 |   325/  338 batches | lr 0.00010 | ms/batch 10.04 | loss 0.00002243
| Epoch  91 |    25/  338 batches | lr 0.00010 | ms/batch 10.24 | loss 0.00000928
| Epoch  91 |    50/  338 batches | lr 0.00010 | ms/batch  9.93 | loss 0.00000565
| Epoch  91 |   

| Epoch  97 |   325/  338 batches | lr 0.00010 | ms/batch  8.53 | loss 0.00000585
| Epoch  98 |    25/  338 batches | lr 0.00010 | ms/batch  8.84 | loss 0.00000730
| Epoch  98 |    50/  338 batches | lr 0.00010 | ms/batch 10.02 | loss 0.00000630
| Epoch  98 |    75/  338 batches | lr 0.00010 | ms/batch  9.89 | loss 0.00000259
| Epoch  98 |   100/  338 batches | lr 0.00010 | ms/batch  9.93 | loss 0.00000477
| Epoch  98 |   125/  338 batches | lr 0.00010 | ms/batch  8.34 | loss 0.00000343
| Epoch  98 |   150/  338 batches | lr 0.00010 | ms/batch 10.06 | loss 0.00000435
| Epoch  98 |   175/  338 batches | lr 0.00010 | ms/batch 10.17 | loss 0.00001723
| Epoch  98 |   200/  338 batches | lr 0.00010 | ms/batch  8.45 | loss 0.00001156
| Epoch  98 |   225/  338 batches | lr 0.00010 | ms/batch 10.11 | loss 0.00000493
| Epoch  98 |   250/  338 batches | lr 0.00010 | ms/batch  9.91 | loss 0.00000495
| Epoch  98 |   275/  338 batches | lr 0.00010 | ms/batch  8.32 | loss 0.00000540
| Epoch  98 |   

| Epoch 105 |   225/  338 batches | lr 0.00010 | ms/batch 11.18 | loss 0.00000465
| Epoch 105 |   250/  338 batches | lr 0.00010 | ms/batch  8.48 | loss 0.00000383
| Epoch 105 |   275/  338 batches | lr 0.00010 | ms/batch 10.31 | loss 0.00000298
| Epoch 105 |   300/  338 batches | lr 0.00010 | ms/batch 10.00 | loss 0.00000411
| Epoch 105 |   325/  338 batches | lr 0.00010 | ms/batch  8.39 | loss 0.00003144
| Epoch 106 |    25/  338 batches | lr 0.00010 | ms/batch  8.79 | loss 0.00000930
| Epoch 106 |    50/  338 batches | lr 0.00010 | ms/batch 10.29 | loss 0.00000365
| Epoch 106 |    75/  338 batches | lr 0.00010 | ms/batch 10.07 | loss 0.00000273
| Epoch 106 |   100/  338 batches | lr 0.00010 | ms/batch  8.38 | loss 0.00000299
| Epoch 106 |   125/  338 batches | lr 0.00010 | ms/batch 10.11 | loss 0.00000429
| Epoch 106 |   150/  338 batches | lr 0.00010 | ms/batch 10.53 | loss 0.00000845
| Epoch 106 |   175/  338 batches | lr 0.00010 | ms/batch 10.25 | loss 0.00000548
| Epoch 106 |   

| Epoch 113 |   125/  338 batches | lr 0.00010 | ms/batch 10.39 | loss 0.00000638
| Epoch 113 |   150/  338 batches | lr 0.00010 | ms/batch  8.57 | loss 0.00000497
| Epoch 113 |   175/  338 batches | lr 0.00010 | ms/batch 10.79 | loss 0.00000939
| Epoch 113 |   200/  338 batches | lr 0.00010 | ms/batch 10.49 | loss 0.00001634
| Epoch 113 |   225/  338 batches | lr 0.00010 | ms/batch  9.12 | loss 0.00000425
| Epoch 113 |   250/  338 batches | lr 0.00010 | ms/batch 10.01 | loss 0.00000260
| Epoch 113 |   275/  338 batches | lr 0.00010 | ms/batch 10.57 | loss 0.00000371
| Epoch 113 |   300/  338 batches | lr 0.00010 | ms/batch 10.59 | loss 0.00000694
| Epoch 113 |   325/  338 batches | lr 0.00010 | ms/batch  8.85 | loss 0.00003268
| Epoch 114 |    25/  338 batches | lr 0.00010 | ms/batch  9.19 | loss 0.00000799
| Epoch 114 |    50/  338 batches | lr 0.00010 | ms/batch 10.53 | loss 0.00000404
| Epoch 114 |    75/  338 batches | lr 0.00010 | ms/batch 10.89 | loss 0.00000267
| Epoch 114 |   

| Epoch 121 |    25/  338 batches | lr 0.00010 | ms/batch 10.28 | loss 0.00000922
| Epoch 121 |    50/  338 batches | lr 0.00010 | ms/batch  8.30 | loss 0.00000292
| Epoch 121 |    75/  338 batches | lr 0.00010 | ms/batch  9.96 | loss 0.00000232
| Epoch 121 |   100/  338 batches | lr 0.00010 | ms/batch 10.05 | loss 0.00002699
| Epoch 121 |   125/  338 batches | lr 0.00010 | ms/batch  9.99 | loss 0.00001044
| Epoch 121 |   150/  338 batches | lr 0.00010 | ms/batch  8.27 | loss 0.00000701
| Epoch 121 |   175/  338 batches | lr 0.00010 | ms/batch 10.00 | loss 0.00000415
| Epoch 121 |   200/  338 batches | lr 0.00010 | ms/batch  9.97 | loss 0.00001090
| Epoch 121 |   225/  338 batches | lr 0.00010 | ms/batch  8.41 | loss 0.00000735
| Epoch 121 |   250/  338 batches | lr 0.00010 | ms/batch  9.92 | loss 0.00000225
| Epoch 121 |   275/  338 batches | lr 0.00010 | ms/batch  9.83 | loss 0.00000460
| Epoch 121 |   300/  338 batches | lr 0.00010 | ms/batch  8.41 | loss 0.00000716
| Epoch 121 |   

| Epoch   3 |   225/  338 batches | lr 0.00010 | ms/batch 10.19 | loss 0.00007068
| Epoch   3 |   250/  338 batches | lr 0.00010 | ms/batch 10.23 | loss 0.00008005
| Epoch   3 |   275/  338 batches | lr 0.00010 | ms/batch  8.33 | loss 0.00007720
| Epoch   3 |   300/  338 batches | lr 0.00010 | ms/batch 10.21 | loss 0.00007357
| Epoch   3 |   325/  338 batches | lr 0.00010 | ms/batch 10.06 | loss 0.00006917
| Epoch   4 |    25/  338 batches | lr 0.00010 | ms/batch 10.13 | loss 0.00006671
| Epoch   4 |    50/  338 batches | lr 0.00010 | ms/batch  8.23 | loss 0.00006500
| Epoch   4 |    75/  338 batches | lr 0.00010 | ms/batch  9.98 | loss 0.00006645
| Epoch   4 |   100/  338 batches | lr 0.00010 | ms/batch  9.84 | loss 0.00007313
| Epoch   4 |   125/  338 batches | lr 0.00010 | ms/batch  8.77 | loss 0.00005736
| Epoch   4 |   150/  338 batches | lr 0.00010 | ms/batch 10.26 | loss 0.00006471
| Epoch   4 |   175/  338 batches | lr 0.00010 | ms/batch  9.89 | loss 0.00005973
| Epoch   4 |   

| Epoch  11 |   125/  338 batches | lr 0.00010 | ms/batch 10.49 | loss 0.00002570
| Epoch  11 |   150/  338 batches | lr 0.00010 | ms/batch 10.86 | loss 0.00004227
| Epoch  11 |   175/  338 batches | lr 0.00010 | ms/batch  8.94 | loss 0.00002390
| Epoch  11 |   200/  338 batches | lr 0.00010 | ms/batch 10.87 | loss 0.00006055
| Epoch  11 |   225/  338 batches | lr 0.00010 | ms/batch 10.75 | loss 0.00003599
| Epoch  11 |   250/  338 batches | lr 0.00010 | ms/batch  8.59 | loss 0.00003661
| Epoch  11 |   275/  338 batches | lr 0.00010 | ms/batch  9.91 | loss 0.00003421
| Epoch  11 |   300/  338 batches | lr 0.00010 | ms/batch 10.07 | loss 0.00002596
| Epoch  11 |   325/  338 batches | lr 0.00010 | ms/batch 10.04 | loss 0.00002772
| Epoch  12 |    25/  338 batches | lr 0.00010 | ms/batch 10.38 | loss 0.00004522
| Epoch  12 |    50/  338 batches | lr 0.00010 | ms/batch  8.23 | loss 0.00006595
| Epoch  12 |    75/  338 batches | lr 0.00010 | ms/batch 10.11 | loss 0.00001990
| Epoch  12 |   

| Epoch  19 |    25/  338 batches | lr 0.00010 | ms/batch 10.79 | loss 0.00006364
| Epoch  19 |    50/  338 batches | lr 0.00010 | ms/batch 10.26 | loss 0.00002206
| Epoch  19 |    75/  338 batches | lr 0.00010 | ms/batch  8.65 | loss 0.00001487
| Epoch  19 |   100/  338 batches | lr 0.00010 | ms/batch 10.38 | loss 0.00002128
| Epoch  19 |   125/  338 batches | lr 0.00010 | ms/batch 10.31 | loss 0.00002229
| Epoch  19 |   150/  338 batches | lr 0.00010 | ms/batch 10.36 | loss 0.00002567
| Epoch  19 |   175/  338 batches | lr 0.00010 | ms/batch  9.00 | loss 0.00002097
| Epoch  19 |   200/  338 batches | lr 0.00010 | ms/batch 10.18 | loss 0.00003601
| Epoch  19 |   225/  338 batches | lr 0.00010 | ms/batch 10.05 | loss 0.00001666
| Epoch  19 |   250/  338 batches | lr 0.00010 | ms/batch  8.63 | loss 0.00001389
| Epoch  19 |   275/  338 batches | lr 0.00010 | ms/batch 10.16 | loss 0.00002774
| Epoch  19 |   300/  338 batches | lr 0.00010 | ms/batch 10.15 | loss 0.00004217
| Epoch  19 |   

| Epoch  26 |   250/  338 batches | lr 0.00010 | ms/batch 10.28 | loss 0.00004970
| Epoch  26 |   275/  338 batches | lr 0.00010 | ms/batch 10.55 | loss 0.00002421
| Epoch  26 |   300/  338 batches | lr 0.00010 | ms/batch  8.54 | loss 0.00000973
| Epoch  26 |   325/  338 batches | lr 0.00010 | ms/batch 10.67 | loss 0.00001349
| Epoch  27 |    25/  338 batches | lr 0.00010 | ms/batch 10.58 | loss 0.00003378
| Epoch  27 |    50/  338 batches | lr 0.00010 | ms/batch 10.93 | loss 0.00001335
| Epoch  27 |    75/  338 batches | lr 0.00010 | ms/batch  8.54 | loss 0.00002676
| Epoch  27 |   100/  338 batches | lr 0.00010 | ms/batch 10.17 | loss 0.00001606
| Epoch  27 |   125/  338 batches | lr 0.00010 | ms/batch 10.04 | loss 0.00002209
| Epoch  27 |   150/  338 batches | lr 0.00010 | ms/batch  8.69 | loss 0.00002034
| Epoch  27 |   175/  338 batches | lr 0.00010 | ms/batch 10.81 | loss 0.00001267
| Epoch  27 |   200/  338 batches | lr 0.00010 | ms/batch 10.29 | loss 0.00009006
| Epoch  27 |   

| Epoch  34 |   150/  338 batches | lr 0.00010 | ms/batch 10.28 | loss 0.00000684
| Epoch  34 |   175/  338 batches | lr 0.00010 | ms/batch 10.00 | loss 0.00000784
| Epoch  34 |   200/  338 batches | lr 0.00010 | ms/batch  8.54 | loss 0.00001231
| Epoch  34 |   225/  338 batches | lr 0.00010 | ms/batch 10.12 | loss 0.00001138
| Epoch  34 |   250/  338 batches | lr 0.00010 | ms/batch 10.19 | loss 0.00000805
| Epoch  34 |   275/  338 batches | lr 0.00010 | ms/batch  8.43 | loss 0.00001333
| Epoch  34 |   300/  338 batches | lr 0.00010 | ms/batch 10.07 | loss 0.00001278
| Epoch  34 |   325/  338 batches | lr 0.00010 | ms/batch 10.05 | loss 0.00001543
| Epoch  35 |    25/  338 batches | lr 0.00010 | ms/batch 10.58 | loss 0.00006264
| Epoch  35 |    50/  338 batches | lr 0.00010 | ms/batch 10.14 | loss 0.00005718
| Epoch  35 |    75/  338 batches | lr 0.00010 | ms/batch  8.44 | loss 0.00002643
| Epoch  35 |   100/  338 batches | lr 0.00010 | ms/batch 10.24 | loss 0.00000855
| Epoch  35 |   

| Epoch  42 |    50/  338 batches | lr 0.00010 | ms/batch  9.85 | loss 0.00003109
| Epoch  42 |    75/  338 batches | lr 0.00010 | ms/batch  9.76 | loss 0.00001017
| Epoch  42 |   100/  338 batches | lr 0.00010 | ms/batch  8.23 | loss 0.00000607
| Epoch  42 |   125/  338 batches | lr 0.00010 | ms/batch 10.21 | loss 0.00001087
| Epoch  42 |   150/  338 batches | lr 0.00010 | ms/batch 10.00 | loss 0.00001119
| Epoch  42 |   175/  338 batches | lr 0.00010 | ms/batch  9.99 | loss 0.00001370
| Epoch  42 |   200/  338 batches | lr 0.00010 | ms/batch  8.21 | loss 0.00001781
| Epoch  42 |   225/  338 batches | lr 0.00010 | ms/batch  9.88 | loss 0.00001692
| Epoch  42 |   250/  338 batches | lr 0.00010 | ms/batch  9.76 | loss 0.00000633
| Epoch  42 |   275/  338 batches | lr 0.00010 | ms/batch  8.23 | loss 0.00001739
| Epoch  42 |   300/  338 batches | lr 0.00010 | ms/batch  9.77 | loss 0.00000830
| Epoch  42 |   325/  338 batches | lr 0.00010 | ms/batch  9.75 | loss 0.00000889
| Epoch  43 |   

| Epoch  49 |   275/  338 batches | lr 0.00010 | ms/batch  9.78 | loss 0.00001529
| Epoch  49 |   300/  338 batches | lr 0.00010 | ms/batch  9.94 | loss 0.00001457
| Epoch  49 |   325/  338 batches | lr 0.00010 | ms/batch  8.24 | loss 0.00000784
| Epoch  50 |    25/  338 batches | lr 0.00010 | ms/batch  8.44 | loss 0.00001381
| Epoch  50 |    50/  338 batches | lr 0.00010 | ms/batch  9.78 | loss 0.00001876
| Epoch  50 |    75/  338 batches | lr 0.00010 | ms/batch  9.76 | loss 0.00000585
| Epoch  50 |   100/  338 batches | lr 0.00010 | ms/batch  8.17 | loss 0.00002150
| Epoch  50 |   125/  338 batches | lr 0.00010 | ms/batch  9.88 | loss 0.00001265
| Epoch  50 |   150/  338 batches | lr 0.00010 | ms/batch  9.74 | loss 0.00000675
| Epoch  50 |   175/  338 batches | lr 0.00010 | ms/batch  8.28 | loss 0.00002188
| Epoch  50 |   200/  338 batches | lr 0.00010 | ms/batch  9.97 | loss 0.00001366
| Epoch  50 |   225/  338 batches | lr 0.00010 | ms/batch  9.72 | loss 0.00001516
| Epoch  50 |   

| Epoch  57 |   175/  338 batches | lr 0.00010 | ms/batch  9.76 | loss 0.00000723
| Epoch  57 |   200/  338 batches | lr 0.00010 | ms/batch  9.69 | loss 0.00000906
| Epoch  57 |   225/  338 batches | lr 0.00010 | ms/batch  8.17 | loss 0.00001051
| Epoch  57 |   250/  338 batches | lr 0.00010 | ms/batch  9.77 | loss 0.00001141
| Epoch  57 |   275/  338 batches | lr 0.00010 | ms/batch  9.74 | loss 0.00000617
| Epoch  57 |   300/  338 batches | lr 0.00010 | ms/batch  8.10 | loss 0.00002207
| Epoch  57 |   325/  338 batches | lr 0.00010 | ms/batch  9.71 | loss 0.00002269
| Epoch  58 |    25/  338 batches | lr 0.00010 | ms/batch 10.03 | loss 0.00000592
| Epoch  58 |    50/  338 batches | lr 0.00010 | ms/batch  9.67 | loss 0.00000916
| Epoch  58 |    75/  338 batches | lr 0.00010 | ms/batch  9.72 | loss 0.00001563
| Epoch  58 |   100/  338 batches | lr 0.00010 | ms/batch  8.29 | loss 0.00000808
| Epoch  58 |   125/  338 batches | lr 0.00010 | ms/batch 10.08 | loss 0.00001375
| Epoch  58 |   

| Epoch  65 |    75/  338 batches | lr 0.00010 | ms/batch 10.05 | loss 0.00000693
| Epoch  65 |   100/  338 batches | lr 0.00010 | ms/batch 10.18 | loss 0.00000463
| Epoch  65 |   125/  338 batches | lr 0.00010 | ms/batch  8.57 | loss 0.00000680
| Epoch  65 |   150/  338 batches | lr 0.00010 | ms/batch 10.42 | loss 0.00001308
| Epoch  65 |   175/  338 batches | lr 0.00010 | ms/batch 10.38 | loss 0.00001304
| Epoch  65 |   200/  338 batches | lr 0.00010 | ms/batch 10.61 | loss 0.00001217
| Epoch  65 |   225/  338 batches | lr 0.00010 | ms/batch  8.47 | loss 0.00000746
| Epoch  65 |   250/  338 batches | lr 0.00010 | ms/batch 11.07 | loss 0.00001030
| Epoch  65 |   275/  338 batches | lr 0.00010 | ms/batch 10.76 | loss 0.00000999
| Epoch  65 |   300/  338 batches | lr 0.00010 | ms/batch  8.42 | loss 0.00000559
| Epoch  65 |   325/  338 batches | lr 0.00010 | ms/batch 10.32 | loss 0.00003448
| Epoch  66 |    25/  338 batches | lr 0.00010 | ms/batch 11.02 | loss 0.00000955
| Epoch  66 |   

| Epoch  72 |   300/  338 batches | lr 0.00010 | ms/batch 10.07 | loss 0.00000778
| Epoch  72 |   325/  338 batches | lr 0.00010 | ms/batch 10.31 | loss 0.00000496
| Epoch  73 |    25/  338 batches | lr 0.00010 | ms/batch 10.47 | loss 0.00000565
| Epoch  73 |    50/  338 batches | lr 0.00010 | ms/batch  8.39 | loss 0.00000659
| Epoch  73 |    75/  338 batches | lr 0.00010 | ms/batch 10.25 | loss 0.00001354
| Epoch  73 |   100/  338 batches | lr 0.00010 | ms/batch 10.17 | loss 0.00008716
| Epoch  73 |   125/  338 batches | lr 0.00010 | ms/batch  8.35 | loss 0.00001288
| Epoch  73 |   150/  338 batches | lr 0.00010 | ms/batch  9.90 | loss 0.00000449
| Epoch  73 |   175/  338 batches | lr 0.00010 | ms/batch  9.85 | loss 0.00000482
| Epoch  73 |   200/  338 batches | lr 0.00010 | ms/batch  8.30 | loss 0.00000492
| Epoch  73 |   225/  338 batches | lr 0.00010 | ms/batch  9.75 | loss 0.00000609
| Epoch  73 |   250/  338 batches | lr 0.00010 | ms/batch  9.75 | loss 0.00000425
| Epoch  73 |   

| Epoch  80 |   200/  338 batches | lr 0.00010 | ms/batch  9.68 | loss 0.00000465
| Epoch  80 |   225/  338 batches | lr 0.00010 | ms/batch  9.70 | loss 0.00000647
| Epoch  80 |   250/  338 batches | lr 0.00010 | ms/batch  8.05 | loss 0.00003388
| Epoch  80 |   275/  338 batches | lr 0.00010 | ms/batch  9.63 | loss 0.00001098
| Epoch  80 |   300/  338 batches | lr 0.00010 | ms/batch  9.58 | loss 0.00000666
| Epoch  80 |   325/  338 batches | lr 0.00010 | ms/batch  8.08 | loss 0.00000395
| Epoch  81 |    25/  338 batches | lr 0.00010 | ms/batch  8.36 | loss 0.00000439
| Epoch  81 |    50/  338 batches | lr 0.00010 | ms/batch  9.65 | loss 0.00000827
| Epoch  81 |    75/  338 batches | lr 0.00010 | ms/batch  9.69 | loss 0.00001153
| Epoch  81 |   100/  338 batches | lr 0.00010 | ms/batch  9.62 | loss 0.00000934
| Epoch  81 |   125/  338 batches | lr 0.00010 | ms/batch  8.07 | loss 0.00000668
| Epoch  81 |   150/  338 batches | lr 0.00010 | ms/batch  9.64 | loss 0.00002718
| Epoch  81 |   

| Epoch  88 |   100/  338 batches | lr 0.00010 | ms/batch  9.60 | loss 0.00000501
| Epoch  88 |   125/  338 batches | lr 0.00010 | ms/batch  9.62 | loss 0.00000518
| Epoch  88 |   150/  338 batches | lr 0.00010 | ms/batch  8.07 | loss 0.00001495
| Epoch  88 |   175/  338 batches | lr 0.00010 | ms/batch  9.60 | loss 0.00001564
| Epoch  88 |   200/  338 batches | lr 0.00010 | ms/batch  9.60 | loss 0.00000753
| Epoch  88 |   225/  338 batches | lr 0.00010 | ms/batch  9.57 | loss 0.00000355
| Epoch  88 |   250/  338 batches | lr 0.00010 | ms/batch  8.07 | loss 0.00001660
| Epoch  88 |   275/  338 batches | lr 0.00010 | ms/batch  9.72 | loss 0.00001123
| Epoch  88 |   300/  338 batches | lr 0.00010 | ms/batch  9.57 | loss 0.00000678
| Epoch  88 |   325/  338 batches | lr 0.00010 | ms/batch  8.03 | loss 0.00000576
| Epoch  89 |    25/  338 batches | lr 0.00010 | ms/batch  8.31 | loss 0.00001119
| Epoch  89 |    50/  338 batches | lr 0.00010 | ms/batch  9.57 | loss 0.00001687
| Epoch  89 |   

| Epoch  95 |   325/  338 batches | lr 0.00010 | ms/batch 10.04 | loss 0.00000495
| Epoch  96 |    25/  338 batches | lr 0.00010 | ms/batch  9.98 | loss 0.00003130
| Epoch  96 |    50/  338 batches | lr 0.00010 | ms/batch  9.93 | loss 0.00003216
| Epoch  96 |    75/  338 batches | lr 0.00010 | ms/batch  8.18 | loss 0.00000422
| Epoch  96 |   100/  338 batches | lr 0.00010 | ms/batch 10.00 | loss 0.00000390
| Epoch  96 |   125/  338 batches | lr 0.00010 | ms/batch  9.93 | loss 0.00000364
| Epoch  96 |   150/  338 batches | lr 0.00010 | ms/batch  8.26 | loss 0.00000364
| Epoch  96 |   175/  338 batches | lr 0.00010 | ms/batch 10.02 | loss 0.00000329
| Epoch  96 |   200/  338 batches | lr 0.00010 | ms/batch  9.80 | loss 0.00000362
| Epoch  96 |   225/  338 batches | lr 0.00010 | ms/batch  8.16 | loss 0.00000283
| Epoch  96 |   250/  338 batches | lr 0.00010 | ms/batch  9.64 | loss 0.00000941
| Epoch  96 |   275/  338 batches | lr 0.00010 | ms/batch  9.61 | loss 0.00002069
| Epoch  96 |   

| Epoch 103 |   225/  338 batches | lr 0.00010 | ms/batch  9.61 | loss 0.00000338
| Epoch 103 |   250/  338 batches | lr 0.00010 | ms/batch  9.60 | loss 0.00000456
| Epoch 103 |   275/  338 batches | lr 0.00010 | ms/batch  8.10 | loss 0.00000695
| Epoch 103 |   300/  338 batches | lr 0.00010 | ms/batch  9.60 | loss 0.00000711
| Epoch 103 |   325/  338 batches | lr 0.00010 | ms/batch  9.65 | loss 0.00001074
| Epoch 104 |    25/  338 batches | lr 0.00010 | ms/batch  9.97 | loss 0.00001523
| Epoch 104 |    50/  338 batches | lr 0.00010 | ms/batch  8.04 | loss 0.00000394
| Epoch 104 |    75/  338 batches | lr 0.00010 | ms/batch  9.66 | loss 0.00000391
| Epoch 104 |   100/  338 batches | lr 0.00010 | ms/batch  9.64 | loss 0.00000965
| Epoch 104 |   125/  338 batches | lr 0.00010 | ms/batch  9.61 | loss 0.00001080
| Epoch 104 |   150/  338 batches | lr 0.00010 | ms/batch  8.01 | loss 0.00001901
| Epoch 104 |   175/  338 batches | lr 0.00010 | ms/batch  9.58 | loss 0.00000825
| Epoch 104 |   

| Epoch 111 |   125/  338 batches | lr 0.00010 | ms/batch  9.64 | loss 0.00000615
| Epoch 111 |   150/  338 batches | lr 0.00010 | ms/batch  9.69 | loss 0.00001671
| Epoch 111 |   175/  338 batches | lr 0.00010 | ms/batch  8.11 | loss 0.00000843
| Epoch 111 |   200/  338 batches | lr 0.00010 | ms/batch  9.72 | loss 0.00000675
| Epoch 111 |   225/  338 batches | lr 0.00010 | ms/batch  9.70 | loss 0.00000625
| Epoch 111 |   250/  338 batches | lr 0.00010 | ms/batch  9.63 | loss 0.00000582
| Epoch 111 |   275/  338 batches | lr 0.00010 | ms/batch  8.04 | loss 0.00000434
| Epoch 111 |   300/  338 batches | lr 0.00010 | ms/batch  9.56 | loss 0.00000612
| Epoch 111 |   325/  338 batches | lr 0.00010 | ms/batch  9.62 | loss 0.00004704
| Epoch 112 |    25/  338 batches | lr 0.00010 | ms/batch  9.99 | loss 0.00000723
| Epoch 112 |    50/  338 batches | lr 0.00010 | ms/batch  8.09 | loss 0.00000321
| Epoch 112 |    75/  338 batches | lr 0.00010 | ms/batch  9.62 | loss 0.00000326
| Epoch 112 |   

| Epoch 119 |    25/  338 batches | lr 0.00010 | ms/batch  9.97 | loss 0.00000504
| Epoch 119 |    50/  338 batches | lr 0.00010 | ms/batch  9.57 | loss 0.00001547
| Epoch 119 |    75/  338 batches | lr 0.00010 | ms/batch  9.62 | loss 0.00001569
| Epoch 119 |   100/  338 batches | lr 0.00010 | ms/batch  8.07 | loss 0.00000429
| Epoch 119 |   125/  338 batches | lr 0.00010 | ms/batch  9.58 | loss 0.00000496
| Epoch 119 |   150/  338 batches | lr 0.00010 | ms/batch  9.57 | loss 0.00000597
| Epoch 119 |   175/  338 batches | lr 0.00010 | ms/batch  8.01 | loss 0.00000290
| Epoch 119 |   200/  338 batches | lr 0.00010 | ms/batch  9.58 | loss 0.00001255
| Epoch 119 |   225/  338 batches | lr 0.00010 | ms/batch  9.56 | loss 0.00000455
| Epoch 119 |   250/  338 batches | lr 0.00010 | ms/batch  8.06 | loss 0.00000328
| Epoch 119 |   275/  338 batches | lr 0.00010 | ms/batch  9.58 | loss 0.00000644
| Epoch 119 |   300/  338 batches | lr 0.00010 | ms/batch  9.73 | loss 0.00000989
| Epoch 119 |   

| Epoch   1 |   225/  338 batches | lr 0.00010 | ms/batch  8.87 | loss 0.00017101
| Epoch   1 |   250/  338 batches | lr 0.00010 | ms/batch 10.42 | loss 0.00017800
| Epoch   1 |   275/  338 batches | lr 0.00010 | ms/batch 10.41 | loss 0.00015356
| Epoch   1 |   300/  338 batches | lr 0.00010 | ms/batch  8.86 | loss 0.00015907
| Epoch   1 |   325/  338 batches | lr 0.00010 | ms/batch 10.51 | loss 0.00014406
| Epoch   2 |    25/  338 batches | lr 0.00010 | ms/batch 10.90 | loss 0.00013608
| Epoch   2 |    50/  338 batches | lr 0.00010 | ms/batch 10.41 | loss 0.00012084
| Epoch   2 |    75/  338 batches | lr 0.00010 | ms/batch  8.83 | loss 0.00011260
| Epoch   2 |   100/  338 batches | lr 0.00010 | ms/batch 10.39 | loss 0.00010753
| Epoch   2 |   125/  338 batches | lr 0.00010 | ms/batch 10.36 | loss 0.00011799
| Epoch   2 |   150/  338 batches | lr 0.00010 | ms/batch 10.38 | loss 0.00011915
| Epoch   2 |   175/  338 batches | lr 0.00010 | ms/batch  8.82 | loss 0.00011208
| Epoch   2 |   

| Epoch   9 |   125/  338 batches | lr 0.00010 | ms/batch  8.86 | loss 0.00007179
| Epoch   9 |   150/  338 batches | lr 0.00010 | ms/batch 10.35 | loss 0.00003902
| Epoch   9 |   175/  338 batches | lr 0.00010 | ms/batch 10.37 | loss 0.00002227
| Epoch   9 |   200/  338 batches | lr 0.00010 | ms/batch  8.81 | loss 0.00003290
| Epoch   9 |   225/  338 batches | lr 0.00010 | ms/batch 10.37 | loss 0.00009036
| Epoch   9 |   250/  338 batches | lr 0.00010 | ms/batch 10.37 | loss 0.00004451
| Epoch   9 |   275/  338 batches | lr 0.00010 | ms/batch 10.35 | loss 0.00002939
| Epoch   9 |   300/  338 batches | lr 0.00010 | ms/batch  9.01 | loss 0.00001844
| Epoch   9 |   325/  338 batches | lr 0.00010 | ms/batch 10.37 | loss 0.00004656
| Epoch  10 |    25/  338 batches | lr 0.00010 | ms/batch 10.76 | loss 0.00007787
| Epoch  10 |    50/  338 batches | lr 0.00010 | ms/batch 10.37 | loss 0.00003792
| Epoch  10 |    75/  338 batches | lr 0.00010 | ms/batch  8.81 | loss 0.00003464
| Epoch  10 |   

| Epoch  17 |    25/  338 batches | lr 0.00010 | ms/batch  9.38 | loss 0.00002674
| Epoch  17 |    50/  338 batches | lr 0.00010 | ms/batch 10.41 | loss 0.00003547
| Epoch  17 |    75/  338 batches | lr 0.00010 | ms/batch 10.42 | loss 0.00001236
| Epoch  17 |   100/  338 batches | lr 0.00010 | ms/batch 10.50 | loss 0.00006074
| Epoch  17 |   125/  338 batches | lr 0.00010 | ms/batch  8.93 | loss 0.00003046
| Epoch  17 |   150/  338 batches | lr 0.00010 | ms/batch 10.47 | loss 0.00001951
| Epoch  17 |   175/  338 batches | lr 0.00010 | ms/batch 10.63 | loss 0.00006928
| Epoch  17 |   200/  338 batches | lr 0.00010 | ms/batch  8.79 | loss 0.00003429
| Epoch  17 |   225/  338 batches | lr 0.00010 | ms/batch 10.44 | loss 0.00001430
| Epoch  17 |   250/  338 batches | lr 0.00010 | ms/batch 10.39 | loss 0.00001986
| Epoch  17 |   275/  338 batches | lr 0.00010 | ms/batch  8.86 | loss 0.00002518
| Epoch  17 |   300/  338 batches | lr 0.00010 | ms/batch 10.41 | loss 0.00003500
| Epoch  17 |   

| Epoch  24 |   250/  338 batches | lr 0.00010 | ms/batch  8.89 | loss 0.00001266
| Epoch  24 |   275/  338 batches | lr 0.00010 | ms/batch 10.47 | loss 0.00001374
| Epoch  24 |   300/  338 batches | lr 0.00010 | ms/batch 10.84 | loss 0.00001625
| Epoch  24 |   325/  338 batches | lr 0.00010 | ms/batch  9.20 | loss 0.00001989
| Epoch  25 |    25/  338 batches | lr 0.00010 | ms/batch  9.74 | loss 0.00004820
| Epoch  25 |    50/  338 batches | lr 0.00010 | ms/batch 10.70 | loss 0.00003466
| Epoch  25 |    75/  338 batches | lr 0.00010 | ms/batch 10.84 | loss 0.00001821
| Epoch  25 |   100/  338 batches | lr 0.00010 | ms/batch  8.91 | loss 0.00001243
| Epoch  25 |   125/  338 batches | lr 0.00010 | ms/batch 10.42 | loss 0.00000903
| Epoch  25 |   150/  338 batches | lr 0.00010 | ms/batch 10.48 | loss 0.00001121
| Epoch  25 |   175/  338 batches | lr 0.00010 | ms/batch 10.47 | loss 0.00010817
| Epoch  25 |   200/  338 batches | lr 0.00010 | ms/batch  8.82 | loss 0.00002048
| Epoch  25 |   

| Epoch  32 |   150/  338 batches | lr 0.00010 | ms/batch  9.22 | loss 0.00000927
| Epoch  32 |   175/  338 batches | lr 0.00010 | ms/batch 10.79 | loss 0.00000762
| Epoch  32 |   200/  338 batches | lr 0.00010 | ms/batch 11.14 | loss 0.00000682
| Epoch  32 |   225/  338 batches | lr 0.00010 | ms/batch  9.39 | loss 0.00000963
| Epoch  32 |   250/  338 batches | lr 0.00010 | ms/batch 11.00 | loss 0.00000659
| Epoch  32 |   275/  338 batches | lr 0.00010 | ms/batch 11.04 | loss 0.00001128
| Epoch  32 |   300/  338 batches | lr 0.00010 | ms/batch 11.05 | loss 0.00001663
| Epoch  32 |   325/  338 batches | lr 0.00010 | ms/batch  9.40 | loss 0.00005366
| Epoch  33 |    25/  338 batches | lr 0.00010 | ms/batch 10.16 | loss 0.00005604
| Epoch  33 |    50/  338 batches | lr 0.00010 | ms/batch 11.55 | loss 0.00002601
| Epoch  33 |    75/  338 batches | lr 0.00010 | ms/batch 11.14 | loss 0.00000675
| Epoch  33 |   100/  338 batches | lr 0.00010 | ms/batch  9.40 | loss 0.00001489
| Epoch  33 |   

| Epoch  40 |    50/  338 batches | lr 0.00010 | ms/batch  9.82 | loss 0.00000846
| Epoch  40 |    75/  338 batches | lr 0.00010 | ms/batch 10.77 | loss 0.00000720
| Epoch  40 |   100/  338 batches | lr 0.00010 | ms/batch 10.84 | loss 0.00001574
| Epoch  40 |   125/  338 batches | lr 0.00010 | ms/batch 10.80 | loss 0.00001061
| Epoch  40 |   150/  338 batches | lr 0.00010 | ms/batch  9.06 | loss 0.00003961
| Epoch  40 |   175/  338 batches | lr 0.00010 | ms/batch 10.85 | loss 0.00001276
| Epoch  40 |   200/  338 batches | lr 0.00010 | ms/batch 11.32 | loss 0.00001097
| Epoch  40 |   225/  338 batches | lr 0.00010 | ms/batch  9.07 | loss 0.00001066
| Epoch  40 |   250/  338 batches | lr 0.00010 | ms/batch 11.02 | loss 0.00001771
| Epoch  40 |   275/  338 batches | lr 0.00010 | ms/batch 10.88 | loss 0.00000610
| Epoch  40 |   300/  338 batches | lr 0.00010 | ms/batch 13.66 | loss 0.00002303
| Epoch  40 |   325/  338 batches | lr 0.00010 | ms/batch 10.86 | loss 0.00001056
| Epoch  41 |   

| Epoch  47 |   275/  338 batches | lr 0.00010 | ms/batch  8.81 | loss 0.00000661
| Epoch  47 |   300/  338 batches | lr 0.00010 | ms/batch 10.45 | loss 0.00000548
| Epoch  47 |   325/  338 batches | lr 0.00010 | ms/batch 10.38 | loss 0.00006517
| Epoch  48 |    25/  338 batches | lr 0.00010 | ms/batch 10.76 | loss 0.00002976
| Epoch  48 |    50/  338 batches | lr 0.00010 | ms/batch  8.86 | loss 0.00000995
| Epoch  48 |    75/  338 batches | lr 0.00010 | ms/batch 10.37 | loss 0.00001057
| Epoch  48 |   100/  338 batches | lr 0.00010 | ms/batch 10.49 | loss 0.00000441
| Epoch  48 |   125/  338 batches | lr 0.00010 | ms/batch  8.87 | loss 0.00000525
| Epoch  48 |   150/  338 batches | lr 0.00010 | ms/batch 10.54 | loss 0.00000541
| Epoch  48 |   175/  338 batches | lr 0.00010 | ms/batch 10.43 | loss 0.00001061
| Epoch  48 |   200/  338 batches | lr 0.00010 | ms/batch 10.45 | loss 0.00002015
| Epoch  48 |   225/  338 batches | lr 0.00010 | ms/batch  8.85 | loss 0.00002284
| Epoch  48 |   

| Epoch  55 |   175/  338 batches | lr 0.00010 | ms/batch  8.82 | loss 0.00000640
| Epoch  55 |   200/  338 batches | lr 0.00010 | ms/batch 10.42 | loss 0.00001005
| Epoch  55 |   225/  338 batches | lr 0.00010 | ms/batch 10.45 | loss 0.00000966
| Epoch  55 |   250/  338 batches | lr 0.00010 | ms/batch  8.88 | loss 0.00000675
| Epoch  55 |   275/  338 batches | lr 0.00010 | ms/batch 10.46 | loss 0.00000738
| Epoch  55 |   300/  338 batches | lr 0.00010 | ms/batch 10.44 | loss 0.00003047
| Epoch  55 |   325/  338 batches | lr 0.00010 | ms/batch 10.40 | loss 0.00000776
| Epoch  56 |    25/  338 batches | lr 0.00010 | ms/batch 10.87 | loss 0.00000577
| Epoch  56 |    50/  338 batches | lr 0.00010 | ms/batch  8.88 | loss 0.00002587
| Epoch  56 |    75/  338 batches | lr 0.00010 | ms/batch 10.45 | loss 0.00002141
| Epoch  56 |   100/  338 batches | lr 0.00010 | ms/batch 10.45 | loss 0.00002124
| Epoch  56 |   125/  338 batches | lr 0.00010 | ms/batch  8.87 | loss 0.00001465
| Epoch  56 |   

| Epoch  63 |    75/  338 batches | lr 0.00010 | ms/batch  8.81 | loss 0.00002164
| Epoch  63 |   100/  338 batches | lr 0.00010 | ms/batch 10.36 | loss 0.00005989
| Epoch  63 |   125/  338 batches | lr 0.00010 | ms/batch 10.37 | loss 0.00001740
| Epoch  63 |   150/  338 batches | lr 0.00010 | ms/batch 10.40 | loss 0.00000685
| Epoch  63 |   175/  338 batches | lr 0.00010 | ms/batch  8.79 | loss 0.00000516
| Epoch  63 |   200/  338 batches | lr 0.00010 | ms/batch 10.38 | loss 0.00000636
| Epoch  63 |   225/  338 batches | lr 0.00010 | ms/batch 10.39 | loss 0.00000612
| Epoch  63 |   250/  338 batches | lr 0.00010 | ms/batch  8.82 | loss 0.00000336
| Epoch  63 |   275/  338 batches | lr 0.00010 | ms/batch 10.38 | loss 0.00000285
| Epoch  63 |   300/  338 batches | lr 0.00010 | ms/batch 10.36 | loss 0.00000378
| Epoch  63 |   325/  338 batches | lr 0.00010 | ms/batch  8.83 | loss 0.00001636
| Epoch  64 |    25/  338 batches | lr 0.00010 | ms/batch  9.20 | loss 0.00001503
| Epoch  64 |   

| Epoch  70 |   300/  338 batches | lr 0.00010 | ms/batch  9.01 | loss 0.00000432
| Epoch  70 |   325/  338 batches | lr 0.00010 | ms/batch 11.13 | loss 0.00000713
| Epoch  71 |    25/  338 batches | lr 0.00010 | ms/batch 10.73 | loss 0.00001914
| Epoch  71 |    50/  338 batches | lr 0.00010 | ms/batch 10.67 | loss 0.00000963
| Epoch  71 |    75/  338 batches | lr 0.00010 | ms/batch  8.93 | loss 0.00000560
| Epoch  71 |   100/  338 batches | lr 0.00010 | ms/batch 10.37 | loss 0.00000364
| Epoch  71 |   125/  338 batches | lr 0.00010 | ms/batch 10.36 | loss 0.00000323
| Epoch  71 |   150/  338 batches | lr 0.00010 | ms/batch  8.84 | loss 0.00000392
| Epoch  71 |   175/  338 batches | lr 0.00010 | ms/batch 10.40 | loss 0.00000836
| Epoch  71 |   200/  338 batches | lr 0.00010 | ms/batch 10.36 | loss 0.00002496
| Epoch  71 |   225/  338 batches | lr 0.00010 | ms/batch 10.45 | loss 0.00001641
| Epoch  71 |   250/  338 batches | lr 0.00010 | ms/batch  8.85 | loss 0.00000694
| Epoch  71 |   

| Epoch  78 |   200/  338 batches | lr 0.00010 | ms/batch  9.25 | loss 0.00000548
| Epoch  78 |   225/  338 batches | lr 0.00010 | ms/batch 10.98 | loss 0.00000749
| Epoch  78 |   250/  338 batches | lr 0.00010 | ms/batch 10.86 | loss 0.00000510
| Epoch  78 |   275/  338 batches | lr 0.00010 | ms/batch  8.99 | loss 0.00000345
| Epoch  78 |   300/  338 batches | lr 0.00010 | ms/batch 10.50 | loss 0.00000459
| Epoch  78 |   325/  338 batches | lr 0.00010 | ms/batch 10.63 | loss 0.00000900
| Epoch  79 |    25/  338 batches | lr 0.00010 | ms/batch 11.69 | loss 0.00001788
| Epoch  79 |    50/  338 batches | lr 0.00010 | ms/batch 10.60 | loss 0.00000656
| Epoch  79 |    75/  338 batches | lr 0.00010 | ms/batch  9.06 | loss 0.00000546
| Epoch  79 |   100/  338 batches | lr 0.00010 | ms/batch 10.73 | loss 0.00000436
| Epoch  79 |   125/  338 batches | lr 0.00010 | ms/batch 10.35 | loss 0.00000487
| Epoch  79 |   150/  338 batches | lr 0.00010 | ms/batch  8.90 | loss 0.00001363
| Epoch  79 |   

| Epoch  86 |   100/  338 batches | lr 0.00010 | ms/batch  9.31 | loss 0.00002576
| Epoch  86 |   125/  338 batches | lr 0.00010 | ms/batch 10.92 | loss 0.00000540
| Epoch  86 |   150/  338 batches | lr 0.00010 | ms/batch 10.91 | loss 0.00000497
| Epoch  86 |   175/  338 batches | lr 0.00010 | ms/batch 10.81 | loss 0.00000800
| Epoch  86 |   200/  338 batches | lr 0.00010 | ms/batch  9.11 | loss 0.00001124
| Epoch  86 |   225/  338 batches | lr 0.00010 | ms/batch 10.80 | loss 0.00000923
| Epoch  86 |   250/  338 batches | lr 0.00010 | ms/batch 10.96 | loss 0.00000324
| Epoch  86 |   275/  338 batches | lr 0.00010 | ms/batch  9.31 | loss 0.00000334
| Epoch  86 |   300/  338 batches | lr 0.00010 | ms/batch 11.03 | loss 0.00000489
| Epoch  86 |   325/  338 batches | lr 0.00010 | ms/batch 11.04 | loss 0.00003355
| Epoch  87 |    25/  338 batches | lr 0.00010 | ms/batch 11.51 | loss 0.00001238
| Epoch  87 |    50/  338 batches | lr 0.00010 | ms/batch  9.29 | loss 0.00001760
| Epoch  87 |   

| Epoch  93 |   325/  338 batches | lr 0.00010 | ms/batch  9.17 | loss 0.00000308
| Epoch  94 |    25/  338 batches | lr 0.00010 | ms/batch  9.86 | loss 0.00001355
| Epoch  94 |    50/  338 batches | lr 0.00010 | ms/batch 11.25 | loss 0.00000718
| Epoch  94 |    75/  338 batches | lr 0.00010 | ms/batch 10.82 | loss 0.00000299
| Epoch  94 |   100/  338 batches | lr 0.00010 | ms/batch  9.28 | loss 0.00000337
| Epoch  94 |   125/  338 batches | lr 0.00010 | ms/batch 11.53 | loss 0.00000692
| Epoch  94 |   150/  338 batches | lr 0.00010 | ms/batch 11.72 | loss 0.00002058
| Epoch  94 |   175/  338 batches | lr 0.00010 | ms/batch  9.37 | loss 0.00000445
| Epoch  94 |   200/  338 batches | lr 0.00010 | ms/batch 11.38 | loss 0.00001158
| Epoch  94 |   225/  338 batches | lr 0.00010 | ms/batch 11.88 | loss 0.00000609
| Epoch  94 |   250/  338 batches | lr 0.00010 | ms/batch 11.46 | loss 0.00000734
| Epoch  94 |   275/  338 batches | lr 0.00010 | ms/batch  9.45 | loss 0.00001005
| Epoch  94 |   

| Epoch 101 |   225/  338 batches | lr 0.00010 | ms/batch  9.68 | loss 0.00000809
| Epoch 101 |   250/  338 batches | lr 0.00010 | ms/batch 11.31 | loss 0.00000407
| Epoch 101 |   275/  338 batches | lr 0.00010 | ms/batch 11.21 | loss 0.00000665
| Epoch 101 |   300/  338 batches | lr 0.00010 | ms/batch 10.01 | loss 0.00000602
| Epoch 101 |   325/  338 batches | lr 0.00010 | ms/batch 11.41 | loss 0.00002229
| Epoch 102 |    25/  338 batches | lr 0.00010 | ms/batch 11.48 | loss 0.00002159
| Epoch 102 |    50/  338 batches | lr 0.00010 | ms/batch 10.88 | loss 0.00001776
| Epoch 102 |    75/  338 batches | lr 0.00010 | ms/batch 10.84 | loss 0.00000360
| Epoch 102 |   100/  338 batches | lr 0.00010 | ms/batch  9.29 | loss 0.00000238
| Epoch 102 |   125/  338 batches | lr 0.00010 | ms/batch 10.83 | loss 0.00000400
| Epoch 102 |   150/  338 batches | lr 0.00010 | ms/batch 10.88 | loss 0.00000356
| Epoch 102 |   175/  338 batches | lr 0.00010 | ms/batch  9.12 | loss 0.00000419
| Epoch 102 |   

| Epoch 109 |   125/  338 batches | lr 0.00010 | ms/batch  8.91 | loss 0.00000384
| Epoch 109 |   150/  338 batches | lr 0.00010 | ms/batch 10.43 | loss 0.00000469
| Epoch 109 |   175/  338 batches | lr 0.00010 | ms/batch 10.37 | loss 0.00001097
| Epoch 109 |   200/  338 batches | lr 0.00010 | ms/batch 10.42 | loss 0.00000566
| Epoch 109 |   225/  338 batches | lr 0.00010 | ms/batch  8.84 | loss 0.00000541
| Epoch 109 |   250/  338 batches | lr 0.00010 | ms/batch 10.45 | loss 0.00000690
| Epoch 109 |   275/  338 batches | lr 0.00010 | ms/batch 10.42 | loss 0.00000814
| Epoch 109 |   300/  338 batches | lr 0.00010 | ms/batch  8.92 | loss 0.00000477
| Epoch 109 |   325/  338 batches | lr 0.00010 | ms/batch 10.40 | loss 0.00001113
| Epoch 110 |    25/  338 batches | lr 0.00010 | ms/batch 10.88 | loss 0.00000515
| Epoch 110 |    50/  338 batches | lr 0.00010 | ms/batch 10.48 | loss 0.00000633
| Epoch 110 |    75/  338 batches | lr 0.00010 | ms/batch  8.81 | loss 0.00000706
| Epoch 110 |   

| Epoch 117 |    25/  338 batches | lr 0.00010 | ms/batch 10.74 | loss 0.00000571
| Epoch 117 |    50/  338 batches | lr 0.00010 | ms/batch  8.81 | loss 0.00000521
| Epoch 117 |    75/  338 batches | lr 0.00010 | ms/batch 10.36 | loss 0.00000460
| Epoch 117 |   100/  338 batches | lr 0.00010 | ms/batch 10.39 | loss 0.00000268
| Epoch 117 |   125/  338 batches | lr 0.00010 | ms/batch  8.85 | loss 0.00000214
| Epoch 117 |   150/  338 batches | lr 0.00010 | ms/batch 10.44 | loss 0.00000256
| Epoch 117 |   175/  338 batches | lr 0.00010 | ms/batch 10.44 | loss 0.00003112
| Epoch 117 |   200/  338 batches | lr 0.00010 | ms/batch  8.95 | loss 0.00001720
| Epoch 117 |   225/  338 batches | lr 0.00010 | ms/batch 10.39 | loss 0.00000543
| Epoch 117 |   250/  338 batches | lr 0.00010 | ms/batch 10.36 | loss 0.00000553
| Epoch 117 |   275/  338 batches | lr 0.00010 | ms/batch 10.40 | loss 0.00000579
| Epoch 117 |   300/  338 batches | lr 0.00010 | ms/batch  8.82 | loss 0.00000303
| Epoch 117 |   

| Epoch 124 |   250/  338 batches | lr 0.00010 | ms/batch  8.83 | loss 0.00001149
| Epoch 124 |   275/  338 batches | lr 0.00010 | ms/batch 10.37 | loss 0.00000574
| Epoch 124 |   300/  338 batches | lr 0.00010 | ms/batch 10.37 | loss 0.00000658
| Epoch 124 |   325/  338 batches | lr 0.00010 | ms/batch  8.81 | loss 0.00000621
Model:  {'hidden_size': 400, 'n_layers': 8, 'act_fun': 'LeakyReLU', 'init_methods': 'xavier uniform'}
| Epoch   0 |    25/  338 batches | lr 0.00010 | ms/batch  9.98 | loss 0.20924742
| Epoch   0 |    50/  338 batches | lr 0.00010 | ms/batch 11.09 | loss 0.01621773
| Epoch   0 |    75/  338 batches | lr 0.00010 | ms/batch 11.14 | loss 0.00547767
| Epoch   0 |   100/  338 batches | lr 0.00010 | ms/batch 11.16 | loss 0.00280546
| Epoch   0 |   125/  338 batches | lr 0.00010 | ms/batch  9.57 | loss 0.00161236
| Epoch   0 |   150/  338 batches | lr 0.00010 | ms/batch 11.14 | loss 0.00107543
| Epoch   0 |   175/  338 batches | lr 0.00010 | ms/batch 11.18 | loss 0.00081

| Epoch   7 |   125/  338 batches | lr 0.00010 | ms/batch 11.15 | loss 0.00002383
| Epoch   7 |   150/  338 batches | lr 0.00010 | ms/batch  9.59 | loss 0.00002443
| Epoch   7 |   175/  338 batches | lr 0.00010 | ms/batch 11.13 | loss 0.00003434
| Epoch   7 |   200/  338 batches | lr 0.00010 | ms/batch 11.22 | loss 0.00003194
| Epoch   7 |   225/  338 batches | lr 0.00010 | ms/batch 11.23 | loss 0.00002303
| Epoch   7 |   250/  338 batches | lr 0.00010 | ms/batch  9.58 | loss 0.00009100
| Epoch   7 |   275/  338 batches | lr 0.00010 | ms/batch 11.14 | loss 0.00003573
| Epoch   7 |   300/  338 batches | lr 0.00010 | ms/batch 11.35 | loss 0.00005234
| Epoch   7 |   325/  338 batches | lr 0.00010 | ms/batch  9.74 | loss 0.00004729
| Epoch   8 |    25/  338 batches | lr 0.00010 | ms/batch 10.12 | loss 0.00003998
| Epoch   8 |    50/  338 batches | lr 0.00010 | ms/batch 11.29 | loss 0.00033062
| Epoch   8 |    75/  338 batches | lr 0.00010 | ms/batch 11.19 | loss 0.00004189
| Epoch   8 |   

| Epoch  15 |    25/  338 batches | lr 0.00010 | ms/batch 12.03 | loss 0.00003969
| Epoch  15 |    50/  338 batches | lr 0.00010 | ms/batch 11.38 | loss 0.00001736
| Epoch  15 |    75/  338 batches | lr 0.00010 | ms/batch  9.98 | loss 0.00005738
| Epoch  15 |   100/  338 batches | lr 0.00010 | ms/batch 11.45 | loss 0.00005476
| Epoch  15 |   125/  338 batches | lr 0.00010 | ms/batch 11.18 | loss 0.00001567
| Epoch  15 |   150/  338 batches | lr 0.00010 | ms/batch  9.59 | loss 0.00001490
| Epoch  15 |   175/  338 batches | lr 0.00010 | ms/batch 11.18 | loss 0.00005460
| Epoch  15 |   200/  338 batches | lr 0.00010 | ms/batch 11.36 | loss 0.00036945
| Epoch  15 |   225/  338 batches | lr 0.00010 | ms/batch  9.82 | loss 0.00003895
| Epoch  15 |   250/  338 batches | lr 0.00010 | ms/batch 11.40 | loss 0.00001946
| Epoch  15 |   275/  338 batches | lr 0.00010 | ms/batch 11.36 | loss 0.00001369
| Epoch  15 |   300/  338 batches | lr 0.00010 | ms/batch 11.47 | loss 0.00001537
| Epoch  15 |   

| Epoch  22 |   250/  338 batches | lr 0.00010 | ms/batch 11.95 | loss 0.00000851
| Epoch  22 |   275/  338 batches | lr 0.00010 | ms/batch 10.11 | loss 0.00000852
| Epoch  22 |   300/  338 batches | lr 0.00010 | ms/batch 11.70 | loss 0.00001128
| Epoch  22 |   325/  338 batches | lr 0.00010 | ms/batch 11.60 | loss 0.00008735
| Epoch  23 |    25/  338 batches | lr 0.00010 | ms/batch 12.50 | loss 0.00001302
| Epoch  23 |    50/  338 batches | lr 0.00010 | ms/batch 10.07 | loss 0.00001257
| Epoch  23 |    75/  338 batches | lr 0.00010 | ms/batch 12.07 | loss 0.00002068
| Epoch  23 |   100/  338 batches | lr 0.00010 | ms/batch 11.74 | loss 0.00002143
| Epoch  23 |   125/  338 batches | lr 0.00010 | ms/batch 11.68 | loss 0.00002771
| Epoch  23 |   150/  338 batches | lr 0.00010 | ms/batch 10.34 | loss 0.00003448
| Epoch  23 |   175/  338 batches | lr 0.00010 | ms/batch 12.20 | loss 0.00001802
| Epoch  23 |   200/  338 batches | lr 0.00010 | ms/batch 12.66 | loss 0.00000861
| Epoch  23 |   

| Epoch  30 |   150/  338 batches | lr 0.00010 | ms/batch 11.22 | loss 0.00001032
| Epoch  30 |   175/  338 batches | lr 0.00010 | ms/batch  9.68 | loss 0.00004421
| Epoch  30 |   200/  338 batches | lr 0.00010 | ms/batch 11.14 | loss 0.00010579
| Epoch  30 |   225/  338 batches | lr 0.00010 | ms/batch 11.37 | loss 0.00001983
| Epoch  30 |   250/  338 batches | lr 0.00010 | ms/batch 11.16 | loss 0.00001104
| Epoch  30 |   275/  338 batches | lr 0.00010 | ms/batch  9.57 | loss 0.00000978
| Epoch  30 |   300/  338 batches | lr 0.00010 | ms/batch 11.20 | loss 0.00000657
| Epoch  30 |   325/  338 batches | lr 0.00010 | ms/batch 11.19 | loss 0.00001447
| Epoch  31 |    25/  338 batches | lr 0.00010 | ms/batch 12.06 | loss 0.00001448
| Epoch  31 |    50/  338 batches | lr 0.00010 | ms/batch  9.84 | loss 0.00001209
| Epoch  31 |    75/  338 batches | lr 0.00010 | ms/batch 11.13 | loss 0.00003350
| Epoch  31 |   100/  338 batches | lr 0.00010 | ms/batch 11.14 | loss 0.00005755
| Epoch  31 |   

| Epoch  38 |    50/  338 batches | lr 0.00010 | ms/batch 11.14 | loss 0.00001491
| Epoch  38 |    75/  338 batches | lr 0.00010 | ms/batch 11.18 | loss 0.00003089
| Epoch  38 |   100/  338 batches | lr 0.00010 | ms/batch  9.59 | loss 0.00001907
| Epoch  38 |   125/  338 batches | lr 0.00010 | ms/batch 11.15 | loss 0.00001075
| Epoch  38 |   150/  338 batches | lr 0.00010 | ms/batch 11.18 | loss 0.00000587
| Epoch  38 |   175/  338 batches | lr 0.00010 | ms/batch  9.62 | loss 0.00000805
| Epoch  38 |   200/  338 batches | lr 0.00010 | ms/batch 11.17 | loss 0.00012402
| Epoch  38 |   225/  338 batches | lr 0.00010 | ms/batch 11.39 | loss 0.00003056
| Epoch  38 |   250/  338 batches | lr 0.00010 | ms/batch  9.69 | loss 0.00000749
| Epoch  38 |   275/  338 batches | lr 0.00010 | ms/batch 11.49 | loss 0.00000667
| Epoch  38 |   300/  338 batches | lr 0.00010 | ms/batch 11.40 | loss 0.00000843
| Epoch  38 |   325/  338 batches | lr 0.00010 | ms/batch 11.27 | loss 0.00001015
| Epoch  39 |   

| Epoch  45 |   275/  338 batches | lr 0.00010 | ms/batch 11.13 | loss 0.00000916
| Epoch  45 |   300/  338 batches | lr 0.00010 | ms/batch  9.58 | loss 0.00000967
| Epoch  45 |   325/  338 batches | lr 0.00010 | ms/batch 11.10 | loss 0.00002887
| Epoch  46 |    25/  338 batches | lr 0.00010 | ms/batch 11.55 | loss 0.00003560
| Epoch  46 |    50/  338 batches | lr 0.00010 | ms/batch 11.14 | loss 0.00001520
| Epoch  46 |    75/  338 batches | lr 0.00010 | ms/batch  9.58 | loss 0.00001131
| Epoch  46 |   100/  338 batches | lr 0.00010 | ms/batch 11.12 | loss 0.00000783
| Epoch  46 |   125/  338 batches | lr 0.00010 | ms/batch 11.13 | loss 0.00000464
| Epoch  46 |   150/  338 batches | lr 0.00010 | ms/batch 11.11 | loss 0.00000388
| Epoch  46 |   175/  338 batches | lr 0.00010 | ms/batch  9.58 | loss 0.00010939
| Epoch  46 |   200/  338 batches | lr 0.00010 | ms/batch 11.15 | loss 0.00004613
| Epoch  46 |   225/  338 batches | lr 0.00010 | ms/batch 11.23 | loss 0.00001015
| Epoch  46 |   

| Epoch  53 |   175/  338 batches | lr 0.00010 | ms/batch 11.95 | loss 0.00009634
| Epoch  53 |   200/  338 batches | lr 0.00010 | ms/batch 10.28 | loss 0.00001189
| Epoch  53 |   225/  338 batches | lr 0.00010 | ms/batch 12.41 | loss 0.00000542
| Epoch  53 |   250/  338 batches | lr 0.00010 | ms/batch 12.03 | loss 0.00000407
| Epoch  53 |   275/  338 batches | lr 0.00010 | ms/batch 11.85 | loss 0.00000543
| Epoch  53 |   300/  338 batches | lr 0.00010 | ms/batch  9.97 | loss 0.00000554
| Epoch  53 |   325/  338 batches | lr 0.00010 | ms/batch 11.32 | loss 0.00000391
| Epoch  54 |    25/  338 batches | lr 0.00010 | ms/batch 11.63 | loss 0.00001094
| Epoch  54 |    50/  338 batches | lr 0.00010 | ms/batch 11.22 | loss 0.00002271
| Epoch  54 |    75/  338 batches | lr 0.00010 | ms/batch  9.80 | loss 0.00001241
| Epoch  54 |   100/  338 batches | lr 0.00010 | ms/batch 11.19 | loss 0.00001124
| Epoch  54 |   125/  338 batches | lr 0.00010 | ms/batch 11.76 | loss 0.00001154
| Epoch  54 |   

| Epoch  61 |    75/  338 batches | lr 0.00010 | ms/batch 11.19 | loss 0.00000481
| Epoch  61 |   100/  338 batches | lr 0.00010 | ms/batch 11.17 | loss 0.00000796
| Epoch  61 |   125/  338 batches | lr 0.00010 | ms/batch  9.61 | loss 0.00000397
| Epoch  61 |   150/  338 batches | lr 0.00010 | ms/batch 11.16 | loss 0.00002451
| Epoch  61 |   175/  338 batches | lr 0.00010 | ms/batch 11.14 | loss 0.00001420
| Epoch  61 |   200/  338 batches | lr 0.00010 | ms/batch  9.60 | loss 0.00000930
| Epoch  61 |   225/  338 batches | lr 0.00010 | ms/batch 11.19 | loss 0.00001054
| Epoch  61 |   250/  338 batches | lr 0.00010 | ms/batch 11.51 | loss 0.00000677
| Epoch  61 |   275/  338 batches | lr 0.00010 | ms/batch  9.81 | loss 0.00000319
| Epoch  61 |   300/  338 batches | lr 0.00010 | ms/batch 11.22 | loss 0.00006338
| Epoch  61 |   325/  338 batches | lr 0.00010 | ms/batch 12.74 | loss 0.00001209
| Epoch  62 |    25/  338 batches | lr 0.00010 | ms/batch 12.15 | loss 0.00001575
| Epoch  62 |   

| Epoch  68 |   300/  338 batches | lr 0.00010 | ms/batch 11.30 | loss 0.00000526
| Epoch  68 |   325/  338 batches | lr 0.00010 | ms/batch 10.18 | loss 0.00000895
| Epoch  69 |    25/  338 batches | lr 0.00010 | ms/batch 10.23 | loss 0.00001895
| Epoch  69 |    50/  338 batches | lr 0.00010 | ms/batch 11.34 | loss 0.00001023
| Epoch  69 |    75/  338 batches | lr 0.00010 | ms/batch 11.20 | loss 0.00002724
| Epoch  69 |   100/  338 batches | lr 0.00010 | ms/batch  9.65 | loss 0.00001156
| Epoch  69 |   125/  338 batches | lr 0.00010 | ms/batch 11.18 | loss 0.00000473
| Epoch  69 |   150/  338 batches | lr 0.00010 | ms/batch 11.28 | loss 0.00000530
| Epoch  69 |   175/  338 batches | lr 0.00010 | ms/batch 11.16 | loss 0.00001295
| Epoch  69 |   200/  338 batches | lr 0.00010 | ms/batch  9.65 | loss 0.00000689
| Epoch  69 |   225/  338 batches | lr 0.00010 | ms/batch 11.40 | loss 0.00000546
| Epoch  69 |   250/  338 batches | lr 0.00010 | ms/batch 11.54 | loss 0.00003131
| Epoch  69 |   

| Epoch  76 |   200/  338 batches | lr 0.00010 | ms/batch 11.60 | loss 0.00000476
| Epoch  76 |   225/  338 batches | lr 0.00010 | ms/batch  9.85 | loss 0.00001755
| Epoch  76 |   250/  338 batches | lr 0.00010 | ms/batch 11.51 | loss 0.00000673
| Epoch  76 |   275/  338 batches | lr 0.00010 | ms/batch 11.34 | loss 0.00000362
| Epoch  76 |   300/  338 batches | lr 0.00010 | ms/batch 11.32 | loss 0.00001019
| Epoch  76 |   325/  338 batches | lr 0.00010 | ms/batch  9.80 | loss 0.00002766
| Epoch  77 |    25/  338 batches | lr 0.00010 | ms/batch 10.12 | loss 0.00000902
| Epoch  77 |    50/  338 batches | lr 0.00010 | ms/batch 11.36 | loss 0.00000789
| Epoch  77 |    75/  338 batches | lr 0.00010 | ms/batch 11.54 | loss 0.00000328
| Epoch  77 |   100/  338 batches | lr 0.00010 | ms/batch  9.89 | loss 0.00002027
| Epoch  77 |   125/  338 batches | lr 0.00010 | ms/batch 11.48 | loss 0.00001895
| Epoch  77 |   150/  338 batches | lr 0.00010 | ms/batch 11.90 | loss 0.00001212
| Epoch  77 |   

| Epoch  84 |   100/  338 batches | lr 0.00010 | ms/batch 11.37 | loss 0.00000299
| Epoch  84 |   125/  338 batches | lr 0.00010 | ms/batch 11.61 | loss 0.00000278
| Epoch  84 |   150/  338 batches | lr 0.00010 | ms/batch  9.78 | loss 0.00000317
| Epoch  84 |   175/  338 batches | lr 0.00010 | ms/batch 11.25 | loss 0.00000285
| Epoch  84 |   200/  338 batches | lr 0.00010 | ms/batch 11.79 | loss 0.00000316
| Epoch  84 |   225/  338 batches | lr 0.00010 | ms/batch  9.76 | loss 0.00000255
| Epoch  84 |   250/  338 batches | lr 0.00010 | ms/batch 11.20 | loss 0.00000251
| Epoch  84 |   275/  338 batches | lr 0.00010 | ms/batch 11.67 | loss 0.00000752
| Epoch  84 |   300/  338 batches | lr 0.00010 | ms/batch  9.84 | loss 0.00000712
| Epoch  84 |   325/  338 batches | lr 0.00010 | ms/batch 11.83 | loss 0.00000805
| Epoch  85 |    25/  338 batches | lr 0.00010 | ms/batch 12.07 | loss 0.00001715
| Epoch  85 |    50/  338 batches | lr 0.00010 | ms/batch 11.61 | loss 0.00000649
| Epoch  85 |   

| Epoch  91 |   325/  338 batches | lr 0.00010 | ms/batch 11.57 | loss 0.00001052
| Epoch  92 |    25/  338 batches | lr 0.00010 | ms/batch 12.02 | loss 0.00001414
| Epoch  92 |    50/  338 batches | lr 0.00010 | ms/batch  9.87 | loss 0.00001101
| Epoch  92 |    75/  338 batches | lr 0.00010 | ms/batch 11.59 | loss 0.00000862
| Epoch  92 |   100/  338 batches | lr 0.00010 | ms/batch 11.59 | loss 0.00000865
| Epoch  92 |   125/  338 batches | lr 0.00010 | ms/batch 10.19 | loss 0.00000407
| Epoch  92 |   150/  338 batches | lr 0.00010 | ms/batch 11.65 | loss 0.00000376
| Epoch  92 |   175/  338 batches | lr 0.00010 | ms/batch 11.56 | loss 0.00001549
| Epoch  92 |   200/  338 batches | lr 0.00010 | ms/batch 11.75 | loss 0.00001290
| Epoch  92 |   225/  338 batches | lr 0.00010 | ms/batch  9.98 | loss 0.00001330
| Epoch  92 |   250/  338 batches | lr 0.00010 | ms/batch 11.52 | loss 0.00000403
| Epoch  92 |   275/  338 batches | lr 0.00010 | ms/batch 11.51 | loss 0.00000836
| Epoch  92 |   

| Epoch  99 |   225/  338 batches | lr 0.00010 | ms/batch 11.55 | loss 0.00000389
| Epoch  99 |   250/  338 batches | lr 0.00010 | ms/batch 10.00 | loss 0.00000271
| Epoch  99 |   275/  338 batches | lr 0.00010 | ms/batch 11.55 | loss 0.00000909
| Epoch  99 |   300/  338 batches | lr 0.00010 | ms/batch 11.52 | loss 0.00001245
| Epoch  99 |   325/  338 batches | lr 0.00010 | ms/batch 11.64 | loss 0.00001091
| Epoch 100 |    25/  338 batches | lr 0.00010 | ms/batch 12.04 | loss 0.00001829
| Epoch 100 |    50/  338 batches | lr 0.00010 | ms/batch  9.92 | loss 0.00000893
| Epoch 100 |    75/  338 batches | lr 0.00010 | ms/batch 11.52 | loss 0.00000289
| Epoch 100 |   100/  338 batches | lr 0.00010 | ms/batch 11.61 | loss 0.00000341
| Epoch 100 |   125/  338 batches | lr 0.00010 | ms/batch  9.92 | loss 0.00000702
| Epoch 100 |   150/  338 batches | lr 0.00010 | ms/batch 11.49 | loss 0.00000289
| Epoch 100 |   175/  338 batches | lr 0.00010 | ms/batch 11.48 | loss 0.00000280
| Epoch 100 |   

| Epoch 107 |   125/  338 batches | lr 0.00010 | ms/batch 11.18 | loss 0.00000611
| Epoch 107 |   150/  338 batches | lr 0.00010 | ms/batch 11.19 | loss 0.00000597
| Epoch 107 |   175/  338 batches | lr 0.00010 | ms/batch  9.61 | loss 0.00000346
| Epoch 107 |   200/  338 batches | lr 0.00010 | ms/batch 11.12 | loss 0.00000731
| Epoch 107 |   225/  338 batches | lr 0.00010 | ms/batch 11.21 | loss 0.00000330
| Epoch 107 |   250/  338 batches | lr 0.00010 | ms/batch  9.60 | loss 0.00000278
| Epoch 107 |   275/  338 batches | lr 0.00010 | ms/batch 11.13 | loss 0.00000608
| Epoch 107 |   300/  338 batches | lr 0.00010 | ms/batch 11.20 | loss 0.00000378
| Epoch 107 |   325/  338 batches | lr 0.00010 | ms/batch  9.69 | loss 0.00000483
| Epoch 108 |    25/  338 batches | lr 0.00010 | ms/batch 10.04 | loss 0.00001731
| Epoch 108 |    50/  338 batches | lr 0.00010 | ms/batch 11.12 | loss 0.00000531
| Epoch 108 |    75/  338 batches | lr 0.00010 | ms/batch 11.14 | loss 0.00000483
| Epoch 108 |   

| Epoch 115 |    25/  338 batches | lr 0.00010 | ms/batch 11.49 | loss 0.00000554
| Epoch 115 |    50/  338 batches | lr 0.00010 | ms/batch 11.20 | loss 0.00003255
| Epoch 115 |    75/  338 batches | lr 0.00010 | ms/batch  9.60 | loss 0.00000704
| Epoch 115 |   100/  338 batches | lr 0.00010 | ms/batch 11.12 | loss 0.00000313
| Epoch 115 |   125/  338 batches | lr 0.00010 | ms/batch 11.13 | loss 0.00000261
| Epoch 115 |   150/  338 batches | lr 0.00010 | ms/batch  9.59 | loss 0.00000218
| Epoch 115 |   175/  338 batches | lr 0.00010 | ms/batch 11.10 | loss 0.00000218
| Epoch 115 |   200/  338 batches | lr 0.00010 | ms/batch 11.25 | loss 0.00000483
| Epoch 115 |   225/  338 batches | lr 0.00010 | ms/batch 11.20 | loss 0.00000847
| Epoch 115 |   250/  338 batches | lr 0.00010 | ms/batch  9.58 | loss 0.00000619
| Epoch 115 |   275/  338 batches | lr 0.00010 | ms/batch 11.14 | loss 0.00001278
| Epoch 115 |   300/  338 batches | lr 0.00010 | ms/batch 11.17 | loss 0.00002686
| Epoch 115 |   

| Epoch 122 |   250/  338 batches | lr 0.00010 | ms/batch 11.16 | loss 0.00000359
| Epoch 122 |   275/  338 batches | lr 0.00010 | ms/batch  9.60 | loss 0.00000545
| Epoch 122 |   300/  338 batches | lr 0.00010 | ms/batch 11.13 | loss 0.00000536
| Epoch 122 |   325/  338 batches | lr 0.00010 | ms/batch 11.15 | loss 0.00000341
| Epoch 123 |    25/  338 batches | lr 0.00010 | ms/batch 11.50 | loss 0.00002980
| Epoch 123 |    50/  338 batches | lr 0.00010 | ms/batch 11.21 | loss 0.00001503
| Epoch 123 |    75/  338 batches | lr 0.00010 | ms/batch  9.57 | loss 0.00000348
| Epoch 123 |   100/  338 batches | lr 0.00010 | ms/batch 11.13 | loss 0.00000343
| Epoch 123 |   125/  338 batches | lr 0.00010 | ms/batch 11.20 | loss 0.00000398
| Epoch 123 |   150/  338 batches | lr 0.00010 | ms/batch  9.61 | loss 0.00000428
| Epoch 123 |   175/  338 batches | lr 0.00010 | ms/batch 11.10 | loss 0.00000364
| Epoch 123 |   200/  338 batches | lr 0.00010 | ms/batch 11.15 | loss 0.00000272
| Epoch 123 |   

In [23]:
testing_results

Unnamed: 0,hidden_size,n_layers,act_fun,init_methods,mean_val_result,std_val_result,test_mse,test_mae,test_rmse,test_mape
0,400,8,LeakyReLU,xavier uniform,5.4e-05,0.000105,9e-06,0.002469,0.003063,-0.008389
1,400,4,ReLU,xavier uniform,5.3e-05,8.9e-05,7e-06,0.001971,0.002562,-0.004358
2,400,4,LeakyReLU,xavier uniform,4.5e-05,9.1e-05,3.7e-05,0.00526,0.00606,-0.000668
3,400,6,LeakyReLU,xavier uniform,4.6e-05,8.4e-05,7e-06,0.0022,0.002719,-0.000346
4,400,8,LeakyReLU,xavier uniform,4.9e-05,7.8e-05,1.9e-05,0.003483,0.004334,-0.036244


In [24]:
testing_results.to_csv('../results/testing_results_trinomial.csv')