In [1]:
import os
import sys
import math
import GPUtil
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torchvision
from tqdm import tqdm_notebook as tqdm, tnrange
from transformers.tokenization_bert import BertTokenizer
from transformers import BertModel
from transformers.optimization import AdamW, get_linear_schedule_with_warmup

w_dir = %pwd
work_dir = os.path.dirname(w_dir)
work_dir

I1129 09:16:00.025009 140564403722048 file_utils.py:39] PyTorch version 1.1.0 available.


'/work'

In [2]:
sys.path.append(w_dir+'/fgc_support_retri')

In [3]:
import config
from fgc_preprocess import SerDataset, BertIdx, bert_collate
from sup_model import BertSupSentClassification

In [4]:
from utils import read_fgc, read_hotpot
fgc_items = read_fgc(config.FGC_TRAIN, eval=True)

I1129 09:16:00.624996 140564403722048 corenlp.py:42] Using an existing server http://140.109.19.191:9000
I1129 09:16:01.627814 140564403722048 corenlp.py:118] The server is available.


{'QID': 'D001Q11', 'QTYPE': '申论', 'QTEXT': '苏东坡为何被后人认为是文学艺术史上的通才?', 'ANSWER': [{'ATEXT': '', 'ATOKEN': [{'text': '', 'start': 0}]}], 'ASPAN': [], 'SHINT': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'ATYPE': 'Event', 'AMODE': 'Single-Span-Extraction'}
{'QID': 'D006Q02', 'QTYPE': '申论', 'QTEXT': '「阿拉伯之春」运动中，走上街头的民众的诉求为何?', 'ANSWER': [{'ATEXT': '', 'ATOKEN': [{'text': '', 'start': 0}]}], 'ASPAN': [], 'SHINT': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'ATYPE': 'Object', 'AMODE': 'Single-Span-Extraction'}
{'QID': 'D048Q09', 'QTYPE': '申论', 'QTEXT': '聊天机器人仰赖哪些方法让回答愈来愈准确?', 'ANSWER': [{'ATEXT': '', 'ATOKEN': [{'text': '', 'start': 0}]}], 'ASPAN': [], 'SHINT': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'ATYPE': 'Object', 'AMODE': 'Single-Span-Extraction'}
{'QID': 'D091Q08', 'QTYPE': '进阶题', 'QTEXT': '妻子的叔叔要怎么叫他?', 'ANSWER': [{'ATEXT': '资讯不足无法判定', 'ATOKEN': [{'

In [5]:
hotpot_items = read_hotpot(config.HOTPOT_DEV, eval=True)

7405 questions
7405 documents
47988 sentences
6.480486158001351 sentences/document
7405 questions
18006 supporting evidence sentences
2.4316002700877783 supporting evidence sentences/question


In [6]:
train_items = fgc_items
dev_items = read_fgc(config.FGC_DEV, eval=True)
test_items = read_fgc(config.FGC_TEST, eval=True)

{'QID': 'D009Q03', 'QTYPE': '申论', 'QTEXT': '「占领华尔街」运动的诉求为何?', 'ANSWER': [{'ATEXT': '', 'ATOKEN': [{'text': '', 'start': 0}]}], 'ASPAN': [], 'SHINT': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'ATYPE': 'Object', 'AMODE': 'Single-Span-Extraction'}
{'QID': 'D032Q10', 'QTYPE': '进阶题', 'QTEXT': '第二次签订的北美贸易协定从签署至生效过了几日?', 'ANSWER': [{'ATEXT': '资讯不足无法判定', 'ATOKEN': [{'text': '资讯不足无法判定', 'start': -1}]}], 'ASPAN': [], 'SHINT': [0, 0, 0], 'ATYPE': 'Date-Duration', 'AMODE': 'Date-Duration'}
{'QID': 'D049Q04', 'QTYPE': '申论', 'QTEXT': '「雅婷逐字稿」的命名起源为何?', 'ANSWER': [{'ATEXT': '', 'ATOKEN': [{'text': '', 'start': 0}]}], 'ASPAN': [], 'SHINT': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'ATYPE': 'Event', 'AMODE': 'Single-Span-Extraction'}
{'QID': 'D117Q05', 'QTYPE': '进阶题', 'QTEXT': '是否发现肿瘤就是得到癌症?', 'ANSWER': [{'ATEXT': '否', 'ATOKEN': [{'text': '否', 'start': 109}]}], 'ASPAN': [{'text': '不具入侵能力但失控繁殖的细胞，称为良性肿瘤', 'start': 492, 'end': 513}], 'SHINT': [0, 0, 0, 0, 0, 

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
train_set = SerDataset(train_items, transform=torchvision.transforms.Compose([BertIdx(tokenizer)]))
dev_set = SerDataset(dev_items, transform=torchvision.transforms.Compose([BertIdx(tokenizer)]))
test_set = SerDataset(test_items, transform=torchvision.transforms.Compose([BertIdx(tokenizer)]))

I1129 09:16:02.845812 140564403722048 tokenization_utils.py:375] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt from cache at /root/.cache/torch/transformers/8a0c070123c1f794c42a29c6904beb7c1b8715741e235bee04aca2c7636fc83f.9b42061518a39ca00b8b52059fd2bede8daa613f8a8671500e518a8c29de8c00


In [8]:
len(train_set)

10116

In [9]:
len(dev_set)

6768

In [10]:
len(test_set)

4953

train model

In [11]:
torch.manual_seed(12)
bert_model_name = 'bert-base-chinese'
warmup_proportion = 0.1
learning_rate = 2e-5
num_epochs = 100
eval_frequency = 5
trained_model_path = config.TRAINED_MODELS / "20191129-with_hotpot"

batch_size = 64

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device_num = 0 if torch.cuda.is_available() else -1
n_gpu = torch.cuda.device_count()

bert_encoder = BertModel.from_pretrained(bert_model_name)
model = BertSupSentClassification(bert_encoder)

model.to(device)
if n_gpu > 1:
    model = nn.DataParallel(model)

param_optimizer = list(model.named_parameters())

no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

num_train_optimization_steps = int(math.ceil(len(train_set) / batch_size)) * num_epochs

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(num_train_optimization_steps*warmup_proportion),
                                            num_training_steps=num_train_optimization_steps)

print('start training ... ')

I1129 09:16:23.306963 140564403722048 configuration_utils.py:152] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json from cache at /root/.cache/torch/transformers/8a3b1cfe5da58286e12a0f5d7d182b8d6eca88c08e26c332ee3817548cf7e60a.0c16faba8be66db3f02805c912e4cf94d3c9cffc1f12fa1a39906f9270f76d33
I1129 09:16:23.309652 140564403722048 configuration_utils.py:169] Model config {
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "po

start training ... 


In [12]:
GPUtil.showUtilization()

| ID | GPU | MEM |
------------------
|  0 |  4% | 12% |
|  1 |  0% |  0% |
|  2 |  0% |  0% |


In [13]:
dataloader_train = DataLoader(train_set, batch_size=batch_size, shuffle=True, collate_fn=bert_collate)
dataloader_dev = DataLoader(dev_set, batch_size=64, collate_fn=bert_collate)

In [14]:
GPUtil.showUtilization()

| ID | GPU | MEM |
------------------
|  0 |  4% | 12% |
|  1 |  0% |  0% |
|  2 |  0% |  0% |


In [15]:
for epoch_i in tnrange(num_epochs+1):
    model.train()
    running_loss = 0.0
    for batch_i, batch in enumerate(tqdm(dataloader_train)):
        optimizer.zero_grad()
        print("batch:{}".format(batch_i))
        GPUtil.showUtilization()
        input_ids = batch['input_ids'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        loss = model(input_ids, token_type_ids=token_type_ids,
                     attention_mask=attention_mask, mode=BertSupSentClassification.ForwardMode.TRAIN,
                     labels=labels)

        if n_gpu > 1:
            loss = loss.mean()  # mean() to average on multi-gpu.
            
        loss.backward()
        optimizer.step()
        scheduler.step()
        running_loss += loss.item()
        
    print('epoch %d train_loss: %.3f' % (epoch_i, running_loss/len(dataloader_train)))
            
    if epoch_i % eval_frequency == 0:
        model.eval()

        accum_loss = 0
        with torch.no_grad():
            for batch in dataloader_dev:
                input_ids = batch['input_ids'].to(device)
                token_type_ids = batch['token_type_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)
                loss = model(input_ids, token_type_ids=token_type_ids,
                             attention_mask=attention_mask, mode=BertSupSentClassification.ForwardMode.TRAIN,
                             labels=labels)
                if n_gpu > 1:
                    loss = loss.mean()
                accum_loss += loss
        aver_loss = accum_loss / len(dataloader_dev)
        print('epoch %d eval_loss: %.3f' % (epoch_i, aver_loss))
            
        model_to_save = model.module if hasattr(model, 'module') else model
        torch.save(model_to_save.state_dict(), str(trained_model_path/ "model_epoch{0}_loss_{1:.3f}.m".format(epoch_i, aver_loss)))

HBox(children=(IntProgress(value=0, max=101), HTML(value='')))

HBox(children=(IntProgress(value=0, max=159), HTML(value='')))

batch:0
| ID | GPU | MEM |
------------------
|  0 |  0% | 12% |
|  1 |  0% |  0% |
|  2 |  0% |  0% |




batch:1
| ID | GPU | MEM |
------------------
|  0 | 16% | 30% |
|  1 | 22% | 27% |
|  2 | 21% | 26% |




batch:2
| ID | GPU | MEM |
------------------
|  0 | 40% | 48% |
|  1 | 22% | 39% |
|  2 | 21% | 37% |




batch:3
| ID | GPU | MEM |
------------------
|  0 | 40% | 62% |
|  1 | 30% | 52% |
|  2 | 29% | 49% |




batch:4
| ID | GPU | MEM |
------------------
|  0 | 47% | 74% |
|  1 | 45% | 65% |
|  2 | 44% | 62% |




batch:5
| ID | GPU | MEM |
------------------
|  0 | 43% | 74% |
|  1 | 45% | 65% |
|  2 | 44% | 62% |




batch:6
| ID | GPU | MEM |
------------------
|  0 | 43% | 74% |
|  1 | 36% | 65% |
|  2 | 32% | 62% |




batch:7
| ID | GPU | MEM |
------------------
|  0 | 52% | 74% |
|  1 | 36% | 65% |
|  2 | 32% | 62% |




batch:8
| ID | GPU | MEM |
------------------
|  0 | 52% | 90% |
|  1 | 32% | 82% |
|  2 | 31% | 76% |




batch:9
| ID | GPU | MEM |
------------------
|  0 | 41% | 90% |
|  1 | 50% | 82% |
|  2 | 48% | 76% |




batch:10
| ID | GPU | MEM |
------------------
|  0 | 43% | 91% |
|  1 | 50% | 82% |
|  2 | 48% | 76% |




batch:11
| ID | GPU | MEM |
------------------
|  0 | 43% | 91% |
|  1 | 28% | 82% |
|  2 | 24% | 76% |




batch:12
| ID | GPU | MEM |
------------------
|  0 | 53% | 91% |
|  1 | 28% | 82% |
|  2 | 24% | 76% |




batch:13
| ID | GPU | MEM |
------------------
|  0 | 53% | 91% |
|  1 | 33% | 82% |
|  2 | 33% | 76% |




batch:14
| ID | GPU | MEM |
------------------
|  0 | 38% | 91% |
|  1 | 41% | 82% |
|  2 | 40% | 76% |




batch:15
| ID | GPU | MEM |
------------------
|  0 | 38% | 91% |
|  1 | 41% | 82% |
|  2 | 40% | 76% |




batch:16
| ID | GPU | MEM |
------------------
|  0 | 42% | 91% |
|  1 | 39% | 82% |
|  2 | 33% | 76% |




batch:17
| ID | GPU | MEM |
------------------
|  0 | 47% | 91% |
|  1 | 39% | 82% |
|  2 | 33% | 76% |




batch:18
| ID | GPU | MEM |
------------------
|  0 | 47% | 91% |
|  1 | 26% | 82% |
|  2 | 25% | 76% |




batch:19
| ID | GPU | MEM |
------------------
|  0 | 51% | 91% |
|  1 | 39% | 82% |
|  2 | 40% | 76% |




batch:20
| ID | GPU | MEM |
------------------
|  0 | 51% | 91% |
|  1 | 39% | 82% |
|  2 | 40% | 76% |




batch:21
| ID | GPU | MEM |
------------------
|  0 | 38% | 91% |
|  1 | 47% | 82% |
|  2 | 41% | 76% |




batch:22
| ID | GPU | MEM |
------------------
|  0 | 50% | 91% |
|  1 | 47% | 82% |
|  2 | 41% | 76% |




batch:23
| ID | GPU | MEM |
------------------
|  0 | 50% | 91% |
|  1 | 27% | 82% |
|  2 | 26% | 76% |




batch:24
| ID | GPU | MEM |
------------------
|  0 | 47% | 91% |
|  1 | 27% | 82% |
|  2 | 26% | 76% |




batch:25
| ID | GPU | MEM |
------------------
|  0 | 47% | 91% |
|  1 | 32% | 82% |
|  2 | 31% | 76% |




batch:26
| ID | GPU | MEM |
------------------
|  0 | 40% | 91% |
|  1 | 45% | 82% |
|  2 | 43% | 76% |




batch:27
| ID | GPU | MEM |
------------------
|  0 | 40% | 91% |
|  1 | 45% | 82% |
|  2 | 43% | 76% |




batch:28
| ID | GPU | MEM |
------------------
|  0 | 41% | 91% |
|  1 | 37% | 82% |
|  2 | 31% | 76% |




batch:29
| ID | GPU | MEM |
------------------
|  0 | 52% | 91% |
|  1 | 37% | 82% |
|  2 | 31% | 76% |




batch:30
| ID | GPU | MEM |
------------------
|  0 | 52% | 91% |
|  1 | 27% | 82% |
|  2 | 28% | 76% |




batch:31
| ID | GPU | MEM |
------------------
|  0 | 41% | 91% |
|  1 | 47% | 82% |
|  2 | 44% | 76% |




batch:32
| ID | GPU | MEM |
------------------
|  0 | 52% | 91% |
|  1 | 47% | 82% |
|  2 | 44% | 76% |




batch:33
| ID | GPU | MEM |
------------------
|  0 | 52% | 91% |
|  1 | 36% | 82% |
|  2 | 30% | 76% |




batch:34
| ID | GPU | MEM |
------------------
|  0 | 50% | 91% |
|  1 | 36% | 82% |
|  2 | 35% | 76% |




batch:35
| ID | GPU | MEM |
------------------
|  0 | 50% | 91% |
|  1 | 35% | 82% |
|  2 | 35% | 76% |




batch:36
| ID | GPU | MEM |
------------------
|  0 | 36% | 91% |
|  1 | 43% | 82% |
|  2 | 41% | 76% |




batch:37
| ID | GPU | MEM |
------------------
|  0 | 45% | 91% |
|  1 | 43% | 82% |
|  2 | 41% | 76% |




batch:38
| ID | GPU | MEM |
------------------
|  0 | 45% | 91% |
|  1 | 30% | 82% |
|  2 | 25% | 76% |




batch:39
| ID | GPU | MEM |
------------------
|  0 | 48% | 91% |
|  1 | 30% | 82% |
|  2 | 37% | 76% |




batch:40
| ID | GPU | MEM |
------------------
|  0 | 48% | 91% |
|  1 | 37% | 82% |
|  2 | 37% | 76% |




batch:41
| ID | GPU | MEM |
------------------
|  0 | 37% | 91% |
|  1 | 42% | 82% |
|  2 | 39% | 76% |




batch:42
| ID | GPU | MEM |
------------------
|  0 | 45% | 91% |
|  1 | 42% | 82% |
|  2 | 39% | 76% |




batch:43
| ID | GPU | MEM |
------------------
|  0 | 45% | 91% |
|  1 | 33% | 82% |
|  2 | 26% | 76% |




batch:44
| ID | GPU | MEM |
------------------
|  0 | 51% | 91% |
|  1 | 33% | 82% |
|  2 | 26% | 76% |




batch:45
| ID | GPU | MEM |
------------------
|  0 | 51% | 91% |
|  1 | 31% | 82% |
|  2 | 33% | 76% |




batch:46
| ID | GPU | MEM |
------------------
|  0 | 40% | 91% |
|  1 | 38% | 82% |
|  2 | 36% | 76% |




batch:47
| ID | GPU | MEM |
------------------
|  0 | 40% | 91% |
|  1 | 38% | 82% |
|  2 | 36% | 76% |




batch:48
| ID | GPU | MEM |
------------------
|  0 | 40% | 91% |
|  1 | 40% | 82% |
|  2 | 37% | 76% |




batch:49
| ID | GPU | MEM |
------------------
|  0 | 45% | 91% |
|  1 | 40% | 82% |
|  2 | 37% | 76% |




batch:50
| ID | GPU | MEM |
------------------
|  0 | 45% | 91% |
|  1 | 31% | 82% |
|  2 | 24% | 76% |




batch:51
| ID | GPU | MEM |
------------------
|  0 | 54% | 91% |
|  1 | 40% | 82% |
|  2 | 42% | 76% |




batch:52
| ID | GPU | MEM |
------------------
|  0 | 44% | 91% |
|  1 | 40% | 82% |
|  2 | 42% | 92% |




batch:53
| ID | GPU | MEM |
------------------
|  0 | 44% | 91% |
|  1 | 44% | 82% |
|  2 | 36% | 92% |




batch:54
| ID | GPU | MEM |
------------------
|  0 | 51% | 91% |
|  1 | 43% | 82% |
|  2 | 45% | 92% |




batch:55
| ID | GPU | MEM |
------------------
|  0 | 51% | 91% |
|  1 | 43% | 82% |
|  2 | 45% | 92% |




batch:56
| ID | GPU | MEM |
------------------
|  0 | 39% | 91% |
|  1 | 37% | 82% |
|  2 | 33% | 92% |




batch:57
| ID | GPU | MEM |
------------------
|  0 | 49% | 91% |
|  1 | 37% | 82% |
|  2 | 33% | 92% |




batch:58
| ID | GPU | MEM |
------------------
|  0 | 49% | 91% |
|  1 | 30% | 82% |
|  2 | 25% | 92% |




batch:59
| ID | GPU | MEM |
------------------
|  0 | 49% | 91% |
|  1 | 30% | 82% |
|  2 | 33% | 92% |




batch:60
| ID | GPU | MEM |
------------------
|  0 | 49% | 91% |
|  1 | 32% | 82% |
|  2 | 33% | 92% |




batch:61
| ID | GPU | MEM |
------------------
|  0 | 37% | 91% |
|  1 | 40% | 82% |
|  2 | 39% | 92% |




batch:62
| ID | GPU | MEM |
------------------
|  0 | 37% | 91% |
|  1 | 40% | 82% |
|  2 | 39% | 92% |




batch:63
| ID | GPU | MEM |
------------------
|  0 | 41% | 91% |
|  1 | 40% | 82% |
|  2 | 31% | 92% |




batch:64
| ID | GPU | MEM |
------------------
|  0 | 59% | 91% |
|  1 | 40% | 82% |
|  2 | 31% | 92% |




batch:65
| ID | GPU | MEM |
------------------
|  0 | 59% | 91% |
|  1 | 33% | 82% |
|  2 | 34% | 92% |




batch:66
| ID | GPU | MEM |
------------------
|  0 | 38% | 91% |
|  1 | 36% | 82% |
|  2 | 36% | 92% |




batch:67
| ID | GPU | MEM |
------------------
|  0 | 38% | 91% |
|  1 | 36% | 82% |
|  2 | 36% | 92% |




batch:68
| ID | GPU | MEM |
------------------
|  0 | 40% | 91% |
|  1 | 40% | 82% |
|  2 | 36% | 92% |




batch:69
| ID | GPU | MEM |
------------------
|  0 | 50% | 91% |
|  1 | 40% | 82% |
|  2 | 36% | 92% |




batch:70
| ID | GPU | MEM |
------------------
|  0 | 50% | 91% |
|  1 | 28% | 82% |
|  2 | 25% | 92% |




batch:71
| ID | GPU | MEM |
------------------
|  0 | 40% | 91% |
|  1 | 39% | 82% |
|  2 | 39% | 92% |




batch:72
| ID | GPU | MEM |
------------------
|  0 | 40% | 91% |
|  1 | 39% | 82% |
|  2 | 39% | 92% |




batch:73
| ID | GPU | MEM |
------------------
|  0 | 40% | 91% |
|  1 | 42% | 82% |
|  2 | 33% | 92% |




batch:74
| ID | GPU | MEM |
------------------
|  0 | 56% | 91% |
|  1 | 42% | 82% |
|  2 | 33% | 92% |




batch:75
| ID | GPU | MEM |
------------------
|  0 | 56% | 91% |
|  1 | 30% | 82% |
|  2 | 33% | 92% |




batch:76
| ID | GPU | MEM |
------------------
|  0 | 37% | 91% |
|  1 | 39% | 82% |
|  2 | 38% | 92% |




batch:77
| ID | GPU | MEM |
------------------
|  0 | 37% | 91% |
|  1 | 39% | 82% |
|  2 | 38% | 92% |




batch:78
| ID | GPU | MEM |
------------------
|  0 | 42% | 91% |
|  1 | 43% | 82% |
|  2 | 34% | 92% |




batch:79
| ID | GPU | MEM |
------------------
|  0 | 56% | 91% |
|  1 | 43% | 82% |
|  2 | 34% | 92% |




batch:80
| ID | GPU | MEM |
------------------
|  0 | 56% | 91% |
|  1 | 30% | 82% |
|  2 | 33% | 92% |




batch:81
| ID | GPU | MEM |
------------------
|  0 | 40% | 91% |
|  1 | 42% | 82% |
|  2 | 42% | 92% |




batch:82
| ID | GPU | MEM |
------------------
|  0 | 40% | 91% |
|  1 | 42% | 82% |
|  2 | 42% | 92% |




batch:83
| ID | GPU | MEM |
------------------
|  0 | 41% | 91% |
|  1 | 42% | 82% |
|  2 | 32% | 92% |




batch:84
| ID | GPU | MEM |
------------------
|  0 | 59% | 91% |
|  1 | 42% | 82% |
|  2 | 32% | 92% |




batch:85
| ID | GPU | MEM |
------------------
|  0 | 59% | 91% |
|  1 | 32% | 82% |
|  2 | 34% | 92% |




batch:86
| ID | GPU | MEM |
------------------
|  0 | 37% | 91% |
|  1 | 51% | 82% |
|  2 | 45% | 92% |




batch:87
| ID | GPU | MEM |
------------------
|  0 | 51% | 91% |
|  1 | 51% | 82% |
|  2 | 45% | 92% |




batch:88
| ID | GPU | MEM |
------------------
|  0 | 51% | 91% |
|  1 | 26% | 82% |
|  2 | 26% | 92% |




batch:89
| ID | GPU | MEM |
------------------
|  0 | 48% | 91% |
|  1 | 26% | 82% |
|  2 | 34% | 92% |




batch:90
| ID | GPU | MEM |
------------------
|  0 | 48% | 91% |
|  1 | 32% | 82% |
|  2 | 34% | 92% |




batch:91
| ID | GPU | MEM |
------------------
|  0 | 32% | 91% |
|  1 | 43% | 82% |
|  2 | 42% | 92% |




batch:92
| ID | GPU | MEM |
------------------
|  0 | 47% | 91% |
|  1 | 43% | 82% |
|  2 | 42% | 92% |




batch:93
| ID | GPU | MEM |
------------------
|  0 | 47% | 91% |
|  1 | 36% | 82% |
|  2 | 25% | 92% |




batch:94
| ID | GPU | MEM |
------------------
|  0 | 54% | 91% |
|  1 | 36% | 82% |
|  2 | 36% | 92% |




batch:95
| ID | GPU | MEM |
------------------
|  0 | 54% | 91% |
|  1 | 30% | 82% |
|  2 | 36% | 92% |




batch:96
| ID | GPU | MEM |
------------------
|  0 | 35% | 91% |
|  1 | 44% | 82% |
|  2 | 42% | 92% |




batch:97
| ID | GPU | MEM |
------------------
|  0 | 46% | 91% |
|  1 | 44% | 82% |
|  2 | 42% | 92% |




batch:98
| ID | GPU | MEM  |
-------------------
|  0 | 46% |  62% |
|  1 | 30% | 100% |
|  2 | 23% |  92% |




batch:99
| ID | GPU | MEM  |
-------------------
|  0 | 50% |  62% |
|  1 | 45% | 100% |
|  2 | 39% |  92% |




batch:100
| ID | GPU | MEM  |
-------------------
|  0 | 51% |  62% |
|  1 | 45% | 100% |
|  2 | 27% |  92% |




batch:101
| ID | GPU | MEM  |
-------------------
|  0 | 51% |  62% |
|  1 | 22% | 100% |
|  2 | 27% |  92% |




batch:102
| ID | GPU | MEM  |
-------------------
|  0 | 37% |  62% |
|  1 | 37% | 100% |
|  2 | 36% |  92% |




batch:103
| ID | GPU | MEM  |
-------------------
|  0 | 43% |  62% |
|  1 | 37% | 100% |
|  2 | 36% |  92% |




batch:104
| ID | GPU | MEM  |
-------------------
|  0 | 43% |  63% |
|  1 | 43% | 100% |
|  2 | 31% |  92% |




batch:105
| ID | GPU | MEM  |
-------------------
|  0 | 54% |  63% |
|  1 | 43% | 100% |
|  2 | 35% |  92% |




batch:106
| ID | GPU | MEM  |
-------------------
|  0 | 54% |  63% |
|  1 | 30% | 100% |
|  2 | 35% |  92% |




batch:107
| ID | GPU | MEM  |
-------------------
|  0 | 39% |  63% |
|  1 | 43% | 100% |
|  2 | 42% |  92% |




batch:108
| ID | GPU | MEM  |
-------------------
|  0 | 39% |  63% |
|  1 | 43% | 100% |
|  2 | 42% |  92% |




batch:109
| ID | GPU | MEM  |
-------------------
|  0 | 42% |  63% |
|  1 | 39% | 100% |
|  2 | 27% |  92% |




batch:110
| ID | GPU | MEM  |
-------------------
|  0 | 54% |  63% |
|  1 | 39% | 100% |
|  2 | 27% |  92% |




batch:111
| ID | GPU | MEM  |
-------------------
|  0 | 54% |  63% |
|  1 | 28% | 100% |
|  2 | 34% |  92% |




batch:112
| ID | GPU | MEM  |
-------------------
|  0 | 40% |  63% |
|  1 | 48% | 100% |
|  2 | 47% |  92% |




batch:113
| ID | GPU | MEM  |
-------------------
|  0 | 40% |  63% |
|  1 | 48% | 100% |
|  2 | 47% |  92% |




batch:114
| ID | GPU | MEM  |
-------------------
|  0 | 42% |  63% |
|  1 | 35% | 100% |
|  2 | 25% |  92% |




batch:115
| ID | GPU | MEM  |
-------------------
|  0 | 50% |  63% |
|  1 | 35% | 100% |
|  2 | 25% |  92% |




batch:116
| ID | GPU | MEM  |
-------------------
|  0 | 50% |  63% |
|  1 | 25% | 100% |
|  2 | 31% |  92% |




batch:117
| ID | GPU | MEM  |
-------------------
|  0 | 41% |  63% |
|  1 | 41% | 100% |
|  2 | 40% |  92% |




batch:118
| ID | GPU | MEM  |
-------------------
|  0 | 41% |  63% |
|  1 | 41% | 100% |
|  2 | 40% |  92% |




batch:119
| ID | GPU | MEM  |
-------------------
|  0 | 43% |  63% |
|  1 | 45% | 100% |
|  2 | 34% |  92% |




batch:120
| ID | GPU | MEM  |
-------------------
|  0 | 50% |  63% |
|  1 | 45% | 100% |
|  2 | 34% |  92% |




batch:121
| ID | GPU | MEM  |
-------------------
|  0 | 50% |  63% |
|  1 | 27% | 100% |
|  2 | 28% |  92% |




batch:122
| ID | GPU | MEM  |
-------------------
|  0 | 46% |  63% |
|  1 | 27% | 100% |
|  2 | 40% |  92% |




batch:123
| ID | GPU | MEM  |
-------------------
|  0 | 46% |  63% |
|  1 | 37% | 100% |
|  2 | 40% |  92% |




batch:124
| ID | GPU | MEM  |
-------------------
|  0 | 37% |  63% |
|  1 | 41% | 100% |
|  2 | 36% |  92% |




batch:125
| ID | GPU | MEM  |
-------------------
|  0 | 44% |  63% |
|  1 | 41% | 100% |
|  2 | 36% |  92% |




batch:126
| ID | GPU | MEM  |
-------------------
|  0 | 44% |  63% |
|  1 | 35% | 100% |
|  2 | 25% |  92% |




batch:127
| ID | GPU | MEM  |
-------------------
|  0 | 53% |  63% |
|  1 | 35% | 100% |
|  2 | 25% |  92% |




batch:128
| ID | GPU | MEM  |
-------------------
|  0 | 53% |  63% |
|  1 | 28% | 100% |
|  2 | 35% |  92% |




batch:129
| ID | GPU | MEM  |
-------------------
|  0 | 38% |  63% |
|  1 | 38% | 100% |
|  2 | 37% |  92% |




batch:130
| ID | GPU | MEM  |
-------------------
|  0 | 38% |  63% |
|  1 | 38% | 100% |
|  2 | 37% |  92% |




batch:131
| ID | GPU | MEM  |
-------------------
|  0 | 41% |  63% |
|  1 | 48% | 100% |
|  2 | 35% |  92% |




batch:132
| ID | GPU | MEM  |
-------------------
|  0 | 54% |  63% |
|  1 | 48% | 100% |
|  2 | 35% |  92% |




batch:133
| ID | GPU | MEM  |
-------------------
|  0 | 54% |  63% |
|  1 | 24% | 100% |
|  2 | 30% |  92% |




batch:134
| ID | GPU | MEM  |
-------------------
|  0 | 35% |  63% |
|  1 | 38% | 100% |
|  2 | 38% |  92% |




batch:135
| ID | GPU | MEM  |
-------------------
|  0 | 47% |  63% |
|  1 | 38% | 100% |
|  2 | 38% |  92% |




batch:136




| ID | GPU | MEM  |
-------------------
|  0 | 47% |  63% |
|  1 | 44% | 100% |
|  2 | 30% |  92% |
batch:137
| ID | GPU | MEM  |
-------------------
|  0 | 51% |  63% |
|  1 | 44% | 100% |
|  2 | 30% |  92% |




batch:138
| ID | GPU | MEM  |
-------------------
|  0 | 51% |  63% |
|  1 | 27% | 100% |
|  2 | 34% |  92% |




batch:139
| ID | GPU | MEM  |
-------------------
|  0 | 40% |  63% |
|  1 | 39% | 100% |
|  2 | 39% |  92% |




batch:140
| ID | GPU | MEM  |
-------------------
|  0 | 40% |  63% |
|  1 | 39% | 100% |
|  2 | 39% |  92% |




batch:141
| ID | GPU | MEM  |
-------------------
|  0 | 38% |  63% |
|  1 | 39% | 100% |
|  2 | 34% |  92% |




batch:142
| ID | GPU | MEM  |
-------------------
|  0 | 49% |  63% |
|  1 | 39% | 100% |
|  2 | 34% |  92% |




batch:143
| ID | GPU | MEM  |
-------------------
|  0 | 49% |  63% |
|  1 | 34% | 100% |
|  2 | 27% |  92% |




batch:144
| ID | GPU | MEM  |
-------------------
|  0 | 48% |  63% |
|  1 | 34% | 100% |
|  2 | 40% |  92% |




batch:145
| ID | GPU | MEM  |
-------------------
|  0 | 48% |  63% |
|  1 | 34% | 100% |
|  2 | 40% |  92% |




batch:146
| ID | GPU | MEM  |
-------------------
|  0 | 38% |  64% |
|  1 | 53% | 100% |
|  2 | 38% |  92% |




batch:147
| ID | GPU | MEM  |
-------------------
|  0 | 57% |  64% |
|  1 | 53% | 100% |
|  2 | 38% |  92% |




batch:148
| ID | GPU | MEM  |
-------------------
|  0 | 57% |  64% |
|  1 | 24% | 100% |
|  2 | 32% |  92% |




batch:149
| ID | GPU | MEM  |
-------------------
|  0 | 40% |  64% |
|  1 | 46% | 100% |
|  2 | 46% |  92% |




batch:150
| ID | GPU | MEM  |
-------------------
|  0 | 40% |  64% |
|  1 | 46% | 100% |
|  2 | 46% |  92% |




batch:151
| ID | GPU | MEM  |
-------------------
|  0 | 45% |  64% |
|  1 | 42% | 100% |
|  2 | 27% |  92% |




batch:152
| ID | GPU | MEM  |
-------------------
|  0 | 51% |  64% |
|  1 | 42% | 100% |
|  2 | 32% |  92% |




batch:153
| ID | GPU | MEM  |
-------------------
|  0 | 51% |  64% |
|  1 | 24% | 100% |
|  2 | 32% |  92% |




batch:154
| ID | GPU | MEM  |
-------------------
|  0 | 35% |  64% |
|  1 | 41% | 100% |
|  2 | 41% |  92% |




batch:155
| ID | GPU | MEM  |
-------------------
|  0 | 35% |  64% |
|  1 | 41% | 100% |
|  2 | 41% |  92% |




batch:156
| ID | GPU | MEM  |
-------------------
|  0 | 43% |  64% |
|  1 | 42% | 100% |
|  2 | 26% |  92% |




batch:157
| ID | GPU | MEM  |
-------------------
|  0 | 60% |  64% |
|  1 | 42% | 100% |
|  2 | 40% |  92% |




batch:158
| ID | GPU | MEM  |
-------------------
|  0 | 60% |  64% |
|  1 | 32% | 100% |
|  2 | 40% |  92% |




epoch 0 train_loss: 0.484
epoch 0 eval_loss: 0.262


HBox(children=(IntProgress(value=0, max=159), HTML(value='')))

batch:0
| ID | GPU | MEM |
------------------
|  0 |  0% | 68% |
|  1 |  0% | 18% |
|  2 |  0% | 95% |




batch:1
| ID | GPU | MEM |
------------------
|  0 | 28% | 68% |
|  1 |  0% | 48% |
|  2 | 31% | 95% |




batch:2
| ID | GPU | MEM |
------------------
|  0 | 28% | 68% |
|  1 | 34% | 48% |
|  2 | 31% | 95% |




batch:3
| ID | GPU | MEM |
------------------
|  0 | 37% | 68% |
|  1 | 41% | 48% |
|  2 | 32% | 95% |




batch:4
| ID | GPU | MEM |
------------------
|  0 | 48% | 68% |
|  1 | 41% | 48% |
|  2 | 32% | 95% |




batch:5
| ID | GPU | MEM |
------------------
|  0 | 48% | 68% |
|  1 | 36% | 48% |
|  2 | 29% | 95% |




batch:6
| ID | GPU | MEM |
------------------
|  0 | 51% | 68% |
|  1 | 36% | 48% |
|  2 | 38% | 95% |




batch:7
| ID | GPU | MEM |
------------------
|  0 | 51% | 68% |
|  1 | 30% | 48% |
|  2 | 38% | 95% |




KeyboardInterrupt: 