In [1]:
from __future__ import absolute_import, division, print_function
# 放在第一句，不然会报错

import logging
from operator import index
import os
import random
from matplotlib.pyplot import title
import numpy as np
import pandas as pd
import csv

import torch
import torch.nn as nn
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, Dataset)
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange
from random import randrange, randint, shuffle, choice, sample

from transformers import BertForMaskedLM,BertConfig,BertTokenizer
from transformers import AdamW
from transformers.optimization import (
    get_constant_schedule,
    get_constant_schedule_with_warmup,
    get_linear_schedule_with_warmup,
    get_cosine_schedule_with_warmup,
    get_cosine_with_hard_restarts_schedule_with_warmup,
    get_polynomial_decay_schedule_with_warmup,
)

import multi_tasks_pretrain_args as args
import json
from sklearn.model_selection import KFold

In [2]:
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [3]:
# Random Mask
def create_masked_lm_predictions(tokens, masked_lm_prob, max_predictions_per_seq, vocab_list):
    """Creates the predictions for the masked LM objective. This is mostly copied from the Google BERT repo, but
    with several refactors to clean it up and remove a lot of unnecessary variables."""
    # 对一个句子进行MLM
    """
    tokens:一句话的tokens
    masked_lm_prob:覆盖的概率
    max_predictions_per_seq:每个seq，最大预测
    vocab_list:词汇表
    return:经过MLM的tokens，mask的index，mask的label
    """
    cand_indices = []
    for (i, token) in enumerate(tokens):
        if token == "[CLS]" or token == "[SEP]":
            continue
        cand_indices.append(i)
    # 获取除了，特殊的CLS和SEP的词语的token下标
    num_to_mask = min(max_predictions_per_seq,
                      max(1, int(round(len(tokens) * masked_lm_prob))))
    # mask数量，极端情况下，至少为1，最多max_predictions_per_seq
    # print(num_to_mask)
    # print("tokens", len(tokens))
    # print("cand", len(cand_indices))
    shuffle(cand_indices)
    # 打乱cand_indices
    mask_indices = sorted(sample(cand_indices, num_to_mask))
    # 从cand_indices随机抽取num_to_mask个元素，并且以list返回，然后进行排序
    masked_token_labels = []
    for index in mask_indices:
        # 80% of the time, replace with [MASK]
        masked_token = "[MASK]"
        masked_token_labels.append(tokens[index])
        # Once we've saved the true label for that token, we can overwrite it with the masked version
        tokens[index] = masked_token

    return tokens, mask_indices, masked_token_labels

In [4]:
def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens):
    """Truncates a pair of sequences to a maximum sequence length. Lifted from Google's BERT repo."""
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_num_tokens:
            break

        trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
        assert len(trunc_tokens) >= 1

        # We want to sometimes truncate from the front and sometimes from the
        # back to add more randomness and avoid biases.
        if random() < 0.5:
            del trunc_tokens[0]
        else:
            trunc_tokens.pop()

In [6]:
def Json_File_Reader(data_name,data_paths,dynamic_mask_times=None):
    all_data = []
    for index,data_path in enumerate(data_paths):
            with open(data_path,'r',encoding='utf-8') as f:
                for item in f.readlines():
                    item = json.loads(item)
                    # item['data_type'] = data_name
                    item['is_val'] = index
                    all_data.append(item)
    
    if dynamic_mask_times:
        temp_data = all_data.copy()
        for i in range(dynamic_mask_times-1):
            all_data = all_data + temp_data
    return all_data

In [7]:
class RTE_Processing:
    def __init__(self,data,tokenizer,max_seq_length,hyperparameter_mask,vocab_list):
        self.data = data
        self.data_type = args.superglue['RTE']
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length
        self.masked_lm_prob = hyperparameter_mask['masked_lm_prob']
        self.max_predictions_per_seq = hyperparameter_mask['max_predictions_per_seq']
        self.vocab_list = vocab_list
        self.special_token_length = 2
        # [CLS] and [SEP]
        self.hard_prompt_token_length = 8

        self.max_token_length = self.max_seq_length - self.special_token_length-self.hard_prompt_token_length-args.soft_prompt_length

    def Creat_Input_For_PLMs(self):
        
        data_input = []
        for index,item in tqdm(enumerate(self.data),desc="RTE Data Processing"):
            sentence1 = item['premise'].strip('\n')
            sentence2 = item['hypothesis'].strip('\n').strip('.')
            label = item['label']
            label_ids = self.data_type['label_list'].index(label)

            answer = self.data_type['prompt_answer_list'][label_ids]

            tokens_a = self.tokenizer.tokenize(sentence1)
            tokens_b = self.tokenizer.tokenize(sentence2)
            answer_tokens = self.tokenizer.tokenize(answer)

            if item['is_val']:
                answer_tokens = []
                max_token_length = self.max_token_length + 1
            else:
                max_token_length = self.max_token_length 

            truncate_seq_pair(tokens_a, tokens_b, max_token_length)
            tokens = ["[CLS]"] +["[unused{}]".format(i+1) for i in range(args.soft_prompt_length)]+ tokens_a +["Question",":"] + tokens_b +["?","the","Answer",":"]+ answer_tokens + ["."] + ["[SEP]"]

            start = len(tokens_a)+1+args.soft_prompt_length+2+len(tokens_b)+4
            prompt_positions = [i for i in range(start,start+1)]
            
            tokens, masked_lm_positions, masked_lm_labels = create_masked_lm_predictions(tokens, self.masked_lm_prob, self.max_predictions_per_seq, self.vocab_list)

            input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
            attention_ids = [1] * len(input_ids)
            segment_ids = [0] * len(input_ids)
            masked_lm_labels = self.tokenizer.convert_tokens_to_ids(masked_lm_labels)
            data_item = {'input_ids':input_ids,
                'segment_ids':segment_ids,
                'attention_ids':attention_ids,
                'masked_lm_positions':masked_lm_positions,
                'masked_lm_labels':masked_lm_labels}
            # no padding
            data_input.append(data_item)

        return data_input

In [8]:
def Divide_Data(data_list,eval_rate):
    """
    data_list: a list contains a dict of item of input for PLMs
    It is not approporite for those task whose contains very few data. In other words, It is unfair.
    """
    kf = KFold(n_splits=int(1/eval_rate),shuffle=True,random_state=args.seed)
    train_data = []
    eval_data = []

    for step,(train_index,eval_index) in enumerate(kf.split(data_list)):
        for i in train_index:
            train_data.append(data_list[i])
        for i in eval_index:
            eval_data.append(data_list[i])
        if step >= 0:
            break
    return train_data,eval_data

In [9]:
class PretrainDataset(Dataset):
    def __init__(self,data,max_seq_length):
        self.max_seq_length = max_seq_length
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self,index):
        data_item = self.data[index]
        input_ids = data_item['input_ids']
        segment_ids = data_item['segment_ids']
        attention_ids = data_item['attention_ids']
        masked_lm_positions = data_item['masked_lm_positions']
        masked_lm_labels = data_item['masked_lm_labels']

        assert len(input_ids)==len(segment_ids)==len(attention_ids)
        padding_length = self.max_seq_length - len(input_ids)

        input_ids += [0] * padding_length
        attention_ids += [0] * padding_length
        segment_ids += [0] * padding_length
        label_ids = np.full(self.max_seq_length, dtype=np.int, fill_value=-100)
        label_ids[masked_lm_positions] = masked_lm_labels

        input_ids = torch.tensor(input_ids,dtype=torch.long)
        segment_ids = torch.tensor(segment_ids,dtype=torch.long)
        attention_ids = torch.tensor(attention_ids,dtype=torch.long)
        label_ids = torch.tensor(label_ids,dtype=torch.long)
        return input_ids, segment_ids, attention_ids, label_ids

In [31]:
def read_data(data_paths,dynamic_mask_times=None,eval_rate=0.1):
        """
        data_path:文件格式为txt，tsv，csv的文件路径
        第一列或者第一二列为文本，最后一列为标签
        dynamic_mask_times:动态掩词
        eval_rate:验证集比例
        """
        sentence1 = []
        sentence2 = []
        label = []
        ID = []
        # ID is a columns to distinguish train data or val data
        for index,data_path in enumerate(data_paths):
            with open(data_path,'r',encoding='utf-8') as f:
                for item in f.readlines():
                    item = json.loads(item)
                    sentence1.append(item['premise'])
                    sentence2.append(item['hypothesis'])
                    label.append(item['label'])
                    ID.append(index)
            
#         all_data = pd.read_csv(data_paths)
#         all_data = pd.read_csv(data_paths, sep='\t', header=None, quoting=csv.QUOTE_NONE,encoding='utf-8')
        # all_data = pd.DataFrame({'premise':sentence1,'hypothesis':sentence2,'label':label})
        all_data = pd.DataFrame({'premise':sentence1,'hypothesis':sentence2,'label':label,'ID':ID})
        
        if dynamic_mask_times:
            temp_data = all_data.copy()
            for _ in range(dynamic_mask_times-1):
                all_data = pd.concat([all_data,temp_data])

        kf = KFold(n_splits=int(1/eval_rate),shuffle=True,random_state=args.seed)
        for step,(train_index,eval_index) in enumerate(kf.split(all_data)):
            eval_data = all_data.iloc[eval_index]
            train_data = all_data.iloc[train_index]
            if step >= 0:
                break
        return train_data,eval_data

In [32]:
class PretrainDataset(Dataset):
    def __init__(self,DATAFRAME,tokenizer,prompt_pattern_list,label_list,max_seq_length,masked_lm_prob,max_predictions_per_seq,vocab_list):
        self.data = DATAFRAME
        self.columns = list(DATAFRAME.columns)
        """
        DATAFRAME:为pandas的DataFrame，需要第一列文本
        """
        self.tokenizer = tokenizer
        self.prompt_pattern_list = prompt_pattern_list
        self.label_list = label_list
        
        self.max_seq_length = max_seq_length
        self.nums_specical_tokens = 2
#         self.max_tokens_nums = max_seq_length - self.nums_specical_tokens - (2+2)-args.soft_prompt_length
        self.max_tokens_nums = max_seq_length - self.nums_specical_tokens - (8)-args.soft_prompt_length
                                                                             
        # 多了两个逗号
        self.masked_lm_prob = masked_lm_prob
        self.max_predictions_per_seq = max_predictions_per_seq
        self.vocab_list = vocab_list
    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self,index):
        text_a = self.data[self.columns[0]].iloc[index].strip('\n')
        text_b = self.data[self.columns[1]].iloc[index].strip('\n')

        label = self.label_list.index(self.data[self.columns[2]].iloc[index])
        ID = self.data[self.columns[3]].iloc[index]
#         label = self.data[self.columns[2]].iloc[index]
        
#         template = choice(self.prompt_pattern_list[label])
        template = self.prompt_pattern_list[label]

#         template = choice(self.prompt_pattern_list[choice([0,1,2])])
        
#         template_label = self.prompt_pattern_list[label]
#         template = self.prompt_pattern[0][0:self.prompt_pattern[1]]+template_label+self.prompt_pattern[0][self.prompt_pattern[1]:]
        
        tokens_template = self.tokenizer.tokenize(template)
        if ID:
            # tokens_template = ["[MASK]"]
            # tokens_template = ["Yes",'or','No']
            tokens_template = []

            # max_tokens_nums = self.max_tokens_nums - 2
            max_tokens_nums = self.max_tokens_nums + 1

        else:
            max_tokens_nums = self.max_tokens_nums
            
        tokens_a = self.tokenizer.tokenize(text_a)
        tokens_b = self.tokenizer.tokenize(text_b)
        truncate_seq_pair(tokens_a, tokens_b, max_tokens_nums)
        # tokens = tokens[:self.max_tokens_nums]

        tokens = ["[CLS]"] +["[unused{}]".format(i+1) for i in range(args.soft_prompt_length)]+ tokens_a +["Question",":"] + tokens_b +["?","The","Answer",":"]+ tokens_template + ["."] + ["[SEP]"]
        # tokens = ["[CLS]"] +["[unused{}]".format(i+1) for i in range(args.soft_prompt_length)]+ tokens_a + tokens_template + [','] + tokens_b+ ["[SEP]"]
        # tokens = ["[CLS]"] +["[unused{}]".format(i+1) for i in range(args.soft_prompt_length)]+ tokens_a + ['，'] + tokens_template + tokens_b+ ["[SEP]"]                                                               
        # masked_labels = tokens.copy()

        start = len(tokens_a)+1+args.soft_prompt_length+2+len(tokens_b)+4
        
        prompt_positions = [i for i in range(start,start+1)]
        
        tokens, masked_lm_positions, masked_lm_labels = create_masked_lm_predictions(tokens, self.masked_lm_prob, self.max_predictions_per_seq, self.vocab_list)

        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        # label_ids = self.tokenizer.conver_tokens_to_ids(masked_labels)

        attention_ids = [1] * len(input_ids)
        # if ID:
        #     attention_ids[start] = 0
            # we do not use attention on the position of answer for validation data set
        padding_length = self.max_seq_length - len(input_ids)

        input_ids += [0] * padding_length
        # label_ids += [0] * padding_length

        attention_ids += [0] * padding_length

        segment_ids = [0] * len(input_ids)
        label_ids = np.full(self.max_seq_length, dtype=np.int, fill_value=-100)
        label_ids[masked_lm_positions] = self.tokenizer.convert_tokens_to_ids(masked_lm_labels)

        input_ids = torch.tensor(input_ids,dtype=torch.long)
        segment_ids = torch.tensor(segment_ids,dtype=torch.long)
        attention_ids = torch.tensor(attention_ids,dtype=torch.long)
        label_ids = torch.tensor(label_ids,dtype=torch.long)

        return input_ids, segment_ids, attention_ids, label_ids

In [33]:
train_data,eval_data = read_data(args.superglue['RTE']['data_path'],10)

In [34]:
TrainDataset = PretrainDataset(train_data,tokenizer,args.superglue['RTE']['prompt_answer_list'],args.superglue['RTE']['label_list'],128,0.25,20,vocab_list)
EvalDataset = PretrainDataset(eval_data,tokenizer,args.superglue['RTE']['prompt_answer_list'],args.superglue['RTE']['label_list'],128,0.25,20,vocab_list)

In [10]:
if not os.path.exists(args.model_save_path):
    os.makedirs(args.model_save_path)

In [11]:
if args.gradient_accumulation_steps < 1:
    raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
        args.gradient_accumulation_steps))

In [2]:
args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
# seed everything,python,numpy and torch

np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.cuda.manual_seed_all(args.seed)

In [51]:
random.seed(42)

In [52]:
random.random()

0.6394267984578837

In [3]:
tokenizer = BertTokenizer.from_pretrained(args.model_name_from_hugging_face)
vocab_list = list(tokenizer.vocab.keys())
config = BertConfig.from_pretrained(args.model_name_from_hugging_face)

In [14]:
RTE_data = Json_File_Reader('RTE',args.superglue['RTE']['data_path'],10)

In [15]:
RTE = RTE_Processing(RTE_data,tokenizer,
                     args.max_seq_length,
                     args.superglue['RTE']['hyperparameter_of_mask'],
                     vocab_list)

In [16]:
RTE_inputs = RTE.Creat_Input_For_PLMs()
# shuffle(RTE_inputs)

RTE Data Processing: 27670it [00:46, 595.65it/s]


In [17]:
train_data,eval_data = Divide_Data(RTE_inputs,0.1)

In [18]:
num_train_optimization_steps = int(len(train_data) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs

In [19]:
model = BertForMaskedLM.from_pretrained(args.model_name_from_hugging_face)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [20]:
device = torch.device("cuda")

In [21]:
model.to(device)

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [22]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
    'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

In [23]:
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)

In [25]:
scheduler = get_linear_schedule_with_warmup(optimizer,num_training_steps=num_train_optimization_steps,num_warmup_steps = num_train_optimization_steps*args.warmup_proportion)

TrainDataset = PretrainDataset(train_data,args.max_seq_length)
EvalDataset = PretrainDataset(eval_data,args.max_seq_length)

TrainDataLoader = DataLoader(TrainDataset,batch_size=args.train_batch_size,shuffle=True)
EvalDataLoader = DataLoader(EvalDataset,batch_size=args.eval_batch_size,shuffle=False)

In [26]:
logger = logging.getLogger(__name__)

In [27]:
global_step = 0
best_loss = 100000
patience = 0

model.train()

for e in trange(int(args.num_train_epochs), desc="Epoch"):
    if patience > args.early_stopping:
        break
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    optimizer.zero_grad()
    for step, batch in enumerate(tqdm(TrainDataLoader, desc="Iteration")):
        batch = tuple(t.to(device) for t in batch)
        input_ids, segment_ids, attention_ids, label_ids = batch
        # masked_lm_loss
        outputs = model(input_ids=input_ids, attention_mask=attention_ids, token_type_ids=segment_ids, labels=label_ids)
        loss = outputs.loss
        if args.gradient_accumulation_steps > 1:
            loss = loss / args.gradient_accumulation_steps

        if args.fp16:
            optimizer.backward(loss)
        else:
            loss.backward()

        tr_loss += loss.item()
        nb_tr_examples += input_ids.size(0)
        nb_tr_steps += 1
        if (step + 1) % args.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
            scheduler.step()

            global_step += 1
        if nb_tr_steps > 0 and nb_tr_steps % 100 == 0:
            logger.info("===================== -epoch %d -train_step %d -train_loss %.4f\n" % (e, nb_tr_steps, tr_loss / nb_tr_steps))
    if nb_tr_steps > 0:
        #################################EVAL#####################################################
        model.eval()
        eval_loss = 0
        nb_eval_steps = 0
        for step, batch in enumerate(tqdm(EvalDataLoader, desc="Evaluating")):
            batch = tuple(t.to(device) for t in batch)
            input_ids, segment_ids, attention_ids, label_ids = batch
            with torch.no_grad():
                outputs = model(input_ids=input_ids, attention_mask=attention_ids, token_type_ids=segment_ids, labels=label_ids)
                loss = outputs.loss
            eval_loss += loss.item()
            nb_eval_steps += 1
        print(nb_tr_steps)

        print(tr_loss)
        print(nb_eval_steps)
        print(eval_loss)
        eval_loss = eval_loss / nb_eval_steps
        if eval_loss < best_loss:
            patience = 0
            # Save a trained model, configuration and tokenizer
            print("The eval loss is decreasing!,so we save model!")
            model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self

            # If we save using the predefined names, we can load using `from_pretrained`
            # output_model_file = os.path.join(model_fold_path, WEIGHTS_NAME)
            output_model_file = os.path.join(args.model_save_path, 'pytorch_model{:.3f}.bin'.format(eval_loss))
            torch.save(model_to_save.state_dict(), output_model_file)
            tokenizer.save_vocabulary(os.path.join(args.model_save_path,'vocab.txt'))
            config.to_json_file(os.path.join(args.model_save_path,'config.json'))
            #torch.save(model.module, output_model_file)

            best_loss = eval_loss

        else:
            patience += 1
        print("============================ -epoch %d -train_loss %.4f -eval_loss %.4f\n"% (e, tr_loss / nb_tr_steps, eval_loss))

    if best_loss<=0.1:
        exit(0)

Epoch:   0%|          | 0/30 [00:00<?, ?it/s]
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  label_ids = np.full(self.max_seq_length, dtype=np.int, fill_value=-100)

Iteration:   0%|          | 1/779 [00:00<03:44,  3.46it/s][A
Iteration:   0%|          | 2/779 [00:00<03:31,  3.68it/s][A
Iteration:   0%|          | 3/779 [00:00<03:03,  4.23it/s][A
Iteration:   1%|          | 4/779 [00:00<02:59,  4.31it/s][A
Iteration:   1%|          | 5/779 [00:01<02:47,  4.61it/s][A
Iteration:   1%|          | 6/779 [00:01<02:51,  4.51it/s][A
Iteration:   1%|          | 7/779 [00:01<02:45,  4.67it/s][A
Iteration:   1%|          | 8/779 [00:01<02:47,  4.60it/s][A
Iteration:   1%|          | 9/779 [00:02<02:50,  4.52it/s][A
Iteration:   1%|▏         | 10/779 [00:02<02:59,  4.28it/s][A
Iteration:   1%|▏         | 11/779 [00:02<02:58,  4.30it/s][A
Iteration:   2%|▏         | 12/779 [00:02<02:55,  4.37it/s][A
Iteration:

Iteration:  32%|███▏      | 247/779 [00:53<01:52,  4.72it/s][A
Iteration:  32%|███▏      | 248/779 [00:53<01:55,  4.60it/s][A
Iteration:  32%|███▏      | 249/779 [00:54<01:54,  4.64it/s][A
Iteration:  32%|███▏      | 250/779 [00:54<01:56,  4.55it/s][A
Iteration:  32%|███▏      | 251/779 [00:54<01:53,  4.65it/s][A
Iteration:  32%|███▏      | 252/779 [00:54<01:57,  4.49it/s][A
Iteration:  32%|███▏      | 253/779 [00:55<01:52,  4.68it/s][A
Iteration:  33%|███▎      | 254/779 [00:55<01:55,  4.54it/s][A
Iteration:  33%|███▎      | 255/779 [00:55<01:53,  4.61it/s][A
Iteration:  33%|███▎      | 256/779 [00:55<01:55,  4.53it/s][A
Iteration:  33%|███▎      | 257/779 [00:55<01:51,  4.67it/s][A
Iteration:  33%|███▎      | 258/779 [00:56<01:52,  4.65it/s][A
Iteration:  33%|███▎      | 259/779 [00:56<01:50,  4.71it/s][A
Iteration:  33%|███▎      | 260/779 [00:56<01:51,  4.64it/s][A
Iteration:  34%|███▎      | 261/779 [00:56<01:50,  4.69it/s][A
Iteration:  34%|███▎      | 262/779 [00:

Iteration:  65%|██████▍   | 503/779 [01:48<00:57,  4.81it/s][A
Iteration:  65%|██████▍   | 504/779 [01:49<01:00,  4.58it/s][A
Iteration:  65%|██████▍   | 505/779 [01:49<00:59,  4.64it/s][A
Iteration:  65%|██████▍   | 506/779 [01:49<00:59,  4.56it/s][A
Iteration:  65%|██████▌   | 507/779 [01:49<00:57,  4.72it/s][A
Iteration:  65%|██████▌   | 508/779 [01:49<00:58,  4.61it/s][A
Iteration:  65%|██████▌   | 509/779 [01:50<00:58,  4.65it/s][A
Iteration:  65%|██████▌   | 510/779 [01:50<00:59,  4.55it/s][A
Iteration:  66%|██████▌   | 511/779 [01:50<00:57,  4.67it/s][A
Iteration:  66%|██████▌   | 512/779 [01:50<00:58,  4.56it/s][A
Iteration:  66%|██████▌   | 513/779 [01:51<00:56,  4.67it/s][A
Iteration:  66%|██████▌   | 514/779 [01:51<00:57,  4.64it/s][A
Iteration:  66%|██████▌   | 515/779 [01:51<00:55,  4.74it/s][A
Iteration:  66%|██████▌   | 516/779 [01:51<00:56,  4.64it/s][A
Iteration:  66%|██████▋   | 517/779 [01:51<00:55,  4.75it/s][A
Iteration:  66%|██████▋   | 518/779 [01:

Iteration:  97%|█████████▋| 759/779 [02:43<00:04,  4.72it/s][A
Iteration:  98%|█████████▊| 760/779 [02:43<00:04,  4.61it/s][A
Iteration:  98%|█████████▊| 761/779 [02:44<00:03,  4.66it/s][A
Iteration:  98%|█████████▊| 762/779 [02:44<00:03,  4.58it/s][A
Iteration:  98%|█████████▊| 763/779 [02:44<00:03,  4.72it/s][A
Iteration:  98%|█████████▊| 764/779 [02:44<00:03,  4.62it/s][A
Iteration:  98%|█████████▊| 765/779 [02:45<00:02,  4.69it/s][A
Iteration:  98%|█████████▊| 766/779 [02:45<00:02,  4.52it/s][A
Iteration:  98%|█████████▊| 767/779 [02:45<00:02,  4.67it/s][A
Iteration:  99%|█████████▊| 768/779 [02:45<00:02,  4.56it/s][A
Iteration:  99%|█████████▊| 769/779 [02:45<00:02,  4.65it/s][A
Iteration:  99%|█████████▉| 770/779 [02:46<00:01,  4.60it/s][A
Iteration:  99%|█████████▉| 771/779 [02:46<00:01,  4.70it/s][A
Iteration:  99%|█████████▉| 772/779 [02:46<00:01,  4.58it/s][A
Iteration:  99%|█████████▉| 773/779 [02:46<00:01,  4.72it/s][A
Iteration:  99%|█████████▉| 774/779 [02:

779
867.3138144612312
87
158.1931689977646
The eval loss is decreasing!,so we save model!


Epoch:   3%|▎         | 1/30 [02:58<1:26:12, 178.35s/it]





Iteration:   0%|          | 0/779 [00:00<?, ?it/s][A
Iteration:   0%|          | 1/779 [00:00<02:36,  4.98it/s][A
Iteration:   0%|          | 2/779 [00:00<02:43,  4.74it/s][A
Iteration:   0%|          | 3/779 [00:00<02:38,  4.90it/s][A
Iteration:   1%|          | 4/779 [00:00<02:45,  4.69it/s][A
Iteration:   1%|          | 5/779 [00:01<02:39,  4.85it/s][A
Iteration:   1%|          | 6/779 [00:01<02:43,  4.72it/s][A
Iteration:   1%|          | 7/779 [00:01<02:40,  4.82it/s][A
Iteration:   1%|          | 8/779 [00:01<02:44,  4.67it/s][A
Iteration:   1%|          | 9/779 [00:01<02:40,  4.80it/s][A
Iteration:   1%|▏         | 10/779 [00:02<02:45,  4.65it/s][A
Iteration:   1%|▏         | 11/779 [00:02<02:40,  4.79it/s][A
Iteration:   2%|▏         | 12/779 [00:02<02:45,  4.64it/s][A
Iteration:   2%|▏         | 13/779 [00:02<02:41,  4.74it/s][A
Iteration:   2%|▏         | 14/779 [00:02<02:47,  4.58it/s][A
Iteration:   2%|▏         | 15/779 [00:03<02:43,  4.68it/s][A
Iteration

Iteration:  33%|███▎      | 257/779 [00:54<01:49,  4.77it/s][A
Iteration:  33%|███▎      | 258/779 [00:54<01:52,  4.62it/s][A
Iteration:  33%|███▎      | 259/779 [00:55<01:52,  4.63it/s][A
Iteration:  33%|███▎      | 260/779 [00:55<01:53,  4.57it/s][A
Iteration:  34%|███▎      | 261/779 [00:55<01:49,  4.71it/s][A
Iteration:  34%|███▎      | 262/779 [00:55<01:51,  4.62it/s][A
Iteration:  34%|███▍      | 263/779 [00:56<01:48,  4.74it/s][A
Iteration:  34%|███▍      | 264/779 [00:56<01:51,  4.61it/s][A
Iteration:  34%|███▍      | 265/779 [00:56<01:49,  4.70it/s][A
Iteration:  34%|███▍      | 266/779 [00:56<01:51,  4.62it/s][A
Iteration:  34%|███▍      | 267/779 [00:56<01:48,  4.72it/s][A
Iteration:  34%|███▍      | 268/779 [00:57<01:50,  4.61it/s][A
Iteration:  35%|███▍      | 269/779 [00:57<01:47,  4.75it/s][A
Iteration:  35%|███▍      | 270/779 [00:57<01:49,  4.63it/s][A
Iteration:  35%|███▍      | 271/779 [00:57<01:47,  4.73it/s][A
Iteration:  35%|███▍      | 272/779 [00:

Iteration:  66%|██████▌   | 513/779 [01:49<00:55,  4.77it/s][A
Iteration:  66%|██████▌   | 514/779 [01:49<00:56,  4.65it/s][A
Iteration:  66%|██████▌   | 515/779 [01:49<00:55,  4.78it/s][A
Iteration:  66%|██████▌   | 516/779 [01:49<00:57,  4.61it/s][A
Iteration:  66%|██████▋   | 517/779 [01:49<00:55,  4.70it/s][A
Iteration:  66%|██████▋   | 518/779 [01:50<00:59,  4.42it/s][A
Iteration:  67%|██████▋   | 519/779 [01:50<00:58,  4.48it/s][A
Iteration:  67%|██████▋   | 520/779 [01:50<00:58,  4.42it/s][A
Iteration:  67%|██████▋   | 521/779 [01:50<00:56,  4.58it/s][A
Iteration:  67%|██████▋   | 522/779 [01:50<00:56,  4.55it/s][A
Iteration:  67%|██████▋   | 523/779 [01:51<00:54,  4.73it/s][A
Iteration:  67%|██████▋   | 524/779 [01:51<00:55,  4.60it/s][A
Iteration:  67%|██████▋   | 525/779 [01:51<00:53,  4.76it/s][A
Iteration:  68%|██████▊   | 526/779 [01:51<00:55,  4.59it/s][A
Iteration:  68%|██████▊   | 527/779 [01:52<00:52,  4.79it/s][A
Iteration:  68%|██████▊   | 528/779 [01:

Iteration:  99%|█████████▊| 769/779 [02:43<00:02,  4.84it/s][A
Iteration:  99%|█████████▉| 770/779 [02:43<00:01,  4.74it/s][A
Iteration:  99%|█████████▉| 771/779 [02:44<00:01,  4.85it/s][A
Iteration:  99%|█████████▉| 772/779 [02:44<00:01,  4.78it/s][A
Iteration:  99%|█████████▉| 773/779 [02:44<00:01,  4.91it/s][A
Iteration:  99%|█████████▉| 774/779 [02:44<00:01,  4.76it/s][A
Iteration:  99%|█████████▉| 775/779 [02:44<00:00,  4.86it/s][A
Iteration: 100%|█████████▉| 776/779 [02:45<00:00,  4.71it/s][A
Iteration: 100%|█████████▉| 777/779 [02:45<00:00,  4.81it/s][A
Iteration: 100%|██████████| 779/779 [02:45<00:00,  4.70it/s][A

Evaluating:   0%|          | 0/87 [00:00<?, ?it/s][A
Evaluating:   1%|          | 1/87 [00:00<00:09,  9.39it/s][A
Evaluating:   2%|▏         | 2/87 [00:00<00:08,  9.58it/s][A
Evaluating:   3%|▎         | 3/87 [00:00<00:08,  9.42it/s][A
Evaluating:   5%|▍         | 4/87 [00:00<00:08,  9.44it/s][A
Evaluating:   6%|▌         | 5/87 [00:00<00:08,  9.29it/s

779
661.6280564665794
87
142.11221885681152
The eval loss is decreasing!,so we save model!


Epoch:   7%|▋         | 2/30 [05:54<1:22:34, 176.96s/it]





Iteration:   0%|          | 0/779 [00:00<?, ?it/s][A
Iteration:   0%|          | 1/779 [00:00<02:32,  5.09it/s][A
Iteration:   0%|          | 2/779 [00:00<02:52,  4.50it/s][A
Iteration:   0%|          | 3/779 [00:00<02:44,  4.71it/s][A
Iteration:   1%|          | 4/779 [00:00<02:49,  4.58it/s][A
Iteration:   1%|          | 5/779 [00:01<02:39,  4.85it/s][A
Iteration:   1%|          | 6/779 [00:01<02:47,  4.60it/s][A
Iteration:   1%|          | 7/779 [00:01<02:44,  4.71it/s][A
Iteration:   1%|          | 8/779 [00:01<02:49,  4.56it/s][A
Iteration:   1%|          | 9/779 [00:01<02:44,  4.67it/s][A
Iteration:   1%|▏         | 10/779 [00:02<02:50,  4.51it/s][A
Iteration:   1%|▏         | 11/779 [00:02<02:44,  4.67it/s][A
Iteration:   2%|▏         | 12/779 [00:02<02:47,  4.57it/s][A
Iteration:   2%|▏         | 13/779 [00:02<02:42,  4.72it/s][A
Iteration:   2%|▏         | 14/779 [00:03<02:48,  4.55it/s][A
Iteration:   2%|▏         | 15/779 [00:03<02:42,  4.70it/s][A
Iteration

Iteration:  33%|███▎      | 257/779 [00:55<01:52,  4.65it/s][A
Iteration:  33%|███▎      | 258/779 [00:55<01:54,  4.54it/s][A
Iteration:  33%|███▎      | 259/779 [00:56<01:51,  4.68it/s][A
Iteration:  33%|███▎      | 260/779 [00:56<01:53,  4.57it/s][A
Iteration:  34%|███▎      | 261/779 [00:56<01:50,  4.70it/s][A
Iteration:  34%|███▎      | 262/779 [00:56<01:53,  4.54it/s][A
Iteration:  34%|███▍      | 263/779 [00:56<01:50,  4.66it/s][A
Iteration:  34%|███▍      | 264/779 [00:57<01:53,  4.53it/s][A
Iteration:  34%|███▍      | 265/779 [00:57<01:50,  4.63it/s][A
Iteration:  34%|███▍      | 266/779 [00:57<01:52,  4.55it/s][A
Iteration:  34%|███▍      | 267/779 [00:57<01:50,  4.65it/s][A
Iteration:  34%|███▍      | 268/779 [00:58<01:52,  4.53it/s][A
Iteration:  35%|███▍      | 269/779 [00:58<01:50,  4.62it/s][A
Iteration:  35%|███▍      | 270/779 [00:58<01:53,  4.50it/s][A
Iteration:  35%|███▍      | 271/779 [00:58<01:48,  4.67it/s][A
Iteration:  35%|███▍      | 272/779 [00:

Iteration:  66%|██████▌   | 513/779 [01:50<00:56,  4.74it/s][A
Iteration:  66%|██████▌   | 514/779 [01:50<00:57,  4.62it/s][A
Iteration:  66%|██████▌   | 515/779 [01:50<00:55,  4.77it/s][A
Iteration:  66%|██████▌   | 516/779 [01:50<00:56,  4.68it/s][A
Iteration:  66%|██████▋   | 517/779 [01:51<00:55,  4.71it/s][A
Iteration:  66%|██████▋   | 518/779 [01:51<00:56,  4.59it/s][A
Iteration:  67%|██████▋   | 519/779 [01:51<00:54,  4.76it/s][A
Iteration:  67%|██████▋   | 520/779 [01:51<00:56,  4.61it/s][A
Iteration:  67%|██████▋   | 521/779 [01:51<00:53,  4.78it/s][A
Iteration:  67%|██████▋   | 522/779 [01:52<00:54,  4.72it/s][A
Iteration:  67%|██████▋   | 523/779 [01:52<00:53,  4.76it/s][A
Iteration:  67%|██████▋   | 524/779 [01:52<00:54,  4.64it/s][A
Iteration:  67%|██████▋   | 525/779 [01:52<00:53,  4.76it/s][A
Iteration:  68%|██████▊   | 526/779 [01:52<00:53,  4.69it/s][A
Iteration:  68%|██████▊   | 527/779 [01:53<00:52,  4.77it/s][A
Iteration:  68%|██████▊   | 528/779 [01:

Iteration:  99%|█████████▊| 769/779 [02:44<00:02,  4.77it/s][A
Iteration:  99%|█████████▉| 770/779 [02:45<00:01,  4.72it/s][A
Iteration:  99%|█████████▉| 771/779 [02:45<00:01,  4.86it/s][A
Iteration:  99%|█████████▉| 772/779 [02:45<00:01,  4.72it/s][A
Iteration:  99%|█████████▉| 773/779 [02:45<00:01,  4.85it/s][A
Iteration:  99%|█████████▉| 774/779 [02:45<00:01,  4.66it/s][A
Iteration:  99%|█████████▉| 775/779 [02:46<00:00,  4.76it/s][A
Iteration: 100%|█████████▉| 776/779 [02:46<00:00,  4.69it/s][A
Iteration: 100%|█████████▉| 777/779 [02:46<00:00,  4.80it/s][A
Iteration: 100%|██████████| 779/779 [02:46<00:00,  4.67it/s][A

Evaluating:   0%|          | 0/87 [00:00<?, ?it/s][A
Evaluating:   1%|          | 1/87 [00:00<00:09,  9.41it/s][A
Evaluating:   2%|▏         | 2/87 [00:00<00:08,  9.70it/s][A
Evaluating:   3%|▎         | 3/87 [00:00<00:08,  9.41it/s][A
Evaluating:   5%|▍         | 4/87 [00:00<00:08,  9.45it/s][A
Evaluating:   6%|▌         | 5/87 [00:00<00:08,  9.20it/s

779
584.4025692343712
87
124.24009490013123
The eval loss is decreasing!,so we save model!


Epoch:  10%|█         | 3/30 [08:51<1:19:40, 177.05s/it]





Iteration:   0%|          | 0/779 [00:00<?, ?it/s][A
Iteration:   0%|          | 1/779 [00:00<02:34,  5.03it/s][A
Iteration:   0%|          | 2/779 [00:00<02:47,  4.64it/s][A
Iteration:   0%|          | 3/779 [00:00<02:39,  4.85it/s][A
Iteration:   1%|          | 4/779 [00:00<02:42,  4.78it/s][A
Iteration:   1%|          | 5/779 [00:01<02:37,  4.93it/s][A
Iteration:   1%|          | 6/779 [00:01<02:43,  4.74it/s][A
Iteration:   1%|          | 7/779 [00:01<02:38,  4.88it/s][A
Iteration:   1%|          | 8/779 [00:01<02:42,  4.75it/s][A
Iteration:   1%|          | 9/779 [00:01<02:39,  4.84it/s][A
Iteration:   1%|▏         | 10/779 [00:02<02:43,  4.71it/s][A
Iteration:   1%|▏         | 11/779 [00:02<02:40,  4.80it/s][A
Iteration:   2%|▏         | 12/779 [00:02<02:42,  4.72it/s][A
Iteration:   2%|▏         | 13/779 [00:02<02:39,  4.81it/s][A
Iteration:   2%|▏         | 14/779 [00:02<02:43,  4.69it/s][A
Iteration:   2%|▏         | 15/779 [00:03<02:39,  4.80it/s][A
Iteration

Iteration:  33%|███▎      | 257/779 [00:54<01:58,  4.42it/s][A
Iteration:  33%|███▎      | 258/779 [00:54<02:00,  4.32it/s][A
Iteration:  33%|███▎      | 259/779 [00:55<01:56,  4.46it/s][A
Iteration:  33%|███▎      | 260/779 [00:55<02:02,  4.22it/s][A
Iteration:  34%|███▎      | 261/779 [00:55<01:59,  4.32it/s][A
Iteration:  34%|███▎      | 262/779 [00:55<02:03,  4.19it/s][A
Iteration:  34%|███▍      | 263/779 [00:56<01:55,  4.45it/s][A
Iteration:  34%|███▍      | 264/779 [00:56<01:55,  4.45it/s][A
Iteration:  34%|███▍      | 265/779 [00:56<01:50,  4.65it/s][A
Iteration:  34%|███▍      | 266/779 [00:56<01:52,  4.57it/s][A
Iteration:  34%|███▍      | 267/779 [00:56<01:49,  4.68it/s][A
Iteration:  34%|███▍      | 268/779 [00:57<01:51,  4.58it/s][A
Iteration:  35%|███▍      | 269/779 [00:57<01:47,  4.72it/s][A
Iteration:  35%|███▍      | 270/779 [00:57<01:50,  4.62it/s][A
Iteration:  35%|███▍      | 271/779 [00:57<01:48,  4.68it/s][A
Iteration:  35%|███▍      | 272/779 [00:

Iteration:  66%|██████▌   | 513/779 [01:49<00:55,  4.77it/s][A
Iteration:  66%|██████▌   | 514/779 [01:49<00:56,  4.73it/s][A
Iteration:  66%|██████▌   | 515/779 [01:49<00:54,  4.81it/s][A
Iteration:  66%|██████▌   | 516/779 [01:49<00:55,  4.72it/s][A
Iteration:  66%|██████▋   | 517/779 [01:50<00:54,  4.85it/s][A
Iteration:  66%|██████▋   | 518/779 [01:50<00:54,  4.76it/s][A
Iteration:  67%|██████▋   | 519/779 [01:50<00:53,  4.85it/s][A
Iteration:  67%|██████▋   | 520/779 [01:50<00:54,  4.71it/s][A
Iteration:  67%|██████▋   | 521/779 [01:51<00:53,  4.83it/s][A
Iteration:  67%|██████▋   | 522/779 [01:51<00:54,  4.70it/s][A
Iteration:  67%|██████▋   | 523/779 [01:51<00:53,  4.80it/s][A
Iteration:  67%|██████▋   | 524/779 [01:51<00:55,  4.56it/s][A
Iteration:  67%|██████▋   | 525/779 [01:51<00:54,  4.68it/s][A
Iteration:  68%|██████▊   | 526/779 [01:52<00:55,  4.59it/s][A
Iteration:  68%|██████▊   | 527/779 [01:52<00:53,  4.72it/s][A
Iteration:  68%|██████▊   | 528/779 [01:

Iteration:  99%|█████████▊| 769/779 [02:43<00:02,  4.82it/s][A
Iteration:  99%|█████████▉| 770/779 [02:44<00:01,  4.68it/s][A
Iteration:  99%|█████████▉| 771/779 [02:44<00:01,  4.82it/s][A
Iteration:  99%|█████████▉| 772/779 [02:44<00:01,  4.70it/s][A
Iteration:  99%|█████████▉| 773/779 [02:44<00:01,  4.83it/s][A
Iteration:  99%|█████████▉| 774/779 [02:44<00:01,  4.71it/s][A
Iteration:  99%|█████████▉| 775/779 [02:45<00:00,  4.85it/s][A
Iteration: 100%|█████████▉| 776/779 [02:45<00:00,  4.74it/s][A
Iteration: 100%|█████████▉| 777/779 [02:45<00:00,  4.80it/s][A
Iteration: 100%|██████████| 779/779 [02:45<00:00,  4.70it/s][A

Evaluating:   0%|          | 0/87 [00:00<?, ?it/s][A
Evaluating:   1%|          | 1/87 [00:00<00:09,  9.35it/s][A
Evaluating:   2%|▏         | 2/87 [00:00<00:08,  9.67it/s][A
Evaluating:   3%|▎         | 3/87 [00:00<00:08,  9.43it/s][A
Evaluating:   5%|▍         | 4/87 [00:00<00:08,  9.48it/s][A
Evaluating:   6%|▌         | 5/87 [00:00<00:08,  9.24it/s

779
503.3688625693321
87
109.18531715869904
The eval loss is decreasing!,so we save model!


Epoch:  13%|█▎        | 4/30 [11:47<1:16:33, 176.69s/it]





Iteration:   0%|          | 0/779 [00:00<?, ?it/s][A
Iteration:   0%|          | 1/779 [00:00<02:31,  5.14it/s][A
Iteration:   0%|          | 2/779 [00:00<02:42,  4.78it/s][A
Iteration:   0%|          | 3/779 [00:00<02:37,  4.93it/s][A
Iteration:   1%|          | 4/779 [00:00<02:43,  4.73it/s][A
Iteration:   1%|          | 5/779 [00:01<02:39,  4.84it/s][A
Iteration:   1%|          | 6/779 [00:01<02:46,  4.64it/s][A
Iteration:   1%|          | 7/779 [00:01<02:42,  4.75it/s][A
Iteration:   1%|          | 8/779 [00:01<02:48,  4.57it/s][A
Iteration:   1%|          | 9/779 [00:01<02:44,  4.69it/s][A
Iteration:   1%|▏         | 10/779 [00:02<02:47,  4.58it/s][A
Iteration:   1%|▏         | 11/779 [00:02<02:42,  4.71it/s][A
Iteration:   2%|▏         | 12/779 [00:02<02:45,  4.64it/s][A
Iteration:   2%|▏         | 13/779 [00:02<02:42,  4.73it/s][A
Iteration:   2%|▏         | 14/779 [00:02<02:43,  4.68it/s][A
Iteration:   2%|▏         | 15/779 [00:03<02:40,  4.75it/s][A
Iteration

Iteration:  33%|███▎      | 257/779 [00:54<01:49,  4.78it/s][A
Iteration:  33%|███▎      | 258/779 [00:54<01:48,  4.79it/s][A
Iteration:  33%|███▎      | 259/779 [00:54<01:46,  4.86it/s][A
Iteration:  33%|███▎      | 260/779 [00:55<01:49,  4.72it/s][A
Iteration:  34%|███▎      | 261/779 [00:55<01:47,  4.81it/s][A
Iteration:  34%|███▎      | 262/779 [00:55<01:50,  4.67it/s][A
Iteration:  34%|███▍      | 263/779 [00:55<01:48,  4.76it/s][A
Iteration:  34%|███▍      | 264/779 [00:55<01:49,  4.68it/s][A
Iteration:  34%|███▍      | 265/779 [00:56<01:47,  4.77it/s][A
Iteration:  34%|███▍      | 266/779 [00:56<01:49,  4.70it/s][A
Iteration:  34%|███▍      | 267/779 [00:56<01:45,  4.87it/s][A
Iteration:  34%|███▍      | 268/779 [00:56<01:48,  4.71it/s][A
Iteration:  35%|███▍      | 269/779 [00:56<01:46,  4.81it/s][A
Iteration:  35%|███▍      | 270/779 [00:57<01:48,  4.68it/s][A
Iteration:  35%|███▍      | 271/779 [00:57<01:45,  4.79it/s][A
Iteration:  35%|███▍      | 272/779 [00:

Iteration:  66%|██████▌   | 513/779 [01:47<00:53,  4.94it/s][A
Iteration:  66%|██████▌   | 514/779 [01:47<00:55,  4.80it/s][A
Iteration:  66%|██████▌   | 515/779 [01:48<00:54,  4.86it/s][A
Iteration:  66%|██████▌   | 516/779 [01:48<00:55,  4.75it/s][A
Iteration:  66%|██████▋   | 517/779 [01:48<00:54,  4.78it/s][A
Iteration:  66%|██████▋   | 518/779 [01:48<00:55,  4.72it/s][A
Iteration:  67%|██████▋   | 519/779 [01:48<00:53,  4.83it/s][A
Iteration:  67%|██████▋   | 520/779 [01:49<00:54,  4.73it/s][A
Iteration:  67%|██████▋   | 521/779 [01:49<00:53,  4.78it/s][A
Iteration:  67%|██████▋   | 522/779 [01:49<00:54,  4.72it/s][A
Iteration:  67%|██████▋   | 523/779 [01:49<00:53,  4.78it/s][A
Iteration:  67%|██████▋   | 524/779 [01:50<00:53,  4.73it/s][A
Iteration:  67%|██████▋   | 525/779 [01:50<00:52,  4.84it/s][A
Iteration:  68%|██████▊   | 526/779 [01:50<00:52,  4.80it/s][A
Iteration:  68%|██████▊   | 527/779 [01:50<00:52,  4.79it/s][A
Iteration:  68%|██████▊   | 528/779 [01:

Iteration:  99%|█████████▊| 769/779 [02:42<00:02,  4.74it/s][A
Iteration:  99%|█████████▉| 770/779 [02:42<00:01,  4.64it/s][A
Iteration:  99%|█████████▉| 771/779 [02:42<00:01,  4.72it/s][A
Iteration:  99%|█████████▉| 772/779 [02:42<00:01,  4.61it/s][A
Iteration:  99%|█████████▉| 773/779 [02:43<00:01,  4.79it/s][A
Iteration:  99%|█████████▉| 774/779 [02:43<00:01,  4.68it/s][A
Iteration:  99%|█████████▉| 775/779 [02:43<00:00,  4.75it/s][A
Iteration: 100%|█████████▉| 776/779 [02:43<00:00,  4.62it/s][A
Iteration: 100%|█████████▉| 777/779 [02:43<00:00,  4.67it/s][A
Iteration: 100%|██████████| 779/779 [02:44<00:00,  4.74it/s][A

Evaluating:   0%|          | 0/87 [00:00<?, ?it/s][A
Evaluating:   1%|          | 1/87 [00:00<00:09,  9.30it/s][A
Evaluating:   3%|▎         | 3/87 [00:00<00:08,  9.62it/s][A
Evaluating:   5%|▍         | 4/87 [00:00<00:08,  9.57it/s][A
Evaluating:   6%|▌         | 5/87 [00:00<00:08,  9.29it/s][A
Evaluating:   7%|▋         | 6/87 [00:00<00:08,  9.20it/s

779
436.06171080470085
87
92.89373207092285
The eval loss is decreasing!,so we save model!


Epoch:  17%|█▋        | 5/30 [14:42<1:13:18, 175.94s/it]





Iteration:   0%|          | 0/779 [00:00<?, ?it/s][A
Iteration:   0%|          | 1/779 [00:00<02:35,  5.02it/s][A
Iteration:   0%|          | 2/779 [00:00<02:47,  4.64it/s][A
Iteration:   0%|          | 3/779 [00:00<02:39,  4.88it/s][A
Iteration:   1%|          | 4/779 [00:00<02:44,  4.70it/s][A
Iteration:   1%|          | 5/779 [00:01<02:40,  4.83it/s][A
Iteration:   1%|          | 6/779 [00:01<02:46,  4.63it/s][A
Iteration:   1%|          | 7/779 [00:01<02:42,  4.76it/s][A
Iteration:   1%|          | 8/779 [00:01<02:48,  4.59it/s][A
Iteration:   1%|          | 9/779 [00:01<02:44,  4.68it/s][A
Iteration:   1%|▏         | 10/779 [00:02<02:48,  4.57it/s][A
Iteration:   1%|▏         | 11/779 [00:02<02:40,  4.78it/s][A
Iteration:   2%|▏         | 12/779 [00:02<02:43,  4.70it/s][A
Iteration:   2%|▏         | 13/779 [00:02<02:39,  4.81it/s][A
Iteration:   2%|▏         | 14/779 [00:02<02:42,  4.70it/s][A
Iteration:   2%|▏         | 15/779 [00:03<02:40,  4.77it/s][A
Iteration

Iteration:  33%|███▎      | 257/779 [00:54<01:48,  4.80it/s][A
Iteration:  33%|███▎      | 258/779 [00:54<01:51,  4.67it/s][A
Iteration:  33%|███▎      | 259/779 [00:55<01:48,  4.80it/s][A
Iteration:  33%|███▎      | 260/779 [00:55<01:50,  4.70it/s][A
Iteration:  34%|███▎      | 261/779 [00:55<01:46,  4.86it/s][A
Iteration:  34%|███▎      | 262/779 [00:55<01:49,  4.70it/s][A
Iteration:  34%|███▍      | 263/779 [00:55<01:47,  4.78it/s][A
Iteration:  34%|███▍      | 264/779 [00:56<01:51,  4.61it/s][A
Iteration:  34%|███▍      | 265/779 [00:56<01:50,  4.65it/s][A
Iteration:  34%|███▍      | 266/779 [00:56<01:52,  4.56it/s][A
Iteration:  34%|███▍      | 267/779 [00:56<01:48,  4.71it/s][A
Iteration:  34%|███▍      | 268/779 [00:57<01:50,  4.61it/s][A
Iteration:  35%|███▍      | 269/779 [00:57<01:48,  4.71it/s][A
Iteration:  35%|███▍      | 270/779 [00:57<01:50,  4.62it/s][A
Iteration:  35%|███▍      | 271/779 [00:57<01:46,  4.76it/s][A
Iteration:  35%|███▍      | 272/779 [00:

Iteration:  66%|██████▌   | 513/779 [01:49<00:56,  4.68it/s][A
Iteration:  66%|██████▌   | 514/779 [01:50<00:57,  4.57it/s][A
Iteration:  66%|██████▌   | 515/779 [01:50<00:55,  4.75it/s][A
Iteration:  66%|██████▌   | 516/779 [01:50<00:57,  4.58it/s][A
Iteration:  66%|██████▋   | 517/779 [01:50<00:55,  4.72it/s][A
Iteration:  66%|██████▋   | 518/779 [01:50<00:56,  4.62it/s][A
Iteration:  67%|██████▋   | 519/779 [01:51<00:54,  4.79it/s][A
Iteration:  67%|██████▋   | 520/779 [01:51<00:55,  4.66it/s][A
Iteration:  67%|██████▋   | 521/779 [01:51<00:53,  4.79it/s][A
Iteration:  67%|██████▋   | 522/779 [01:51<00:55,  4.61it/s][A
Iteration:  67%|██████▋   | 523/779 [01:51<00:53,  4.76it/s][A
Iteration:  67%|██████▋   | 524/779 [01:52<00:55,  4.61it/s][A
Iteration:  67%|██████▋   | 525/779 [01:52<00:52,  4.82it/s][A
Iteration:  68%|██████▊   | 526/779 [01:52<00:54,  4.62it/s][A
Iteration:  68%|██████▊   | 527/779 [01:52<00:52,  4.78it/s][A
Iteration:  68%|██████▊   | 528/779 [01:

Iteration:  99%|█████████▊| 769/779 [02:44<00:02,  4.71it/s][A
Iteration:  99%|█████████▉| 770/779 [02:44<00:01,  4.60it/s][A
Iteration:  99%|█████████▉| 771/779 [02:44<00:01,  4.66it/s][A
Iteration:  99%|█████████▉| 772/779 [02:44<00:01,  4.60it/s][A
Iteration:  99%|█████████▉| 773/779 [02:45<00:01,  4.73it/s][A
Iteration:  99%|█████████▉| 774/779 [02:45<00:01,  4.60it/s][A
Iteration:  99%|█████████▉| 775/779 [02:45<00:00,  4.75it/s][A
Iteration: 100%|█████████▉| 776/779 [02:45<00:00,  4.68it/s][A
Iteration: 100%|█████████▉| 777/779 [02:45<00:00,  4.77it/s][A
Iteration: 100%|██████████| 779/779 [02:46<00:00,  4.69it/s][A

Evaluating:   0%|          | 0/87 [00:00<?, ?it/s][A
Evaluating:   1%|          | 1/87 [00:00<00:09,  9.30it/s][A
Evaluating:   2%|▏         | 2/87 [00:00<00:08,  9.64it/s][A
Evaluating:   3%|▎         | 3/87 [00:00<00:08,  9.36it/s][A
Evaluating:   5%|▍         | 4/87 [00:00<00:08,  9.41it/s][A
Evaluating:   6%|▌         | 5/87 [00:00<00:08,  9.19it/s

779
376.85557466745377
87
82.89273929595947
The eval loss is decreasing!,so we save model!


Epoch:  20%|██        | 6/30 [17:38<1:10:28, 176.17s/it]





Iteration:   0%|          | 0/779 [00:00<?, ?it/s][A
Iteration:   0%|          | 1/779 [00:00<02:30,  5.16it/s][A
Iteration:   0%|          | 2/779 [00:00<02:45,  4.69it/s][A
Iteration:   0%|          | 3/779 [00:00<02:39,  4.87it/s][A
Iteration:   1%|          | 4/779 [00:00<02:46,  4.65it/s][A
Iteration:   1%|          | 5/779 [00:01<02:41,  4.79it/s][A
Iteration:   1%|          | 6/779 [00:01<02:43,  4.72it/s][A
Iteration:   1%|          | 7/779 [00:01<02:40,  4.81it/s][A
Iteration:   1%|          | 8/779 [00:01<02:44,  4.68it/s][A
Iteration:   1%|          | 9/779 [00:01<02:42,  4.73it/s][A
Iteration:   1%|▏         | 10/779 [00:02<02:45,  4.63it/s][A
Iteration:   1%|▏         | 11/779 [00:02<02:40,  4.78it/s][A
Iteration:   2%|▏         | 12/779 [00:02<02:43,  4.70it/s][A
Iteration:   2%|▏         | 13/779 [00:02<02:38,  4.82it/s][A
Iteration:   2%|▏         | 14/779 [00:02<02:42,  4.71it/s][A
Iteration:   2%|▏         | 15/779 [00:03<02:38,  4.82it/s][A
Iteration

Iteration:  33%|███▎      | 257/779 [00:54<01:48,  4.80it/s][A
Iteration:  33%|███▎      | 258/779 [00:55<01:51,  4.69it/s][A
Iteration:  33%|███▎      | 259/779 [00:55<01:50,  4.70it/s][A
Iteration:  33%|███▎      | 260/779 [00:55<01:53,  4.58it/s][A
Iteration:  34%|███▎      | 261/779 [00:55<01:50,  4.68it/s][A
Iteration:  34%|███▎      | 262/779 [00:55<01:53,  4.57it/s][A
Iteration:  34%|███▍      | 263/779 [00:56<01:50,  4.67it/s][A
Iteration:  34%|███▍      | 264/779 [00:56<01:52,  4.60it/s][A
Iteration:  34%|███▍      | 265/779 [00:56<01:49,  4.70it/s][A
Iteration:  34%|███▍      | 266/779 [00:56<01:51,  4.61it/s][A
Iteration:  34%|███▍      | 267/779 [00:56<01:48,  4.74it/s][A
Iteration:  34%|███▍      | 268/779 [00:57<01:52,  4.54it/s][A
Iteration:  35%|███▍      | 269/779 [00:57<01:48,  4.70it/s][A
Iteration:  35%|███▍      | 270/779 [00:57<01:51,  4.57it/s][A
Iteration:  35%|███▍      | 271/779 [00:57<01:48,  4.69it/s][A
Iteration:  35%|███▍      | 272/779 [00:

Iteration:  66%|██████▌   | 513/779 [01:49<00:56,  4.70it/s][A
Iteration:  66%|██████▌   | 514/779 [01:49<00:57,  4.65it/s][A
Iteration:  66%|██████▌   | 515/779 [01:49<00:54,  4.80it/s][A
Iteration:  66%|██████▌   | 516/779 [01:49<00:56,  4.63it/s][A
Iteration:  66%|██████▋   | 517/779 [01:50<00:54,  4.80it/s][A
Iteration:  66%|██████▋   | 518/779 [01:50<00:56,  4.64it/s][A
Iteration:  67%|██████▋   | 519/779 [01:50<00:54,  4.78it/s][A
Iteration:  67%|██████▋   | 520/779 [01:50<00:54,  4.71it/s][A
Iteration:  67%|██████▋   | 521/779 [01:50<00:53,  4.81it/s][A
Iteration:  67%|██████▋   | 522/779 [01:51<00:54,  4.68it/s][A
Iteration:  67%|██████▋   | 523/779 [01:51<00:53,  4.80it/s][A
Iteration:  67%|██████▋   | 524/779 [01:51<00:54,  4.72it/s][A
Iteration:  67%|██████▋   | 525/779 [01:51<00:52,  4.81it/s][A
Iteration:  68%|██████▊   | 526/779 [01:52<00:53,  4.72it/s][A
Iteration:  68%|██████▊   | 527/779 [01:52<00:52,  4.78it/s][A
Iteration:  68%|██████▊   | 528/779 [01:

Iteration:  99%|█████████▊| 769/779 [02:43<00:02,  4.85it/s][A
Iteration:  99%|█████████▉| 770/779 [02:43<00:01,  4.75it/s][A
Iteration:  99%|█████████▉| 771/779 [02:44<00:01,  4.84it/s][A
Iteration:  99%|█████████▉| 772/779 [02:44<00:01,  4.67it/s][A
Iteration:  99%|█████████▉| 773/779 [02:44<00:01,  4.79it/s][A
Iteration:  99%|█████████▉| 774/779 [02:44<00:01,  4.69it/s][A
Iteration:  99%|█████████▉| 775/779 [02:44<00:00,  4.84it/s][A
Iteration: 100%|█████████▉| 776/779 [02:45<00:00,  4.73it/s][A
Iteration: 100%|█████████▉| 777/779 [02:45<00:00,  4.87it/s][A
Iteration: 100%|██████████| 779/779 [02:45<00:00,  4.70it/s][A

Evaluating:   0%|          | 0/87 [00:00<?, ?it/s][A
Evaluating:   1%|          | 1/87 [00:00<00:09,  9.29it/s][A
Evaluating:   2%|▏         | 2/87 [00:00<00:08,  9.57it/s][A
Evaluating:   3%|▎         | 3/87 [00:00<00:09,  9.33it/s][A
Evaluating:   5%|▍         | 4/87 [00:00<00:08,  9.42it/s][A
Evaluating:   6%|▌         | 5/87 [00:00<00:08,  9.20it/s

779
333.30600503087044
87
74.62056630849838
The eval loss is decreasing!,so we save model!


Epoch:  23%|██▎       | 7/30 [20:34<1:07:31, 176.13s/it]





Iteration:   0%|          | 0/779 [00:00<?, ?it/s][A
Iteration:   0%|          | 1/779 [00:00<02:31,  5.15it/s][A
Iteration:   0%|          | 2/779 [00:00<02:47,  4.63it/s][A
Iteration:   0%|          | 3/779 [00:00<02:42,  4.78it/s][A
Iteration:   1%|          | 4/779 [00:00<02:49,  4.56it/s][A
Iteration:   1%|          | 5/779 [00:01<02:44,  4.69it/s][A
Iteration:   1%|          | 6/779 [00:01<02:47,  4.62it/s][A
Iteration:   1%|          | 7/779 [00:01<02:44,  4.70it/s][A
Iteration:   1%|          | 8/779 [00:01<02:46,  4.62it/s][A
Iteration:   1%|          | 9/779 [00:01<02:43,  4.72it/s][A
Iteration:   1%|▏         | 10/779 [00:02<02:45,  4.63it/s][A
Iteration:   1%|▏         | 11/779 [00:02<02:41,  4.75it/s][A
Iteration:   2%|▏         | 12/779 [00:02<02:47,  4.58it/s][A
Iteration:   2%|▏         | 13/779 [00:02<02:42,  4.71it/s][A
Iteration:   2%|▏         | 14/779 [00:03<02:45,  4.61it/s][A
Iteration:   2%|▏         | 15/779 [00:03<02:40,  4.76it/s][A
Iteration

Iteration:  33%|███▎      | 257/779 [00:54<01:59,  4.37it/s][A
Iteration:  33%|███▎      | 258/779 [00:55<02:01,  4.30it/s][A
Iteration:  33%|███▎      | 259/779 [00:55<01:59,  4.34it/s][A
Iteration:  33%|███▎      | 260/779 [00:55<02:00,  4.31it/s][A
Iteration:  34%|███▎      | 261/779 [00:55<02:00,  4.31it/s][A
Iteration:  34%|███▎      | 262/779 [00:55<02:03,  4.18it/s][A
Iteration:  34%|███▍      | 263/779 [00:56<01:56,  4.41it/s][A
Iteration:  34%|███▍      | 264/779 [00:56<01:58,  4.34it/s][A
Iteration:  34%|███▍      | 265/779 [00:56<01:54,  4.51it/s][A
Iteration:  34%|███▍      | 266/779 [00:56<01:55,  4.46it/s][A
Iteration:  34%|███▍      | 267/779 [00:57<01:49,  4.69it/s][A
Iteration:  34%|███▍      | 268/779 [00:57<01:49,  4.65it/s][A
Iteration:  35%|███▍      | 269/779 [00:57<01:46,  4.79it/s][A
Iteration:  35%|███▍      | 270/779 [00:57<01:50,  4.61it/s][A
Iteration:  35%|███▍      | 271/779 [00:57<01:48,  4.70it/s][A
Iteration:  35%|███▍      | 272/779 [00:

Iteration:  66%|██████▌   | 513/779 [01:49<00:56,  4.71it/s][A
Iteration:  66%|██████▌   | 514/779 [01:49<00:57,  4.61it/s][A
Iteration:  66%|██████▌   | 515/779 [01:49<00:55,  4.77it/s][A
Iteration:  66%|██████▌   | 516/779 [01:50<00:56,  4.64it/s][A
Iteration:  66%|██████▋   | 517/779 [01:50<00:55,  4.74it/s][A
Iteration:  66%|██████▋   | 518/779 [01:50<00:55,  4.68it/s][A
Iteration:  67%|██████▋   | 519/779 [01:50<00:54,  4.77it/s][A
Iteration:  67%|██████▋   | 520/779 [01:50<00:56,  4.61it/s][A
Iteration:  67%|██████▋   | 521/779 [01:51<00:54,  4.75it/s][A
Iteration:  67%|██████▋   | 522/779 [01:51<00:55,  4.64it/s][A
Iteration:  67%|██████▋   | 523/779 [01:51<00:54,  4.71it/s][A
Iteration:  67%|██████▋   | 524/779 [01:51<00:56,  4.53it/s][A
Iteration:  67%|██████▋   | 525/779 [01:52<00:54,  4.69it/s][A
Iteration:  68%|██████▊   | 526/779 [01:52<00:54,  4.64it/s][A
Iteration:  68%|██████▊   | 527/779 [01:52<00:53,  4.75it/s][A
Iteration:  68%|██████▊   | 528/779 [01:

Iteration:  99%|█████████▊| 769/779 [02:44<00:02,  4.67it/s][A
Iteration:  99%|█████████▉| 770/779 [02:44<00:01,  4.61it/s][A
Iteration:  99%|█████████▉| 771/779 [02:44<00:01,  4.72it/s][A
Iteration:  99%|█████████▉| 772/779 [02:44<00:01,  4.62it/s][A
Iteration:  99%|█████████▉| 773/779 [02:45<00:01,  4.75it/s][A
Iteration:  99%|█████████▉| 774/779 [02:45<00:01,  4.67it/s][A
Iteration:  99%|█████████▉| 775/779 [02:45<00:00,  4.77it/s][A
Iteration: 100%|█████████▉| 776/779 [02:45<00:00,  4.70it/s][A
Iteration: 100%|█████████▉| 777/779 [02:45<00:00,  4.82it/s][A
Iteration: 100%|██████████| 779/779 [02:46<00:00,  4.69it/s][A

Evaluating:   0%|          | 0/87 [00:00<?, ?it/s][A
Evaluating:   1%|          | 1/87 [00:00<00:09,  9.51it/s][A
Evaluating:   3%|▎         | 3/87 [00:00<00:08,  9.67it/s][A
Evaluating:   5%|▍         | 4/87 [00:00<00:08,  9.59it/s][A
Evaluating:   6%|▌         | 5/87 [00:00<00:08,  9.31it/s][A
Evaluating:   7%|▋         | 6/87 [00:00<00:08,  9.24it/s

779
293.0960951000452
87
65.62185168266296
The eval loss is decreasing!,so we save model!


Epoch:  27%|██▋       | 8/30 [23:31<1:04:38, 176.29s/it]





Iteration:   0%|          | 0/779 [00:00<?, ?it/s][A
Iteration:   0%|          | 1/779 [00:00<02:40,  4.85it/s][A
Iteration:   0%|          | 2/779 [00:00<02:49,  4.59it/s][A
Iteration:   0%|          | 3/779 [00:00<02:42,  4.77it/s][A
Iteration:   1%|          | 4/779 [00:00<02:47,  4.63it/s][A
Iteration:   1%|          | 5/779 [00:01<02:41,  4.78it/s][A
Iteration:   1%|          | 6/779 [00:01<02:46,  4.65it/s][A
Iteration:   1%|          | 7/779 [00:01<02:42,  4.75it/s][A
Iteration:   1%|          | 8/779 [00:01<02:46,  4.63it/s][A
Iteration:   1%|          | 9/779 [00:01<02:44,  4.69it/s][A
Iteration:   1%|▏         | 10/779 [00:02<02:47,  4.60it/s][A
Iteration:   1%|▏         | 11/779 [00:02<02:42,  4.74it/s][A
Iteration:   2%|▏         | 12/779 [00:02<02:45,  4.64it/s][A
Iteration:   2%|▏         | 13/779 [00:02<02:38,  4.84it/s][A
Iteration:   2%|▏         | 14/779 [00:02<02:42,  4.72it/s][A
Iteration:   2%|▏         | 15/779 [00:03<02:39,  4.79it/s][A
Iteration

Iteration:  33%|███▎      | 257/779 [00:54<01:48,  4.80it/s][A
Iteration:  33%|███▎      | 258/779 [00:55<01:51,  4.67it/s][A
Iteration:  33%|███▎      | 259/779 [00:55<01:49,  4.75it/s][A
Iteration:  33%|███▎      | 260/779 [00:55<01:53,  4.58it/s][A
Iteration:  34%|███▎      | 261/779 [00:55<01:48,  4.77it/s][A
Iteration:  34%|███▎      | 262/779 [00:55<01:51,  4.65it/s][A
Iteration:  34%|███▍      | 263/779 [00:56<01:48,  4.77it/s][A
Iteration:  34%|███▍      | 264/779 [00:56<01:50,  4.67it/s][A
Iteration:  34%|███▍      | 265/779 [00:56<01:48,  4.74it/s][A
Iteration:  34%|███▍      | 266/779 [00:56<01:50,  4.64it/s][A
Iteration:  34%|███▍      | 267/779 [00:56<01:46,  4.79it/s][A
Iteration:  34%|███▍      | 268/779 [00:57<01:50,  4.63it/s][A
Iteration:  35%|███▍      | 269/779 [00:57<01:46,  4.79it/s][A
Iteration:  35%|███▍      | 270/779 [00:57<01:49,  4.66it/s][A
Iteration:  35%|███▍      | 271/779 [00:57<01:45,  4.79it/s][A
Iteration:  35%|███▍      | 272/779 [00:

Iteration:  66%|██████▌   | 513/779 [01:49<00:55,  4.78it/s][A
Iteration:  66%|██████▌   | 514/779 [01:49<00:57,  4.62it/s][A
Iteration:  66%|██████▌   | 515/779 [01:49<00:55,  4.78it/s][A
Iteration:  66%|██████▌   | 516/779 [01:49<00:55,  4.73it/s][A
Iteration:  66%|██████▋   | 517/779 [01:50<00:53,  4.87it/s][A
Iteration:  66%|██████▋   | 518/779 [01:50<00:54,  4.76it/s][A
Iteration:  67%|██████▋   | 519/779 [01:50<00:53,  4.90it/s][A
Iteration:  67%|██████▋   | 520/779 [01:50<00:54,  4.79it/s][A
Iteration:  67%|██████▋   | 521/779 [01:50<00:52,  4.88it/s][A
Iteration:  67%|██████▋   | 522/779 [01:51<00:54,  4.70it/s][A
Iteration:  67%|██████▋   | 523/779 [01:51<00:53,  4.80it/s][A
Iteration:  67%|██████▋   | 524/779 [01:51<00:53,  4.73it/s][A
Iteration:  67%|██████▋   | 525/779 [01:51<00:52,  4.80it/s][A
Iteration:  68%|██████▊   | 526/779 [01:52<00:53,  4.73it/s][A
Iteration:  68%|██████▊   | 527/779 [01:52<00:51,  4.85it/s][A
Iteration:  68%|██████▊   | 528/779 [01:

Iteration:  99%|█████████▊| 769/779 [02:44<00:02,  4.74it/s][A
Iteration:  99%|█████████▉| 770/779 [02:44<00:01,  4.64it/s][A
Iteration:  99%|█████████▉| 771/779 [02:44<00:01,  4.80it/s][A
Iteration:  99%|█████████▉| 772/779 [02:44<00:01,  4.68it/s][A
Iteration:  99%|█████████▉| 773/779 [02:45<00:01,  4.81it/s][A
Iteration:  99%|█████████▉| 774/779 [02:45<00:01,  4.69it/s][A
Iteration:  99%|█████████▉| 775/779 [02:45<00:00,  4.81it/s][A
Iteration: 100%|█████████▉| 776/779 [02:45<00:00,  4.75it/s][A
Iteration: 100%|█████████▉| 777/779 [02:45<00:00,  4.77it/s][A
Iteration: 100%|██████████| 779/779 [02:46<00:00,  4.69it/s][A

Evaluating:   0%|          | 0/87 [00:00<?, ?it/s][A
Evaluating:   1%|          | 1/87 [00:00<00:09,  9.33it/s][A
Evaluating:   2%|▏         | 2/87 [00:00<00:08,  9.54it/s][A
Evaluating:   3%|▎         | 3/87 [00:00<00:09,  9.29it/s][A
Evaluating:   5%|▍         | 4/87 [00:00<00:08,  9.33it/s][A
Evaluating:   6%|▌         | 5/87 [00:00<00:09,  9.10it/s

779
262.3829615563154
87
61.07890033721924
The eval loss is decreasing!,so we save model!


Epoch:  30%|███       | 9/30 [26:28<1:01:43, 176.38s/it]





Iteration:   0%|          | 0/779 [00:00<?, ?it/s][A
Iteration:   0%|          | 1/779 [00:00<02:33,  5.05it/s][A
Iteration:   0%|          | 2/779 [00:00<02:49,  4.58it/s][A
Iteration:   0%|          | 3/779 [00:00<02:39,  4.85it/s][A
Iteration:   1%|          | 4/779 [00:00<02:42,  4.77it/s][A
Iteration:   1%|          | 5/779 [00:01<02:40,  4.82it/s][A
Iteration:   1%|          | 6/779 [00:01<02:47,  4.62it/s][A
Iteration:   1%|          | 7/779 [00:01<02:41,  4.77it/s][A
Iteration:   1%|          | 8/779 [00:01<02:47,  4.59it/s][A
Iteration:   1%|          | 9/779 [00:01<02:41,  4.76it/s][A
Iteration:   1%|▏         | 10/779 [00:02<02:44,  4.67it/s][A
Iteration:   1%|▏         | 11/779 [00:02<02:40,  4.79it/s][A
Iteration:   2%|▏         | 12/779 [00:02<02:43,  4.68it/s][A
Iteration:   2%|▏         | 13/779 [00:02<02:38,  4.84it/s][A
Iteration:   2%|▏         | 14/779 [00:02<02:40,  4.75it/s][A
Iteration:   2%|▏         | 15/779 [00:03<02:36,  4.90it/s][A
Iteration

Iteration:  33%|███▎      | 257/779 [00:54<01:48,  4.81it/s][A
Iteration:  33%|███▎      | 258/779 [00:55<01:50,  4.71it/s][A
Iteration:  33%|███▎      | 259/779 [00:55<01:47,  4.82it/s][A
Iteration:  33%|███▎      | 260/779 [00:55<01:50,  4.68it/s][A
Iteration:  34%|███▎      | 261/779 [00:55<01:48,  4.76it/s][A
Iteration:  34%|███▎      | 262/779 [00:56<01:49,  4.70it/s][A
Iteration:  34%|███▍      | 263/779 [00:56<01:47,  4.79it/s][A
Iteration:  34%|███▍      | 264/779 [00:56<01:50,  4.66it/s][A
Iteration:  34%|███▍      | 265/779 [00:56<01:48,  4.74it/s][A
Iteration:  34%|███▍      | 266/779 [00:56<01:50,  4.65it/s][A
Iteration:  34%|███▍      | 267/779 [00:57<01:48,  4.73it/s][A
Iteration:  34%|███▍      | 268/779 [00:57<01:50,  4.63it/s][A
Iteration:  35%|███▍      | 269/779 [00:57<01:47,  4.76it/s][A
Iteration:  35%|███▍      | 270/779 [00:57<01:49,  4.65it/s][A
Iteration:  35%|███▍      | 271/779 [00:57<01:45,  4.84it/s][A
Iteration:  35%|███▍      | 272/779 [00:

Iteration:  66%|██████▌   | 513/779 [01:49<00:59,  4.48it/s][A
Iteration:  66%|██████▌   | 514/779 [01:49<00:59,  4.47it/s][A
Iteration:  66%|██████▌   | 515/779 [01:50<00:56,  4.68it/s][A
Iteration:  66%|██████▌   | 516/779 [01:50<00:57,  4.60it/s][A
Iteration:  66%|██████▋   | 517/779 [01:50<00:55,  4.73it/s][A
Iteration:  66%|██████▋   | 518/779 [01:50<00:56,  4.64it/s][A
Iteration:  67%|██████▋   | 519/779 [01:50<00:54,  4.76it/s][A
Iteration:  67%|██████▋   | 520/779 [01:51<00:55,  4.65it/s][A
Iteration:  67%|██████▋   | 521/779 [01:51<00:54,  4.70it/s][A
Iteration:  67%|██████▋   | 522/779 [01:51<00:55,  4.62it/s][A
Iteration:  67%|██████▋   | 523/779 [01:51<00:55,  4.58it/s][A
Iteration:  67%|██████▋   | 524/779 [01:52<00:57,  4.41it/s][A
Iteration:  67%|██████▋   | 525/779 [01:52<00:55,  4.59it/s][A
Iteration:  68%|██████▊   | 526/779 [01:52<00:55,  4.60it/s][A
Iteration:  68%|██████▊   | 527/779 [01:52<00:53,  4.72it/s][A
Iteration:  68%|██████▊   | 528/779 [01:

Iteration:  99%|█████████▊| 769/779 [02:44<00:02,  4.30it/s][A
Iteration:  99%|█████████▉| 770/779 [02:44<00:02,  4.33it/s][A
Iteration:  99%|█████████▉| 771/779 [02:44<00:01,  4.54it/s][A
Iteration:  99%|█████████▉| 772/779 [02:45<00:01,  4.50it/s][A
Iteration:  99%|█████████▉| 773/779 [02:45<00:01,  4.70it/s][A
Iteration:  99%|█████████▉| 774/779 [02:45<00:01,  4.62it/s][A
Iteration:  99%|█████████▉| 775/779 [02:45<00:00,  4.73it/s][A
Iteration: 100%|█████████▉| 776/779 [02:46<00:00,  4.61it/s][A
Iteration: 100%|█████████▉| 777/779 [02:46<00:00,  4.75it/s][A
Iteration: 100%|██████████| 779/779 [02:46<00:00,  4.68it/s][A

Evaluating:   0%|          | 0/87 [00:00<?, ?it/s][A
Evaluating:   1%|          | 1/87 [00:00<00:09,  9.43it/s][A
Evaluating:   2%|▏         | 2/87 [00:00<00:08,  9.64it/s][A
Evaluating:   3%|▎         | 3/87 [00:00<00:09,  9.32it/s][A
Evaluating:   5%|▍         | 4/87 [00:00<00:08,  9.37it/s][A
Evaluating:   6%|▌         | 5/87 [00:00<00:09,  9.11it/s

779
234.64946827292442
87
54.308121502399445
The eval loss is decreasing!,so we save model!


Epoch:  33%|███▎      | 10/30 [29:24<58:50, 176.53s/it] 





Iteration:   0%|          | 0/779 [00:00<?, ?it/s][A
Iteration:   0%|          | 1/779 [00:00<02:31,  5.12it/s][A
Iteration:   0%|          | 2/779 [00:00<02:48,  4.62it/s][A
Iteration:   0%|          | 3/779 [00:00<02:40,  4.84it/s][A
Iteration:   1%|          | 4/779 [00:00<02:59,  4.31it/s][A
Iteration:   1%|          | 5/779 [00:01<02:53,  4.45it/s][A
Iteration:   1%|          | 6/779 [00:01<02:55,  4.41it/s][A
Iteration:   1%|          | 7/779 [00:01<02:56,  4.37it/s][A
Iteration:   1%|          | 8/779 [00:01<03:01,  4.26it/s][A
Iteration:   1%|          | 9/779 [00:02<02:57,  4.33it/s][A
Iteration:   1%|▏         | 10/779 [00:02<03:06,  4.13it/s][A
Iteration:   1%|▏         | 11/779 [00:02<02:55,  4.37it/s][A
Iteration:   2%|▏         | 12/779 [00:02<03:02,  4.21it/s][A
Iteration:   2%|▏         | 13/779 [00:02<02:56,  4.33it/s][A
Iteration:   2%|▏         | 14/779 [00:03<03:01,  4.21it/s][A
Iteration:   2%|▏         | 15/779 [00:03<02:52,  4.42it/s][A
Iteration

Iteration:  33%|███▎      | 257/779 [00:55<01:51,  4.69it/s][A
Iteration:  33%|███▎      | 258/779 [00:55<01:54,  4.56it/s][A
Iteration:  33%|███▎      | 259/779 [00:55<01:50,  4.72it/s][A
Iteration:  33%|███▎      | 260/779 [00:55<01:51,  4.66it/s][A
Iteration:  34%|███▎      | 261/779 [00:56<01:49,  4.75it/s][A
Iteration:  34%|███▎      | 262/779 [00:56<01:51,  4.63it/s][A
Iteration:  34%|███▍      | 263/779 [00:56<01:47,  4.80it/s][A
Iteration:  34%|███▍      | 264/779 [00:56<01:50,  4.66it/s][A
Iteration:  34%|███▍      | 265/779 [00:57<01:47,  4.77it/s][A
Iteration:  34%|███▍      | 266/779 [00:57<01:53,  4.50it/s][A
Iteration:  34%|███▍      | 267/779 [00:57<01:50,  4.65it/s][A
Iteration:  34%|███▍      | 268/779 [00:57<01:52,  4.54it/s][A
Iteration:  35%|███▍      | 269/779 [00:57<01:49,  4.64it/s][A
Iteration:  35%|███▍      | 270/779 [00:58<01:51,  4.58it/s][A
Iteration:  35%|███▍      | 271/779 [00:58<01:47,  4.72it/s][A
Iteration:  35%|███▍      | 272/779 [00:

Iteration:  66%|██████▌   | 513/779 [01:50<00:57,  4.66it/s][A
Iteration:  66%|██████▌   | 514/779 [01:50<00:57,  4.62it/s][A
Iteration:  66%|██████▌   | 515/779 [01:50<00:55,  4.76it/s][A
Iteration:  66%|██████▌   | 516/779 [01:50<00:57,  4.61it/s][A
Iteration:  66%|██████▋   | 517/779 [01:51<00:55,  4.73it/s][A
Iteration:  66%|██████▋   | 518/779 [01:51<00:56,  4.61it/s][A
Iteration:  67%|██████▋   | 519/779 [01:51<00:54,  4.77it/s][A
Iteration:  67%|██████▋   | 520/779 [01:51<00:55,  4.68it/s][A
Iteration:  67%|██████▋   | 521/779 [01:51<00:53,  4.83it/s][A
Iteration:  67%|██████▋   | 522/779 [01:52<00:54,  4.70it/s][A
Iteration:  67%|██████▋   | 523/779 [01:52<00:53,  4.82it/s][A
Iteration:  67%|██████▋   | 524/779 [01:52<00:55,  4.61it/s][A
Iteration:  67%|██████▋   | 525/779 [01:52<00:53,  4.73it/s][A
Iteration:  68%|██████▊   | 526/779 [01:52<00:54,  4.63it/s][A
Iteration:  68%|██████▊   | 527/779 [01:53<00:52,  4.77it/s][A
Iteration:  68%|██████▊   | 528/779 [01:

Iteration:  99%|█████████▊| 769/779 [02:44<00:02,  4.84it/s][A
Iteration:  99%|█████████▉| 770/779 [02:44<00:01,  4.69it/s][A
Iteration:  99%|█████████▉| 771/779 [02:45<00:01,  4.78it/s][A
Iteration:  99%|█████████▉| 772/779 [02:45<00:01,  4.58it/s][A
Iteration:  99%|█████████▉| 773/779 [02:45<00:01,  4.73it/s][A
Iteration:  99%|█████████▉| 774/779 [02:45<00:01,  4.61it/s][A
Iteration:  99%|█████████▉| 775/779 [02:46<00:00,  4.67it/s][A
Iteration: 100%|█████████▉| 776/779 [02:46<00:00,  4.51it/s][A
Iteration: 100%|█████████▉| 777/779 [02:46<00:00,  4.66it/s][A
Iteration: 100%|██████████| 779/779 [02:46<00:00,  4.67it/s][A

Evaluating:   0%|          | 0/87 [00:00<?, ?it/s][A
Evaluating:   1%|          | 1/87 [00:00<00:09,  9.19it/s][A
Evaluating:   2%|▏         | 2/87 [00:00<00:08,  9.49it/s][A
Evaluating:   3%|▎         | 3/87 [00:00<00:09,  9.27it/s][A
Evaluating:   5%|▍         | 4/87 [00:00<00:08,  9.36it/s][A
Evaluating:   6%|▌         | 5/87 [00:00<00:08,  9.14it/s

779
214.7680231332779
87
48.8370575606823
The eval loss is decreasing!,so we save model!


Epoch:  37%|███▋      | 11/30 [32:22<55:57, 176.71s/it]





Iteration:   0%|          | 0/779 [00:00<?, ?it/s][A
Iteration:   0%|          | 1/779 [00:00<02:34,  5.04it/s][A
Iteration:   0%|          | 2/779 [00:00<02:56,  4.41it/s][A
Iteration:   0%|          | 3/779 [00:00<02:52,  4.50it/s][A
Iteration:   1%|          | 4/779 [00:00<02:59,  4.31it/s][A
Iteration:   1%|          | 5/779 [00:01<02:52,  4.48it/s][A
Iteration:   1%|          | 6/779 [00:01<02:59,  4.31it/s][A
Iteration:   1%|          | 7/779 [00:01<02:47,  4.60it/s][A
Iteration:   1%|          | 8/779 [00:01<02:50,  4.53it/s][A
Iteration:   1%|          | 9/779 [00:01<02:44,  4.68it/s][A
Iteration:   1%|▏         | 10/779 [00:02<02:46,  4.62it/s][A
Iteration:   1%|▏         | 11/779 [00:02<02:43,  4.69it/s][A
Iteration:   2%|▏         | 12/779 [00:02<02:48,  4.56it/s][A
Iteration:   2%|▏         | 13/779 [00:02<02:42,  4.72it/s][A
Iteration:   2%|▏         | 14/779 [00:03<02:47,  4.57it/s][A
Iteration:   2%|▏         | 15/779 [00:03<02:43,  4.67it/s][A
Iteration

Iteration:  33%|███▎      | 257/779 [00:55<01:49,  4.75it/s][A
Iteration:  33%|███▎      | 258/779 [00:55<01:51,  4.66it/s][A
Iteration:  33%|███▎      | 259/779 [00:55<01:49,  4.75it/s][A
Iteration:  33%|███▎      | 260/779 [00:55<01:50,  4.68it/s][A
Iteration:  34%|███▎      | 261/779 [00:55<01:49,  4.75it/s][A
Iteration:  34%|███▎      | 262/779 [00:56<01:51,  4.62it/s][A
Iteration:  34%|███▍      | 263/779 [00:56<01:51,  4.63it/s][A
Iteration:  34%|███▍      | 264/779 [00:56<01:52,  4.58it/s][A
Iteration:  34%|███▍      | 265/779 [00:56<01:48,  4.75it/s][A
Iteration:  34%|███▍      | 266/779 [00:57<01:50,  4.66it/s][A
Iteration:  34%|███▍      | 267/779 [00:57<01:46,  4.81it/s][A
Iteration:  34%|███▍      | 268/779 [00:57<01:48,  4.70it/s][A
Iteration:  35%|███▍      | 269/779 [00:57<01:45,  4.82it/s][A
Iteration:  35%|███▍      | 270/779 [00:57<01:48,  4.69it/s][A
Iteration:  35%|███▍      | 271/779 [00:58<01:45,  4.80it/s][A
Iteration:  35%|███▍      | 272/779 [00:

Iteration:  66%|██████▌   | 513/779 [01:49<00:55,  4.83it/s][A
Iteration:  66%|██████▌   | 514/779 [01:50<00:57,  4.62it/s][A
Iteration:  66%|██████▌   | 515/779 [01:50<00:54,  4.82it/s][A
Iteration:  66%|██████▌   | 516/779 [01:50<00:55,  4.72it/s][A
Iteration:  66%|██████▋   | 517/779 [01:50<00:54,  4.79it/s][A
Iteration:  66%|██████▋   | 518/779 [01:50<00:55,  4.70it/s][A
Iteration:  67%|██████▋   | 519/779 [01:51<00:53,  4.82it/s][A
Iteration:  67%|██████▋   | 520/779 [01:51<00:55,  4.70it/s][A
Iteration:  67%|██████▋   | 521/779 [01:51<00:53,  4.84it/s][A
Iteration:  67%|██████▋   | 522/779 [01:51<00:54,  4.70it/s][A
Iteration:  67%|██████▋   | 523/779 [01:51<00:53,  4.80it/s][A
Iteration:  67%|██████▋   | 524/779 [01:52<00:53,  4.79it/s][A
Iteration:  67%|██████▋   | 525/779 [01:52<00:52,  4.88it/s][A
Iteration:  68%|██████▊   | 526/779 [01:52<00:53,  4.74it/s][A
Iteration:  68%|██████▊   | 527/779 [01:52<00:52,  4.78it/s][A
Iteration:  68%|██████▊   | 528/779 [01:

Iteration:  99%|█████████▊| 769/779 [02:43<00:02,  4.72it/s][A
Iteration:  99%|█████████▉| 770/779 [02:44<00:01,  4.70it/s][A
Iteration:  99%|█████████▉| 771/779 [02:44<00:01,  4.80it/s][A
Iteration:  99%|█████████▉| 772/779 [02:44<00:01,  4.60it/s][A
Iteration:  99%|█████████▉| 773/779 [02:44<00:01,  4.76it/s][A
Iteration:  99%|█████████▉| 774/779 [02:45<00:01,  4.68it/s][A
Iteration:  99%|█████████▉| 775/779 [02:45<00:00,  4.82it/s][A
Iteration: 100%|█████████▉| 776/779 [02:45<00:00,  4.66it/s][A
Iteration: 100%|█████████▉| 777/779 [02:45<00:00,  4.77it/s][A
Iteration: 100%|██████████| 779/779 [02:45<00:00,  4.70it/s][A

Evaluating:   0%|          | 0/87 [00:00<?, ?it/s][A
Evaluating:   1%|          | 1/87 [00:00<00:08,  9.64it/s][A
Evaluating:   3%|▎         | 3/87 [00:00<00:08,  9.69it/s][A
Evaluating:   5%|▍         | 4/87 [00:00<00:08,  9.61it/s][A
Evaluating:   6%|▌         | 5/87 [00:00<00:08,  9.35it/s][A
Evaluating:   7%|▋         | 6/87 [00:00<00:08,  9.27it/s

779
196.6872602701187
87
47.564820766448975
The eval loss is decreasing!,so we save model!


Epoch:  40%|████      | 12/30 [35:18<52:58, 176.57s/it]





Iteration:   0%|          | 0/779 [00:00<?, ?it/s][A
Iteration:   0%|          | 1/779 [00:00<02:37,  4.95it/s][A
Iteration:   0%|          | 2/779 [00:00<02:50,  4.55it/s][A
Iteration:   0%|          | 3/779 [00:00<02:41,  4.80it/s][A
Iteration:   1%|          | 4/779 [00:00<02:48,  4.60it/s][A
Iteration:   1%|          | 5/779 [00:01<02:43,  4.72it/s][A
Iteration:   1%|          | 6/779 [00:01<02:45,  4.66it/s][A
Iteration:   1%|          | 7/779 [00:01<02:42,  4.74it/s][A
Iteration:   1%|          | 8/779 [00:01<02:46,  4.62it/s][A
Iteration:   1%|          | 9/779 [00:01<02:45,  4.66it/s][A
Iteration:   1%|▏         | 10/779 [00:02<02:50,  4.51it/s][A
Iteration:   1%|▏         | 11/779 [00:02<02:44,  4.66it/s][A
Iteration:   2%|▏         | 12/779 [00:02<02:47,  4.59it/s][A
Iteration:   2%|▏         | 13/779 [00:02<02:42,  4.72it/s][A
Iteration:   2%|▏         | 14/779 [00:03<02:46,  4.59it/s][A
Iteration:   2%|▏         | 15/779 [00:03<02:41,  4.73it/s][A
Iteration

Iteration:  33%|███▎      | 257/779 [00:54<01:49,  4.79it/s][A
Iteration:  33%|███▎      | 258/779 [00:54<01:51,  4.68it/s][A
Iteration:  33%|███▎      | 259/779 [00:55<01:50,  4.72it/s][A
Iteration:  33%|███▎      | 260/779 [00:55<01:53,  4.57it/s][A
Iteration:  34%|███▎      | 261/779 [00:55<01:51,  4.65it/s][A
Iteration:  34%|███▎      | 262/779 [00:55<01:53,  4.56it/s][A
Iteration:  34%|███▍      | 263/779 [00:55<01:50,  4.69it/s][A
Iteration:  34%|███▍      | 264/779 [00:56<01:52,  4.60it/s][A
Iteration:  34%|███▍      | 265/779 [00:56<01:49,  4.70it/s][A
Iteration:  34%|███▍      | 266/779 [00:56<01:51,  4.62it/s][A
Iteration:  34%|███▍      | 267/779 [00:56<01:47,  4.77it/s][A
Iteration:  34%|███▍      | 268/779 [00:56<01:50,  4.61it/s][A
Iteration:  35%|███▍      | 269/779 [00:57<01:46,  4.79it/s][A
Iteration:  35%|███▍      | 270/779 [00:57<01:49,  4.64it/s][A
Iteration:  35%|███▍      | 271/779 [00:57<01:46,  4.77it/s][A
Iteration:  35%|███▍      | 272/779 [00:

Iteration:  66%|██████▌   | 513/779 [01:48<00:56,  4.74it/s][A
Iteration:  66%|██████▌   | 514/779 [01:49<00:57,  4.64it/s][A
Iteration:  66%|██████▌   | 515/779 [01:49<00:55,  4.77it/s][A
Iteration:  66%|██████▌   | 516/779 [01:49<00:55,  4.70it/s][A
Iteration:  66%|██████▋   | 517/779 [01:49<00:54,  4.83it/s][A
Iteration:  66%|██████▋   | 518/779 [01:49<00:54,  4.75it/s][A
Iteration:  67%|██████▋   | 519/779 [01:50<00:54,  4.80it/s][A
Iteration:  67%|██████▋   | 520/779 [01:50<00:55,  4.69it/s][A
Iteration:  67%|██████▋   | 521/779 [01:50<00:54,  4.77it/s][A
Iteration:  67%|██████▋   | 522/779 [01:50<00:54,  4.68it/s][A
Iteration:  67%|██████▋   | 523/779 [01:51<00:53,  4.79it/s][A
Iteration:  67%|██████▋   | 524/779 [01:51<00:54,  4.69it/s][A
Iteration:  67%|██████▋   | 525/779 [01:51<00:52,  4.87it/s][A
Iteration:  68%|██████▊   | 526/779 [01:51<00:54,  4.64it/s][A
Iteration:  68%|██████▊   | 527/779 [01:51<00:52,  4.76it/s][A
Iteration:  68%|██████▊   | 528/779 [01:

Iteration:  99%|█████████▊| 769/779 [02:43<00:02,  4.85it/s][A
Iteration:  99%|█████████▉| 770/779 [02:43<00:01,  4.75it/s][A
Iteration:  99%|█████████▉| 771/779 [02:43<00:01,  4.83it/s][A
Iteration:  99%|█████████▉| 772/779 [02:43<00:01,  4.74it/s][A
Iteration:  99%|█████████▉| 773/779 [02:43<00:01,  4.82it/s][A
Iteration:  99%|█████████▉| 774/779 [02:44<00:01,  4.70it/s][A
Iteration:  99%|█████████▉| 775/779 [02:44<00:00,  4.79it/s][A
Iteration: 100%|█████████▉| 776/779 [02:44<00:00,  4.68it/s][A
Iteration: 100%|█████████▉| 777/779 [02:44<00:00,  4.77it/s][A
Iteration: 100%|██████████| 779/779 [02:44<00:00,  4.72it/s][A

Evaluating:   0%|          | 0/87 [00:00<?, ?it/s][A
Evaluating:   1%|          | 1/87 [00:00<00:08,  9.56it/s][A
Evaluating:   2%|▏         | 2/87 [00:00<00:08,  9.79it/s][A
Evaluating:   3%|▎         | 3/87 [00:00<00:08,  9.49it/s][A
Evaluating:   5%|▍         | 4/87 [00:00<00:08,  9.49it/s][A
Evaluating:   6%|▌         | 5/87 [00:00<00:08,  9.25it/s

779
183.14746241271496
87
43.0563078969717
The eval loss is decreasing!,so we save model!


Epoch:  43%|████▎     | 13/30 [38:13<49:55, 176.21s/it]





Iteration:   0%|          | 0/779 [00:00<?, ?it/s][A
Iteration:   0%|          | 1/779 [00:00<02:36,  4.98it/s][A
Iteration:   0%|          | 2/779 [00:00<02:49,  4.57it/s][A
Iteration:   0%|          | 3/779 [00:00<02:39,  4.87it/s][A
Iteration:   1%|          | 4/779 [00:00<02:45,  4.69it/s][A
Iteration:   1%|          | 5/779 [00:01<02:42,  4.76it/s][A
Iteration:   1%|          | 6/779 [00:01<02:47,  4.60it/s][A
Iteration:   1%|          | 7/779 [00:01<02:44,  4.69it/s][A
Iteration:   1%|          | 8/779 [00:01<02:47,  4.59it/s][A
Iteration:   1%|          | 9/779 [00:01<02:44,  4.68it/s][A
Iteration:   1%|▏         | 10/779 [00:02<02:47,  4.58it/s][A
Iteration:   1%|▏         | 11/779 [00:02<02:43,  4.69it/s][A
Iteration:   2%|▏         | 12/779 [00:02<02:47,  4.57it/s][A
Iteration:   2%|▏         | 13/779 [00:02<02:42,  4.72it/s][A
Iteration:   2%|▏         | 14/779 [00:03<02:44,  4.66it/s][A
Iteration:   2%|▏         | 15/779 [00:03<02:38,  4.82it/s][A
Iteration

Iteration:  33%|███▎      | 257/779 [00:54<01:49,  4.78it/s][A
Iteration:  33%|███▎      | 258/779 [00:54<01:51,  4.68it/s][A
Iteration:  33%|███▎      | 259/779 [00:54<01:48,  4.79it/s][A
Iteration:  33%|███▎      | 260/779 [00:55<01:51,  4.64it/s][A
Iteration:  34%|███▎      | 261/779 [00:55<01:49,  4.74it/s][A
Iteration:  34%|███▎      | 262/779 [00:55<01:50,  4.68it/s][A
Iteration:  34%|███▍      | 263/779 [00:55<01:47,  4.79it/s][A
Iteration:  34%|███▍      | 264/779 [00:56<01:52,  4.57it/s][A
Iteration:  34%|███▍      | 265/779 [00:56<01:51,  4.62it/s][A
Iteration:  34%|███▍      | 266/779 [00:56<01:51,  4.59it/s][A
Iteration:  34%|███▍      | 267/779 [00:56<01:49,  4.67it/s][A
Iteration:  34%|███▍      | 268/779 [00:56<01:49,  4.65it/s][A
Iteration:  35%|███▍      | 269/779 [00:57<01:46,  4.78it/s][A
Iteration:  35%|███▍      | 270/779 [00:57<01:47,  4.74it/s][A
Iteration:  35%|███▍      | 271/779 [00:57<01:45,  4.81it/s][A
Iteration:  35%|███▍      | 272/779 [00:

Iteration:  66%|██████▌   | 513/779 [01:47<00:54,  4.85it/s][A
Iteration:  66%|██████▌   | 514/779 [01:47<00:54,  4.86it/s][A
Iteration:  66%|██████▌   | 515/779 [01:47<00:53,  4.94it/s][A
Iteration:  66%|██████▌   | 516/779 [01:48<00:53,  4.90it/s][A
Iteration:  66%|██████▋   | 517/779 [01:48<00:52,  4.97it/s][A
Iteration:  66%|██████▋   | 518/779 [01:48<00:54,  4.83it/s][A
Iteration:  67%|██████▋   | 519/779 [01:48<00:53,  4.89it/s][A
Iteration:  67%|██████▋   | 520/779 [01:48<00:53,  4.88it/s][A
Iteration:  67%|██████▋   | 521/779 [01:49<00:52,  4.93it/s][A
Iteration:  67%|██████▋   | 522/779 [01:49<00:52,  4.90it/s][A
Iteration:  67%|██████▋   | 523/779 [01:49<00:52,  4.92it/s][A
Iteration:  67%|██████▋   | 524/779 [01:49<00:52,  4.84it/s][A
Iteration:  67%|██████▋   | 525/779 [01:49<00:52,  4.88it/s][A
Iteration:  68%|██████▊   | 526/779 [01:50<00:53,  4.76it/s][A
Iteration:  68%|██████▊   | 527/779 [01:50<00:51,  4.85it/s][A
Iteration:  68%|██████▊   | 528/779 [01:

Iteration:  99%|█████████▊| 769/779 [02:40<00:02,  4.78it/s][A
Iteration:  99%|█████████▉| 770/779 [02:40<00:02,  4.48it/s][A
Iteration:  99%|█████████▉| 771/779 [02:40<00:01,  4.41it/s][A
Iteration:  99%|█████████▉| 772/779 [02:41<00:01,  4.19it/s][A
Iteration:  99%|█████████▉| 773/779 [02:41<00:01,  4.24it/s][A
Iteration:  99%|█████████▉| 774/779 [02:41<00:01,  4.32it/s][A
Iteration:  99%|█████████▉| 775/779 [02:41<00:00,  4.51it/s][A
Iteration: 100%|█████████▉| 776/779 [02:41<00:00,  4.47it/s][A
Iteration: 100%|█████████▉| 777/779 [02:42<00:00,  4.57it/s][A
Iteration: 100%|██████████| 779/779 [02:42<00:00,  4.80it/s][A

Evaluating:   0%|          | 0/87 [00:00<?, ?it/s][A
Evaluating:   1%|          | 1/87 [00:00<00:09,  9.21it/s][A
Evaluating:   3%|▎         | 3/87 [00:00<00:08,  9.61it/s][A
Evaluating:   5%|▍         | 4/87 [00:00<00:08,  9.56it/s][A
Evaluating:   6%|▌         | 5/87 [00:00<00:08,  9.30it/s][A
Evaluating:   7%|▋         | 6/87 [00:00<00:08,  9.25it/s

779
167.8810574710369
87
38.82420262694359
The eval loss is decreasing!,so we save model!


Epoch:  47%|████▋     | 14/30 [41:06<46:42, 175.16s/it]





Iteration:   0%|          | 0/779 [00:00<?, ?it/s][A
Iteration:   0%|          | 1/779 [00:00<02:40,  4.85it/s][A
Iteration:   0%|          | 2/779 [00:00<02:47,  4.63it/s][A
Iteration:   0%|          | 3/779 [00:00<02:38,  4.90it/s][A
Iteration:   1%|          | 4/779 [00:00<02:48,  4.59it/s][A
Iteration:   1%|          | 5/779 [00:01<02:42,  4.75it/s][A
Iteration:   1%|          | 6/779 [00:01<02:47,  4.61it/s][A
Iteration:   1%|          | 7/779 [00:01<02:41,  4.77it/s][A
Iteration:   1%|          | 8/779 [00:01<02:45,  4.65it/s][A
Iteration:   1%|          | 9/779 [00:01<02:41,  4.76it/s][A
Iteration:   1%|▏         | 10/779 [00:02<02:45,  4.66it/s][A
Iteration:   1%|▏         | 11/779 [00:02<02:40,  4.79it/s][A
Iteration:   2%|▏         | 12/779 [00:02<02:45,  4.64it/s][A
Iteration:   2%|▏         | 13/779 [00:02<02:40,  4.76it/s][A
Iteration:   2%|▏         | 14/779 [00:03<02:49,  4.52it/s][A
Iteration:   2%|▏         | 15/779 [00:03<02:47,  4.57it/s][A
Iteration

Iteration:  33%|███▎      | 257/779 [00:54<01:50,  4.71it/s][A
Iteration:  33%|███▎      | 258/779 [00:54<01:53,  4.58it/s][A
Iteration:  33%|███▎      | 259/779 [00:55<01:51,  4.67it/s][A
Iteration:  33%|███▎      | 260/779 [00:55<01:52,  4.62it/s][A
Iteration:  34%|███▎      | 261/779 [00:55<01:49,  4.75it/s][A
Iteration:  34%|███▎      | 262/779 [00:55<01:50,  4.67it/s][A
Iteration:  34%|███▍      | 263/779 [00:56<01:48,  4.75it/s][A
Iteration:  34%|███▍      | 264/779 [00:56<01:50,  4.67it/s][A
Iteration:  34%|███▍      | 265/779 [00:56<01:47,  4.80it/s][A
Iteration:  34%|███▍      | 266/779 [00:56<01:50,  4.66it/s][A
Iteration:  34%|███▍      | 267/779 [00:56<01:46,  4.79it/s][A
Iteration:  34%|███▍      | 268/779 [00:57<01:49,  4.66it/s][A
Iteration:  35%|███▍      | 269/779 [00:57<01:46,  4.78it/s][A
Iteration:  35%|███▍      | 270/779 [00:57<01:47,  4.71it/s][A
Iteration:  35%|███▍      | 271/779 [00:57<01:44,  4.84it/s][A
Iteration:  35%|███▍      | 272/779 [00:

Iteration:  66%|██████▌   | 513/779 [01:49<00:56,  4.72it/s][A
Iteration:  66%|██████▌   | 514/779 [01:49<00:56,  4.69it/s][A
Iteration:  66%|██████▌   | 515/779 [01:49<00:55,  4.78it/s][A
Iteration:  66%|██████▌   | 516/779 [01:49<00:56,  4.69it/s][A
Iteration:  66%|██████▋   | 517/779 [01:50<00:54,  4.76it/s][A
Iteration:  66%|██████▋   | 518/779 [01:50<00:55,  4.73it/s][A
Iteration:  67%|██████▋   | 519/779 [01:50<00:55,  4.72it/s][A
Iteration:  67%|██████▋   | 520/779 [01:50<00:55,  4.63it/s][A
Iteration:  67%|██████▋   | 521/779 [01:50<00:53,  4.81it/s][A
Iteration:  67%|██████▋   | 522/779 [01:51<00:54,  4.73it/s][A
Iteration:  67%|██████▋   | 523/779 [01:51<00:53,  4.80it/s][A
Iteration:  67%|██████▋   | 524/779 [01:51<00:54,  4.68it/s][A
Iteration:  67%|██████▋   | 525/779 [01:51<00:53,  4.77it/s][A
Iteration:  68%|██████▊   | 526/779 [01:52<00:53,  4.70it/s][A
Iteration:  68%|██████▊   | 527/779 [01:52<00:52,  4.81it/s][A
Iteration:  68%|██████▊   | 528/779 [01:

KeyboardInterrupt: 

In [29]:
model.eval()
eval_loss = 0
nb_eval_steps = 0
for step, batch in enumerate(tqdm(EvalDataLoader, desc="Evaluating")):
    batch = tuple(t.to(device) for t in batch)
    input_ids, segment_ids, attention_ids, label_ids = batch
    if step >=0 :
        break
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_ids, token_type_ids=segment_ids, labels=label_ids)
        loss = outputs.loss
    eval_loss += loss.item()
    nb_eval_steps += 1

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  label_ids = np.full(self.max_seq_length, dtype=np.int, fill_value=-100)
Evaluating:   0%|          | 0/87 [00:00<?, ?it/s]


In [28]:
input_ids[0]

tensor([  101,   103,   125,  1550,  1482,   103,  1138,  1120,  1655,  1141,
         6486,  1150,  2242,  1103,   103,   119,   103,   119, 19241,  1127,
         1255,   103,  1103,   103,  1311,  1105,   103,   158,   119,   156,
          119,  4037,  1112,   170,  1871,   103,   103,  1106,   103,  2025,
         3303,  1118,  1103,   153,  5773,  6098,  1945,   119,  1337,   112,
          188,  1164,   103,  7541,  1104,  1103,  3555,   126,   119,   126,
         1550,  1482,  1104,  5696,   103,  1656,   103,   103,  1311,   117,
         2452,  1106,  1103,  2025,   119,   103,   122,   103,   129,  1550,
         1482,  1104,  5576, 13335, 15447, 16736,  7162,  1686,  1107,   103,
          117,  1103,  2025,  1276,   119, 22171,   131,  2677,  7541,  1104,
          158,   119,   156,   119,  5696,  7162,   103,  1482,   136,   103,
        26018,   103,  1302,   119,   102,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0],

In [30]:
input_ids[0]

tensor([  101,   103,   125,  1550,  1482,   103,  1138,  1120,  1655,  1141,
         6486,  1150,  2242,  1103,   103,   119,   103,   119, 19241,  1127,
         1255,   103,  1103,   103,  1311,  1105,   103,   158,   119,   156,
          119,  4037,  1112,   170,  1871,   103,   103,  1106,   103,  2025,
         3303,  1118,  1103,   153,  5773,  6098,  1945,   119,  1337,   112,
          188,  1164,   103,  7541,  1104,  1103,  3555,   126,   119,   126,
         1550,  1482,  1104,  5696,   103,  1656,   103,   103,  1311,   117,
         2452,  1106,  1103,  2025,   119,   103,   122,   103,   129,  1550,
         1482,  1104,  5576, 13335, 15447, 16736,  7162,  1686,  1107,   103,
          117,  1103,  2025,  1276,   119, 22171,   131,  2677,  7541,  1104,
          158,   119,   156,   119,  5696,  7162,   103,  1482,   136,   103,
        26018,   103,  1302,   119,   102,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0],

In [61]:
nb_eval_steps

44

In [67]:
len(eval_data)

2767

In [69]:
eval_data[0]

{'input_ids': [101,
  16992,
  125,
  1550,
  1482,
  1150,
  1138,
  1120,
  1655,
  1141,
  6486,
  1150,
  2242,
  1103,
  158,
  119,
  156,
  119,
  103,
  1127,
  1255,
  1107,
  103,
  1244,
  103,
  1105,
  1132,
  158,
  119,
  156,
  103,
  4037,
  1112,
  103,
  1871,
  117,
  2452,
  1106,
  1103,
  2025,
  3303,
  1118,
  1103,
  153,
  5773,
  6098,
  1945,
  119,
  103,
  112,
  188,
  1164,
  1210,
  103,
  103,
  103,
  3555,
  103,
  119,
  126,
  1550,
  1482,
  1104,
  5696,
  7162,
  103,
  1103,
  1244,
  103,
  117,
  2452,
  1106,
  1103,
  2025,
  119,
  3517,
  122,
  119,
  129,
  1550,
  1482,
  1104,
  5576,
  103,
  15447,
  16736,
  7162,
  1686,
  103,
  5224,
  103,
  1103,
  2025,
  1276,
  119,
  22171,
  131,
  2677,
  7541,
  103,
  103,
  119,
  103,
  119,
  103,
  7162,
  1138,
  103,
  136,
  1103,
  26018,
  131,
  1302,
  119,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'segment_ids': [0,
  0,
  0,
  0,
  0,
  0,


In [59]:
test = {'tokens':[1,2,3]}

In [60]:
tokens = test['tokens'].copy()

In [61]:
tokens[0] = 4

In [62]:
tokens

[4, 2, 3]

In [63]:
test

{'tokens': [1, 2, 3]}

In [4]:
tokenizer.tokenize('RL has been successful in many fields.')

['R', '##L', 'has', 'been', 'successful', 'in', 'many', 'fields', '.']