In [1]:
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
models_list = [
    'bert-base-cased','roberta-base','distilroberta-base','distilbert-base-cased'
]

for model in models_list:
    nlp = pipeline(
        task="fill-mask",model=model
    )

    print(type(nlp.model))

    preds = nlp(
        f"If you don't {nlp.tokenizer.mask_token} lying, you will end up as a BBC reporter."
    )

    print(
        f"If you don't *** lying, you will end up as a BBC reporter."
    )

    print(type(preds))


    for p in preds:
        print(
            f"Token:{p['token_str']}. Score: {100*p['score']:,.2f}%"
        )
    
    print('\n\n\n')

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<class 'transformers.models.bert.modeling_bert.BertForMaskedLM'>
If you don't *** lying, you will end up as a BBC reporter.
<class 'list'>
Token:stop. Score: 33.34%
Token:keep. Score: 26.61%
Token:start. Score: 9.33%
Token:quit. Score: 6.34%
Token:like. Score: 5.86%




<class 'transformers.models.roberta.modeling_roberta.RobertaForMaskedLM'>
If you don't *** lying, you will end up as a BBC reporter.
<class 'list'>
Token: stop. Score: 75.30%
Token: start. Score: 19.09%
Token: quit. Score: 2.58%
Token: keep. Score: 0.68%
Token: practice. Score: 0.49%






Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<class 'transformers.models.roberta.modeling_roberta.RobertaForMaskedLM'>
If you don't *** lying, you will end up as a BBC reporter.
<class 'list'>
Token: stop. Score: 78.28%
Token: start. Score: 6.65%
Token: keep. Score: 2.80%
Token: admit. Score: 2.49%
Token: accept. Score: 2.04%




<class 'transformers.models.distilbert.modeling_distilbert.DistilBertForMaskedLM'>
If you don't *** lying, you will end up as a BBC reporter.
<class 'list'>
Token:stop. Score: 16.91%
Token:be. Score: 5.68%
Token:mind. Score: 4.38%
Token:start. Score: 4.07%
Token:want. Score: 3.96%






# BERT for Sequence Classification

In [2]:
from transformers import Trainer, TrainingArguments, DistilBertForSequenceClassification, DistilBertTokenizerFast, \
    DataCollatorWithPadding, pipeline

from datasets import load_metric, Dataset
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
with open('./data/snips.train.txt','rb') as f:
    snips_rows = f.readlines()

snips_rows[:20]

[b'listen O\r\n',
 b'to O\r\n',
 b'westbam B-artist\r\n',
 b'alumb O\r\n',
 b'allergic B-album\r\n',
 b'on O\r\n',
 b'google B-service\r\n',
 b'music I-service\r\n',
 b'PlayMusic\r\n',
 b'\r\n',
 b'add O\r\n',
 b'step B-entity_name\r\n',
 b'to I-entity_name\r\n',
 b'me I-entity_name\r\n',
 b'to O\r\n',
 b'the O\r\n',
 b'50 B-playlist\r\n',
 b'cl\xc3\xa1sicos I-playlist\r\n',
 b'playlist O\r\n',
 b'AddToPlaylist\r\n']

In [4]:
utterances = []
tokenized_utterances = []
labels_for_tokens_orig = []
sequence_labels_orig = []

utterance, tokenized_utterance, label_for_utterances = '', [], []

for i,snip_row in enumerate(snips_rows):
    if snip_row == b'\r\n': continue;

    if ' ' not in snip_row.decode():
        sequence_labels_orig.append(snip_row.decode().strip())
        utterances.append(utterance.strip())
        tokenized_utterances.append(tokenized_utterance)
        labels_for_tokens_orig.append(label_for_utterances)
        utterance = ''
        tokenized_utterance = []
        label_for_utterances = []
        continue

    token, token_label = snip_row.decode().split(' ')
    token_label = token_label.strip()

    utterance += f'{token} '

    tokenized_utterance.append(token)
    label_for_utterances.append(token_label)

In [5]:
# 13084 utternace (command to do an action)
# Each utternace consists of words

# list/utterance of words labels, list/utterance of words, string/utterance, entry/utterance of each sequence label
len(labels_for_tokens_orig), len(tokenized_utterances), len(utterances), len(sequence_labels_orig)

(13084, 13084, 13084, 13084)

In [6]:
print(tokenized_utterances[0])
print(labels_for_tokens_orig[0])
print(utterances[0])
print(sequence_labels_orig[0])

['listen', 'to', 'westbam', 'alumb', 'allergic', 'on', 'google', 'music']
['O', 'O', 'B-artist', 'O', 'B-album', 'O', 'B-service', 'I-service']
listen to westbam alumb allergic on google music
PlayMusic


In [7]:
from functools import reduce

unique_token_labels = list(
    set(
        reduce(
            lambda x,y:x+y,
            labels_for_tokens_orig
        )
    )
)
labels_for_tokens = [
    [
        unique_token_labels.index(_) for _ in l
    ] for l in labels_for_tokens_orig
]

print(
    "Number of unique token labels:",unique_token_labels.__len__(),end="\n\n"
)

print(
    "Examples:",unique_token_labels[:5]
)

Number of unique token labels: 72

Examples: ['I-timeRange', 'B-object_part_of_series_type', 'B-playlist', 'I-playlist_owner', 'B-artist']


In [8]:
unique_sequence_labels = list(set(sequence_labels_orig))
sequence_labels = [unique_sequence_labels.index(l) for l in sequence_labels_orig]
print("Unique sequence labels:\n\t",unique_sequence_labels,end="",sep="")

Unique sequence labels:
	['SearchScreeningEvent', 'RateBook', 'AddToPlaylist', 'SearchCreativeWork', 'GetWeather', 'PlayMusic', 'BookRestaurant']

In [9]:
print(tokenized_utterances[0])
print(labels_for_tokens[0])
print([ unique_token_labels[l] for l in labels_for_tokens[0] ])
print(utterances[0])
print(sequence_labels[0])
print(unique_sequence_labels[sequence_labels[0]])

['listen', 'to', 'westbam', 'alumb', 'allergic', 'on', 'google', 'music']
[65, 65, 4, 65, 9, 65, 17, 8]
['O', 'O', 'B-artist', 'O', 'B-album', 'O', 'B-service', 'I-service']
listen to westbam alumb allergic on google music
5
PlayMusic


In [10]:
snips_dataset = Dataset.from_dict(
    dict(
        utterance=utterances,
        label=sequence_labels,
        tokens=tokenized_utterances,
        token_labels=labels_for_tokens
    )
)
snips_dataset = snips_dataset.train_test_split(seed=10,test_size=.2)
snips_dataset['train'][111]

{'utterance': 'play tuomas holopainen s the 21 project',
 'label': 5,
 'tokens': ['play', 'tuomas', 'holopainen', 's', 'the', '21', 'project'],
 'token_labels': [65, 4, 24, 65, 9, 38, 38]}

In [11]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [12]:
def preprocess_function(examples):
    return tokenizer(examples["utterance"], truncation=True)

### Example:
ex = snips_dataset['train'][111]
print(ex)
print(preprocess_function(ex)['input_ids'][1:-1])
print(tokenizer.convert_ids_to_tokens(preprocess_function(ex)['input_ids'][1:-1]))
print(
    tokenizer.decode(
        preprocess_function(ex)['input_ids'],
        skip_special_tokens=True
    )
)

{'utterance': 'play tuomas holopainen s the 21 project', 'label': 5, 'tokens': ['play', 'tuomas', 'holopainen', 's', 'the', '21', 'project'], 'token_labels': [65, 4, 24, 65, 9, 38, 38]}
[2377, 10722, 9626, 2015, 7570, 4135, 4502, 21820, 1055, 1996, 2538, 2622]
['play', 'tu', '##oma', '##s', 'ho', '##lo', '##pa', '##inen', 's', 'the', '21', 'project']
play tuomas holopainen s the 21 project


In [13]:
# Create input_ids & attention_mask by mapping the tokenizer to each example
seq_clf_tokenized_snips = snips_dataset.map(
    preprocess_function, batched=True
)

print()
print("Before:",snips_dataset['train'])
print("After:",seq_clf_tokenized_snips['train'])

print("\nIf Padding was used in tokenizer, all the data points will be padded with zeros till the length of the longest data point")
print([x['input_ids'] for x in seq_clf_tokenized_snips['train']][:5])

Map:   0%|          | 0/10467 [00:00<?, ? examples/s]Map: 100%|██████████| 10467/10467 [00:00<00:00, 14949.89 examples/s]
Map: 100%|██████████| 2617/2617 [00:00<00:00, 16797.42 examples/s]



Before: Dataset({
    features: ['utterance', 'label', 'tokens', 'token_labels'],
    num_rows: 10467
})
After: Dataset({
    features: ['utterance', 'label', 'tokens', 'token_labels', 'input_ids', 'attention_mask'],
    num_rows: 10467
})

If Padding was used in tokenizer, all the data points will be padded with zeros till the length of the longest data point
[[101, 1999, 2698, 2847, 2013, 2085, 2097, 2009, 4542, 2012, 2026, 2783, 2173, 102], [101, 5587, 2023, 2053, 6844, 4328, 24529, 23049, 2072, 8694, 2000, 2026, 2980, 2160, 2377, 9863, 102], [101, 1045, 3446, 2023, 16432, 1014, 102], [101, 2507, 2033, 1996, 3185, 6134, 2005, 3152, 1999, 1996, 2181, 102], [101, 2338, 1037, 3962, 2005, 2416, 2012, 1037, 4825, 2008, 4240, 3869, 1998, 11772, 102]]


In [14]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### Example 1:
data_collator([
    {'input_ids': [101, 2023, 2003, 1037, 24, 102]},
    {'input_ids': [101, 2054, 3185, 102]},
])

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': tensor([[ 101, 2023, 2003, 1037,   24,  102],
        [ 101, 2054, 3185,  102,    0,    0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 0, 0]])}

In [15]:
sequence_clf_model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels = len(unique_sequence_labels)
)

sequence_clf_model.config.id2label = {i: l for i,l in enumerate(unique_sequence_labels)}
sequence_clf_model.config.id2label

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{0: 'SearchScreeningEvent',
 1: 'RateBook',
 2: 'AddToPlaylist',
 3: 'SearchCreativeWork',
 4: 'GetWeather',
 5: 'PlayMusic',
 6: 'BookRestaurant'}

In [16]:
metric = load_metric("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

metric

  metric = load_metric("accuracy")


Metric(name: "accuracy", features: {'predictions': Value(dtype='int32', id=None), 'references': Value(dtype='int32', id=None)}, usage: """
Args:
    predictions (`list` of `int`): Predicted labels.
    references (`list` of `int`): Ground truth labels.
    normalize (`boolean`): If set to False, returns the number of correctly classified samples. Otherwise, returns the fraction of correctly classified samples. Defaults to True.
    sample_weight (`list` of `float`): Sample weights Defaults to None.

Returns:
    accuracy (`float` or `int`): Accuracy score. Minimum possible value is 0. Maximum possible value is 1.0, or the number of examples input, if `normalize` is set to `True`.. A higher score means higher accuracy.

Examples:

    Example 1-A simple example
        >>> accuracy_metric = datasets.load_metric("accuracy")
        >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0])
        >>> print(results)
        {'accuracy': 0.5}

   

In [17]:
epochs = 2

training_args = TrainingArguments(
    output_dir="./snips_clf/results",
    num_train_epochs=epochs,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    load_best_model_at_end=True,

    warmup_steps=len(seq_clf_tokenized_snips['train']) // 5,
    weight_decay= 0.05,

    logging_steps=1,
    log_level='info',
    evaluation_strategy='epoch',
    save_strategy='epoch'
)

trainer = Trainer(
    model=sequence_clf_model,
    args=training_args,
    train_dataset=seq_clf_tokenized_snips['train'],
    eval_dataset=seq_clf_tokenized_snips['test'],
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

# Check if the model is using CUDA (GPU)
is_using_cuda = trainer.args.device.type == 'cuda'
print(f"Is using CUDA: "+'\033[92m'+str(is_using_cuda)+'\033[0m')

Is using CUDA: [92mTrue[0m


In [18]:
# Get initial accuracy
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: token_labels, tokens, utterance. If token_labels, tokens, utterance are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
100%|██████████| 82/82 [00:02<00:00, 33.21it/s]


{'eval_loss': 1.9463006258010864,
 'eval_accuracy': 0.16431027894535727,
 'eval_runtime': 4.4855,
 'eval_samples_per_second': 583.439,
 'eval_steps_per_second': 18.281}

In [19]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: token_labels, tokens, utterance. If token_labels, tokens, utterance are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10,467
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 656
  Number of trainable parameters = 66,958,855
  0%|          | 3/656 [00:00<01:10,  9.31it/s]

{'loss': 1.9267, 'learning_rate': 2.3889154323936934e-08, 'epoch': 0.0}
{'loss': 1.9457, 'learning_rate': 4.777830864787387e-08, 'epoch': 0.01}


  1%|          | 4/656 [00:00<01:08,  9.48it/s]

{'loss': 1.9709, 'learning_rate': 7.16674629718108e-08, 'epoch': 0.01}
{'loss': 1.9498, 'learning_rate': 9.555661729574773e-08, 'epoch': 0.01}


  1%|          | 6/656 [00:00<01:03, 10.19it/s]

{'loss': 1.9305, 'learning_rate': 1.1944577161968468e-07, 'epoch': 0.02}
{'loss': 1.944, 'learning_rate': 1.433349259436216e-07, 'epoch': 0.02}


  1%|          | 8/656 [00:00<01:04, 10.10it/s]

{'loss': 1.9711, 'learning_rate': 1.6722408026755853e-07, 'epoch': 0.02}
{'loss': 1.954, 'learning_rate': 1.9111323459149547e-07, 'epoch': 0.02}


  2%|▏         | 11/656 [00:01<01:07,  9.49it/s]

{'loss': 1.9735, 'learning_rate': 2.150023889154324e-07, 'epoch': 0.03}
{'loss': 1.97, 'learning_rate': 2.3889154323936937e-07, 'epoch': 0.03}


  2%|▏         | 12/656 [00:01<01:08,  9.46it/s]

{'loss': 1.9669, 'learning_rate': 2.6278069756330625e-07, 'epoch': 0.03}
{'loss': 1.9489, 'learning_rate': 2.866698518872432e-07, 'epoch': 0.04}


  2%|▏         | 15/656 [00:01<01:09,  9.25it/s]

{'loss': 1.9399, 'learning_rate': 3.1055900621118013e-07, 'epoch': 0.04}
{'loss': 1.9396, 'learning_rate': 3.3444816053511706e-07, 'epoch': 0.04}


  2%|▏         | 16/656 [00:01<01:07,  9.42it/s]

{'loss': 1.9659, 'learning_rate': 3.58337314859054e-07, 'epoch': 0.05}
{'loss': 1.9471, 'learning_rate': 3.8222646918299094e-07, 'epoch': 0.05}


  3%|▎         | 19/656 [00:01<01:06,  9.54it/s]

{'loss': 1.9596, 'learning_rate': 4.0611562350692793e-07, 'epoch': 0.05}
{'loss': 1.9388, 'learning_rate': 4.300047778308648e-07, 'epoch': 0.05}


  3%|▎         | 20/656 [00:02<01:07,  9.48it/s]

{'loss': 1.9286, 'learning_rate': 4.5389393215480175e-07, 'epoch': 0.06}
{'loss': 1.9369, 'learning_rate': 4.777830864787387e-07, 'epoch': 0.06}


  3%|▎         | 22/656 [00:02<01:07,  9.44it/s]

{'loss': 1.9697, 'learning_rate': 5.016722408026756e-07, 'epoch': 0.06}
{'loss': 1.9634, 'learning_rate': 5.255613951266125e-07, 'epoch': 0.07}


  4%|▎         | 24/656 [00:02<01:06,  9.56it/s]

{'loss': 1.9474, 'learning_rate': 5.494505494505495e-07, 'epoch': 0.07}
{'loss': 1.94, 'learning_rate': 5.733397037744864e-07, 'epoch': 0.07}


  4%|▍         | 27/656 [00:02<01:06,  9.48it/s]

{'loss': 1.9345, 'learning_rate': 5.972288580984234e-07, 'epoch': 0.08}
{'loss': 1.9414, 'learning_rate': 6.211180124223603e-07, 'epoch': 0.08}


  4%|▍         | 28/656 [00:02<01:09,  9.04it/s]

{'loss': 1.9608, 'learning_rate': 6.450071667462972e-07, 'epoch': 0.08}
{'loss': 1.9563, 'learning_rate': 6.688963210702341e-07, 'epoch': 0.09}


  5%|▍         | 30/656 [00:03<01:08,  9.12it/s]

{'loss': 1.9353, 'learning_rate': 6.92785475394171e-07, 'epoch': 0.09}
{'loss': 1.9541, 'learning_rate': 7.16674629718108e-07, 'epoch': 0.09}


  5%|▍         | 32/656 [00:03<01:10,  8.86it/s]

{'loss': 1.9418, 'learning_rate': 7.405637840420449e-07, 'epoch': 0.09}
{'loss': 1.9302, 'learning_rate': 7.644529383659819e-07, 'epoch': 0.1}


  5%|▌         | 34/656 [00:03<01:09,  8.96it/s]

{'loss': 1.9607, 'learning_rate': 7.883420926899189e-07, 'epoch': 0.1}
{'loss': 1.9453, 'learning_rate': 8.122312470138559e-07, 'epoch': 0.1}


  6%|▌         | 37/656 [00:03<01:06,  9.32it/s]

{'loss': 1.9738, 'learning_rate': 8.361204013377926e-07, 'epoch': 0.11}
{'loss': 1.9188, 'learning_rate': 8.600095556617296e-07, 'epoch': 0.11}


  6%|▌         | 38/656 [00:04<01:07,  9.16it/s]

{'loss': 1.9416, 'learning_rate': 8.838987099856666e-07, 'epoch': 0.11}
{'loss': 1.9699, 'learning_rate': 9.077878643096035e-07, 'epoch': 0.12}


  6%|▋         | 41/656 [00:04<01:03,  9.68it/s]

{'loss': 1.9847, 'learning_rate': 9.316770186335405e-07, 'epoch': 0.12}
{'loss': 1.9309, 'learning_rate': 9.555661729574775e-07, 'epoch': 0.12}


  7%|▋         | 43/656 [00:04<01:02,  9.80it/s]

{'loss': 1.9345, 'learning_rate': 9.794553272814141e-07, 'epoch': 0.12}
{'loss': 1.9472, 'learning_rate': 1.0033444816053512e-06, 'epoch': 0.13}


  7%|▋         | 45/656 [00:04<01:03,  9.62it/s]

{'loss': 1.9503, 'learning_rate': 1.0272336359292883e-06, 'epoch': 0.13}
{'loss': 1.9574, 'learning_rate': 1.051122790253225e-06, 'epoch': 0.13}


  7%|▋         | 46/656 [00:04<01:05,  9.31it/s]

{'loss': 1.9175, 'learning_rate': 1.0750119445771621e-06, 'epoch': 0.14}
{'loss': 1.9349, 'learning_rate': 1.098901098901099e-06, 'epoch': 0.14}


  7%|▋         | 48/656 [00:05<01:08,  8.89it/s]

{'loss': 1.9193, 'learning_rate': 1.1227902532250359e-06, 'epoch': 0.14}
{'loss': 1.9385, 'learning_rate': 1.1466794075489728e-06, 'epoch': 0.15}


  8%|▊         | 51/656 [00:05<01:04,  9.41it/s]

{'loss': 1.9418, 'learning_rate': 1.1705685618729096e-06, 'epoch': 0.15}
{'loss': 1.9292, 'learning_rate': 1.1944577161968467e-06, 'epoch': 0.15}


  8%|▊         | 52/656 [00:05<01:08,  8.79it/s]

{'loss': 1.929, 'learning_rate': 1.2183468705207836e-06, 'epoch': 0.16}
{'loss': 1.9335, 'learning_rate': 1.2422360248447205e-06, 'epoch': 0.16}


  8%|▊         | 55/656 [00:05<01:07,  8.91it/s]

{'loss': 1.9182, 'learning_rate': 1.2661251791686574e-06, 'epoch': 0.16}
{'loss': 1.9426, 'learning_rate': 1.2900143334925945e-06, 'epoch': 0.16}


  9%|▊         | 56/656 [00:06<01:07,  8.91it/s]

{'loss': 1.9506, 'learning_rate': 1.3139034878165314e-06, 'epoch': 0.17}
{'loss': 1.9395, 'learning_rate': 1.3377926421404683e-06, 'epoch': 0.17}


  9%|▉         | 58/656 [00:06<01:06,  8.94it/s]

{'loss': 1.9148, 'learning_rate': 1.3616817964644054e-06, 'epoch': 0.17}
{'loss': 1.9273, 'learning_rate': 1.385570950788342e-06, 'epoch': 0.18}


  9%|▉         | 61/656 [00:06<01:05,  9.12it/s]

{'loss': 1.9379, 'learning_rate': 1.4094601051122791e-06, 'epoch': 0.18}
{'loss': 1.9369, 'learning_rate': 1.433349259436216e-06, 'epoch': 0.18}


 10%|▉         | 63/656 [00:06<01:02,  9.50it/s]

{'loss': 1.9437, 'learning_rate': 1.4572384137601529e-06, 'epoch': 0.19}
{'loss': 1.9239, 'learning_rate': 1.4811275680840898e-06, 'epoch': 0.19}


 10%|▉         | 65/656 [00:06<01:00,  9.72it/s]

{'loss': 1.9553, 'learning_rate': 1.5050167224080269e-06, 'epoch': 0.19}
{'loss': 1.9351, 'learning_rate': 1.5289058767319638e-06, 'epoch': 0.2}


 10%|█         | 67/656 [00:07<01:01,  9.53it/s]

{'loss': 1.9156, 'learning_rate': 1.5527950310559006e-06, 'epoch': 0.2}
{'loss': 1.9048, 'learning_rate': 1.5766841853798377e-06, 'epoch': 0.2}


 11%|█         | 69/656 [00:07<01:02,  9.40it/s]

{'loss': 1.9222, 'learning_rate': 1.6005733397037744e-06, 'epoch': 0.2}
{'loss': 1.9087, 'learning_rate': 1.6244624940277117e-06, 'epoch': 0.21}


 11%|█         | 71/656 [00:07<01:02,  9.33it/s]

{'loss': 1.9182, 'learning_rate': 1.6483516483516484e-06, 'epoch': 0.21}
{'loss': 1.9216, 'learning_rate': 1.6722408026755853e-06, 'epoch': 0.21}


 11%|█         | 72/656 [00:07<01:05,  8.92it/s]

{'loss': 1.9099, 'learning_rate': 1.6961299569995224e-06, 'epoch': 0.22}
{'loss': 1.9108, 'learning_rate': 1.7200191113234592e-06, 'epoch': 0.22}


 11%|█▏        | 75/656 [00:08<01:07,  8.67it/s]

{'loss': 1.9242, 'learning_rate': 1.7439082656473961e-06, 'epoch': 0.22}
{'loss': 1.9169, 'learning_rate': 1.7677974199713332e-06, 'epoch': 0.23}


 12%|█▏        | 77/656 [00:08<01:03,  9.05it/s]

{'loss': 1.949, 'learning_rate': 1.7916865742952701e-06, 'epoch': 0.23}
{'loss': 1.9059, 'learning_rate': 1.815575728619207e-06, 'epoch': 0.23}


 12%|█▏        | 79/656 [00:08<01:03,  9.02it/s]

{'loss': 1.9139, 'learning_rate': 1.839464882943144e-06, 'epoch': 0.23}
{'loss': 1.9295, 'learning_rate': 1.863354037267081e-06, 'epoch': 0.24}


 12%|█▏        | 81/656 [00:08<01:02,  9.25it/s]

{'loss': 1.9342, 'learning_rate': 1.8872431915910176e-06, 'epoch': 0.24}
{'loss': 1.9032, 'learning_rate': 1.911132345914955e-06, 'epoch': 0.24}


 13%|█▎        | 83/656 [00:08<00:59,  9.58it/s]

{'loss': 1.9174, 'learning_rate': 1.935021500238892e-06, 'epoch': 0.25}
{'loss': 1.9161, 'learning_rate': 1.9589106545628283e-06, 'epoch': 0.25}


 13%|█▎        | 85/656 [00:09<01:00,  9.46it/s]

{'loss': 1.9168, 'learning_rate': 1.9827998088867656e-06, 'epoch': 0.25}
{'loss': 1.892, 'learning_rate': 2.0066889632107025e-06, 'epoch': 0.26}


 13%|█▎        | 87/656 [00:09<01:00,  9.39it/s]

{'loss': 1.888, 'learning_rate': 2.0305781175346394e-06, 'epoch': 0.26}
{'loss': 1.9004, 'learning_rate': 2.0544672718585767e-06, 'epoch': 0.26}


 13%|█▎        | 88/656 [00:09<01:00,  9.39it/s]

{'loss': 1.9002, 'learning_rate': 2.078356426182513e-06, 'epoch': 0.27}
{'loss': 1.8859, 'learning_rate': 2.10224558050645e-06, 'epoch': 0.27}


 14%|█▎        | 90/656 [00:09<01:03,  8.98it/s]

{'loss': 1.8914, 'learning_rate': 2.1261347348303873e-06, 'epoch': 0.27}
{'loss': 1.874, 'learning_rate': 2.1500238891543242e-06, 'epoch': 0.27}


 14%|█▍        | 92/656 [00:09<01:02,  9.09it/s]

{'loss': 1.9457, 'learning_rate': 2.173913043478261e-06, 'epoch': 0.28}
{'loss': 1.8917, 'learning_rate': 2.197802197802198e-06, 'epoch': 0.28}


 14%|█▍        | 95/656 [00:10<00:58,  9.61it/s]

{'loss': 1.8862, 'learning_rate': 2.221691352126135e-06, 'epoch': 0.28}
{'loss': 1.8822, 'learning_rate': 2.2455805064500718e-06, 'epoch': 0.29}


 15%|█▍        | 96/656 [00:10<01:02,  8.91it/s]

{'loss': 1.8852, 'learning_rate': 2.269469660774009e-06, 'epoch': 0.29}
{'loss': 1.8689, 'learning_rate': 2.2933588150979455e-06, 'epoch': 0.29}


 15%|█▌        | 99/656 [00:10<01:00,  9.17it/s]

{'loss': 1.8954, 'learning_rate': 2.3172479694218824e-06, 'epoch': 0.3}
{'loss': 1.8876, 'learning_rate': 2.3411371237458193e-06, 'epoch': 0.3}


 15%|█▌        | 101/656 [00:10<00:56,  9.86it/s]

{'loss': 1.8488, 'learning_rate': 2.3650262780697566e-06, 'epoch': 0.3}
{'loss': 1.9036, 'learning_rate': 2.3889154323936935e-06, 'epoch': 0.3}


 16%|█▌        | 102/656 [00:11<00:57,  9.63it/s]

{'loss': 1.855, 'learning_rate': 2.41280458671763e-06, 'epoch': 0.31}
{'loss': 1.8587, 'learning_rate': 2.4366937410415673e-06, 'epoch': 0.31}


 16%|█▌        | 104/656 [00:11<00:57,  9.54it/s]

{'loss': 1.8735, 'learning_rate': 2.460582895365504e-06, 'epoch': 0.31}
{'loss': 1.8862, 'learning_rate': 2.484472049689441e-06, 'epoch': 0.32}


 16%|█▌        | 106/656 [00:11<00:58,  9.38it/s]

{'loss': 1.8048, 'learning_rate': 2.508361204013378e-06, 'epoch': 0.32}
{'loss': 1.8583, 'learning_rate': 2.5322503583373148e-06, 'epoch': 0.32}


 16%|█▋        | 108/656 [00:11<00:59,  9.27it/s]

{'loss': 1.831, 'learning_rate': 2.5561395126612517e-06, 'epoch': 0.33}
{'loss': 1.8617, 'learning_rate': 2.580028666985189e-06, 'epoch': 0.33}


 17%|█▋        | 110/656 [00:11<00:56,  9.66it/s]

{'loss': 1.885, 'learning_rate': 2.603917821309126e-06, 'epoch': 0.33}
{'loss': 1.8256, 'learning_rate': 2.6278069756330627e-06, 'epoch': 0.34}


 17%|█▋        | 113/656 [00:12<00:56,  9.68it/s]

{'loss': 1.8126, 'learning_rate': 2.6516961299569996e-06, 'epoch': 0.34}
{'loss': 1.8079, 'learning_rate': 2.6755852842809365e-06, 'epoch': 0.34}


 17%|█▋        | 114/656 [00:12<00:57,  9.39it/s]

{'loss': 1.7989, 'learning_rate': 2.6994744386048734e-06, 'epoch': 0.34}
{'loss': 1.8544, 'learning_rate': 2.7233635929288107e-06, 'epoch': 0.35}


 18%|█▊        | 116/656 [00:12<00:57,  9.47it/s]

{'loss': 1.8198, 'learning_rate': 2.747252747252747e-06, 'epoch': 0.35}
{'loss': 1.8098, 'learning_rate': 2.771141901576684e-06, 'epoch': 0.35}


 18%|█▊        | 118/656 [00:12<00:57,  9.32it/s]

{'loss': 1.8073, 'learning_rate': 2.7950310559006214e-06, 'epoch': 0.36}
{'loss': 1.801, 'learning_rate': 2.8189202102245582e-06, 'epoch': 0.36}


 18%|█▊        | 121/656 [00:12<00:55,  9.57it/s]

{'loss': 1.7791, 'learning_rate': 2.842809364548495e-06, 'epoch': 0.36}
{'loss': 1.806, 'learning_rate': 2.866698518872432e-06, 'epoch': 0.37}


 19%|█▉        | 123/656 [00:13<00:54,  9.76it/s]

{'loss': 1.7939, 'learning_rate': 2.890587673196369e-06, 'epoch': 0.37}
{'loss': 1.7607, 'learning_rate': 2.9144768275203058e-06, 'epoch': 0.37}


 19%|█▉        | 125/656 [00:13<00:56,  9.41it/s]

{'loss': 1.7814, 'learning_rate': 2.938365981844243e-06, 'epoch': 0.38}
{'loss': 1.7664, 'learning_rate': 2.9622551361681795e-06, 'epoch': 0.38}


 19%|█▉        | 127/656 [00:13<00:54,  9.64it/s]

{'loss': 1.8041, 'learning_rate': 2.9861442904921164e-06, 'epoch': 0.38}
{'loss': 1.7489, 'learning_rate': 3.0100334448160537e-06, 'epoch': 0.38}


 20%|█▉        | 128/656 [00:13<00:56,  9.36it/s]

{'loss': 1.8045, 'learning_rate': 3.0339225991399906e-06, 'epoch': 0.39}
{'loss': 1.7597, 'learning_rate': 3.0578117534639275e-06, 'epoch': 0.39}


 20%|█▉        | 131/656 [00:14<00:54,  9.65it/s]

{'loss': 1.7913, 'learning_rate': 3.0817009077878644e-06, 'epoch': 0.39}
{'loss': 1.6741, 'learning_rate': 3.1055900621118013e-06, 'epoch': 0.4}


 20%|██        | 132/656 [00:14<00:54,  9.67it/s]

{'loss': 1.7284, 'learning_rate': 3.1294792164357386e-06, 'epoch': 0.4}
{'loss': 1.7456, 'learning_rate': 3.1533683707596755e-06, 'epoch': 0.4}


 20%|██        | 134/656 [00:14<00:54,  9.60it/s]

{'loss': 1.7138, 'learning_rate': 3.1772575250836123e-06, 'epoch': 0.41}
{'loss': 1.7537, 'learning_rate': 3.201146679407549e-06, 'epoch': 0.41}


 21%|██        | 136/656 [00:14<00:58,  8.83it/s]

{'loss': 1.7496, 'learning_rate': 3.2250358337314857e-06, 'epoch': 0.41}
{'loss': 1.6861, 'learning_rate': 3.2489249880554234e-06, 'epoch': 0.41}


 21%|██        | 138/656 [00:14<00:58,  8.92it/s]

{'loss': 1.7245, 'learning_rate': 3.2728141423793603e-06, 'epoch': 0.42}
{'loss': 1.6861, 'learning_rate': 3.2967032967032968e-06, 'epoch': 0.42}


 21%|██▏       | 141/656 [00:15<00:54,  9.51it/s]

{'loss': 1.6751, 'learning_rate': 3.3205924510272337e-06, 'epoch': 0.42}
{'loss': 1.6759, 'learning_rate': 3.3444816053511705e-06, 'epoch': 0.43}


 22%|██▏       | 143/656 [00:15<00:55,  9.28it/s]

{'loss': 1.6421, 'learning_rate': 3.3683707596751074e-06, 'epoch': 0.43}
{'loss': 1.657, 'learning_rate': 3.3922599139990447e-06, 'epoch': 0.43}


 22%|██▏       | 144/656 [00:15<00:56,  9.03it/s]

{'loss': 1.6776, 'learning_rate': 3.4161490683229816e-06, 'epoch': 0.44}
{'loss': 1.6624, 'learning_rate': 3.4400382226469185e-06, 'epoch': 0.44}


 22%|██▏       | 147/656 [00:15<00:52,  9.62it/s]

{'loss': 1.653, 'learning_rate': 3.4639273769708554e-06, 'epoch': 0.44}
{'loss': 1.6057, 'learning_rate': 3.4878165312947923e-06, 'epoch': 0.45}


 23%|██▎       | 148/656 [00:15<00:52,  9.65it/s]

{'loss': 1.6894, 'learning_rate': 3.511705685618729e-06, 'epoch': 0.45}
{'loss': 1.6673, 'learning_rate': 3.5355948399426665e-06, 'epoch': 0.45}


 23%|██▎       | 151/656 [00:16<00:50,  9.91it/s]

{'loss': 1.5893, 'learning_rate': 3.5594839942666033e-06, 'epoch': 0.45}
{'loss': 1.6495, 'learning_rate': 3.5833731485905402e-06, 'epoch': 0.46}


 23%|██▎       | 152/656 [00:16<00:57,  8.77it/s]

{'loss': 1.6287, 'learning_rate': 3.607262302914477e-06, 'epoch': 0.46}
{'loss': 1.55, 'learning_rate': 3.631151457238414e-06, 'epoch': 0.46}


 23%|██▎       | 154/656 [00:16<00:59,  8.48it/s]

{'loss': 1.6543, 'learning_rate': 3.6550406115623505e-06, 'epoch': 0.47}
{'loss': 1.6905, 'learning_rate': 3.678929765886288e-06, 'epoch': 0.47}


 24%|██▍       | 157/656 [00:16<00:54,  9.17it/s]

{'loss': 1.5758, 'learning_rate': 3.702818920210225e-06, 'epoch': 0.47}
{'loss': 1.6206, 'learning_rate': 3.726708074534162e-06, 'epoch': 0.48}


 24%|██▍       | 158/656 [00:16<00:53,  9.35it/s]

{'loss': 1.5486, 'learning_rate': 3.7505972288580984e-06, 'epoch': 0.48}
{'loss': 1.6219, 'learning_rate': 3.7744863831820353e-06, 'epoch': 0.48}


 25%|██▍       | 161/656 [00:17<00:51,  9.59it/s]

{'loss': 1.5724, 'learning_rate': 3.798375537505972e-06, 'epoch': 0.48}
{'loss': 1.537, 'learning_rate': 3.82226469182991e-06, 'epoch': 0.49}


 25%|██▍       | 162/656 [00:17<00:51,  9.58it/s]

{'loss': 1.5348, 'learning_rate': 3.846153846153847e-06, 'epoch': 0.49}
{'loss': 1.5125, 'learning_rate': 3.870043000477784e-06, 'epoch': 0.49}


 25%|██▌       | 164/656 [00:17<00:51,  9.55it/s]

{'loss': 1.5312, 'learning_rate': 3.8939321548017206e-06, 'epoch': 0.5}
{'loss': 1.4656, 'learning_rate': 3.917821309125657e-06, 'epoch': 0.5}


 25%|██▌       | 166/656 [00:17<00:54,  9.03it/s]

{'loss': 1.5727, 'learning_rate': 3.9417104634495935e-06, 'epoch': 0.5}
{'loss': 1.4627, 'learning_rate': 3.965599617773531e-06, 'epoch': 0.51}


 26%|██▌       | 168/656 [00:18<00:58,  8.40it/s]

{'loss': 1.3611, 'learning_rate': 3.989488772097468e-06, 'epoch': 0.51}
{'loss': 1.3675, 'learning_rate': 4.013377926421405e-06, 'epoch': 0.51}


 26%|██▌       | 170/656 [00:18<00:59,  8.12it/s]

{'loss': 1.4279, 'learning_rate': 4.037267080745342e-06, 'epoch': 0.52}
{'loss': 1.5083, 'learning_rate': 4.061156235069279e-06, 'epoch': 0.52}


 26%|██▌       | 172/656 [00:18<01:00,  7.95it/s]

{'loss': 1.5383, 'learning_rate': 4.085045389393216e-06, 'epoch': 0.52}
{'loss': 1.3496, 'learning_rate': 4.108934543717153e-06, 'epoch': 0.52}


 27%|██▋       | 174/656 [00:18<01:00,  7.97it/s]

{'loss': 1.4552, 'learning_rate': 4.132823698041089e-06, 'epoch': 0.53}
{'loss': 1.358, 'learning_rate': 4.156712852365026e-06, 'epoch': 0.53}


 27%|██▋       | 176/656 [00:19<00:59,  8.08it/s]

{'loss': 1.4589, 'learning_rate': 4.180602006688963e-06, 'epoch': 0.53}
{'loss': 1.355, 'learning_rate': 4.2044911610129e-06, 'epoch': 0.54}


 27%|██▋       | 178/656 [00:19<00:58,  8.16it/s]

{'loss': 1.4459, 'learning_rate': 4.228380315336837e-06, 'epoch': 0.54}
{'loss': 1.3521, 'learning_rate': 4.252269469660775e-06, 'epoch': 0.54}


 28%|██▊       | 181/656 [00:19<00:53,  8.81it/s]

{'loss': 1.3067, 'learning_rate': 4.2761586239847116e-06, 'epoch': 0.55}
{'loss': 1.4514, 'learning_rate': 4.3000477783086484e-06, 'epoch': 0.55}


 28%|██▊       | 183/656 [00:19<00:50,  9.31it/s]

{'loss': 1.185, 'learning_rate': 4.323936932632585e-06, 'epoch': 0.55}
{'loss': 1.4299, 'learning_rate': 4.347826086956522e-06, 'epoch': 0.55}


 28%|██▊       | 184/656 [00:19<00:51,  9.17it/s]

{'loss': 1.2749, 'learning_rate': 4.371715241280458e-06, 'epoch': 0.56}
{'loss': 1.3263, 'learning_rate': 4.395604395604396e-06, 'epoch': 0.56}


 29%|██▊       | 187/656 [00:20<00:51,  9.13it/s]

{'loss': 1.3305, 'learning_rate': 4.419493549928333e-06, 'epoch': 0.56}
{'loss': 1.3561, 'learning_rate': 4.44338270425227e-06, 'epoch': 0.57}


 29%|██▊       | 188/656 [00:20<00:51,  9.03it/s]

{'loss': 1.3089, 'learning_rate': 4.467271858576207e-06, 'epoch': 0.57}
{'loss': 1.2679, 'learning_rate': 4.4911610129001435e-06, 'epoch': 0.57}


 29%|██▉       | 191/656 [00:20<00:51,  8.99it/s]

{'loss': 1.358, 'learning_rate': 4.51505016722408e-06, 'epoch': 0.58}
{'loss': 1.3667, 'learning_rate': 4.538939321548018e-06, 'epoch': 0.58}


 29%|██▉       | 193/656 [00:20<00:49,  9.40it/s]

{'loss': 1.2514, 'learning_rate': 4.562828475871954e-06, 'epoch': 0.58}
{'loss': 1.2462, 'learning_rate': 4.586717630195891e-06, 'epoch': 0.59}


 30%|██▉       | 195/656 [00:21<00:49,  9.37it/s]

{'loss': 1.3234, 'learning_rate': 4.610606784519828e-06, 'epoch': 0.59}
{'loss': 1.2219, 'learning_rate': 4.634495938843765e-06, 'epoch': 0.59}


 30%|███       | 197/656 [00:21<00:50,  9.11it/s]

{'loss': 1.3101, 'learning_rate': 4.658385093167702e-06, 'epoch': 0.59}
{'loss': 1.2506, 'learning_rate': 4.682274247491639e-06, 'epoch': 0.6}


 30%|███       | 199/656 [00:21<00:48,  9.41it/s]

{'loss': 1.1405, 'learning_rate': 4.706163401815576e-06, 'epoch': 0.6}
{'loss': 1.1816, 'learning_rate': 4.730052556139513e-06, 'epoch': 0.6}


 30%|███       | 200/656 [00:21<00:51,  8.78it/s]

{'loss': 1.2081, 'learning_rate': 4.75394171046345e-06, 'epoch': 0.61}
{'loss': 1.2399, 'learning_rate': 4.777830864787387e-06, 'epoch': 0.61}


 31%|███       | 203/656 [00:22<00:50,  8.92it/s]

{'loss': 1.1376, 'learning_rate': 4.801720019111324e-06, 'epoch': 0.61}
{'loss': 1.1446, 'learning_rate': 4.82560917343526e-06, 'epoch': 0.62}


 31%|███▏      | 205/656 [00:22<00:48,  9.34it/s]

{'loss': 1.1924, 'learning_rate': 4.849498327759198e-06, 'epoch': 0.62}
{'loss': 1.1323, 'learning_rate': 4.8733874820831345e-06, 'epoch': 0.62}


 31%|███▏      | 206/656 [00:22<00:47,  9.43it/s]

{'loss': 1.1225, 'learning_rate': 4.897276636407071e-06, 'epoch': 0.62}
{'loss': 1.1083, 'learning_rate': 4.921165790731008e-06, 'epoch': 0.63}


 32%|███▏      | 209/656 [00:22<00:48,  9.15it/s]

{'loss': 1.2296, 'learning_rate': 4.945054945054945e-06, 'epoch': 0.63}
{'loss': 1.1288, 'learning_rate': 4.968944099378882e-06, 'epoch': 0.63}


 32%|███▏      | 210/656 [00:22<00:49,  9.05it/s]

{'loss': 1.1339, 'learning_rate': 4.99283325370282e-06, 'epoch': 0.64}
{'loss': 1.1024, 'learning_rate': 5.016722408026756e-06, 'epoch': 0.64}


 32%|███▏      | 213/656 [00:23<00:48,  9.22it/s]

{'loss': 1.1125, 'learning_rate': 5.040611562350693e-06, 'epoch': 0.64}
{'loss': 1.174, 'learning_rate': 5.0645007166746296e-06, 'epoch': 0.65}


 33%|███▎      | 215/656 [00:23<00:48,  9.18it/s]

{'loss': 0.9924, 'learning_rate': 5.0883898709985665e-06, 'epoch': 0.65}
{'loss': 0.8769, 'learning_rate': 5.112279025322503e-06, 'epoch': 0.65}


 33%|███▎      | 216/656 [00:23<00:47,  9.28it/s]

{'loss': 1.1046, 'learning_rate': 5.136168179646441e-06, 'epoch': 0.66}
{'loss': 1.0391, 'learning_rate': 5.160057333970378e-06, 'epoch': 0.66}


 33%|███▎      | 218/656 [00:23<00:54,  8.03it/s]

{'loss': 0.9625, 'learning_rate': 5.183946488294315e-06, 'epoch': 0.66}
{'loss': 0.9882, 'learning_rate': 5.207835642618252e-06, 'epoch': 0.66}


 34%|███▎      | 220/656 [00:23<00:49,  8.86it/s]

{'loss': 1.0559, 'learning_rate': 5.231724796942189e-06, 'epoch': 0.67}
{'loss': 1.0434, 'learning_rate': 5.2556139512661255e-06, 'epoch': 0.67}


 34%|███▍      | 223/656 [00:24<00:49,  8.78it/s]

{'loss': 1.0261, 'learning_rate': 5.279503105590062e-06, 'epoch': 0.67}
{'loss': 0.9971, 'learning_rate': 5.303392259913999e-06, 'epoch': 0.68}


 34%|███▍      | 225/656 [00:24<00:46,  9.35it/s]

{'loss': 0.9414, 'learning_rate': 5.327281414237936e-06, 'epoch': 0.68}
{'loss': 0.9474, 'learning_rate': 5.351170568561873e-06, 'epoch': 0.68}


 34%|███▍      | 226/656 [00:24<00:46,  9.30it/s]

{'loss': 0.921, 'learning_rate': 5.37505972288581e-06, 'epoch': 0.69}
{'loss': 0.9763, 'learning_rate': 5.398948877209747e-06, 'epoch': 0.69}


 35%|███▍      | 229/656 [00:24<00:45,  9.33it/s]

{'loss': 0.8748, 'learning_rate': 5.4228380315336845e-06, 'epoch': 0.69}
{'loss': 0.9321, 'learning_rate': 5.446727185857621e-06, 'epoch': 0.7}


 35%|███▌      | 230/656 [00:25<00:45,  9.33it/s]

{'loss': 0.9199, 'learning_rate': 5.4706163401815574e-06, 'epoch': 0.7}
{'loss': 0.9421, 'learning_rate': 5.494505494505494e-06, 'epoch': 0.7}


 35%|███▌      | 232/656 [00:25<00:47,  8.86it/s]

{'loss': 0.866, 'learning_rate': 5.518394648829431e-06, 'epoch': 0.7}
{'loss': 0.8803, 'learning_rate': 5.542283803153368e-06, 'epoch': 0.71}


 36%|███▌      | 235/656 [00:25<00:46,  9.02it/s]

{'loss': 0.8453, 'learning_rate': 5.566172957477306e-06, 'epoch': 0.71}
{'loss': 0.877, 'learning_rate': 5.590062111801243e-06, 'epoch': 0.71}


 36%|███▌      | 237/656 [00:25<00:46,  8.97it/s]

{'loss': 0.9354, 'learning_rate': 5.61395126612518e-06, 'epoch': 0.72}
{'loss': 0.7843, 'learning_rate': 5.6378404204491165e-06, 'epoch': 0.72}


 36%|███▋      | 238/656 [00:25<00:45,  9.12it/s]

{'loss': 0.7139, 'learning_rate': 5.661729574773053e-06, 'epoch': 0.72}
{'loss': 0.9736, 'learning_rate': 5.68561872909699e-06, 'epoch': 0.73}


 37%|███▋      | 240/656 [00:26<00:49,  8.36it/s]

{'loss': 0.9235, 'learning_rate': 5.709507883420927e-06, 'epoch': 0.73}
{'loss': 0.7926, 'learning_rate': 5.733397037744864e-06, 'epoch': 0.73}


 37%|███▋      | 242/656 [00:26<00:51,  8.09it/s]

{'loss': 0.8072, 'learning_rate': 5.757286192068801e-06, 'epoch': 0.73}
{'loss': 0.9472, 'learning_rate': 5.781175346392738e-06, 'epoch': 0.74}


 37%|███▋      | 245/656 [00:26<00:45,  9.03it/s]

{'loss': 0.7191, 'learning_rate': 5.805064500716675e-06, 'epoch': 0.74}
{'loss': 0.8121, 'learning_rate': 5.8289536550406116e-06, 'epoch': 0.74}


 38%|███▊      | 247/656 [00:26<00:45,  9.02it/s]

{'loss': 0.8116, 'learning_rate': 5.852842809364549e-06, 'epoch': 0.75}
{'loss': 0.936, 'learning_rate': 5.876731963688486e-06, 'epoch': 0.75}


 38%|███▊      | 248/656 [00:27<00:47,  8.68it/s]

{'loss': 0.7821, 'learning_rate': 5.900621118012423e-06, 'epoch': 0.75}
{'loss': 0.8738, 'learning_rate': 5.924510272336359e-06, 'epoch': 0.76}


 38%|███▊      | 250/656 [00:27<00:46,  8.67it/s]

{'loss': 0.7646, 'learning_rate': 5.948399426660296e-06, 'epoch': 0.76}
{'loss': 0.6275, 'learning_rate': 5.972288580984233e-06, 'epoch': 0.76}


 38%|███▊      | 252/656 [00:27<00:48,  8.28it/s]

{'loss': 0.7988, 'learning_rate': 5.996177735308171e-06, 'epoch': 0.77}
{'loss': 0.7549, 'learning_rate': 6.0200668896321075e-06, 'epoch': 0.77}


 39%|███▊      | 254/656 [00:27<00:46,  8.71it/s]

{'loss': 0.6499, 'learning_rate': 6.043956043956044e-06, 'epoch': 0.77}
{'loss': 0.6524, 'learning_rate': 6.067845198279981e-06, 'epoch': 0.77}


 39%|███▉      | 257/656 [00:28<00:42,  9.34it/s]

{'loss': 0.6578, 'learning_rate': 6.091734352603918e-06, 'epoch': 0.78}
{'loss': 0.7045, 'learning_rate': 6.115623506927855e-06, 'epoch': 0.78}


 39%|███▉      | 258/656 [00:28<00:44,  8.86it/s]

{'loss': 0.8461, 'learning_rate': 6.139512661251792e-06, 'epoch': 0.78}
{'loss': 0.7144, 'learning_rate': 6.163401815575729e-06, 'epoch': 0.79}


 40%|███▉      | 261/656 [00:28<00:41,  9.50it/s]

{'loss': 0.8192, 'learning_rate': 6.187290969899666e-06, 'epoch': 0.79}
{'loss': 0.7231, 'learning_rate': 6.2111801242236025e-06, 'epoch': 0.79}


 40%|████      | 263/656 [00:28<00:43,  9.02it/s]

{'loss': 0.6767, 'learning_rate': 6.2350692785475394e-06, 'epoch': 0.8}
{'loss': 0.6049, 'learning_rate': 6.258958432871477e-06, 'epoch': 0.8}


 40%|████      | 264/656 [00:28<00:43,  9.00it/s]

{'loss': 0.5289, 'learning_rate': 6.282847587195413e-06, 'epoch': 0.8}
{'loss': 0.7546, 'learning_rate': 6.306736741519351e-06, 'epoch': 0.8}


 41%|████      | 266/656 [00:29<00:44,  8.68it/s]

{'loss': 0.5536, 'learning_rate': 6.330625895843287e-06, 'epoch': 0.81}
{'loss': 0.6977, 'learning_rate': 6.354515050167225e-06, 'epoch': 0.81}


 41%|████      | 269/656 [00:29<00:45,  8.42it/s]

{'loss': 0.6618, 'learning_rate': 6.378404204491162e-06, 'epoch': 0.81}
{'loss': 0.5477, 'learning_rate': 6.402293358815098e-06, 'epoch': 0.82}


 41%|████      | 270/656 [00:29<00:44,  8.71it/s]

{'loss': 0.618, 'learning_rate': 6.426182513139035e-06, 'epoch': 0.82}
{'loss': 0.5348, 'learning_rate': 6.450071667462971e-06, 'epoch': 0.82}


 42%|████▏     | 273/656 [00:29<00:41,  9.21it/s]

{'loss': 0.5572, 'learning_rate': 6.473960821786909e-06, 'epoch': 0.83}
{'loss': 0.561, 'learning_rate': 6.497849976110847e-06, 'epoch': 0.83}


 42%|████▏     | 275/656 [00:30<00:41,  9.16it/s]

{'loss': 0.5105, 'learning_rate': 6.521739130434783e-06, 'epoch': 0.83}
{'loss': 0.5904, 'learning_rate': 6.545628284758721e-06, 'epoch': 0.84}


 42%|████▏     | 276/656 [00:30<00:40,  9.33it/s]

{'loss': 0.6076, 'learning_rate': 6.569517439082657e-06, 'epoch': 0.84}
{'loss': 0.4004, 'learning_rate': 6.5934065934065935e-06, 'epoch': 0.84}


 42%|████▏     | 278/656 [00:30<00:39,  9.54it/s]

{'loss': 0.585, 'learning_rate': 6.61729574773053e-06, 'epoch': 0.84}
{'loss': 0.5642, 'learning_rate': 6.641184902054467e-06, 'epoch': 0.85}


 43%|████▎     | 281/656 [00:30<00:39,  9.58it/s]

{'loss': 0.5159, 'learning_rate': 6.665074056378405e-06, 'epoch': 0.85}
{'loss': 0.5557, 'learning_rate': 6.688963210702341e-06, 'epoch': 0.85}


 43%|████▎     | 282/656 [00:30<00:41,  9.00it/s]

{'loss': 0.543, 'learning_rate': 6.712852365026279e-06, 'epoch': 0.86}
{'loss': 0.5917, 'learning_rate': 6.736741519350215e-06, 'epoch': 0.86}


 43%|████▎     | 284/656 [00:31<00:40,  9.19it/s]

{'loss': 0.4902, 'learning_rate': 6.7606306736741526e-06, 'epoch': 0.86}
{'loss': 0.5242, 'learning_rate': 6.7845198279980895e-06, 'epoch': 0.87}


 44%|████▍     | 287/656 [00:31<00:39,  9.38it/s]

{'loss': 0.4805, 'learning_rate': 6.808408982322026e-06, 'epoch': 0.87}
{'loss': 0.4684, 'learning_rate': 6.832298136645963e-06, 'epoch': 0.87}


 44%|████▍     | 289/656 [00:31<00:38,  9.63it/s]

{'loss': 0.4764, 'learning_rate': 6.856187290969899e-06, 'epoch': 0.88}
{'loss': 0.475, 'learning_rate': 6.880076445293837e-06, 'epoch': 0.88}


 44%|████▍     | 290/656 [00:31<00:39,  9.37it/s]

{'loss': 0.4907, 'learning_rate': 6.903965599617773e-06, 'epoch': 0.88}
{'loss': 0.5197, 'learning_rate': 6.927854753941711e-06, 'epoch': 0.88}


 45%|████▍     | 292/656 [00:31<00:39,  9.30it/s]

{'loss': 0.5681, 'learning_rate': 6.9517439082656485e-06, 'epoch': 0.89}
{'loss': 0.4408, 'learning_rate': 6.9756330625895845e-06, 'epoch': 0.89}


 45%|████▍     | 294/656 [00:32<00:41,  8.81it/s]

{'loss': 0.4852, 'learning_rate': 6.999522216913522e-06, 'epoch': 0.89}
{'loss': 0.5004, 'learning_rate': 7.023411371237458e-06, 'epoch': 0.9}


 45%|████▌     | 296/656 [00:32<00:41,  8.73it/s]

{'loss': 0.4978, 'learning_rate': 7.047300525561395e-06, 'epoch': 0.9}
{'loss': 0.4542, 'learning_rate': 7.071189679885333e-06, 'epoch': 0.9}


 45%|████▌     | 298/656 [00:32<00:39,  9.07it/s]

{'loss': 0.4582, 'learning_rate': 7.095078834209269e-06, 'epoch': 0.91}
{'loss': 0.5422, 'learning_rate': 7.118967988533207e-06, 'epoch': 0.91}


 46%|████▌     | 300/656 [00:32<00:37,  9.39it/s]

{'loss': 0.3877, 'learning_rate': 7.142857142857143e-06, 'epoch': 0.91}
{'loss': 0.4086, 'learning_rate': 7.1667462971810804e-06, 'epoch': 0.91}


 46%|████▌     | 302/656 [00:33<00:38,  9.21it/s]

{'loss': 0.5334, 'learning_rate': 7.1906354515050165e-06, 'epoch': 0.92}
{'loss': 0.409, 'learning_rate': 7.214524605828954e-06, 'epoch': 0.92}


 46%|████▋     | 305/656 [00:33<00:38,  9.06it/s]

{'loss': 0.3972, 'learning_rate': 7.238413760152891e-06, 'epoch': 0.92}
{'loss': 0.4754, 'learning_rate': 7.262302914476828e-06, 'epoch': 0.93}


 47%|████▋     | 306/656 [00:33<00:38,  9.05it/s]

{'loss': 0.3222, 'learning_rate': 7.286192068800765e-06, 'epoch': 0.93}
{'loss': 0.3494, 'learning_rate': 7.310081223124701e-06, 'epoch': 0.93}


 47%|████▋     | 308/656 [00:33<00:40,  8.60it/s]

{'loss': 0.4136, 'learning_rate': 7.333970377448639e-06, 'epoch': 0.94}
{'loss': 0.385, 'learning_rate': 7.357859531772576e-06, 'epoch': 0.94}


 47%|████▋     | 310/656 [00:34<00:39,  8.72it/s]

{'loss': 0.3619, 'learning_rate': 7.381748686096512e-06, 'epoch': 0.94}
{'loss': 0.3704, 'learning_rate': 7.40563784042045e-06, 'epoch': 0.95}


 48%|████▊     | 312/656 [00:34<00:39,  8.68it/s]

{'loss': 0.4076, 'learning_rate': 7.429526994744386e-06, 'epoch': 0.95}
{'loss': 0.3771, 'learning_rate': 7.453416149068324e-06, 'epoch': 0.95}


 48%|████▊     | 314/656 [00:34<00:39,  8.65it/s]

{'loss': 0.267, 'learning_rate': 7.47730530339226e-06, 'epoch': 0.95}
{'loss': 0.3422, 'learning_rate': 7.501194457716197e-06, 'epoch': 0.96}


 48%|████▊     | 316/656 [00:34<00:39,  8.71it/s]

{'loss': 0.3007, 'learning_rate': 7.5250836120401346e-06, 'epoch': 0.96}
{'loss': 0.3206, 'learning_rate': 7.548972766364071e-06, 'epoch': 0.96}


 49%|████▊     | 319/656 [00:34<00:36,  9.17it/s]

{'loss': 0.3344, 'learning_rate': 7.572861920688008e-06, 'epoch': 0.97}
{'loss': 0.4749, 'learning_rate': 7.596751075011944e-06, 'epoch': 0.97}


 49%|████▉     | 321/656 [00:35<00:37,  8.93it/s]

{'loss': 0.2868, 'learning_rate': 7.620640229335882e-06, 'epoch': 0.97}
{'loss': 0.3336, 'learning_rate': 7.64452938365982e-06, 'epoch': 0.98}


 49%|████▉     | 323/656 [00:35<00:37,  9.00it/s]

{'loss': 0.3806, 'learning_rate': 7.668418537983756e-06, 'epoch': 0.98}
{'loss': 0.3628, 'learning_rate': 7.692307692307694e-06, 'epoch': 0.98}


 49%|████▉     | 324/656 [00:35<00:39,  8.51it/s]

{'loss': 0.3891, 'learning_rate': 7.71619684663163e-06, 'epoch': 0.98}
{'loss': 0.3342, 'learning_rate': 7.740086000955567e-06, 'epoch': 0.99}


 50%|████▉     | 326/656 [00:35<00:38,  8.50it/s]

{'loss': 0.3843, 'learning_rate': 7.763975155279503e-06, 'epoch': 0.99}
{'loss': 0.3193, 'learning_rate': 7.787864309603441e-06, 'epoch': 0.99}


 50%|█████     | 328/656 [00:35<00:31, 10.28it/s]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: token_labels, tokens, utterance. If token_labels, tokens, utterance are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32


{'loss': 0.3105, 'learning_rate': 7.811753463927377e-06, 'epoch': 1.0}
{'loss': 0.371, 'learning_rate': 7.835642618251313e-06, 'epoch': 1.0}



 50%|█████     | 328/656 [00:38<00:31, 10.28it/s]Saving model checkpoint to ./snips_clf/results\checkpoint-328
Configuration saved in ./snips_clf/results\checkpoint-328\config.json


{'eval_loss': 0.2652447819709778, 'eval_accuracy': 0.9709591134887275, 'eval_runtime': 2.5287, 'eval_samples_per_second': 1034.932, 'eval_steps_per_second': 32.428, 'epoch': 1.0}


Model weights saved in ./snips_clf/results\checkpoint-328\pytorch_model.bin
 50%|█████     | 330/656 [00:44<09:16,  1.71s/it]

{'loss': 0.2604, 'learning_rate': 7.859531772575251e-06, 'epoch': 1.0}
{'loss': 0.3013, 'learning_rate': 7.883420926899187e-06, 'epoch': 1.01}


 51%|█████     | 332/656 [00:44<05:44,  1.06s/it]

{'loss': 0.389, 'learning_rate': 7.907310081223125e-06, 'epoch': 1.01}
{'loss': 0.3135, 'learning_rate': 7.931199235547062e-06, 'epoch': 1.01}


 51%|█████     | 335/656 [00:44<02:36,  2.05it/s]

{'loss': 0.3878, 'learning_rate': 7.955088389870998e-06, 'epoch': 1.02}
{'loss': 0.2748, 'learning_rate': 7.978977544194936e-06, 'epoch': 1.02}


 51%|█████▏    | 337/656 [00:45<01:41,  3.14it/s]

{'loss': 0.2845, 'learning_rate': 8.002866698518872e-06, 'epoch': 1.02}
{'loss': 0.2607, 'learning_rate': 8.02675585284281e-06, 'epoch': 1.02}


 52%|█████▏    | 338/656 [00:45<01:41,  3.14it/s]

{'loss': 0.2682, 'learning_rate': 8.050645007166746e-06, 'epoch': 1.03}
{'loss': 0.2425, 'learning_rate': 8.074534161490684e-06, 'epoch': 1.03}


 52%|█████▏    | 340/656 [00:45<01:14,  4.21it/s]

{'loss': 0.2835, 'learning_rate': 8.098423315814621e-06, 'epoch': 1.03}
{'loss': 0.4487, 'learning_rate': 8.122312470138558e-06, 'epoch': 1.04}


 52%|█████▏    | 343/656 [00:45<00:50,  6.22it/s]

{'loss': 0.2192, 'learning_rate': 8.146201624462495e-06, 'epoch': 1.04}
{'loss': 0.2482, 'learning_rate': 8.170090778786431e-06, 'epoch': 1.04}


 53%|█████▎    | 345/656 [00:45<00:43,  7.21it/s]

{'loss': 0.2316, 'learning_rate': 8.193979933110369e-06, 'epoch': 1.05}
{'loss': 0.2707, 'learning_rate': 8.217869087434307e-06, 'epoch': 1.05}


 53%|█████▎    | 347/656 [00:46<00:39,  7.84it/s]

{'loss': 0.2727, 'learning_rate': 8.241758241758243e-06, 'epoch': 1.05}
{'loss': 0.208, 'learning_rate': 8.265647396082179e-06, 'epoch': 1.05}


 53%|█████▎    | 348/656 [00:46<00:38,  7.96it/s]

{'loss': 0.4748, 'learning_rate': 8.289536550406115e-06, 'epoch': 1.06}
{'loss': 0.2997, 'learning_rate': 8.313425704730053e-06, 'epoch': 1.06}


 54%|█████▎    | 351/656 [00:46<00:34,  8.83it/s]

{'loss': 0.2212, 'learning_rate': 8.337314859053989e-06, 'epoch': 1.06}
{'loss': 0.2565, 'learning_rate': 8.361204013377926e-06, 'epoch': 1.07}


 54%|█████▎    | 352/656 [00:46<00:34,  8.90it/s]

{'loss': 0.1939, 'learning_rate': 8.385093167701864e-06, 'epoch': 1.07}
{'loss': 0.2293, 'learning_rate': 8.4089823220258e-06, 'epoch': 1.07}


 54%|█████▍    | 355/656 [00:47<00:33,  9.01it/s]

{'loss': 0.1817, 'learning_rate': 8.432871476349738e-06, 'epoch': 1.08}
{'loss': 0.1563, 'learning_rate': 8.456760630673674e-06, 'epoch': 1.08}


 54%|█████▍    | 356/656 [00:47<00:34,  8.62it/s]

{'loss': 0.2951, 'learning_rate': 8.480649784997612e-06, 'epoch': 1.08}
{'loss': 0.1756, 'learning_rate': 8.50453893932155e-06, 'epoch': 1.09}


 55%|█████▍    | 358/656 [00:47<00:32,  9.24it/s]

{'loss': 0.1915, 'learning_rate': 8.528428093645485e-06, 'epoch': 1.09}
{'loss': 0.1722, 'learning_rate': 8.552317247969423e-06, 'epoch': 1.09}


 55%|█████▍    | 360/656 [00:47<00:32,  9.14it/s]

{'loss': 0.2047, 'learning_rate': 8.576206402293359e-06, 'epoch': 1.09}
{'loss': 0.1943, 'learning_rate': 8.600095556617297e-06, 'epoch': 1.1}


 55%|█████▌    | 363/656 [00:47<00:32,  9.08it/s]

{'loss': 0.1807, 'learning_rate': 8.623984710941233e-06, 'epoch': 1.1}
{'loss': 0.1821, 'learning_rate': 8.64787386526517e-06, 'epoch': 1.1}


 56%|█████▌    | 365/656 [00:48<00:32,  9.07it/s]

{'loss': 0.3204, 'learning_rate': 8.671763019589108e-06, 'epoch': 1.11}
{'loss': 0.2215, 'learning_rate': 8.695652173913044e-06, 'epoch': 1.11}


 56%|█████▌    | 366/656 [00:48<00:31,  9.24it/s]

{'loss': 0.1705, 'learning_rate': 8.71954132823698e-06, 'epoch': 1.11}
{'loss': 0.2266, 'learning_rate': 8.743430482560916e-06, 'epoch': 1.12}


 56%|█████▌    | 368/656 [00:48<00:30,  9.37it/s]

{'loss': 0.1796, 'learning_rate': 8.767319636884854e-06, 'epoch': 1.12}
{'loss': 0.2024, 'learning_rate': 8.791208791208792e-06, 'epoch': 1.12}


 57%|█████▋    | 371/656 [00:48<00:30,  9.45it/s]

{'loss': 0.1975, 'learning_rate': 8.815097945532728e-06, 'epoch': 1.12}
{'loss': 0.1812, 'learning_rate': 8.838987099856666e-06, 'epoch': 1.13}


 57%|█████▋    | 373/656 [00:48<00:29,  9.47it/s]

{'loss': 0.3382, 'learning_rate': 8.862876254180602e-06, 'epoch': 1.13}
{'loss': 0.1592, 'learning_rate': 8.88676540850454e-06, 'epoch': 1.13}


 57%|█████▋    | 374/656 [00:49<00:29,  9.54it/s]

{'loss': 0.2277, 'learning_rate': 8.910654562828476e-06, 'epoch': 1.14}
{'loss': 0.2466, 'learning_rate': 8.934543717152413e-06, 'epoch': 1.14}


 57%|█████▋    | 376/656 [00:49<00:30,  9.10it/s]

{'loss': 0.2524, 'learning_rate': 8.958432871476351e-06, 'epoch': 1.14}
{'loss': 0.1181, 'learning_rate': 8.982322025800287e-06, 'epoch': 1.15}


 58%|█████▊    | 378/656 [00:49<00:33,  8.25it/s]

{'loss': 0.1678, 'learning_rate': 9.006211180124225e-06, 'epoch': 1.15}
{'loss': 0.1495, 'learning_rate': 9.03010033444816e-06, 'epoch': 1.15}


 58%|█████▊    | 380/656 [00:49<00:34,  8.07it/s]

{'loss': 0.2356, 'learning_rate': 9.053989488772099e-06, 'epoch': 1.16}
{'loss': 0.1533, 'learning_rate': 9.077878643096036e-06, 'epoch': 1.16}


 58%|█████▊    | 382/656 [00:50<00:34,  8.05it/s]

{'loss': 0.1786, 'learning_rate': 9.101767797419972e-06, 'epoch': 1.16}
{'loss': 0.1569, 'learning_rate': 9.125656951743908e-06, 'epoch': 1.16}


 59%|█████▊    | 384/656 [00:50<00:30,  8.96it/s]

{'loss': 0.1534, 'learning_rate': 9.149546106067846e-06, 'epoch': 1.17}
{'loss': 0.1385, 'learning_rate': 9.173435260391782e-06, 'epoch': 1.17}


 59%|█████▉    | 386/656 [00:50<00:29,  9.13it/s]

{'loss': 0.1666, 'learning_rate': 9.197324414715718e-06, 'epoch': 1.17}
{'loss': 0.1546, 'learning_rate': 9.221213569039656e-06, 'epoch': 1.18}


 59%|█████▉    | 388/656 [00:50<00:30,  8.79it/s]

{'loss': 0.199, 'learning_rate': 9.245102723363594e-06, 'epoch': 1.18}
{'loss': 0.1852, 'learning_rate': 9.26899187768753e-06, 'epoch': 1.18}


 60%|█████▉    | 391/656 [00:51<00:28,  9.26it/s]

{'loss': 0.1474, 'learning_rate': 9.292881032011467e-06, 'epoch': 1.19}
{'loss': 0.154, 'learning_rate': 9.316770186335403e-06, 'epoch': 1.19}


 60%|█████▉    | 393/656 [00:51<00:27,  9.41it/s]

{'loss': 0.1132, 'learning_rate': 9.340659340659341e-06, 'epoch': 1.19}
{'loss': 0.2411, 'learning_rate': 9.364548494983277e-06, 'epoch': 1.2}


 60%|██████    | 394/656 [00:51<00:28,  9.22it/s]

{'loss': 0.2572, 'learning_rate': 9.388437649307215e-06, 'epoch': 1.2}
{'loss': 0.1199, 'learning_rate': 9.412326803631153e-06, 'epoch': 1.2}


 61%|██████    | 397/656 [00:51<00:27,  9.33it/s]

{'loss': 0.1353, 'learning_rate': 9.436215957955089e-06, 'epoch': 1.2}
{'loss': 0.1569, 'learning_rate': 9.460105112279026e-06, 'epoch': 1.21}


 61%|██████    | 398/656 [00:51<00:27,  9.26it/s]

{'loss': 0.2536, 'learning_rate': 9.483994266602962e-06, 'epoch': 1.21}
{'loss': 0.2308, 'learning_rate': 9.5078834209269e-06, 'epoch': 1.21}


 61%|██████    | 401/656 [00:52<00:27,  9.24it/s]

{'loss': 0.1248, 'learning_rate': 9.531772575250838e-06, 'epoch': 1.22}
{'loss': 0.1925, 'learning_rate': 9.555661729574774e-06, 'epoch': 1.22}


 61%|██████▏   | 402/656 [00:52<00:27,  9.07it/s]

{'loss': 0.1138, 'learning_rate': 9.57955088389871e-06, 'epoch': 1.22}
{'loss': 0.2342, 'learning_rate': 9.603440038222648e-06, 'epoch': 1.23}


 62%|██████▏   | 404/656 [00:52<00:28,  8.95it/s]

{'loss': 0.1132, 'learning_rate': 9.627329192546584e-06, 'epoch': 1.23}
{'loss': 0.0966, 'learning_rate': 9.65121834687052e-06, 'epoch': 1.23}


 62%|██████▏   | 407/656 [00:52<00:27,  9.20it/s]

{'loss': 0.1074, 'learning_rate': 9.675107501194458e-06, 'epoch': 1.23}
{'loss': 0.113, 'learning_rate': 9.698996655518395e-06, 'epoch': 1.24}


 62%|██████▏   | 408/656 [00:52<00:27,  9.05it/s]

{'loss': 0.1652, 'learning_rate': 9.722885809842331e-06, 'epoch': 1.24}
{'loss': 0.0992, 'learning_rate': 9.746774964166269e-06, 'epoch': 1.24}


 62%|██████▎   | 410/656 [00:53<00:27,  9.10it/s]

{'loss': 0.2001, 'learning_rate': 9.770664118490205e-06, 'epoch': 1.25}
{'loss': 0.168, 'learning_rate': 9.794553272814143e-06, 'epoch': 1.25}


 63%|██████▎   | 412/656 [00:53<00:26,  9.23it/s]

{'loss': 0.1251, 'learning_rate': 9.81844242713808e-06, 'epoch': 1.25}
{'loss': 0.1308, 'learning_rate': 9.842331581462017e-06, 'epoch': 1.26}


 63%|██████▎   | 414/656 [00:53<00:27,  8.76it/s]

{'loss': 0.1621, 'learning_rate': 9.866220735785954e-06, 'epoch': 1.26}
{'loss': 0.234, 'learning_rate': 9.89010989010989e-06, 'epoch': 1.26}


 64%|██████▎   | 417/656 [00:53<00:26,  8.88it/s]

{'loss': 0.1495, 'learning_rate': 9.913999044433828e-06, 'epoch': 1.27}
{'loss': 0.135, 'learning_rate': 9.937888198757764e-06, 'epoch': 1.27}


 64%|██████▍   | 419/656 [00:54<00:25,  9.35it/s]

{'loss': 0.2454, 'learning_rate': 9.961777353081702e-06, 'epoch': 1.27}
{'loss': 0.1358, 'learning_rate': 9.98566650740564e-06, 'epoch': 1.27}


 64%|██████▍   | 420/656 [00:54<00:26,  8.96it/s]

{'loss': 0.2036, 'learning_rate': 1.0009555661729576e-05, 'epoch': 1.28}
{'loss': 0.2281, 'learning_rate': 1.0033444816053512e-05, 'epoch': 1.28}


 64%|██████▍   | 422/656 [00:54<00:27,  8.54it/s]

{'loss': 0.1377, 'learning_rate': 1.005733397037745e-05, 'epoch': 1.28}
{'loss': 0.1191, 'learning_rate': 1.0081223124701385e-05, 'epoch': 1.29}


 65%|██████▍   | 425/656 [00:54<00:26,  8.80it/s]

{'loss': 0.1247, 'learning_rate': 1.0105112279025323e-05, 'epoch': 1.29}
{'loss': 0.1239, 'learning_rate': 1.0129001433349259e-05, 'epoch': 1.29}


 65%|██████▌   | 427/656 [00:55<00:25,  8.97it/s]

{'loss': 0.1854, 'learning_rate': 1.0152890587673197e-05, 'epoch': 1.3}
{'loss': 0.1509, 'learning_rate': 1.0176779741997133e-05, 'epoch': 1.3}


 65%|██████▌   | 429/656 [00:55<00:24,  9.38it/s]

{'loss': 0.1007, 'learning_rate': 1.020066889632107e-05, 'epoch': 1.3}
{'loss': 0.1286, 'learning_rate': 1.0224558050645007e-05, 'epoch': 1.3}


 66%|██████▌   | 430/656 [00:55<00:26,  8.42it/s]

{'loss': 0.2167, 'learning_rate': 1.0248447204968944e-05, 'epoch': 1.31}
{'loss': 0.0925, 'learning_rate': 1.0272336359292882e-05, 'epoch': 1.31}


 66%|██████▌   | 432/656 [00:55<00:27,  8.24it/s]

{'loss': 0.0756, 'learning_rate': 1.0296225513616818e-05, 'epoch': 1.31}
{'loss': 0.1776, 'learning_rate': 1.0320114667940756e-05, 'epoch': 1.32}


 66%|██████▌   | 434/656 [00:55<00:24,  9.03it/s]

{'loss': 0.1143, 'learning_rate': 1.0344003822264692e-05, 'epoch': 1.32}
{'loss': 0.1152, 'learning_rate': 1.036789297658863e-05, 'epoch': 1.32}


 67%|██████▋   | 437/656 [00:56<00:23,  9.52it/s]

{'loss': 0.0901, 'learning_rate': 1.0391782130912567e-05, 'epoch': 1.33}
{'loss': 0.0933, 'learning_rate': 1.0415671285236503e-05, 'epoch': 1.33}


 67%|██████▋   | 438/656 [00:56<00:23,  9.19it/s]

{'loss': 0.0881, 'learning_rate': 1.0439560439560441e-05, 'epoch': 1.33}
{'loss': 0.1835, 'learning_rate': 1.0463449593884377e-05, 'epoch': 1.34}


 67%|██████▋   | 440/656 [00:56<00:25,  8.60it/s]

{'loss': 0.242, 'learning_rate': 1.0487338748208313e-05, 'epoch': 1.34}
{'loss': 0.0738, 'learning_rate': 1.0511227902532251e-05, 'epoch': 1.34}


 67%|██████▋   | 442/656 [00:56<00:25,  8.43it/s]

{'loss': 0.1321, 'learning_rate': 1.0535117056856187e-05, 'epoch': 1.34}
{'loss': 0.1029, 'learning_rate': 1.0559006211180125e-05, 'epoch': 1.35}


 68%|██████▊   | 445/656 [00:57<00:24,  8.63it/s]

{'loss': 0.093, 'learning_rate': 1.058289536550406e-05, 'epoch': 1.35}
{'loss': 0.1945, 'learning_rate': 1.0606784519827999e-05, 'epoch': 1.35}


 68%|██████▊   | 446/656 [00:57<00:23,  8.99it/s]

{'loss': 0.0788, 'learning_rate': 1.0630673674151935e-05, 'epoch': 1.36}
{'loss': 0.099, 'learning_rate': 1.0654562828475872e-05, 'epoch': 1.36}


 68%|██████▊   | 449/656 [00:57<00:21,  9.44it/s]

{'loss': 0.1167, 'learning_rate': 1.067845198279981e-05, 'epoch': 1.36}
{'loss': 0.0681, 'learning_rate': 1.0702341137123746e-05, 'epoch': 1.37}


 69%|██████▉   | 451/656 [00:57<00:21,  9.41it/s]

{'loss': 0.066, 'learning_rate': 1.0726230291447684e-05, 'epoch': 1.37}
{'loss': 0.1391, 'learning_rate': 1.075011944577162e-05, 'epoch': 1.37}


 69%|██████▉   | 453/656 [00:57<00:20,  9.67it/s]

{'loss': 0.0997, 'learning_rate': 1.0774008600095558e-05, 'epoch': 1.38}
{'loss': 0.2157, 'learning_rate': 1.0797897754419494e-05, 'epoch': 1.38}


 69%|██████▉   | 454/656 [00:58<00:22,  8.82it/s]

{'loss': 0.1249, 'learning_rate': 1.0821786908743431e-05, 'epoch': 1.38}
{'loss': 0.1265, 'learning_rate': 1.0845676063067369e-05, 'epoch': 1.38}


 70%|██████▉   | 456/656 [00:58<00:24,  8.32it/s]

{'loss': 0.1728, 'learning_rate': 1.0869565217391305e-05, 'epoch': 1.39}
{'loss': 0.0959, 'learning_rate': 1.0893454371715243e-05, 'epoch': 1.39}


 70%|██████▉   | 458/656 [00:58<00:24,  8.15it/s]

{'loss': 0.0739, 'learning_rate': 1.0917343526039179e-05, 'epoch': 1.39}
{'loss': 0.1238, 'learning_rate': 1.0941232680363115e-05, 'epoch': 1.4}


 70%|███████   | 460/656 [00:58<00:23,  8.18it/s]

{'loss': 0.0637, 'learning_rate': 1.0965121834687053e-05, 'epoch': 1.4}
{'loss': 0.0745, 'learning_rate': 1.0989010989010989e-05, 'epoch': 1.4}


 71%|███████   | 463/656 [00:59<00:21,  8.83it/s]

{'loss': 0.1662, 'learning_rate': 1.1012900143334926e-05, 'epoch': 1.41}
{'loss': 0.087, 'learning_rate': 1.1036789297658862e-05, 'epoch': 1.41}


 71%|███████   | 464/656 [00:59<00:21,  8.84it/s]

{'loss': 0.071, 'learning_rate': 1.10606784519828e-05, 'epoch': 1.41}
{'loss': 0.0597, 'learning_rate': 1.1084567606306736e-05, 'epoch': 1.41}


 71%|███████   | 467/656 [00:59<00:21,  8.84it/s]

{'loss': 0.1305, 'learning_rate': 1.1108456760630674e-05, 'epoch': 1.42}
{'loss': 0.0708, 'learning_rate': 1.1132345914954612e-05, 'epoch': 1.42}


 71%|███████▏  | 468/656 [00:59<00:22,  8.52it/s]

{'loss': 0.2166, 'learning_rate': 1.1156235069278548e-05, 'epoch': 1.42}
{'loss': 0.1265, 'learning_rate': 1.1180124223602485e-05, 'epoch': 1.43}


 72%|███████▏  | 470/656 [01:00<00:21,  8.83it/s]

{'loss': 0.1665, 'learning_rate': 1.1204013377926421e-05, 'epoch': 1.43}
{'loss': 0.158, 'learning_rate': 1.122790253225036e-05, 'epoch': 1.43}


 72%|███████▏  | 472/656 [01:00<00:19,  9.48it/s]

{'loss': 0.0537, 'learning_rate': 1.1251791686574297e-05, 'epoch': 1.44}
{'loss': 0.0576, 'learning_rate': 1.1275680840898233e-05, 'epoch': 1.44}


 72%|███████▏  | 475/656 [01:00<00:18,  9.56it/s]

{'loss': 0.1469, 'learning_rate': 1.129956999522217e-05, 'epoch': 1.44}
{'loss': 0.0517, 'learning_rate': 1.1323459149546107e-05, 'epoch': 1.45}


 73%|███████▎  | 477/656 [01:00<00:20,  8.95it/s]

{'loss': 0.0647, 'learning_rate': 1.1347348303870044e-05, 'epoch': 1.45}
{'loss': 0.0622, 'learning_rate': 1.137123745819398e-05, 'epoch': 1.45}


 73%|███████▎  | 478/656 [01:00<00:19,  9.07it/s]

{'loss': 0.0733, 'learning_rate': 1.1395126612517917e-05, 'epoch': 1.45}
{'loss': 0.0537, 'learning_rate': 1.1419015766841854e-05, 'epoch': 1.46}


 73%|███████▎  | 481/656 [01:01<00:18,  9.56it/s]

{'loss': 0.1211, 'learning_rate': 1.144290492116579e-05, 'epoch': 1.46}
{'loss': 0.1072, 'learning_rate': 1.1466794075489728e-05, 'epoch': 1.46}


 73%|███████▎  | 482/656 [01:01<00:18,  9.66it/s]

{'loss': 0.0538, 'learning_rate': 1.1490683229813664e-05, 'epoch': 1.47}
{'loss': 0.1574, 'learning_rate': 1.1514572384137602e-05, 'epoch': 1.47}


 74%|███████▍  | 484/656 [01:01<00:18,  9.32it/s]

{'loss': 0.0591, 'learning_rate': 1.153846153846154e-05, 'epoch': 1.47}
{'loss': 0.0547, 'learning_rate': 1.1562350692785476e-05, 'epoch': 1.48}


 74%|███████▍  | 486/656 [01:01<00:19,  8.87it/s]

{'loss': 0.059, 'learning_rate': 1.1586239847109413e-05, 'epoch': 1.48}
{'loss': 0.057, 'learning_rate': 1.161012900143335e-05, 'epoch': 1.48}


 75%|███████▍  | 489/656 [01:02<00:18,  9.02it/s]

{'loss': 0.0537, 'learning_rate': 1.1634018155757287e-05, 'epoch': 1.48}
{'loss': 0.0713, 'learning_rate': 1.1657907310081223e-05, 'epoch': 1.49}


 75%|███████▍  | 491/656 [01:02<00:17,  9.39it/s]

{'loss': 0.052, 'learning_rate': 1.168179646440516e-05, 'epoch': 1.49}
{'loss': 0.122, 'learning_rate': 1.1705685618729099e-05, 'epoch': 1.49}


 75%|███████▌  | 493/656 [01:02<00:17,  9.28it/s]

{'loss': 0.1216, 'learning_rate': 1.1729574773053035e-05, 'epoch': 1.5}
{'loss': 0.0542, 'learning_rate': 1.1753463927376972e-05, 'epoch': 1.5}


 75%|███████▌  | 494/656 [01:02<00:18,  8.89it/s]

{'loss': 0.1785, 'learning_rate': 1.1777353081700908e-05, 'epoch': 1.5}
{'loss': 0.13, 'learning_rate': 1.1801242236024846e-05, 'epoch': 1.51}


 76%|███████▌  | 497/656 [01:02<00:17,  8.94it/s]

{'loss': 0.0668, 'learning_rate': 1.1825131390348782e-05, 'epoch': 1.51}
{'loss': 0.0479, 'learning_rate': 1.1849020544672718e-05, 'epoch': 1.51}


 76%|███████▌  | 498/656 [01:03<00:17,  9.16it/s]

{'loss': 0.2249, 'learning_rate': 1.1872909698996656e-05, 'epoch': 1.52}
{'loss': 0.0493, 'learning_rate': 1.1896798853320592e-05, 'epoch': 1.52}


 76%|███████▌  | 500/656 [01:03<00:16,  9.26it/s]

{'loss': 0.0407, 'learning_rate': 1.192068800764453e-05, 'epoch': 1.52}
{'loss': 0.0437, 'learning_rate': 1.1944577161968466e-05, 'epoch': 1.52}


 77%|███████▋  | 503/656 [01:03<00:16,  9.42it/s]

{'loss': 0.0429, 'learning_rate': 1.1968466316292403e-05, 'epoch': 1.53}
{'loss': 0.0525, 'learning_rate': 1.1992355470616341e-05, 'epoch': 1.53}


 77%|███████▋  | 505/656 [01:03<00:16,  9.21it/s]

{'loss': 0.0791, 'learning_rate': 1.2016244624940277e-05, 'epoch': 1.53}
{'loss': 0.2912, 'learning_rate': 1.2040133779264215e-05, 'epoch': 1.54}


 77%|███████▋  | 506/656 [01:03<00:16,  9.09it/s]

{'loss': 0.0439, 'learning_rate': 1.2064022933588151e-05, 'epoch': 1.54}
{'loss': 0.0785, 'learning_rate': 1.2087912087912089e-05, 'epoch': 1.54}


 78%|███████▊  | 509/656 [01:04<00:16,  8.76it/s]

{'loss': 0.1932, 'learning_rate': 1.2111801242236026e-05, 'epoch': 1.55}
{'loss': 0.0447, 'learning_rate': 1.2135690396559962e-05, 'epoch': 1.55}


 78%|███████▊  | 511/656 [01:04<00:15,  9.17it/s]

{'loss': 0.0458, 'learning_rate': 1.21595795508839e-05, 'epoch': 1.55}
{'loss': 0.0549, 'learning_rate': 1.2183468705207836e-05, 'epoch': 1.55}


 78%|███████▊  | 513/656 [01:04<00:15,  9.26it/s]

{'loss': 0.1159, 'learning_rate': 1.2207357859531774e-05, 'epoch': 1.56}
{'loss': 0.0636, 'learning_rate': 1.223124701385571e-05, 'epoch': 1.56}


 78%|███████▊  | 514/656 [01:04<00:15,  9.15it/s]

{'loss': 0.0459, 'learning_rate': 1.2255136168179648e-05, 'epoch': 1.56}
{'loss': 0.0381, 'learning_rate': 1.2279025322503584e-05, 'epoch': 1.57}


 79%|███████▉  | 517/656 [01:05<00:14,  9.61it/s]

{'loss': 0.0387, 'learning_rate': 1.230291447682752e-05, 'epoch': 1.57}
{'loss': 0.0762, 'learning_rate': 1.2326803631151458e-05, 'epoch': 1.57}


 79%|███████▉  | 519/656 [01:05<00:14,  9.28it/s]

{'loss': 0.1662, 'learning_rate': 1.2350692785475394e-05, 'epoch': 1.58}
{'loss': 0.0413, 'learning_rate': 1.2374581939799331e-05, 'epoch': 1.58}


 79%|███████▉  | 521/656 [01:05<00:14,  9.19it/s]

{'loss': 0.0551, 'learning_rate': 1.2398471094123269e-05, 'epoch': 1.58}
{'loss': 0.1736, 'learning_rate': 1.2422360248447205e-05, 'epoch': 1.59}


 80%|███████▉  | 522/656 [01:05<00:14,  8.98it/s]

{'loss': 0.0437, 'learning_rate': 1.2446249402771143e-05, 'epoch': 1.59}
{'loss': 0.1354, 'learning_rate': 1.2470138557095079e-05, 'epoch': 1.59}


 80%|███████▉  | 524/656 [01:05<00:14,  9.24it/s]

{'loss': 0.1002, 'learning_rate': 1.2494027711419017e-05, 'epoch': 1.59}
{'loss': 0.0763, 'learning_rate': 1.2517916865742954e-05, 'epoch': 1.6}


 80%|████████  | 527/656 [01:06<00:14,  9.02it/s]

{'loss': 0.0557, 'learning_rate': 1.254180602006689e-05, 'epoch': 1.6}
{'loss': 0.0355, 'learning_rate': 1.2565695174390826e-05, 'epoch': 1.6}


 81%|████████  | 529/656 [01:06<00:13,  9.53it/s]

{'loss': 0.0487, 'learning_rate': 1.2589584328714766e-05, 'epoch': 1.61}
{'loss': 0.1778, 'learning_rate': 1.2613473483038702e-05, 'epoch': 1.61}


 81%|████████  | 530/656 [01:06<00:13,  9.32it/s]

{'loss': 0.1756, 'learning_rate': 1.2637362637362638e-05, 'epoch': 1.61}
{'loss': 0.063, 'learning_rate': 1.2661251791686574e-05, 'epoch': 1.62}


 81%|████████  | 532/656 [01:06<00:13,  8.96it/s]

{'loss': 0.0429, 'learning_rate': 1.2685140946010512e-05, 'epoch': 1.62}
{'loss': 0.0614, 'learning_rate': 1.270903010033445e-05, 'epoch': 1.62}


 81%|████████▏ | 534/656 [01:07<00:14,  8.58it/s]

{'loss': 0.0325, 'learning_rate': 1.2732919254658385e-05, 'epoch': 1.62}
{'loss': 0.0611, 'learning_rate': 1.2756808408982323e-05, 'epoch': 1.63}


 82%|████████▏ | 537/656 [01:07<00:12,  9.45it/s]

{'loss': 0.077, 'learning_rate': 1.278069756330626e-05, 'epoch': 1.63}
{'loss': 0.0482, 'learning_rate': 1.2804586717630195e-05, 'epoch': 1.63}


 82%|████████▏ | 538/656 [01:07<00:12,  9.58it/s]

{'loss': 0.0336, 'learning_rate': 1.2828475871954135e-05, 'epoch': 1.64}
{'loss': 0.0394, 'learning_rate': 1.285236502627807e-05, 'epoch': 1.64}


 82%|████████▏ | 541/656 [01:07<00:11,  9.60it/s]

{'loss': 0.0327, 'learning_rate': 1.2876254180602007e-05, 'epoch': 1.64}
{'loss': 0.0984, 'learning_rate': 1.2900143334925943e-05, 'epoch': 1.65}


 83%|████████▎ | 543/656 [01:07<00:11,  9.57it/s]

{'loss': 0.1718, 'learning_rate': 1.2924032489249882e-05, 'epoch': 1.65}
{'loss': 0.0323, 'learning_rate': 1.2947921643573818e-05, 'epoch': 1.65}


 83%|████████▎ | 545/656 [01:08<00:11,  9.46it/s]

{'loss': 0.0504, 'learning_rate': 1.2971810797897754e-05, 'epoch': 1.66}
{'loss': 0.0376, 'learning_rate': 1.2995699952221694e-05, 'epoch': 1.66}


 83%|████████▎ | 546/656 [01:08<00:12,  9.13it/s]

{'loss': 0.0367, 'learning_rate': 1.301958910654563e-05, 'epoch': 1.66}
{'loss': 0.0861, 'learning_rate': 1.3043478260869566e-05, 'epoch': 1.66}


 84%|████████▎ | 549/656 [01:08<00:11,  9.67it/s]

{'loss': 0.0372, 'learning_rate': 1.3067367415193502e-05, 'epoch': 1.67}
{'loss': 0.1457, 'learning_rate': 1.3091256569517441e-05, 'epoch': 1.67}


 84%|████████▍ | 550/656 [01:08<00:11,  9.50it/s]

{'loss': 0.0494, 'learning_rate': 1.3115145723841377e-05, 'epoch': 1.67}
{'loss': 0.1438, 'learning_rate': 1.3139034878165313e-05, 'epoch': 1.68}


 84%|████████▍ | 553/656 [01:08<00:11,  9.02it/s]

{'loss': 0.0318, 'learning_rate': 1.3162924032489251e-05, 'epoch': 1.68}
{'loss': 0.0315, 'learning_rate': 1.3186813186813187e-05, 'epoch': 1.68}


 85%|████████▍ | 555/656 [01:09<00:10,  9.26it/s]

{'loss': 0.2507, 'learning_rate': 1.3210702341137123e-05, 'epoch': 1.69}
{'loss': 0.0909, 'learning_rate': 1.323459149546106e-05, 'epoch': 1.69}


 85%|████████▍ | 556/656 [01:09<00:11,  8.80it/s]

{'loss': 0.1817, 'learning_rate': 1.3258480649784999e-05, 'epoch': 1.69}
{'loss': 0.0344, 'learning_rate': 1.3282369804108935e-05, 'epoch': 1.7}


 85%|████████▌ | 558/656 [01:09<00:11,  8.25it/s]

{'loss': 0.0567, 'learning_rate': 1.330625895843287e-05, 'epoch': 1.7}
{'loss': 0.1352, 'learning_rate': 1.333014811275681e-05, 'epoch': 1.7}


 86%|████████▌ | 561/656 [01:09<00:10,  8.73it/s]

{'loss': 0.0327, 'learning_rate': 1.3354037267080746e-05, 'epoch': 1.7}
{'loss': 0.1758, 'learning_rate': 1.3377926421404682e-05, 'epoch': 1.71}


 86%|████████▌ | 562/656 [01:10<00:11,  8.39it/s]

{'loss': 0.1317, 'learning_rate': 1.3401815575728622e-05, 'epoch': 1.71}
{'loss': 0.2654, 'learning_rate': 1.3425704730052558e-05, 'epoch': 1.71}


 86%|████████▌ | 564/656 [01:10<00:10,  8.61it/s]

{'loss': 0.1136, 'learning_rate': 1.3449593884376494e-05, 'epoch': 1.72}
{'loss': 0.0669, 'learning_rate': 1.347348303870043e-05, 'epoch': 1.72}


 86%|████████▋ | 566/656 [01:10<00:10,  8.72it/s]

{'loss': 0.0395, 'learning_rate': 1.3497372193024369e-05, 'epoch': 1.72}
{'loss': 0.0399, 'learning_rate': 1.3521261347348305e-05, 'epoch': 1.73}


 87%|████████▋ | 568/656 [01:10<00:10,  8.48it/s]

{'loss': 0.0366, 'learning_rate': 1.3545150501672241e-05, 'epoch': 1.73}
{'loss': 0.174, 'learning_rate': 1.3569039655996179e-05, 'epoch': 1.73}


 87%|████████▋ | 571/656 [01:11<00:08,  9.46it/s]

{'loss': 0.1136, 'learning_rate': 1.3592928810320115e-05, 'epoch': 1.73}
{'loss': 0.2025, 'learning_rate': 1.3616817964644053e-05, 'epoch': 1.74}


 87%|████████▋ | 573/656 [01:11<00:08,  9.70it/s]

{'loss': 0.0285, 'learning_rate': 1.3640707118967989e-05, 'epoch': 1.74}
{'loss': 0.0328, 'learning_rate': 1.3664596273291926e-05, 'epoch': 1.74}


 88%|████████▊ | 575/656 [01:11<00:08,  9.78it/s]

{'loss': 0.0381, 'learning_rate': 1.3688485427615862e-05, 'epoch': 1.75}
{'loss': 0.0729, 'learning_rate': 1.3712374581939799e-05, 'epoch': 1.75}


 88%|████████▊ | 577/656 [01:11<00:08,  9.62it/s]

{'loss': 0.0803, 'learning_rate': 1.3736263736263738e-05, 'epoch': 1.75}
{'loss': 0.0292, 'learning_rate': 1.3760152890587674e-05, 'epoch': 1.76}


 88%|████████▊ | 579/656 [01:11<00:07,  9.93it/s]

{'loss': 0.0523, 'learning_rate': 1.378404204491161e-05, 'epoch': 1.76}
{'loss': 0.0274, 'learning_rate': 1.3807931199235546e-05, 'epoch': 1.76}


 88%|████████▊ | 580/656 [01:11<00:07,  9.90it/s]

{'loss': 0.0759, 'learning_rate': 1.3831820353559485e-05, 'epoch': 1.77}
{'loss': 0.0281, 'learning_rate': 1.3855709507883422e-05, 'epoch': 1.77}


 89%|████████▉ | 583/656 [01:12<00:07,  9.59it/s]

{'loss': 0.0279, 'learning_rate': 1.3879598662207358e-05, 'epoch': 1.77}
{'loss': 0.0294, 'learning_rate': 1.3903487816531297e-05, 'epoch': 1.77}


 89%|████████▉ | 584/656 [01:12<00:07,  9.11it/s]

{'loss': 0.1562, 'learning_rate': 1.3927376970855233e-05, 'epoch': 1.78}
{'loss': 0.0496, 'learning_rate': 1.3951266125179169e-05, 'epoch': 1.78}


 89%|████████▉ | 586/656 [01:12<00:07,  9.19it/s]

{'loss': 0.1913, 'learning_rate': 1.3975155279503105e-05, 'epoch': 1.78}
{'loss': 0.0439, 'learning_rate': 1.3999044433827045e-05, 'epoch': 1.79}


 90%|████████▉ | 589/656 [01:12<00:07,  9.40it/s]

{'loss': 0.0829, 'learning_rate': 1.402293358815098e-05, 'epoch': 1.79}
{'loss': 0.1877, 'learning_rate': 1.4046822742474917e-05, 'epoch': 1.79}


 90%|█████████ | 591/656 [01:13<00:07,  9.23it/s]

{'loss': 0.1483, 'learning_rate': 1.4070711896798854e-05, 'epoch': 1.8}
{'loss': 0.0979, 'learning_rate': 1.409460105112279e-05, 'epoch': 1.8}


 90%|█████████ | 592/656 [01:13<00:07,  9.14it/s]

{'loss': 0.065, 'learning_rate': 1.4118490205446726e-05, 'epoch': 1.8}
{'loss': 0.1875, 'learning_rate': 1.4142379359770666e-05, 'epoch': 1.8}


 91%|█████████ | 594/656 [01:13<00:06,  8.91it/s]

{'loss': 0.0302, 'learning_rate': 1.4166268514094602e-05, 'epoch': 1.81}
{'loss': 0.1143, 'learning_rate': 1.4190157668418538e-05, 'epoch': 1.81}


 91%|█████████ | 596/656 [01:13<00:07,  8.38it/s]

{'loss': 0.0833, 'learning_rate': 1.4214046822742474e-05, 'epoch': 1.81}
{'loss': 0.0236, 'learning_rate': 1.4237935977066413e-05, 'epoch': 1.82}


 91%|█████████ | 598/656 [01:13<00:06,  8.59it/s]

{'loss': 0.023, 'learning_rate': 1.426182513139035e-05, 'epoch': 1.82}
{'loss': 0.0245, 'learning_rate': 1.4285714285714285e-05, 'epoch': 1.82}


 92%|█████████▏| 601/656 [01:14<00:05,  9.17it/s]

{'loss': 0.1417, 'learning_rate': 1.4309603440038225e-05, 'epoch': 1.83}
{'loss': 0.0269, 'learning_rate': 1.4333492594362161e-05, 'epoch': 1.83}


 92%|█████████▏| 602/656 [01:14<00:06,  8.98it/s]

{'loss': 0.0282, 'learning_rate': 1.4357381748686097e-05, 'epoch': 1.83}
{'loss': 0.0313, 'learning_rate': 1.4381270903010033e-05, 'epoch': 1.84}


 92%|█████████▏| 605/656 [01:14<00:05,  8.82it/s]

{'loss': 0.0401, 'learning_rate': 1.4405160057333972e-05, 'epoch': 1.84}
{'loss': 0.2009, 'learning_rate': 1.4429049211657908e-05, 'epoch': 1.84}


 92%|█████████▏| 606/656 [01:14<00:05,  8.83it/s]

{'loss': 0.0272, 'learning_rate': 1.4452938365981844e-05, 'epoch': 1.84}
{'loss': 0.0368, 'learning_rate': 1.4476827520305782e-05, 'epoch': 1.85}


 93%|█████████▎| 608/656 [01:15<00:05,  8.26it/s]

{'loss': 0.1349, 'learning_rate': 1.4500716674629718e-05, 'epoch': 1.85}
{'loss': 0.0232, 'learning_rate': 1.4524605828953656e-05, 'epoch': 1.85}


 93%|█████████▎| 610/656 [01:15<00:05,  9.02it/s]

{'loss': 0.0539, 'learning_rate': 1.4548494983277592e-05, 'epoch': 1.86}
{'loss': 0.0268, 'learning_rate': 1.457238413760153e-05, 'epoch': 1.86}


 93%|█████████▎| 613/656 [01:15<00:04,  8.73it/s]

{'loss': 0.146, 'learning_rate': 1.4596273291925466e-05, 'epoch': 1.86}
{'loss': 0.1555, 'learning_rate': 1.4620162446249402e-05, 'epoch': 1.87}


 94%|█████████▎| 614/656 [01:15<00:04,  8.98it/s]

{'loss': 0.128, 'learning_rate': 1.4644051600573341e-05, 'epoch': 1.87}
{'loss': 0.1609, 'learning_rate': 1.4667940754897277e-05, 'epoch': 1.87}


 94%|█████████▍| 616/656 [01:16<00:04,  9.48it/s]

{'loss': 0.0233, 'learning_rate': 1.4691829909221213e-05, 'epoch': 1.88}
{'loss': 0.0271, 'learning_rate': 1.4715719063545153e-05, 'epoch': 1.88}


 94%|█████████▍| 619/656 [01:16<00:04,  9.11it/s]

{'loss': 0.1078, 'learning_rate': 1.4739608217869089e-05, 'epoch': 1.88}
{'loss': 0.0446, 'learning_rate': 1.4763497372193025e-05, 'epoch': 1.88}


 95%|█████████▍| 620/656 [01:16<00:04,  8.75it/s]

{'loss': 0.0243, 'learning_rate': 1.478738652651696e-05, 'epoch': 1.89}
{'loss': 0.1576, 'learning_rate': 1.48112756808409e-05, 'epoch': 1.89}


 95%|█████████▍| 623/656 [01:16<00:03,  8.69it/s]

{'loss': 0.0267, 'learning_rate': 1.4835164835164836e-05, 'epoch': 1.89}
{'loss': 0.0272, 'learning_rate': 1.4859053989488772e-05, 'epoch': 1.9}


 95%|█████████▌| 624/656 [01:16<00:03,  8.72it/s]

{'loss': 0.055, 'learning_rate': 1.4882943143812712e-05, 'epoch': 1.9}
{'loss': 0.0579, 'learning_rate': 1.4906832298136648e-05, 'epoch': 1.9}


 95%|█████████▌| 626/656 [01:17<00:03,  8.13it/s]

{'loss': 0.0228, 'learning_rate': 1.4930721452460584e-05, 'epoch': 1.91}
{'loss': 0.0358, 'learning_rate': 1.495461060678452e-05, 'epoch': 1.91}


 96%|█████████▌| 629/656 [01:17<00:02,  9.16it/s]

{'loss': 0.1701, 'learning_rate': 1.4978499761108458e-05, 'epoch': 1.91}
{'loss': 0.0291, 'learning_rate': 1.5002388915432394e-05, 'epoch': 1.91}


 96%|█████████▌| 630/656 [01:17<00:02,  9.34it/s]

{'loss': 0.0247, 'learning_rate': 1.502627806975633e-05, 'epoch': 1.92}
{'loss': 0.2705, 'learning_rate': 1.5050167224080269e-05, 'epoch': 1.92}


 96%|█████████▋| 632/656 [01:17<00:02,  9.28it/s]

{'loss': 0.0207, 'learning_rate': 1.5074056378404205e-05, 'epoch': 1.92}
{'loss': 0.0653, 'learning_rate': 1.5097945532728141e-05, 'epoch': 1.93}


 97%|█████████▋| 635/656 [01:18<00:02,  9.64it/s]

{'loss': 0.0262, 'learning_rate': 1.5121834687052077e-05, 'epoch': 1.93}
{'loss': 0.0533, 'learning_rate': 1.5145723841376017e-05, 'epoch': 1.93}


 97%|█████████▋| 637/656 [01:18<00:01, 10.06it/s]

{'loss': 0.1239, 'learning_rate': 1.5169612995699953e-05, 'epoch': 1.94}
{'loss': 0.0441, 'learning_rate': 1.5193502150023889e-05, 'epoch': 1.94}


 97%|█████████▋| 638/656 [01:18<00:01, 10.04it/s]

{'loss': 0.0244, 'learning_rate': 1.5217391304347828e-05, 'epoch': 1.94}
{'loss': 0.0266, 'learning_rate': 1.5241280458671764e-05, 'epoch': 1.95}


 98%|█████████▊| 640/656 [01:18<00:01,  9.89it/s]

{'loss': 0.0866, 'learning_rate': 1.52651696129957e-05, 'epoch': 1.95}
{'loss': 0.1029, 'learning_rate': 1.528905876731964e-05, 'epoch': 1.95}


 98%|█████████▊| 642/656 [01:18<00:01,  9.92it/s]

{'loss': 0.1387, 'learning_rate': 1.5312947921643576e-05, 'epoch': 1.95}
{'loss': 0.1501, 'learning_rate': 1.5336837075967512e-05, 'epoch': 1.96}


 98%|█████████▊| 644/656 [01:19<00:01,  9.92it/s]

{'loss': 0.0213, 'learning_rate': 1.5360726230291448e-05, 'epoch': 1.96}
{'loss': 0.019, 'learning_rate': 1.5384615384615387e-05, 'epoch': 1.96}


 98%|█████████▊| 646/656 [01:19<00:00, 10.07it/s]

{'loss': 0.0179, 'learning_rate': 1.5408504538939323e-05, 'epoch': 1.97}
{'loss': 0.0203, 'learning_rate': 1.543239369326326e-05, 'epoch': 1.97}


 99%|█████████▉| 649/656 [01:19<00:00,  9.56it/s]

{'loss': 0.0285, 'learning_rate': 1.54562828475872e-05, 'epoch': 1.97}
{'loss': 0.0927, 'learning_rate': 1.5480172001911135e-05, 'epoch': 1.98}


 99%|█████████▉| 650/656 [01:19<00:00,  8.84it/s]

{'loss': 0.0344, 'learning_rate': 1.550406115623507e-05, 'epoch': 1.98}
{'loss': 0.02, 'learning_rate': 1.5527950310559007e-05, 'epoch': 1.98}


 99%|█████████▉| 652/656 [01:19<00:00,  8.48it/s]

{'loss': 0.0326, 'learning_rate': 1.5551839464882946e-05, 'epoch': 1.98}
{'loss': 0.0187, 'learning_rate': 1.5575728619206882e-05, 'epoch': 1.99}


100%|█████████▉| 655/656 [01:20<00:00,  9.28it/s]

{'loss': 0.0226, 'learning_rate': 1.5599617773530818e-05, 'epoch': 1.99}
{'loss': 0.1768, 'learning_rate': 1.5623506927854754e-05, 'epoch': 1.99}


100%|██████████| 656/656 [01:20<00:00,  9.28it/s]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: token_labels, tokens, utterance. If token_labels, tokens, utterance are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32


{'loss': 0.0177, 'learning_rate': 1.564739608217869e-05, 'epoch': 2.0}
{'loss': 0.0158, 'learning_rate': 1.5671285236502626e-05, 'epoch': 2.0}


                                                 
100%|██████████| 656/656 [01:22<00:00,  9.28it/s]Saving model checkpoint to ./snips_clf/results\checkpoint-656
Configuration saved in ./snips_clf/results\checkpoint-656\config.json


{'eval_loss': 0.060164548456668854, 'eval_accuracy': 0.9847153228888039, 'eval_runtime': 2.5331, 'eval_samples_per_second': 1033.119, 'eval_steps_per_second': 32.371, 'epoch': 2.0}


Model weights saved in ./snips_clf/results\checkpoint-656\pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./snips_clf/results\checkpoint-656 (score: 0.060164548456668854).
100%|██████████| 656/656 [01:28<00:00,  7.39it/s]

{'train_runtime': 88.7146, 'train_samples_per_second': 235.97, 'train_steps_per_second': 7.395, 'train_loss': 0.7290181125934441, 'epoch': 2.0}





TrainOutput(global_step=656, training_loss=0.7290181125934441, metrics={'train_runtime': 88.7146, 'train_samples_per_second': 235.97, 'train_steps_per_second': 7.395, 'train_loss': 0.7290181125934441, 'epoch': 2.0})

In [20]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: token_labels, tokens, utterance. If token_labels, tokens, utterance are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
100%|██████████| 82/82 [00:02<00:00, 33.90it/s]


{'eval_loss': 0.060164548456668854,
 'eval_accuracy': 0.9847153228888039,
 'eval_runtime': 2.4882,
 'eval_samples_per_second': 1051.75,
 'eval_steps_per_second': 32.955,
 'epoch': 2.0}

In [29]:
pipe = pipeline(
    task="text-classification",
    model=sequence_clf_model, 
    tokenizer=tokenizer,
    device=0
    )
pipe([
    'Add Morph by Twenty One Pilot to my TOP playlist',
    'Rate A thousand splendid suns 9 out of 10 stars'
    ])

[{'label': 'AddToPlaylist', 'score': 0.9911097288131714},
 {'label': 'RateBook', 'score': 0.9908831715583801}]

In [30]:
# Save the best model to the paths specified previously
trainer.save_model()

Saving model checkpoint to ./snips_clf/results
Configuration saved in ./snips_clf/results\config.json
Model weights saved in ./snips_clf/results\pytorch_model.bin


In [31]:
pipe = pipeline(
    task="text-classification",
    model="./snips_clf/results", 
    tokenizer=tokenizer,
    device=0
    )
pipe([
    'Add Morph by Twenty One Pilot to my TOP playlist',
    'Rate A thousand splendid suns 9 out of 10 stars'
    ])

loading configuration file ./snips_clf/results\config.json
Model config DistilBertConfig {
  "_name_or_path": "./snips_clf/results",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "SearchScreeningEvent",
    "1": "RateBook",
    "2": "AddToPlaylist",
    "3": "SearchCreativeWork",
    "4": "GetWeather",
    "5": "PlayMusic",
    "6": "BookRestaurant"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transfor

[{'label': 'AddToPlaylist', 'score': 0.9911097288131714},
 {'label': 'RateBook', 'score': 0.9908831715583801}]

In [32]:
## Freezing some layers
frozen_sequence_clf_model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels = len(unique_sequence_labels)
)

for param in frozen_sequence_clf_model.distilbert.parameters():
    param.requires_grad = False

loading configuration file config.json from cache at C:\Users\Muham/.cache\huggingface\hub\models--distilbert-base-uncased\snapshots\6cdc0aad91f5ae2e6712e91bc7b65d1cf5c05411\config.json
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.35.0",
  "vocab_size": 30522
}


In [64]:
epochs = 2

training_args = TrainingArguments(
    output_dir="./snips_clf/results",
    num_train_epochs=epochs,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    load_best_model_at_end=True,

    warmup_steps=len(seq_clf_tokenized_snips['train']) // 5,
    weight_decay= 0.05,

    logging_steps=1,
    log_level='info',
    evaluation_strategy='epoch',
    save_strategy='epoch'
)

trainer = Trainer(
    model=frozen_sequence_clf_model,
    args=training_args,
    train_dataset=seq_clf_tokenized_snips['train'],
    eval_dataset=seq_clf_tokenized_snips['test'],
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

# Check if the model is using CUDA (GPU)
is_using_cuda = trainer.args.device.type == 'cuda'
print(f"Is using CUDA: "+'\033[92m'+str(is_using_cuda)+'\033[0m')

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


Is using CUDA: [92mTrue[0m


In [65]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: token_labels, tokens, utterance. If token_labels, tokens, utterance are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
100%|██████████| 82/82 [00:02<00:00, 32.17it/s]


{'eval_loss': 1.9450441598892212,
 'eval_accuracy': 0.1700420328620558,
 'eval_runtime': 2.6976,
 'eval_samples_per_second': 970.13,
 'eval_steps_per_second': 30.398}

In [66]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: token_labels, tokens, utterance. If token_labels, tokens, utterance are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10,467
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 656
  Number of trainable parameters = 595,975
  1%|          | 6/656 [00:00<00:22, 29.08it/s]

{'loss': 1.953, 'learning_rate': 2.3889154323936934e-08, 'epoch': 0.0}
{'loss': 1.9445, 'learning_rate': 4.777830864787387e-08, 'epoch': 0.01}
{'loss': 1.946, 'learning_rate': 7.16674629718108e-08, 'epoch': 0.01}
{'loss': 1.9548, 'learning_rate': 9.555661729574773e-08, 'epoch': 0.01}
{'loss': 1.9551, 'learning_rate': 1.1944577161968468e-07, 'epoch': 0.02}
{'loss': 1.9433, 'learning_rate': 1.433349259436216e-07, 'epoch': 0.02}


  2%|▏         | 12/656 [00:00<00:22, 28.09it/s]

{'loss': 1.9409, 'learning_rate': 1.6722408026755853e-07, 'epoch': 0.02}
{'loss': 1.9594, 'learning_rate': 1.9111323459149547e-07, 'epoch': 0.02}
{'loss': 1.9559, 'learning_rate': 2.150023889154324e-07, 'epoch': 0.03}
{'loss': 1.9397, 'learning_rate': 2.3889154323936937e-07, 'epoch': 0.03}
{'loss': 1.9657, 'learning_rate': 2.6278069756330625e-07, 'epoch': 0.03}


  3%|▎         | 17/656 [00:00<00:23, 27.60it/s]

{'loss': 1.9449, 'learning_rate': 2.866698518872432e-07, 'epoch': 0.04}
{'loss': 1.931, 'learning_rate': 3.1055900621118013e-07, 'epoch': 0.04}
{'loss': 1.9479, 'learning_rate': 3.3444816053511706e-07, 'epoch': 0.04}
{'loss': 1.9526, 'learning_rate': 3.58337314859054e-07, 'epoch': 0.05}
{'loss': 1.9279, 'learning_rate': 3.8222646918299094e-07, 'epoch': 0.05}
{'loss': 1.9521, 'learning_rate': 4.0611562350692793e-07, 'epoch': 0.05}


  4%|▎         | 23/656 [00:00<00:21, 29.07it/s]

{'loss': 1.9261, 'learning_rate': 4.300047778308648e-07, 'epoch': 0.05}
{'loss': 1.9404, 'learning_rate': 4.5389393215480175e-07, 'epoch': 0.06}
{'loss': 1.9522, 'learning_rate': 4.777830864787387e-07, 'epoch': 0.06}
{'loss': 1.953, 'learning_rate': 5.016722408026756e-07, 'epoch': 0.06}
{'loss': 1.9609, 'learning_rate': 5.255613951266125e-07, 'epoch': 0.07}
{'loss': 1.9621, 'learning_rate': 5.494505494505495e-07, 'epoch': 0.07}


  4%|▍         | 28/656 [00:00<00:21, 29.19it/s]

{'loss': 1.9501, 'learning_rate': 5.733397037744864e-07, 'epoch': 0.07}
{'loss': 1.9766, 'learning_rate': 5.972288580984234e-07, 'epoch': 0.08}
{'loss': 1.9585, 'learning_rate': 6.211180124223603e-07, 'epoch': 0.08}
{'loss': 1.9221, 'learning_rate': 6.450071667462972e-07, 'epoch': 0.08}
{'loss': 1.9198, 'learning_rate': 6.688963210702341e-07, 'epoch': 0.09}


  5%|▌         | 33/656 [00:01<00:22, 27.24it/s]

{'loss': 1.9388, 'learning_rate': 6.92785475394171e-07, 'epoch': 0.09}
{'loss': 1.9819, 'learning_rate': 7.16674629718108e-07, 'epoch': 0.09}
{'loss': 1.9256, 'learning_rate': 7.405637840420449e-07, 'epoch': 0.09}
{'loss': 1.9359, 'learning_rate': 7.644529383659819e-07, 'epoch': 0.1}
{'loss': 1.9263, 'learning_rate': 7.883420926899189e-07, 'epoch': 0.1}


  6%|▌         | 39/656 [00:01<00:21, 28.18it/s]

{'loss': 1.9226, 'learning_rate': 8.122312470138559e-07, 'epoch': 0.1}
{'loss': 1.9508, 'learning_rate': 8.361204013377926e-07, 'epoch': 0.11}
{'loss': 1.957, 'learning_rate': 8.600095556617296e-07, 'epoch': 0.11}
{'loss': 1.944, 'learning_rate': 8.838987099856666e-07, 'epoch': 0.11}
{'loss': 1.9461, 'learning_rate': 9.077878643096035e-07, 'epoch': 0.12}
{'loss': 1.9406, 'learning_rate': 9.316770186335405e-07, 'epoch': 0.12}


  7%|▋         | 45/656 [00:01<00:20, 29.10it/s]

{'loss': 1.9451, 'learning_rate': 9.555661729574775e-07, 'epoch': 0.12}
{'loss': 1.9612, 'learning_rate': 9.794553272814141e-07, 'epoch': 0.12}
{'loss': 1.934, 'learning_rate': 1.0033444816053512e-06, 'epoch': 0.13}
{'loss': 1.9436, 'learning_rate': 1.0272336359292883e-06, 'epoch': 0.13}
{'loss': 1.934, 'learning_rate': 1.051122790253225e-06, 'epoch': 0.13}
{'loss': 1.936, 'learning_rate': 1.0750119445771621e-06, 'epoch': 0.14}


  8%|▊         | 51/656 [00:01<00:21, 27.95it/s]

{'loss': 1.9274, 'learning_rate': 1.098901098901099e-06, 'epoch': 0.14}
{'loss': 1.9711, 'learning_rate': 1.1227902532250359e-06, 'epoch': 0.14}
{'loss': 1.959, 'learning_rate': 1.1466794075489728e-06, 'epoch': 0.15}
{'loss': 1.953, 'learning_rate': 1.1705685618729096e-06, 'epoch': 0.15}
{'loss': 1.9281, 'learning_rate': 1.1944577161968467e-06, 'epoch': 0.15}


  8%|▊         | 55/656 [00:01<00:22, 26.80it/s]

{'loss': 1.9573, 'learning_rate': 1.2183468705207836e-06, 'epoch': 0.16}
{'loss': 1.9546, 'learning_rate': 1.2422360248447205e-06, 'epoch': 0.16}
{'loss': 1.9315, 'learning_rate': 1.2661251791686574e-06, 'epoch': 0.16}
{'loss': 1.9296, 'learning_rate': 1.2900143334925945e-06, 'epoch': 0.16}
{'loss': 1.9611, 'learning_rate': 1.3139034878165314e-06, 'epoch': 0.17}


  9%|▉         | 60/656 [00:02<00:22, 26.82it/s]

{'loss': 1.9292, 'learning_rate': 1.3377926421404683e-06, 'epoch': 0.17}
{'loss': 1.9512, 'learning_rate': 1.3616817964644054e-06, 'epoch': 0.17}
{'loss': 1.9497, 'learning_rate': 1.385570950788342e-06, 'epoch': 0.18}
{'loss': 1.9762, 'learning_rate': 1.4094601051122791e-06, 'epoch': 0.18}
{'loss': 1.9082, 'learning_rate': 1.433349259436216e-06, 'epoch': 0.18}


 10%|█         | 66/656 [00:02<00:20, 28.66it/s]

{'loss': 1.953, 'learning_rate': 1.4572384137601529e-06, 'epoch': 0.19}
{'loss': 1.9548, 'learning_rate': 1.4811275680840898e-06, 'epoch': 0.19}
{'loss': 1.935, 'learning_rate': 1.5050167224080269e-06, 'epoch': 0.19}
{'loss': 1.9399, 'learning_rate': 1.5289058767319638e-06, 'epoch': 0.2}
{'loss': 1.9674, 'learning_rate': 1.5527950310559006e-06, 'epoch': 0.2}
{'loss': 1.951, 'learning_rate': 1.5766841853798377e-06, 'epoch': 0.2}


 11%|█         | 71/656 [00:02<00:20, 28.91it/s]

{'loss': 1.9525, 'learning_rate': 1.6005733397037744e-06, 'epoch': 0.2}
{'loss': 1.9383, 'learning_rate': 1.6244624940277117e-06, 'epoch': 0.21}
{'loss': 1.9594, 'learning_rate': 1.6483516483516484e-06, 'epoch': 0.21}
{'loss': 1.9529, 'learning_rate': 1.6722408026755853e-06, 'epoch': 0.21}
{'loss': 1.907, 'learning_rate': 1.6961299569995224e-06, 'epoch': 0.22}


 12%|█▏        | 77/656 [00:02<00:20, 28.07it/s]

{'loss': 1.929, 'learning_rate': 1.7200191113234592e-06, 'epoch': 0.22}
{'loss': 1.9399, 'learning_rate': 1.7439082656473961e-06, 'epoch': 0.22}
{'loss': 1.9411, 'learning_rate': 1.7677974199713332e-06, 'epoch': 0.23}
{'loss': 1.9281, 'learning_rate': 1.7916865742952701e-06, 'epoch': 0.23}
{'loss': 1.9516, 'learning_rate': 1.815575728619207e-06, 'epoch': 0.23}


 12%|█▎        | 82/656 [00:02<00:20, 28.44it/s]

{'loss': 1.9224, 'learning_rate': 1.839464882943144e-06, 'epoch': 0.23}
{'loss': 1.9384, 'learning_rate': 1.863354037267081e-06, 'epoch': 0.24}
{'loss': 1.9161, 'learning_rate': 1.8872431915910176e-06, 'epoch': 0.24}
{'loss': 1.9482, 'learning_rate': 1.911132345914955e-06, 'epoch': 0.24}
{'loss': 1.9567, 'learning_rate': 1.935021500238892e-06, 'epoch': 0.25}
{'loss': 1.9106, 'learning_rate': 1.9589106545628283e-06, 'epoch': 0.25}


 13%|█▎        | 88/656 [00:03<00:19, 29.22it/s]

{'loss': 1.9701, 'learning_rate': 1.9827998088867656e-06, 'epoch': 0.25}
{'loss': 1.942, 'learning_rate': 2.0066889632107025e-06, 'epoch': 0.26}
{'loss': 1.9489, 'learning_rate': 2.0305781175346394e-06, 'epoch': 0.26}
{'loss': 1.963, 'learning_rate': 2.0544672718585767e-06, 'epoch': 0.26}
{'loss': 1.9406, 'learning_rate': 2.078356426182513e-06, 'epoch': 0.27}
{'loss': 1.9509, 'learning_rate': 2.10224558050645e-06, 'epoch': 0.27}


 14%|█▍        | 94/656 [00:03<00:18, 29.75it/s]

{'loss': 1.9381, 'learning_rate': 2.1261347348303873e-06, 'epoch': 0.27}
{'loss': 1.9416, 'learning_rate': 2.1500238891543242e-06, 'epoch': 0.27}
{'loss': 1.9583, 'learning_rate': 2.173913043478261e-06, 'epoch': 0.28}
{'loss': 1.9584, 'learning_rate': 2.197802197802198e-06, 'epoch': 0.28}
{'loss': 1.9284, 'learning_rate': 2.221691352126135e-06, 'epoch': 0.28}
{'loss': 1.9356, 'learning_rate': 2.2455805064500718e-06, 'epoch': 0.29}


 15%|█▌        | 99/656 [00:03<00:20, 27.16it/s]

{'loss': 1.9367, 'learning_rate': 2.269469660774009e-06, 'epoch': 0.29}
{'loss': 1.9497, 'learning_rate': 2.2933588150979455e-06, 'epoch': 0.29}
{'loss': 1.9368, 'learning_rate': 2.3172479694218824e-06, 'epoch': 0.3}
{'loss': 1.95, 'learning_rate': 2.3411371237458193e-06, 'epoch': 0.3}
{'loss': 1.9469, 'learning_rate': 2.3650262780697566e-06, 'epoch': 0.3}


 16%|█▌        | 103/656 [00:03<00:20, 27.62it/s]

{'loss': 1.9466, 'learning_rate': 2.3889154323936935e-06, 'epoch': 0.3}
{'loss': 1.9466, 'learning_rate': 2.41280458671763e-06, 'epoch': 0.31}
{'loss': 1.9506, 'learning_rate': 2.4366937410415673e-06, 'epoch': 0.31}
{'loss': 1.9265, 'learning_rate': 2.460582895365504e-06, 'epoch': 0.31}


 16%|█▋        | 107/656 [00:03<00:25, 21.30it/s]

{'loss': 1.9381, 'learning_rate': 2.484472049689441e-06, 'epoch': 0.32}
{'loss': 1.9477, 'learning_rate': 2.508361204013378e-06, 'epoch': 0.32}
{'loss': 1.9448, 'learning_rate': 2.5322503583373148e-06, 'epoch': 0.32}


 17%|█▋        | 110/656 [00:04<00:26, 20.88it/s]

{'loss': 1.9318, 'learning_rate': 2.5561395126612517e-06, 'epoch': 0.33}
{'loss': 1.9413, 'learning_rate': 2.580028666985189e-06, 'epoch': 0.33}
{'loss': 1.9247, 'learning_rate': 2.603917821309126e-06, 'epoch': 0.33}
{'loss': 1.9504, 'learning_rate': 2.6278069756330627e-06, 'epoch': 0.34}


 17%|█▋        | 113/656 [00:04<00:27, 20.10it/s]

{'loss': 1.9425, 'learning_rate': 2.6516961299569996e-06, 'epoch': 0.34}
{'loss': 1.9456, 'learning_rate': 2.6755852842809365e-06, 'epoch': 0.34}
{'loss': 1.9216, 'learning_rate': 2.6994744386048734e-06, 'epoch': 0.34}


 18%|█▊        | 118/656 [00:04<00:27, 19.24it/s]

{'loss': 1.9498, 'learning_rate': 2.7233635929288107e-06, 'epoch': 0.35}
{'loss': 1.9654, 'learning_rate': 2.747252747252747e-06, 'epoch': 0.35}
{'loss': 1.948, 'learning_rate': 2.771141901576684e-06, 'epoch': 0.35}
{'loss': 1.9445, 'learning_rate': 2.7950310559006214e-06, 'epoch': 0.36}


 18%|█▊        | 121/656 [00:04<00:27, 19.53it/s]

{'loss': 1.9245, 'learning_rate': 2.8189202102245582e-06, 'epoch': 0.36}
{'loss': 1.9564, 'learning_rate': 2.842809364548495e-06, 'epoch': 0.36}
{'loss': 1.9228, 'learning_rate': 2.866698518872432e-06, 'epoch': 0.37}
{'loss': 1.9401, 'learning_rate': 2.890587673196369e-06, 'epoch': 0.37}


 19%|█▉        | 125/656 [00:04<00:27, 19.63it/s]

{'loss': 1.9823, 'learning_rate': 2.9144768275203058e-06, 'epoch': 0.37}
{'loss': 1.9473, 'learning_rate': 2.938365981844243e-06, 'epoch': 0.38}
{'loss': 1.9283, 'learning_rate': 2.9622551361681795e-06, 'epoch': 0.38}
{'loss': 1.9205, 'learning_rate': 2.9861442904921164e-06, 'epoch': 0.38}


 20%|█▉        | 129/656 [00:05<00:27, 18.86it/s]

{'loss': 1.9367, 'learning_rate': 3.0100334448160537e-06, 'epoch': 0.38}
{'loss': 1.928, 'learning_rate': 3.0339225991399906e-06, 'epoch': 0.39}
{'loss': 1.9255, 'learning_rate': 3.0578117534639275e-06, 'epoch': 0.39}
{'loss': 1.9223, 'learning_rate': 3.0817009077878644e-06, 'epoch': 0.39}


 20%|██        | 134/656 [00:05<00:25, 20.66it/s]

{'loss': 1.9049, 'learning_rate': 3.1055900621118013e-06, 'epoch': 0.4}
{'loss': 1.9304, 'learning_rate': 3.1294792164357386e-06, 'epoch': 0.4}
{'loss': 1.9057, 'learning_rate': 3.1533683707596755e-06, 'epoch': 0.4}
{'loss': 1.9125, 'learning_rate': 3.1772575250836123e-06, 'epoch': 0.41}


 21%|██        | 137/656 [00:05<00:26, 19.42it/s]

{'loss': 1.9407, 'learning_rate': 3.201146679407549e-06, 'epoch': 0.41}
{'loss': 1.9206, 'learning_rate': 3.2250358337314857e-06, 'epoch': 0.41}
{'loss': 1.9478, 'learning_rate': 3.2489249880554234e-06, 'epoch': 0.41}
{'loss': 1.9202, 'learning_rate': 3.2728141423793603e-06, 'epoch': 0.42}


 22%|██▏       | 142/656 [00:05<00:26, 19.76it/s]

{'loss': 1.9353, 'learning_rate': 3.2967032967032968e-06, 'epoch': 0.42}
{'loss': 1.915, 'learning_rate': 3.3205924510272337e-06, 'epoch': 0.42}
{'loss': 1.9352, 'learning_rate': 3.3444816053511705e-06, 'epoch': 0.43}
{'loss': 1.9415, 'learning_rate': 3.3683707596751074e-06, 'epoch': 0.43}


 22%|██▏       | 145/656 [00:05<00:26, 19.31it/s]

{'loss': 1.9597, 'learning_rate': 3.3922599139990447e-06, 'epoch': 0.43}
{'loss': 1.9507, 'learning_rate': 3.4161490683229816e-06, 'epoch': 0.44}
{'loss': 1.9363, 'learning_rate': 3.4400382226469185e-06, 'epoch': 0.44}
{'loss': 1.9282, 'learning_rate': 3.4639273769708554e-06, 'epoch': 0.44}


 23%|██▎       | 150/656 [00:06<00:23, 21.43it/s]

{'loss': 1.9316, 'learning_rate': 3.4878165312947923e-06, 'epoch': 0.45}
{'loss': 1.9539, 'learning_rate': 3.511705685618729e-06, 'epoch': 0.45}
{'loss': 1.925, 'learning_rate': 3.5355948399426665e-06, 'epoch': 0.45}
{'loss': 1.9214, 'learning_rate': 3.5594839942666033e-06, 'epoch': 0.45}
{'loss': 1.9266, 'learning_rate': 3.5833731485905402e-06, 'epoch': 0.46}


 23%|██▎       | 153/656 [00:06<00:25, 19.56it/s]

{'loss': 1.9282, 'learning_rate': 3.607262302914477e-06, 'epoch': 0.46}
{'loss': 1.9037, 'learning_rate': 3.631151457238414e-06, 'epoch': 0.46}
{'loss': 1.9078, 'learning_rate': 3.6550406115623505e-06, 'epoch': 0.47}


 24%|██▍       | 158/656 [00:06<00:24, 20.00it/s]

{'loss': 1.9215, 'learning_rate': 3.678929765886288e-06, 'epoch': 0.47}
{'loss': 1.9611, 'learning_rate': 3.702818920210225e-06, 'epoch': 0.47}
{'loss': 1.9158, 'learning_rate': 3.726708074534162e-06, 'epoch': 0.48}
{'loss': 1.9209, 'learning_rate': 3.7505972288580984e-06, 'epoch': 0.48}


 25%|██▍       | 162/656 [00:06<00:24, 20.26it/s]

{'loss': 1.9221, 'learning_rate': 3.7744863831820353e-06, 'epoch': 0.48}
{'loss': 1.9323, 'learning_rate': 3.798375537505972e-06, 'epoch': 0.48}
{'loss': 1.9514, 'learning_rate': 3.82226469182991e-06, 'epoch': 0.49}
{'loss': 1.8994, 'learning_rate': 3.846153846153847e-06, 'epoch': 0.49}
{'loss': 1.932, 'learning_rate': 3.870043000477784e-06, 'epoch': 0.49}


 25%|██▌       | 165/656 [00:06<00:23, 20.66it/s]

{'loss': 1.9313, 'learning_rate': 3.8939321548017206e-06, 'epoch': 0.5}
{'loss': 1.931, 'learning_rate': 3.917821309125657e-06, 'epoch': 0.5}
{'loss': 1.9341, 'learning_rate': 3.9417104634495935e-06, 'epoch': 0.5}


 26%|██▌       | 169/656 [00:07<00:27, 17.94it/s]

{'loss': 1.9227, 'learning_rate': 3.965599617773531e-06, 'epoch': 0.51}
{'loss': 1.9263, 'learning_rate': 3.989488772097468e-06, 'epoch': 0.51}
{'loss': 1.9323, 'learning_rate': 4.013377926421405e-06, 'epoch': 0.51}


 26%|██▌       | 172/656 [00:07<00:26, 18.37it/s]

{'loss': 1.9161, 'learning_rate': 4.037267080745342e-06, 'epoch': 0.52}
{'loss': 1.927, 'learning_rate': 4.061156235069279e-06, 'epoch': 0.52}
{'loss': 1.9147, 'learning_rate': 4.085045389393216e-06, 'epoch': 0.52}
{'loss': 1.9341, 'learning_rate': 4.108934543717153e-06, 'epoch': 0.52}


 27%|██▋       | 176/656 [00:07<00:27, 17.17it/s]

{'loss': 1.9402, 'learning_rate': 4.132823698041089e-06, 'epoch': 0.53}
{'loss': 1.9229, 'learning_rate': 4.156712852365026e-06, 'epoch': 0.53}
{'loss': 1.9291, 'learning_rate': 4.180602006688963e-06, 'epoch': 0.53}
{'loss': 1.9253, 'learning_rate': 4.2044911610129e-06, 'epoch': 0.54}


 27%|██▋       | 180/656 [00:07<00:25, 18.53it/s]

{'loss': 1.923, 'learning_rate': 4.228380315336837e-06, 'epoch': 0.54}
{'loss': 1.9376, 'learning_rate': 4.252269469660775e-06, 'epoch': 0.54}
{'loss': 1.9295, 'learning_rate': 4.2761586239847116e-06, 'epoch': 0.55}
{'loss': 1.9285, 'learning_rate': 4.3000477783086484e-06, 'epoch': 0.55}


 28%|██▊       | 184/656 [00:07<00:24, 19.36it/s]

{'loss': 1.9321, 'learning_rate': 4.323936932632585e-06, 'epoch': 0.55}
{'loss': 1.9195, 'learning_rate': 4.347826086956522e-06, 'epoch': 0.55}
{'loss': 1.95, 'learning_rate': 4.371715241280458e-06, 'epoch': 0.56}
{'loss': 1.9021, 'learning_rate': 4.395604395604396e-06, 'epoch': 0.56}


 29%|██▊       | 188/656 [00:08<00:23, 19.50it/s]

{'loss': 1.9184, 'learning_rate': 4.419493549928333e-06, 'epoch': 0.56}
{'loss': 1.9372, 'learning_rate': 4.44338270425227e-06, 'epoch': 0.57}
{'loss': 1.9275, 'learning_rate': 4.467271858576207e-06, 'epoch': 0.57}
{'loss': 1.9149, 'learning_rate': 4.4911610129001435e-06, 'epoch': 0.57}


 29%|██▉       | 193/656 [00:08<00:23, 20.11it/s]

{'loss': 1.9578, 'learning_rate': 4.51505016722408e-06, 'epoch': 0.58}
{'loss': 1.921, 'learning_rate': 4.538939321548018e-06, 'epoch': 0.58}
{'loss': 1.9481, 'learning_rate': 4.562828475871954e-06, 'epoch': 0.58}
{'loss': 1.933, 'learning_rate': 4.586717630195891e-06, 'epoch': 0.59}
{'loss': 1.9242, 'learning_rate': 4.610606784519828e-06, 'epoch': 0.59}


 30%|███       | 197/656 [00:08<00:22, 20.54it/s]

{'loss': 1.9084, 'learning_rate': 4.634495938843765e-06, 'epoch': 0.59}
{'loss': 1.8956, 'learning_rate': 4.658385093167702e-06, 'epoch': 0.59}
{'loss': 1.924, 'learning_rate': 4.682274247491639e-06, 'epoch': 0.6}
{'loss': 1.9329, 'learning_rate': 4.706163401815576e-06, 'epoch': 0.6}


 31%|███       | 201/656 [00:08<00:21, 20.82it/s]

{'loss': 1.9099, 'learning_rate': 4.730052556139513e-06, 'epoch': 0.6}
{'loss': 1.9111, 'learning_rate': 4.75394171046345e-06, 'epoch': 0.61}
{'loss': 1.9207, 'learning_rate': 4.777830864787387e-06, 'epoch': 0.61}
{'loss': 1.9353, 'learning_rate': 4.801720019111324e-06, 'epoch': 0.61}


 31%|███▏      | 206/656 [00:09<00:22, 20.10it/s]

{'loss': 1.915, 'learning_rate': 4.82560917343526e-06, 'epoch': 0.62}
{'loss': 1.9239, 'learning_rate': 4.849498327759198e-06, 'epoch': 0.62}
{'loss': 1.9028, 'learning_rate': 4.8733874820831345e-06, 'epoch': 0.62}
{'loss': 1.9041, 'learning_rate': 4.897276636407071e-06, 'epoch': 0.62}
{'loss': 1.9241, 'learning_rate': 4.921165790731008e-06, 'epoch': 0.63}


 32%|███▏      | 210/656 [00:09<00:22, 19.68it/s]

{'loss': 1.9235, 'learning_rate': 4.945054945054945e-06, 'epoch': 0.63}
{'loss': 1.93, 'learning_rate': 4.968944099378882e-06, 'epoch': 0.63}
{'loss': 1.9207, 'learning_rate': 4.99283325370282e-06, 'epoch': 0.64}
{'loss': 1.9298, 'learning_rate': 5.016722408026756e-06, 'epoch': 0.64}


 33%|███▎      | 214/656 [00:09<00:22, 19.38it/s]

{'loss': 1.9144, 'learning_rate': 5.040611562350693e-06, 'epoch': 0.64}
{'loss': 1.9434, 'learning_rate': 5.0645007166746296e-06, 'epoch': 0.65}
{'loss': 1.9017, 'learning_rate': 5.0883898709985665e-06, 'epoch': 0.65}
{'loss': 1.9081, 'learning_rate': 5.112279025322503e-06, 'epoch': 0.65}


 33%|███▎      | 217/656 [00:09<00:23, 18.94it/s]

{'loss': 1.9184, 'learning_rate': 5.136168179646441e-06, 'epoch': 0.66}
{'loss': 1.8961, 'learning_rate': 5.160057333970378e-06, 'epoch': 0.66}
{'loss': 1.9287, 'learning_rate': 5.183946488294315e-06, 'epoch': 0.66}


 34%|███▎      | 221/656 [00:09<00:24, 17.69it/s]

{'loss': 1.932, 'learning_rate': 5.207835642618252e-06, 'epoch': 0.66}
{'loss': 1.9264, 'learning_rate': 5.231724796942189e-06, 'epoch': 0.67}
{'loss': 1.9098, 'learning_rate': 5.2556139512661255e-06, 'epoch': 0.67}
{'loss': 1.9251, 'learning_rate': 5.279503105590062e-06, 'epoch': 0.67}


 34%|███▍      | 226/656 [00:10<00:22, 19.25it/s]

{'loss': 1.9328, 'learning_rate': 5.303392259913999e-06, 'epoch': 0.68}
{'loss': 1.9165, 'learning_rate': 5.327281414237936e-06, 'epoch': 0.68}
{'loss': 1.9266, 'learning_rate': 5.351170568561873e-06, 'epoch': 0.68}
{'loss': 1.8974, 'learning_rate': 5.37505972288581e-06, 'epoch': 0.69}


 35%|███▍      | 229/656 [00:10<00:21, 20.09it/s]

{'loss': 1.927, 'learning_rate': 5.398948877209747e-06, 'epoch': 0.69}
{'loss': 1.9233, 'learning_rate': 5.4228380315336845e-06, 'epoch': 0.69}
{'loss': 1.9066, 'learning_rate': 5.446727185857621e-06, 'epoch': 0.7}
{'loss': 1.9084, 'learning_rate': 5.4706163401815574e-06, 'epoch': 0.7}


 36%|███▌      | 233/656 [00:10<00:21, 19.33it/s]

{'loss': 1.928, 'learning_rate': 5.494505494505494e-06, 'epoch': 0.7}
{'loss': 1.9037, 'learning_rate': 5.518394648829431e-06, 'epoch': 0.7}
{'loss': 1.9301, 'learning_rate': 5.542283803153368e-06, 'epoch': 0.71}
{'loss': 1.9178, 'learning_rate': 5.566172957477306e-06, 'epoch': 0.71}


 36%|███▋      | 238/656 [00:10<00:20, 20.00it/s]

{'loss': 1.9098, 'learning_rate': 5.590062111801243e-06, 'epoch': 0.71}
{'loss': 1.9106, 'learning_rate': 5.61395126612518e-06, 'epoch': 0.72}
{'loss': 1.9123, 'learning_rate': 5.6378404204491165e-06, 'epoch': 0.72}
{'loss': 1.9239, 'learning_rate': 5.661729574773053e-06, 'epoch': 0.72}


 37%|███▋      | 241/656 [00:10<00:22, 18.35it/s]

{'loss': 1.8896, 'learning_rate': 5.68561872909699e-06, 'epoch': 0.73}
{'loss': 1.9334, 'learning_rate': 5.709507883420927e-06, 'epoch': 0.73}
{'loss': 1.9253, 'learning_rate': 5.733397037744864e-06, 'epoch': 0.73}


 37%|███▋      | 244/656 [00:11<00:22, 17.97it/s]

{'loss': 1.9048, 'learning_rate': 5.757286192068801e-06, 'epoch': 0.73}
{'loss': 1.9071, 'learning_rate': 5.781175346392738e-06, 'epoch': 0.74}
{'loss': 1.9129, 'learning_rate': 5.805064500716675e-06, 'epoch': 0.74}
{'loss': 1.8909, 'learning_rate': 5.8289536550406116e-06, 'epoch': 0.74}


 38%|███▊      | 248/656 [00:11<00:22, 18.45it/s]

{'loss': 1.919, 'learning_rate': 5.852842809364549e-06, 'epoch': 0.75}
{'loss': 1.9019, 'learning_rate': 5.876731963688486e-06, 'epoch': 0.75}
{'loss': 1.9199, 'learning_rate': 5.900621118012423e-06, 'epoch': 0.75}
{'loss': 1.9107, 'learning_rate': 5.924510272336359e-06, 'epoch': 0.76}


 38%|███▊      | 252/656 [00:11<00:22, 17.79it/s]

{'loss': 1.908, 'learning_rate': 5.948399426660296e-06, 'epoch': 0.76}
{'loss': 1.9255, 'learning_rate': 5.972288580984233e-06, 'epoch': 0.76}
{'loss': 1.9002, 'learning_rate': 5.996177735308171e-06, 'epoch': 0.77}


 39%|███▉      | 255/656 [00:11<00:21, 18.33it/s]

{'loss': 1.9033, 'learning_rate': 6.0200668896321075e-06, 'epoch': 0.77}
{'loss': 1.9215, 'learning_rate': 6.043956043956044e-06, 'epoch': 0.77}
{'loss': 1.8952, 'learning_rate': 6.067845198279981e-06, 'epoch': 0.77}
{'loss': 1.9224, 'learning_rate': 6.091734352603918e-06, 'epoch': 0.78}


 39%|███▉      | 259/656 [00:11<00:21, 18.76it/s]

{'loss': 1.917, 'learning_rate': 6.115623506927855e-06, 'epoch': 0.78}
{'loss': 1.9113, 'learning_rate': 6.139512661251792e-06, 'epoch': 0.78}
{'loss': 1.9253, 'learning_rate': 6.163401815575729e-06, 'epoch': 0.79}
{'loss': 1.9059, 'learning_rate': 6.187290969899666e-06, 'epoch': 0.79}


 40%|████      | 263/656 [00:12<00:20, 19.26it/s]

{'loss': 1.8879, 'learning_rate': 6.2111801242236025e-06, 'epoch': 0.79}
{'loss': 1.892, 'learning_rate': 6.2350692785475394e-06, 'epoch': 0.8}
{'loss': 1.9092, 'learning_rate': 6.258958432871477e-06, 'epoch': 0.8}
{'loss': 1.8992, 'learning_rate': 6.282847587195413e-06, 'epoch': 0.8}


 41%|████      | 267/656 [00:12<00:21, 18.16it/s]

{'loss': 1.9237, 'learning_rate': 6.306736741519351e-06, 'epoch': 0.8}
{'loss': 1.9196, 'learning_rate': 6.330625895843287e-06, 'epoch': 0.81}
{'loss': 1.9156, 'learning_rate': 6.354515050167225e-06, 'epoch': 0.81}
{'loss': 1.8736, 'learning_rate': 6.378404204491162e-06, 'epoch': 0.81}


 41%|████▏     | 272/656 [00:12<00:18, 20.23it/s]

{'loss': 1.9012, 'learning_rate': 6.402293358815098e-06, 'epoch': 0.82}
{'loss': 1.9031, 'learning_rate': 6.426182513139035e-06, 'epoch': 0.82}
{'loss': 1.9171, 'learning_rate': 6.450071667462971e-06, 'epoch': 0.82}
{'loss': 1.8856, 'learning_rate': 6.473960821786909e-06, 'epoch': 0.83}


 42%|████▏     | 275/656 [00:12<00:19, 19.65it/s]

{'loss': 1.9053, 'learning_rate': 6.497849976110847e-06, 'epoch': 0.83}
{'loss': 1.8873, 'learning_rate': 6.521739130434783e-06, 'epoch': 0.83}
{'loss': 1.8964, 'learning_rate': 6.545628284758721e-06, 'epoch': 0.84}
{'loss': 1.8822, 'learning_rate': 6.569517439082657e-06, 'epoch': 0.84}


 43%|████▎     | 279/656 [00:12<00:18, 20.11it/s]

{'loss': 1.8946, 'learning_rate': 6.5934065934065935e-06, 'epoch': 0.84}
{'loss': 1.9025, 'learning_rate': 6.61729574773053e-06, 'epoch': 0.84}
{'loss': 1.9156, 'learning_rate': 6.641184902054467e-06, 'epoch': 0.85}
{'loss': 1.9021, 'learning_rate': 6.665074056378405e-06, 'epoch': 0.85}


 43%|████▎     | 283/656 [00:13<00:18, 20.50it/s]

{'loss': 1.9158, 'learning_rate': 6.688963210702341e-06, 'epoch': 0.85}
{'loss': 1.9086, 'learning_rate': 6.712852365026279e-06, 'epoch': 0.86}
{'loss': 1.9006, 'learning_rate': 6.736741519350215e-06, 'epoch': 0.86}
{'loss': 1.8962, 'learning_rate': 6.7606306736741526e-06, 'epoch': 0.86}


 44%|████▍     | 288/656 [00:13<00:18, 20.26it/s]

{'loss': 1.9027, 'learning_rate': 6.7845198279980895e-06, 'epoch': 0.87}
{'loss': 1.8911, 'learning_rate': 6.808408982322026e-06, 'epoch': 0.87}
{'loss': 1.8965, 'learning_rate': 6.832298136645963e-06, 'epoch': 0.87}
{'loss': 1.8894, 'learning_rate': 6.856187290969899e-06, 'epoch': 0.88}
{'loss': 1.8911, 'learning_rate': 6.880076445293837e-06, 'epoch': 0.88}


 45%|████▍     | 292/656 [00:13<00:17, 20.62it/s]

{'loss': 1.8942, 'learning_rate': 6.903965599617773e-06, 'epoch': 0.88}
{'loss': 1.8889, 'learning_rate': 6.927854753941711e-06, 'epoch': 0.88}
{'loss': 1.9047, 'learning_rate': 6.9517439082656485e-06, 'epoch': 0.89}
{'loss': 1.9123, 'learning_rate': 6.9756330625895845e-06, 'epoch': 0.89}


 45%|████▌     | 296/656 [00:13<00:19, 18.59it/s]

{'loss': 1.8823, 'learning_rate': 6.999522216913522e-06, 'epoch': 0.89}
{'loss': 1.8924, 'learning_rate': 7.023411371237458e-06, 'epoch': 0.9}
{'loss': 1.8984, 'learning_rate': 7.047300525561395e-06, 'epoch': 0.9}
{'loss': 1.9039, 'learning_rate': 7.071189679885333e-06, 'epoch': 0.9}


 46%|████▌     | 301/656 [00:13<00:17, 19.96it/s]

{'loss': 1.9054, 'learning_rate': 7.095078834209269e-06, 'epoch': 0.91}
{'loss': 1.8992, 'learning_rate': 7.118967988533207e-06, 'epoch': 0.91}
{'loss': 1.8869, 'learning_rate': 7.142857142857143e-06, 'epoch': 0.91}
{'loss': 1.8925, 'learning_rate': 7.1667462971810804e-06, 'epoch': 0.91}


 46%|████▋     | 304/656 [00:14<00:18, 19.41it/s]

{'loss': 1.9048, 'learning_rate': 7.1906354515050165e-06, 'epoch': 0.92}
{'loss': 1.8738, 'learning_rate': 7.214524605828954e-06, 'epoch': 0.92}
{'loss': 1.8912, 'learning_rate': 7.238413760152891e-06, 'epoch': 0.92}
{'loss': 1.9154, 'learning_rate': 7.262302914476828e-06, 'epoch': 0.93}


 47%|████▋     | 308/656 [00:14<00:19, 18.02it/s]

{'loss': 1.8844, 'learning_rate': 7.286192068800765e-06, 'epoch': 0.93}
{'loss': 1.8574, 'learning_rate': 7.310081223124701e-06, 'epoch': 0.93}
{'loss': 1.8964, 'learning_rate': 7.333970377448639e-06, 'epoch': 0.94}
{'loss': 1.8883, 'learning_rate': 7.357859531772576e-06, 'epoch': 0.94}


 48%|████▊     | 312/656 [00:14<00:18, 18.14it/s]

{'loss': 1.8792, 'learning_rate': 7.381748686096512e-06, 'epoch': 0.94}
{'loss': 1.8865, 'learning_rate': 7.40563784042045e-06, 'epoch': 0.95}
{'loss': 1.8911, 'learning_rate': 7.429526994744386e-06, 'epoch': 0.95}
{'loss': 1.8894, 'learning_rate': 7.453416149068324e-06, 'epoch': 0.95}


 48%|████▊     | 317/656 [00:14<00:18, 18.57it/s]

{'loss': 1.8703, 'learning_rate': 7.47730530339226e-06, 'epoch': 0.95}
{'loss': 1.8998, 'learning_rate': 7.501194457716197e-06, 'epoch': 0.96}
{'loss': 1.8854, 'learning_rate': 7.5250836120401346e-06, 'epoch': 0.96}
{'loss': 1.8667, 'learning_rate': 7.548972766364071e-06, 'epoch': 0.96}


 49%|████▉     | 320/656 [00:15<00:17, 18.97it/s]

{'loss': 1.882, 'learning_rate': 7.572861920688008e-06, 'epoch': 0.97}
{'loss': 1.8904, 'learning_rate': 7.596751075011944e-06, 'epoch': 0.97}
{'loss': 1.8862, 'learning_rate': 7.620640229335882e-06, 'epoch': 0.97}
{'loss': 1.8736, 'learning_rate': 7.64452938365982e-06, 'epoch': 0.98}


 49%|████▉     | 324/656 [00:15<00:18, 18.06it/s]

{'loss': 1.8895, 'learning_rate': 7.668418537983756e-06, 'epoch': 0.98}
{'loss': 1.8754, 'learning_rate': 7.692307692307694e-06, 'epoch': 0.98}
{'loss': 1.8798, 'learning_rate': 7.71619684663163e-06, 'epoch': 0.98}
{'loss': 1.8659, 'learning_rate': 7.740086000955567e-06, 'epoch': 0.99}


 50%|█████     | 328/656 [00:15<00:18, 17.69it/s]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: token_labels, tokens, utterance. If token_labels, tokens, utterance are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32


{'loss': 1.8787, 'learning_rate': 7.763975155279503e-06, 'epoch': 0.99}
{'loss': 1.8922, 'learning_rate': 7.787864309603441e-06, 'epoch': 0.99}
{'loss': 1.8911, 'learning_rate': 7.811753463927377e-06, 'epoch': 1.0}
{'loss': 1.9111, 'learning_rate': 7.835642618251313e-06, 'epoch': 1.0}


                                                 
 50%|█████     | 328/656 [00:19<00:18, 17.69it/s]Saving model checkpoint to ./snips_clf/results\checkpoint-328
Configuration saved in ./snips_clf/results\checkpoint-328\config.json


{'eval_loss': 1.8689872026443481, 'eval_accuracy': 0.6851356515093618, 'eval_runtime': 3.9093, 'eval_samples_per_second': 669.43, 'eval_steps_per_second': 20.976, 'epoch': 1.0}


Model weights saved in ./snips_clf/results\checkpoint-328\pytorch_model.bin
 50%|█████     | 331/656 [00:22<03:55,  1.38it/s]

{'loss': 1.8947, 'learning_rate': 7.859531772575251e-06, 'epoch': 1.0}
{'loss': 1.8635, 'learning_rate': 7.883420926899187e-06, 'epoch': 1.01}
{'loss': 1.8822, 'learning_rate': 7.907310081223125e-06, 'epoch': 1.01}


 51%|█████     | 335/656 [00:23<02:10,  2.46it/s]

{'loss': 1.8605, 'learning_rate': 7.931199235547062e-06, 'epoch': 1.01}
{'loss': 1.8973, 'learning_rate': 7.955088389870998e-06, 'epoch': 1.02}
{'loss': 1.8648, 'learning_rate': 7.978977544194936e-06, 'epoch': 1.02}
{'loss': 1.8715, 'learning_rate': 8.002866698518872e-06, 'epoch': 1.02}


 52%|█████▏    | 339/656 [00:23<01:26,  3.67it/s]

{'loss': 1.8922, 'learning_rate': 8.02675585284281e-06, 'epoch': 1.02}
{'loss': 1.8703, 'learning_rate': 8.050645007166746e-06, 'epoch': 1.03}
{'loss': 1.8737, 'learning_rate': 8.074534161490684e-06, 'epoch': 1.03}
{'loss': 1.8714, 'learning_rate': 8.098423315814621e-06, 'epoch': 1.03}


 52%|█████▏    | 343/656 [00:23<00:50,  6.21it/s]

{'loss': 1.8696, 'learning_rate': 8.122312470138558e-06, 'epoch': 1.04}
{'loss': 1.863, 'learning_rate': 8.146201624462495e-06, 'epoch': 1.04}
{'loss': 1.8613, 'learning_rate': 8.170090778786431e-06, 'epoch': 1.04}
{'loss': 1.8625, 'learning_rate': 8.193979933110369e-06, 'epoch': 1.05}


 53%|█████▎    | 347/656 [00:23<00:34,  8.92it/s]

{'loss': 1.8717, 'learning_rate': 8.217869087434307e-06, 'epoch': 1.05}
{'loss': 1.87, 'learning_rate': 8.241758241758243e-06, 'epoch': 1.05}
{'loss': 1.8731, 'learning_rate': 8.265647396082179e-06, 'epoch': 1.05}
{'loss': 1.8807, 'learning_rate': 8.289536550406115e-06, 'epoch': 1.06}


 54%|█████▎    | 352/656 [00:24<00:23, 12.87it/s]

{'loss': 1.8709, 'learning_rate': 8.313425704730053e-06, 'epoch': 1.06}
{'loss': 1.8517, 'learning_rate': 8.337314859053989e-06, 'epoch': 1.06}
{'loss': 1.8659, 'learning_rate': 8.361204013377926e-06, 'epoch': 1.07}
{'loss': 1.8398, 'learning_rate': 8.385093167701864e-06, 'epoch': 1.07}


 54%|█████▍    | 355/656 [00:24<00:21, 13.94it/s]

{'loss': 1.8523, 'learning_rate': 8.4089823220258e-06, 'epoch': 1.07}
{'loss': 1.8829, 'learning_rate': 8.432871476349738e-06, 'epoch': 1.08}
{'loss': 1.8631, 'learning_rate': 8.456760630673674e-06, 'epoch': 1.08}
{'loss': 1.8547, 'learning_rate': 8.480649784997612e-06, 'epoch': 1.08}


 55%|█████▍    | 359/656 [00:24<00:17, 17.10it/s]

{'loss': 1.843, 'learning_rate': 8.50453893932155e-06, 'epoch': 1.09}
{'loss': 1.8405, 'learning_rate': 8.528428093645485e-06, 'epoch': 1.09}
{'loss': 1.8462, 'learning_rate': 8.552317247969423e-06, 'epoch': 1.09}
{'loss': 1.8781, 'learning_rate': 8.576206402293359e-06, 'epoch': 1.09}


 55%|█████▌    | 363/656 [00:24<00:16, 17.28it/s]

{'loss': 1.8589, 'learning_rate': 8.600095556617297e-06, 'epoch': 1.1}
{'loss': 1.8719, 'learning_rate': 8.623984710941233e-06, 'epoch': 1.1}
{'loss': 1.8534, 'learning_rate': 8.64787386526517e-06, 'epoch': 1.1}
{'loss': 1.8557, 'learning_rate': 8.671763019589108e-06, 'epoch': 1.11}


 56%|█████▌    | 368/656 [00:24<00:15, 18.94it/s]

{'loss': 1.8515, 'learning_rate': 8.695652173913044e-06, 'epoch': 1.11}
{'loss': 1.8465, 'learning_rate': 8.71954132823698e-06, 'epoch': 1.11}
{'loss': 1.8443, 'learning_rate': 8.743430482560916e-06, 'epoch': 1.12}
{'loss': 1.867, 'learning_rate': 8.767319636884854e-06, 'epoch': 1.12}


 57%|█████▋    | 371/656 [00:25<00:14, 19.76it/s]

{'loss': 1.8431, 'learning_rate': 8.791208791208792e-06, 'epoch': 1.12}
{'loss': 1.8643, 'learning_rate': 8.815097945532728e-06, 'epoch': 1.12}
{'loss': 1.8499, 'learning_rate': 8.838987099856666e-06, 'epoch': 1.13}
{'loss': 1.8721, 'learning_rate': 8.862876254180602e-06, 'epoch': 1.13}


 57%|█████▋    | 375/656 [00:25<00:13, 20.24it/s]

{'loss': 1.8552, 'learning_rate': 8.88676540850454e-06, 'epoch': 1.13}
{'loss': 1.8588, 'learning_rate': 8.910654562828476e-06, 'epoch': 1.14}
{'loss': 1.8578, 'learning_rate': 8.934543717152413e-06, 'epoch': 1.14}
{'loss': 1.838, 'learning_rate': 8.958432871476351e-06, 'epoch': 1.14}


 58%|█████▊    | 379/656 [00:25<00:15, 17.95it/s]

{'loss': 1.8543, 'learning_rate': 8.982322025800287e-06, 'epoch': 1.15}
{'loss': 1.8562, 'learning_rate': 9.006211180124225e-06, 'epoch': 1.15}
{'loss': 1.8555, 'learning_rate': 9.03010033444816e-06, 'epoch': 1.15}


 58%|█████▊    | 382/656 [00:25<00:15, 17.95it/s]

{'loss': 1.8526, 'learning_rate': 9.053989488772099e-06, 'epoch': 1.16}
{'loss': 1.8289, 'learning_rate': 9.077878643096036e-06, 'epoch': 1.16}
{'loss': 1.8552, 'learning_rate': 9.101767797419972e-06, 'epoch': 1.16}
{'loss': 1.8504, 'learning_rate': 9.125656951743908e-06, 'epoch': 1.16}


 59%|█████▉    | 386/656 [00:25<00:14, 18.99it/s]

{'loss': 1.8326, 'learning_rate': 9.149546106067846e-06, 'epoch': 1.17}
{'loss': 1.8751, 'learning_rate': 9.173435260391782e-06, 'epoch': 1.17}
{'loss': 1.8304, 'learning_rate': 9.197324414715718e-06, 'epoch': 1.17}
{'loss': 1.8278, 'learning_rate': 9.221213569039656e-06, 'epoch': 1.18}


 59%|█████▉    | 390/656 [00:26<00:14, 18.54it/s]

{'loss': 1.8389, 'learning_rate': 9.245102723363594e-06, 'epoch': 1.18}
{'loss': 1.8565, 'learning_rate': 9.26899187768753e-06, 'epoch': 1.18}
{'loss': 1.8302, 'learning_rate': 9.292881032011467e-06, 'epoch': 1.19}
{'loss': 1.8629, 'learning_rate': 9.316770186335403e-06, 'epoch': 1.19}


 60%|██████    | 394/656 [00:26<00:13, 19.00it/s]

{'loss': 1.8528, 'learning_rate': 9.340659340659341e-06, 'epoch': 1.19}
{'loss': 1.8554, 'learning_rate': 9.364548494983277e-06, 'epoch': 1.2}
{'loss': 1.8332, 'learning_rate': 9.388437649307215e-06, 'epoch': 1.2}
{'loss': 1.8298, 'learning_rate': 9.412326803631153e-06, 'epoch': 1.2}


 61%|██████    | 398/656 [00:26<00:13, 19.30it/s]

{'loss': 1.8351, 'learning_rate': 9.436215957955089e-06, 'epoch': 1.2}
{'loss': 1.846, 'learning_rate': 9.460105112279026e-06, 'epoch': 1.21}
{'loss': 1.8701, 'learning_rate': 9.483994266602962e-06, 'epoch': 1.21}
{'loss': 1.8589, 'learning_rate': 9.5078834209269e-06, 'epoch': 1.21}


 61%|██████▏   | 403/656 [00:26<00:12, 19.91it/s]

{'loss': 1.8429, 'learning_rate': 9.531772575250838e-06, 'epoch': 1.22}
{'loss': 1.8047, 'learning_rate': 9.555661729574774e-06, 'epoch': 1.22}
{'loss': 1.8263, 'learning_rate': 9.57955088389871e-06, 'epoch': 1.22}
{'loss': 1.8331, 'learning_rate': 9.603440038222648e-06, 'epoch': 1.23}


 62%|██████▏   | 406/656 [00:26<00:13, 19.03it/s]

{'loss': 1.8684, 'learning_rate': 9.627329192546584e-06, 'epoch': 1.23}
{'loss': 1.8106, 'learning_rate': 9.65121834687052e-06, 'epoch': 1.23}
{'loss': 1.8603, 'learning_rate': 9.675107501194458e-06, 'epoch': 1.23}
{'loss': 1.8325, 'learning_rate': 9.698996655518395e-06, 'epoch': 1.24}


 62%|██████▎   | 410/656 [00:27<00:12, 19.31it/s]

{'loss': 1.8213, 'learning_rate': 9.722885809842331e-06, 'epoch': 1.24}
{'loss': 1.8336, 'learning_rate': 9.746774964166269e-06, 'epoch': 1.24}
{'loss': 1.8081, 'learning_rate': 9.770664118490205e-06, 'epoch': 1.25}
{'loss': 1.841, 'learning_rate': 9.794553272814143e-06, 'epoch': 1.25}


 63%|██████▎   | 413/656 [00:27<00:12, 19.15it/s]

{'loss': 1.8352, 'learning_rate': 9.81844242713808e-06, 'epoch': 1.25}
{'loss': 1.8586, 'learning_rate': 9.842331581462017e-06, 'epoch': 1.26}
{'loss': 1.7933, 'learning_rate': 9.866220735785954e-06, 'epoch': 1.26}


 64%|██████▎   | 417/656 [00:27<00:12, 18.40it/s]

{'loss': 1.8488, 'learning_rate': 9.89010989010989e-06, 'epoch': 1.26}
{'loss': 1.8413, 'learning_rate': 9.913999044433828e-06, 'epoch': 1.27}
{'loss': 1.8177, 'learning_rate': 9.937888198757764e-06, 'epoch': 1.27}
{'loss': 1.8317, 'learning_rate': 9.961777353081702e-06, 'epoch': 1.27}


 64%|██████▍   | 421/656 [00:27<00:12, 18.84it/s]

{'loss': 1.8592, 'learning_rate': 9.98566650740564e-06, 'epoch': 1.27}
{'loss': 1.8031, 'learning_rate': 1.0009555661729576e-05, 'epoch': 1.28}
{'loss': 1.8374, 'learning_rate': 1.0033444816053512e-05, 'epoch': 1.28}
{'loss': 1.8549, 'learning_rate': 1.005733397037745e-05, 'epoch': 1.28}


 65%|██████▍   | 425/656 [00:27<00:13, 17.65it/s]

{'loss': 1.8145, 'learning_rate': 1.0081223124701385e-05, 'epoch': 1.29}
{'loss': 1.8225, 'learning_rate': 1.0105112279025323e-05, 'epoch': 1.29}
{'loss': 1.8235, 'learning_rate': 1.0129001433349259e-05, 'epoch': 1.29}
{'loss': 1.8255, 'learning_rate': 1.0152890587673197e-05, 'epoch': 1.3}


 65%|██████▌   | 429/656 [00:28<00:12, 18.50it/s]

{'loss': 1.8562, 'learning_rate': 1.0176779741997133e-05, 'epoch': 1.3}
{'loss': 1.8186, 'learning_rate': 1.020066889632107e-05, 'epoch': 1.3}
{'loss': 1.8202, 'learning_rate': 1.0224558050645007e-05, 'epoch': 1.3}
{'loss': 1.8243, 'learning_rate': 1.0248447204968944e-05, 'epoch': 1.31}


 66%|██████▌   | 433/656 [00:28<00:12, 17.51it/s]

{'loss': 1.8163, 'learning_rate': 1.0272336359292882e-05, 'epoch': 1.31}
{'loss': 1.808, 'learning_rate': 1.0296225513616818e-05, 'epoch': 1.31}
{'loss': 1.8274, 'learning_rate': 1.0320114667940756e-05, 'epoch': 1.32}
{'loss': 1.795, 'learning_rate': 1.0344003822264692e-05, 'epoch': 1.32}


 67%|██████▋   | 438/656 [00:28<00:11, 19.76it/s]

{'loss': 1.8086, 'learning_rate': 1.036789297658863e-05, 'epoch': 1.32}
{'loss': 1.8057, 'learning_rate': 1.0391782130912567e-05, 'epoch': 1.33}
{'loss': 1.8307, 'learning_rate': 1.0415671285236503e-05, 'epoch': 1.33}
{'loss': 1.7971, 'learning_rate': 1.0439560439560441e-05, 'epoch': 1.33}


 67%|██████▋   | 440/656 [00:28<00:11, 18.88it/s]

{'loss': 1.8014, 'learning_rate': 1.0463449593884377e-05, 'epoch': 1.34}
{'loss': 1.8126, 'learning_rate': 1.0487338748208313e-05, 'epoch': 1.34}
{'loss': 1.813, 'learning_rate': 1.0511227902532251e-05, 'epoch': 1.34}


 68%|██████▊   | 444/656 [00:28<00:12, 17.64it/s]

{'loss': 1.8084, 'learning_rate': 1.0535117056856187e-05, 'epoch': 1.34}
{'loss': 1.8323, 'learning_rate': 1.0559006211180125e-05, 'epoch': 1.35}
{'loss': 1.8208, 'learning_rate': 1.058289536550406e-05, 'epoch': 1.35}
{'loss': 1.821, 'learning_rate': 1.0606784519827999e-05, 'epoch': 1.35}


 68%|██████▊   | 449/656 [00:29<00:10, 19.51it/s]

{'loss': 1.8249, 'learning_rate': 1.0630673674151935e-05, 'epoch': 1.36}
{'loss': 1.8111, 'learning_rate': 1.0654562828475872e-05, 'epoch': 1.36}
{'loss': 1.8117, 'learning_rate': 1.067845198279981e-05, 'epoch': 1.36}
{'loss': 1.8085, 'learning_rate': 1.0702341137123746e-05, 'epoch': 1.37}
{'loss': 1.7961, 'learning_rate': 1.0726230291447684e-05, 'epoch': 1.37}


 69%|██████▉   | 453/656 [00:29<00:10, 20.22it/s]

{'loss': 1.8018, 'learning_rate': 1.075011944577162e-05, 'epoch': 1.37}
{'loss': 1.7733, 'learning_rate': 1.0774008600095558e-05, 'epoch': 1.38}
{'loss': 1.8281, 'learning_rate': 1.0797897754419494e-05, 'epoch': 1.38}
{'loss': 1.8184, 'learning_rate': 1.0821786908743431e-05, 'epoch': 1.38}


 70%|██████▉   | 456/656 [00:29<00:11, 17.66it/s]

{'loss': 1.8422, 'learning_rate': 1.0845676063067369e-05, 'epoch': 1.38}
{'loss': 1.784, 'learning_rate': 1.0869565217391305e-05, 'epoch': 1.39}
{'loss': 1.8091, 'learning_rate': 1.0893454371715243e-05, 'epoch': 1.39}


 70%|███████   | 460/656 [00:29<00:11, 17.21it/s]

{'loss': 1.811, 'learning_rate': 1.0917343526039179e-05, 'epoch': 1.39}
{'loss': 1.836, 'learning_rate': 1.0941232680363115e-05, 'epoch': 1.4}
{'loss': 1.7869, 'learning_rate': 1.0965121834687053e-05, 'epoch': 1.4}


 71%|███████   | 463/656 [00:30<00:10, 18.36it/s]

{'loss': 1.7857, 'learning_rate': 1.0989010989010989e-05, 'epoch': 1.4}
{'loss': 1.8243, 'learning_rate': 1.1012900143334926e-05, 'epoch': 1.41}
{'loss': 1.7669, 'learning_rate': 1.1036789297658862e-05, 'epoch': 1.41}
{'loss': 1.8163, 'learning_rate': 1.10606784519828e-05, 'epoch': 1.41}


 71%|███████   | 467/656 [00:30<00:10, 18.27it/s]

{'loss': 1.7853, 'learning_rate': 1.1084567606306736e-05, 'epoch': 1.41}
{'loss': 1.7984, 'learning_rate': 1.1108456760630674e-05, 'epoch': 1.42}
{'loss': 1.7897, 'learning_rate': 1.1132345914954612e-05, 'epoch': 1.42}
{'loss': 1.8121, 'learning_rate': 1.1156235069278548e-05, 'epoch': 1.42}


 72%|███████▏  | 472/656 [00:30<00:09, 19.64it/s]

{'loss': 1.8085, 'learning_rate': 1.1180124223602485e-05, 'epoch': 1.43}
{'loss': 1.7888, 'learning_rate': 1.1204013377926421e-05, 'epoch': 1.43}
{'loss': 1.7968, 'learning_rate': 1.122790253225036e-05, 'epoch': 1.43}
{'loss': 1.8001, 'learning_rate': 1.1251791686574297e-05, 'epoch': 1.44}
{'loss': 1.8012, 'learning_rate': 1.1275680840898233e-05, 'epoch': 1.44}


 73%|███████▎  | 476/656 [00:30<00:08, 20.22it/s]

{'loss': 1.8007, 'learning_rate': 1.129956999522217e-05, 'epoch': 1.44}
{'loss': 1.7849, 'learning_rate': 1.1323459149546107e-05, 'epoch': 1.45}
{'loss': 1.7945, 'learning_rate': 1.1347348303870044e-05, 'epoch': 1.45}
{'loss': 1.8041, 'learning_rate': 1.137123745819398e-05, 'epoch': 1.45}


 73%|███████▎  | 481/656 [00:30<00:08, 20.22it/s]

{'loss': 1.8067, 'learning_rate': 1.1395126612517917e-05, 'epoch': 1.45}
{'loss': 1.7671, 'learning_rate': 1.1419015766841854e-05, 'epoch': 1.46}
{'loss': 1.7953, 'learning_rate': 1.144290492116579e-05, 'epoch': 1.46}
{'loss': 1.8026, 'learning_rate': 1.1466794075489728e-05, 'epoch': 1.46}


 74%|███████▍  | 484/656 [00:31<00:08, 20.02it/s]

{'loss': 1.8103, 'learning_rate': 1.1490683229813664e-05, 'epoch': 1.47}
{'loss': 1.7723, 'learning_rate': 1.1514572384137602e-05, 'epoch': 1.47}
{'loss': 1.7542, 'learning_rate': 1.153846153846154e-05, 'epoch': 1.47}
{'loss': 1.7968, 'learning_rate': 1.1562350692785476e-05, 'epoch': 1.48}


 74%|███████▍  | 488/656 [00:31<00:09, 18.28it/s]

{'loss': 1.7753, 'learning_rate': 1.1586239847109413e-05, 'epoch': 1.48}
{'loss': 1.8154, 'learning_rate': 1.161012900143335e-05, 'epoch': 1.48}
{'loss': 1.758, 'learning_rate': 1.1634018155757287e-05, 'epoch': 1.48}
{'loss': 1.7878, 'learning_rate': 1.1657907310081223e-05, 'epoch': 1.49}


 75%|███████▌  | 493/656 [00:31<00:08, 19.27it/s]

{'loss': 1.7998, 'learning_rate': 1.168179646440516e-05, 'epoch': 1.49}
{'loss': 1.7977, 'learning_rate': 1.1705685618729099e-05, 'epoch': 1.49}
{'loss': 1.8187, 'learning_rate': 1.1729574773053035e-05, 'epoch': 1.5}
{'loss': 1.7595, 'learning_rate': 1.1753463927376972e-05, 'epoch': 1.5}


 76%|███████▌  | 496/656 [00:31<00:08, 18.08it/s]

{'loss': 1.7664, 'learning_rate': 1.1777353081700908e-05, 'epoch': 1.5}
{'loss': 1.7633, 'learning_rate': 1.1801242236024846e-05, 'epoch': 1.51}
{'loss': 1.7844, 'learning_rate': 1.1825131390348782e-05, 'epoch': 1.51}
{'loss': 1.7498, 'learning_rate': 1.1849020544672718e-05, 'epoch': 1.51}


 76%|███████▋  | 501/656 [00:31<00:07, 19.65it/s]

{'loss': 1.7533, 'learning_rate': 1.1872909698996656e-05, 'epoch': 1.52}
{'loss': 1.7604, 'learning_rate': 1.1896798853320592e-05, 'epoch': 1.52}
{'loss': 1.7597, 'learning_rate': 1.192068800764453e-05, 'epoch': 1.52}
{'loss': 1.7691, 'learning_rate': 1.1944577161968466e-05, 'epoch': 1.52}


 77%|███████▋  | 505/656 [00:32<00:07, 19.36it/s]

{'loss': 1.7726, 'learning_rate': 1.1968466316292403e-05, 'epoch': 1.53}
{'loss': 1.7497, 'learning_rate': 1.1992355470616341e-05, 'epoch': 1.53}
{'loss': 1.7661, 'learning_rate': 1.2016244624940277e-05, 'epoch': 1.53}
{'loss': 1.7671, 'learning_rate': 1.2040133779264215e-05, 'epoch': 1.54}


 77%|███████▋  | 507/656 [00:32<00:08, 17.88it/s]

{'loss': 1.7963, 'learning_rate': 1.2064022933588151e-05, 'epoch': 1.54}
{'loss': 1.7491, 'learning_rate': 1.2087912087912089e-05, 'epoch': 1.54}
{'loss': 1.7736, 'learning_rate': 1.2111801242236026e-05, 'epoch': 1.55}


 78%|███████▊  | 512/656 [00:32<00:07, 18.89it/s]

{'loss': 1.7152, 'learning_rate': 1.2135690396559962e-05, 'epoch': 1.55}
{'loss': 1.7432, 'learning_rate': 1.21595795508839e-05, 'epoch': 1.55}
{'loss': 1.7904, 'learning_rate': 1.2183468705207836e-05, 'epoch': 1.55}
{'loss': 1.7669, 'learning_rate': 1.2207357859531774e-05, 'epoch': 1.56}


 79%|███████▊  | 516/656 [00:32<00:07, 19.16it/s]

{'loss': 1.7447, 'learning_rate': 1.223124701385571e-05, 'epoch': 1.56}
{'loss': 1.7671, 'learning_rate': 1.2255136168179648e-05, 'epoch': 1.56}
{'loss': 1.7346, 'learning_rate': 1.2279025322503584e-05, 'epoch': 1.57}
{'loss': 1.7534, 'learning_rate': 1.230291447682752e-05, 'epoch': 1.57}


 79%|███████▉  | 519/656 [00:32<00:07, 19.51it/s]

{'loss': 1.7734, 'learning_rate': 1.2326803631151458e-05, 'epoch': 1.57}
{'loss': 1.788, 'learning_rate': 1.2350692785475394e-05, 'epoch': 1.58}
{'loss': 1.754, 'learning_rate': 1.2374581939799331e-05, 'epoch': 1.58}
{'loss': 1.7735, 'learning_rate': 1.2398471094123269e-05, 'epoch': 1.58}


                                                 

{'loss': 1.7588, 'learning_rate': 1.2422360248447205e-05, 'epoch': 1.59}
{'loss': 1.7561, 'learning_rate': 1.2446249402771143e-05, 'epoch': 1.59}
{'loss': 1.7692, 'learning_rate': 1.2470138557095079e-05, 'epoch': 1.59}
{'loss': 1.7226, 'learning_rate': 1.2494027711419017e-05, 'epoch': 1.59}
{'loss': 1.7826, 'learning_rate': 1.2517916865742954e-05, 'epoch': 1.6}


 81%|████████  | 529/656 [00:33<00:06, 20.17it/s]

{'loss': 1.7261, 'learning_rate': 1.254180602006689e-05, 'epoch': 1.6}
{'loss': 1.6987, 'learning_rate': 1.2565695174390826e-05, 'epoch': 1.6}
{'loss': 1.7481, 'learning_rate': 1.2589584328714766e-05, 'epoch': 1.61}
{'loss': 1.7675, 'learning_rate': 1.2613473483038702e-05, 'epoch': 1.61}
{'loss': 1.7598, 'learning_rate': 1.2637362637362638e-05, 'epoch': 1.61}


 81%|████████▏ | 533/656 [00:33<00:06, 19.33it/s]

{'loss': 1.7655, 'learning_rate': 1.2661251791686574e-05, 'epoch': 1.62}
{'loss': 1.7247, 'learning_rate': 1.2685140946010512e-05, 'epoch': 1.62}
{'loss': 1.739, 'learning_rate': 1.270903010033445e-05, 'epoch': 1.62}
{'loss': 1.7274, 'learning_rate': 1.2732919254658385e-05, 'epoch': 1.62}


 82%|████████▏ | 538/656 [00:33<00:05, 20.55it/s]

{'loss': 1.7472, 'learning_rate': 1.2756808408982323e-05, 'epoch': 1.63}
{'loss': 1.7262, 'learning_rate': 1.278069756330626e-05, 'epoch': 1.63}
{'loss': 1.7375, 'learning_rate': 1.2804586717630195e-05, 'epoch': 1.63}
{'loss': 1.7415, 'learning_rate': 1.2828475871954135e-05, 'epoch': 1.64}
{'loss': 1.7472, 'learning_rate': 1.285236502627807e-05, 'epoch': 1.64}


 83%|████████▎ | 543/656 [00:34<00:05, 20.88it/s]

{'loss': 1.7016, 'learning_rate': 1.2876254180602007e-05, 'epoch': 1.64}
{'loss': 1.7855, 'learning_rate': 1.2900143334925943e-05, 'epoch': 1.65}
{'loss': 1.7632, 'learning_rate': 1.2924032489249882e-05, 'epoch': 1.65}
{'loss': 1.7412, 'learning_rate': 1.2947921643573818e-05, 'epoch': 1.65}
{'loss': 1.7337, 'learning_rate': 1.2971810797897754e-05, 'epoch': 1.66}


 83%|████████▎ | 547/656 [00:34<00:05, 20.41it/s]

{'loss': 1.7248, 'learning_rate': 1.2995699952221694e-05, 'epoch': 1.66}
{'loss': 1.7661, 'learning_rate': 1.301958910654563e-05, 'epoch': 1.66}
{'loss': 1.7262, 'learning_rate': 1.3043478260869566e-05, 'epoch': 1.66}
{'loss': 1.7359, 'learning_rate': 1.3067367415193502e-05, 'epoch': 1.67}


 84%|████████▍ | 551/656 [00:34<00:05, 20.67it/s]

{'loss': 1.6797, 'learning_rate': 1.3091256569517441e-05, 'epoch': 1.67}
{'loss': 1.7653, 'learning_rate': 1.3115145723841377e-05, 'epoch': 1.67}
{'loss': 1.7327, 'learning_rate': 1.3139034878165313e-05, 'epoch': 1.68}
{'loss': 1.7306, 'learning_rate': 1.3162924032489251e-05, 'epoch': 1.68}


 85%|████████▍ | 555/656 [00:34<00:05, 19.96it/s]

{'loss': 1.7273, 'learning_rate': 1.3186813186813187e-05, 'epoch': 1.68}
{'loss': 1.7074, 'learning_rate': 1.3210702341137123e-05, 'epoch': 1.69}
{'loss': 1.7143, 'learning_rate': 1.323459149546106e-05, 'epoch': 1.69}
{'loss': 1.7428, 'learning_rate': 1.3258480649784999e-05, 'epoch': 1.69}


 85%|████████▌ | 559/656 [00:34<00:05, 18.52it/s]

{'loss': 1.7424, 'learning_rate': 1.3282369804108935e-05, 'epoch': 1.7}
{'loss': 1.7547, 'learning_rate': 1.330625895843287e-05, 'epoch': 1.7}
{'loss': 1.7171, 'learning_rate': 1.333014811275681e-05, 'epoch': 1.7}
{'loss': 1.7048, 'learning_rate': 1.3354037267080746e-05, 'epoch': 1.7}


 86%|████████▌ | 563/656 [00:35<00:04, 18.74it/s]

{'loss': 1.74, 'learning_rate': 1.3377926421404682e-05, 'epoch': 1.71}
{'loss': 1.7266, 'learning_rate': 1.3401815575728622e-05, 'epoch': 1.71}
{'loss': 1.7407, 'learning_rate': 1.3425704730052558e-05, 'epoch': 1.71}
{'loss': 1.6912, 'learning_rate': 1.3449593884376494e-05, 'epoch': 1.72}


 86%|████████▋ | 567/656 [00:35<00:04, 18.67it/s]

{'loss': 1.7156, 'learning_rate': 1.347348303870043e-05, 'epoch': 1.72}
{'loss': 1.7286, 'learning_rate': 1.3497372193024369e-05, 'epoch': 1.72}
{'loss': 1.6819, 'learning_rate': 1.3521261347348305e-05, 'epoch': 1.73}
{'loss': 1.6976, 'learning_rate': 1.3545150501672241e-05, 'epoch': 1.73}


 87%|████████▋ | 572/656 [00:35<00:04, 19.66it/s]

{'loss': 1.6698, 'learning_rate': 1.3569039655996179e-05, 'epoch': 1.73}
{'loss': 1.7111, 'learning_rate': 1.3592928810320115e-05, 'epoch': 1.73}
{'loss': 1.7226, 'learning_rate': 1.3616817964644053e-05, 'epoch': 1.74}
{'loss': 1.709, 'learning_rate': 1.3640707118967989e-05, 'epoch': 1.74}
{'loss': 1.6983, 'learning_rate': 1.3664596273291926e-05, 'epoch': 1.74}


 88%|████████▊ | 577/656 [00:35<00:03, 20.99it/s]

{'loss': 1.6782, 'learning_rate': 1.3688485427615862e-05, 'epoch': 1.75}
{'loss': 1.6711, 'learning_rate': 1.3712374581939799e-05, 'epoch': 1.75}
{'loss': 1.6947, 'learning_rate': 1.3736263736263738e-05, 'epoch': 1.75}
{'loss': 1.6998, 'learning_rate': 1.3760152890587674e-05, 'epoch': 1.76}


 89%|████████▊ | 581/656 [00:36<00:03, 21.51it/s]

{'loss': 1.6744, 'learning_rate': 1.378404204491161e-05, 'epoch': 1.76}
{'loss': 1.6607, 'learning_rate': 1.3807931199235546e-05, 'epoch': 1.76}
{'loss': 1.6848, 'learning_rate': 1.3831820353559485e-05, 'epoch': 1.77}
{'loss': 1.6546, 'learning_rate': 1.3855709507883422e-05, 'epoch': 1.77}
{'loss': 1.7242, 'learning_rate': 1.3879598662207358e-05, 'epoch': 1.77}


 89%|████████▉ | 584/656 [00:36<00:03, 21.02it/s]

{'loss': 1.6851, 'learning_rate': 1.3903487816531297e-05, 'epoch': 1.77}
{'loss': 1.6822, 'learning_rate': 1.3927376970855233e-05, 'epoch': 1.78}
{'loss': 1.6655, 'learning_rate': 1.3951266125179169e-05, 'epoch': 1.78}


 90%|████████▉ | 589/656 [00:36<00:03, 20.23it/s]

{'loss': 1.6707, 'learning_rate': 1.3975155279503105e-05, 'epoch': 1.78}
{'loss': 1.7131, 'learning_rate': 1.3999044433827045e-05, 'epoch': 1.79}
{'loss': 1.6643, 'learning_rate': 1.402293358815098e-05, 'epoch': 1.79}
{'loss': 1.6821, 'learning_rate': 1.4046822742474917e-05, 'epoch': 1.79}


 90%|█████████ | 592/656 [00:36<00:03, 19.68it/s]

{'loss': 1.6486, 'learning_rate': 1.4070711896798854e-05, 'epoch': 1.8}
{'loss': 1.7068, 'learning_rate': 1.409460105112279e-05, 'epoch': 1.8}
{'loss': 1.6729, 'learning_rate': 1.4118490205446726e-05, 'epoch': 1.8}
{'loss': 1.7232, 'learning_rate': 1.4142379359770666e-05, 'epoch': 1.8}


 91%|█████████ | 596/656 [00:36<00:03, 18.22it/s]

{'loss': 1.662, 'learning_rate': 1.4166268514094602e-05, 'epoch': 1.81}
{'loss': 1.7109, 'learning_rate': 1.4190157668418538e-05, 'epoch': 1.81}
{'loss': 1.6674, 'learning_rate': 1.4214046822742474e-05, 'epoch': 1.81}
{'loss': 1.5871, 'learning_rate': 1.4237935977066413e-05, 'epoch': 1.82}


 92%|█████████▏| 601/656 [00:37<00:02, 19.18it/s]

{'loss': 1.6684, 'learning_rate': 1.426182513139035e-05, 'epoch': 1.82}
{'loss': 1.6708, 'learning_rate': 1.4285714285714285e-05, 'epoch': 1.82}
{'loss': 1.7027, 'learning_rate': 1.4309603440038225e-05, 'epoch': 1.83}
{'loss': 1.6326, 'learning_rate': 1.4333492594362161e-05, 'epoch': 1.83}


 92%|█████████▏| 604/656 [00:37<00:02, 18.96it/s]

{'loss': 1.6645, 'learning_rate': 1.4357381748686097e-05, 'epoch': 1.83}
{'loss': 1.6915, 'learning_rate': 1.4381270903010033e-05, 'epoch': 1.84}
{'loss': 1.7049, 'learning_rate': 1.4405160057333972e-05, 'epoch': 1.84}
{'loss': 1.7012, 'learning_rate': 1.4429049211657908e-05, 'epoch': 1.84}


 93%|█████████▎| 607/656 [00:37<00:02, 17.60it/s]

{'loss': 1.6576, 'learning_rate': 1.4452938365981844e-05, 'epoch': 1.84}
{'loss': 1.6883, 'learning_rate': 1.4476827520305782e-05, 'epoch': 1.85}
{'loss': 1.7088, 'learning_rate': 1.4500716674629718e-05, 'epoch': 1.85}


 93%|█████████▎| 612/656 [00:37<00:02, 17.95it/s]

{'loss': 1.6119, 'learning_rate': 1.4524605828953656e-05, 'epoch': 1.85}
{'loss': 1.656, 'learning_rate': 1.4548494983277592e-05, 'epoch': 1.86}
{'loss': 1.6264, 'learning_rate': 1.457238413760153e-05, 'epoch': 1.86}
{'loss': 1.6388, 'learning_rate': 1.4596273291925466e-05, 'epoch': 1.86}


 94%|█████████▍| 615/656 [00:37<00:02, 19.10it/s]

{'loss': 1.6882, 'learning_rate': 1.4620162446249402e-05, 'epoch': 1.87}
{'loss': 1.6884, 'learning_rate': 1.4644051600573341e-05, 'epoch': 1.87}
{'loss': 1.6591, 'learning_rate': 1.4667940754897277e-05, 'epoch': 1.87}
{'loss': 1.6074, 'learning_rate': 1.4691829909221213e-05, 'epoch': 1.88}


 94%|█████████▍| 619/656 [00:38<00:01, 18.68it/s]

{'loss': 1.6761, 'learning_rate': 1.4715719063545153e-05, 'epoch': 1.88}
{'loss': 1.6209, 'learning_rate': 1.4739608217869089e-05, 'epoch': 1.88}
{'loss': 1.6425, 'learning_rate': 1.4763497372193025e-05, 'epoch': 1.88}
{'loss': 1.6232, 'learning_rate': 1.478738652651696e-05, 'epoch': 1.89}


 95%|█████████▍| 623/656 [00:38<00:01, 18.24it/s]

{'loss': 1.6331, 'learning_rate': 1.48112756808409e-05, 'epoch': 1.89}
{'loss': 1.6228, 'learning_rate': 1.4835164835164836e-05, 'epoch': 1.89}
{'loss': 1.6398, 'learning_rate': 1.4859053989488772e-05, 'epoch': 1.9}
{'loss': 1.6569, 'learning_rate': 1.4882943143812712e-05, 'epoch': 1.9}


 95%|█████████▌| 626/656 [00:38<00:01, 17.06it/s]

{'loss': 1.6466, 'learning_rate': 1.4906832298136648e-05, 'epoch': 1.9}
{'loss': 1.6161, 'learning_rate': 1.4930721452460584e-05, 'epoch': 1.91}
{'loss': 1.6269, 'learning_rate': 1.495461060678452e-05, 'epoch': 1.91}


 96%|█████████▌| 630/656 [00:38<00:01, 18.44it/s]

{'loss': 1.6292, 'learning_rate': 1.4978499761108458e-05, 'epoch': 1.91}
{'loss': 1.6399, 'learning_rate': 1.5002388915432394e-05, 'epoch': 1.91}
{'loss': 1.6501, 'learning_rate': 1.502627806975633e-05, 'epoch': 1.92}
{'loss': 1.6894, 'learning_rate': 1.5050167224080269e-05, 'epoch': 1.92}


 97%|█████████▋| 634/656 [00:38<00:01, 19.48it/s]

{'loss': 1.6334, 'learning_rate': 1.5074056378404205e-05, 'epoch': 1.92}
{'loss': 1.6596, 'learning_rate': 1.5097945532728141e-05, 'epoch': 1.93}
{'loss': 1.6128, 'learning_rate': 1.5121834687052077e-05, 'epoch': 1.93}
{'loss': 1.6228, 'learning_rate': 1.5145723841376017e-05, 'epoch': 1.93}


 97%|█████████▋| 639/656 [00:39<00:00, 20.79it/s]

{'loss': 1.6606, 'learning_rate': 1.5169612995699953e-05, 'epoch': 1.94}
{'loss': 1.6349, 'learning_rate': 1.5193502150023889e-05, 'epoch': 1.94}
{'loss': 1.6432, 'learning_rate': 1.5217391304347828e-05, 'epoch': 1.94}
{'loss': 1.6114, 'learning_rate': 1.5241280458671764e-05, 'epoch': 1.95}
{'loss': 1.5823, 'learning_rate': 1.52651696129957e-05, 'epoch': 1.95}


 98%|█████████▊| 644/656 [00:39<00:00, 21.29it/s]

{'loss': 1.6439, 'learning_rate': 1.528905876731964e-05, 'epoch': 1.95}
{'loss': 1.6347, 'learning_rate': 1.5312947921643576e-05, 'epoch': 1.95}
{'loss': 1.6161, 'learning_rate': 1.5336837075967512e-05, 'epoch': 1.96}
{'loss': 1.5982, 'learning_rate': 1.5360726230291448e-05, 'epoch': 1.96}
{'loss': 1.5809, 'learning_rate': 1.5384615384615387e-05, 'epoch': 1.96}


 99%|█████████▉| 649/656 [00:39<00:00, 20.56it/s]

{'loss': 1.5849, 'learning_rate': 1.5408504538939323e-05, 'epoch': 1.97}
{'loss': 1.5857, 'learning_rate': 1.543239369326326e-05, 'epoch': 1.97}
{'loss': 1.6424, 'learning_rate': 1.54562828475872e-05, 'epoch': 1.97}
{'loss': 1.5686, 'learning_rate': 1.5480172001911135e-05, 'epoch': 1.98}


 99%|█████████▉| 652/656 [00:39<00:00, 18.15it/s]

{'loss': 1.6423, 'learning_rate': 1.550406115623507e-05, 'epoch': 1.98}
{'loss': 1.6077, 'learning_rate': 1.5527950310559007e-05, 'epoch': 1.98}
{'loss': 1.6088, 'learning_rate': 1.5551839464882946e-05, 'epoch': 1.98}


100%|██████████| 656/656 [00:39<00:00, 18.49it/s]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: token_labels, tokens, utterance. If token_labels, tokens, utterance are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32


{'loss': 1.5499, 'learning_rate': 1.5575728619206882e-05, 'epoch': 1.99}
{'loss': 1.5711, 'learning_rate': 1.5599617773530818e-05, 'epoch': 1.99}
{'loss': 1.5938, 'learning_rate': 1.5623506927854754e-05, 'epoch': 1.99}
{'loss': 1.5604, 'learning_rate': 1.564739608217869e-05, 'epoch': 2.0}
{'loss': 1.6406, 'learning_rate': 1.5671285236502626e-05, 'epoch': 2.0}


                                                 
100%|██████████| 656/656 [00:43<00:00, 18.49it/s]Saving model checkpoint to ./snips_clf/results\checkpoint-656
Configuration saved in ./snips_clf/results\checkpoint-656\config.json


{'eval_loss': 1.570827603340149, 'eval_accuracy': 0.8868933893771495, 'eval_runtime': 3.9241, 'eval_samples_per_second': 666.898, 'eval_steps_per_second': 20.896, 'epoch': 2.0}


Model weights saved in ./snips_clf/results\checkpoint-656\pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./snips_clf/results\checkpoint-656 (score: 1.570827603340149).
100%|██████████| 656/656 [00:45<00:00, 14.43it/s]

{'train_runtime': 45.4625, 'train_samples_per_second': 460.467, 'train_steps_per_second': 14.429, 'train_loss': 1.8436544395801497, 'epoch': 2.0}





TrainOutput(global_step=656, training_loss=1.8436544395801497, metrics={'train_runtime': 45.4625, 'train_samples_per_second': 460.467, 'train_steps_per_second': 14.429, 'train_loss': 1.8436544395801497, 'epoch': 2.0})

In [67]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: token_labels, tokens, utterance. If token_labels, tokens, utterance are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
100%|██████████| 82/82 [00:02<00:00, 32.01it/s]


{'eval_loss': 1.570827603340149,
 'eval_accuracy': 0.8868933893771495,
 'eval_runtime': 2.7608,
 'eval_samples_per_second': 947.897,
 'eval_steps_per_second': 29.701,
 'epoch': 2.0}

# BERT for Token Classification

In [68]:
from transformers import DataCollatorForTokenClassification, DistilBertForTokenClassification, \
                            DistilBertTokenizerFast, pipeline

In [69]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

loading file vocab.txt from cache at C:\Users\Muham/.cache\huggingface\hub\models--distilbert-base-uncased\snapshots\6cdc0aad91f5ae2e6712e91bc7b65d1cf5c05411\vocab.txt
loading file tokenizer.json from cache at C:\Users\Muham/.cache\huggingface\hub\models--distilbert-base-uncased\snapshots\6cdc0aad91f5ae2e6712e91bc7b65d1cf5c05411\tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at C:\Users\Muham/.cache\huggingface\hub\models--distilbert-base-uncased\snapshots\6cdc0aad91f5ae2e6712e91bc7b65d1cf5c05411\tokenizer_config.json
loading configuration file config.json from cache at C:\Users\Muham/.cache\huggingface\hub\models--distilbert-base-uncased\snapshots\6cdc0aad91f5ae2e6712e91bc7b65d1cf5c05411\config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_d

In [95]:
snips_dataset['train'][0:2]

# tokenizer(
#         snips_dataset['train'][0:14]['tokens'], truncation=True, is_split_into_words=True
#     ).word_ids(batch_index=13)

{'utterance': ['in seven hours from now will it rain at my current place',
  'add this nozomi tsuji tune to my hot house playlist'],
 'label': [4, 2],
 'tokens': [['in',
   'seven',
   'hours',
   'from',
   'now',
   'will',
   'it',
   'rain',
   'at',
   'my',
   'current',
   'place'],
  ['add',
   'this',
   'nozomi',
   'tsuji',
   'tune',
   'to',
   'my',
   'hot',
   'house',
   'playlist']],
 'token_labels': [[65, 57, 0, 0, 0, 65, 65, 13, 65, 65, 12, 60],
  [65, 65, 4, 24, 47, 65, 36, 2, 23, 65]]}

In [101]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples['tokens'], truncation=True, is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples["token_labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100) # Magic number, where the loss is ignored for that label
            elif word_ids != previous_word_idx: # only label the first token of a given word
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Example:
tokenize_and_align_labels(snips_dataset['train'][0:5])

{'input_ids': [[101, 1999, 2698, 2847, 2013, 2085, 2097, 2009, 4542, 2012, 2026, 2783, 2173, 102], [101, 5587, 2023, 2053, 6844, 4328, 24529, 23049, 2072, 8694, 2000, 2026, 2980, 2160, 2377, 9863, 102], [101, 1045, 3446, 2023, 16432, 1014, 102], [101, 2507, 2033, 1996, 3185, 6134, 2005, 3152, 1999, 1996, 2181, 102], [101, 2338, 1037, 3962, 2005, 2416, 2012, 1037, 4825, 2008, 4240, 3869, 1998, 11772, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[-100, 65, 57, 0, 0, 0, 65, 65, 13, 65, 65, 12, 60, -100], [-100, 65, 65, 4, 4, 4, 24, 24, 24, 47, 65, 36, 2, 23, 65, 65, -100], [-100, 65, 65, 32, 51, 31, -100], [-100, 65, 65, 65, 51, 45, 65, 69, 44, 18, 18, -100], [-100, 65, 65, 65, 65, 27, 65, 65, 19, 65, 65, 26, 50, 50, -100]]}

In [104]:
tok_clf_tokenized_snips = snips_dataset.map(tokenize_and_align_labels, batched=True)

Map: 100%|██████████| 10467/10467 [00:01<00:00, 7095.04 examples/s]
Map: 100%|██████████| 2617/2617 [00:00<00:00, 7309.02 examples/s]


In [110]:
# Some columns may cause confusion to the token classification model like: label (which is the seq label)
#        so better remove them.
tok_clf_tokenized_snips = tok_clf_tokenized_snips.remove_columns(
    ['utterance', 'label', 'tokens', 'token_labels']
)

In [118]:
[tok_clf_tokenized_snips['train'][0]]

[{'input_ids': [101,
   1999,
   2698,
   2847,
   2013,
   2085,
   2097,
   2009,
   4542,
   2012,
   2026,
   2783,
   2173,
   102],
  'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  'labels': [-100, 65, 57, 0, 0, 0, 65, 65, 13, 65, 65, 12, 60, -100]}]

In [123]:
tok_data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# Example
tok_data_collator([tok_clf_tokenized_snips['train'][0],tok_clf_tokenized_snips['train'][1]])

{'input_ids': tensor([[  101,  1999,  2698,  2847,  2013,  2085,  2097,  2009,  4542,  2012,
          2026,  2783,  2173,   102,     0,     0,     0],
        [  101,  5587,  2023,  2053,  6844,  4328, 24529, 23049,  2072,  8694,
          2000,  2026,  2980,  2160,  2377,  9863,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'labels': tensor([[-100,   65,   57,    0,    0,    0,   65,   65,   13,   65,   65,   12,
           60, -100, -100, -100, -100],
        [-100,   65,   65,    4,    4,    4,   24,   24,   24,   47,   65,   36,
            2,   23,   65,   65, -100]])}

In [111]:
tok_clf_model = DistilBertForTokenClassification.from_pretrained(
    'distilbert-base-uncased', num_labels=len(unique_token_labels)
)

tok_clf_model.config.id2label = {i: l for i, l in enumerate(unique_token_labels)}
tok_clf_model.config.id2label

loading configuration file config.json from cache at C:\Users\Muham/.cache\huggingface\hub\models--distilbert-base-uncased\snapshots\6cdc0aad91f5ae2e6712e91bc7b65d1cf5c05411\config.json
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "23": "LABEL_23",
    "24": "LABEL_24",
    "25": "LABEL_25",
    "26": "LABEL_26",
    "27": "LABEL_27",
    "28": 

{0: 'I-timeRange',
 1: 'B-object_part_of_series_type',
 2: 'B-playlist',
 3: 'I-playlist_owner',
 4: 'B-artist',
 5: 'B-genre',
 6: 'I-cuisine',
 7: 'B-geographic_poi',
 8: 'I-service',
 9: 'B-album',
 10: 'B-track',
 11: 'I-movie_type',
 12: 'B-current_location',
 13: 'B-condition_description',
 14: 'I-restaurant_type',
 15: 'I-genre',
 16: 'B-poi',
 17: 'B-service',
 18: 'I-spatial_relation',
 19: 'B-restaurant_type',
 20: 'I-object_select',
 21: 'I-entity_name',
 22: 'B-restaurant_name',
 23: 'I-playlist',
 24: 'I-artist',
 25: 'B-object_location_type',
 26: 'B-served_dish',
 27: 'B-party_size_number',
 28: 'I-object_location_type',
 29: 'I-city',
 30: 'I-location_name',
 31: 'B-rating_value',
 32: 'B-object_select',
 33: 'I-track',
 34: 'B-movie_name',
 35: 'I-poi',
 36: 'B-playlist_owner',
 37: 'I-state',
 38: 'I-album',
 39: 'B-cuisine',
 40: 'B-year',
 41: 'I-movie_name',
 42: 'B-sort',
 43: 'B-rating_unit',
 44: 'B-spatial_relation',
 45: 'I-object_type',
 46: 'B-city',
 47: 'B

In [124]:
epochs = 2

training_args = TrainingArguments(
    output_dir="./snips_tok_clf/results",
    num_train_epochs=epochs,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    load_best_model_at_end=True,

    logging_steps=10,
    logging_strategy='steps',
    log_level='info',

    evaluation_strategy='epoch',
    save_strategy='epoch'
)

trainer = Trainer(
    model=tok_clf_model,
    args=training_args,
    train_dataset=tok_clf_tokenized_snips['train'],
    eval_dataset=tok_clf_tokenized_snips['test'],
    data_collator=tok_data_collator
)

# Check if the model is using CUDA (GPU)
is_using_cuda = trainer.args.device.type == 'cuda'
print(f"Is using CUDA: "+'\033[92m'+str(is_using_cuda)+'\033[0m')

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


Is using CUDA: [92mTrue[0m


In [125]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
100%|██████████| 82/82 [00:02<00:00, 31.41it/s]


{'eval_loss': 4.430819034576416,
 'eval_runtime': 2.6395,
 'eval_samples_per_second': 991.46,
 'eval_steps_per_second': 31.066}

In [126]:
trainer.train()

***** Running training *****
  Num examples = 10,467
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 656
  Number of trainable parameters = 66,418,248
  2%|▏         | 12/656 [00:01<01:07,  9.49it/s]

{'loss': 3.4594, 'learning_rate': 4.923780487804878e-05, 'epoch': 0.03}


  3%|▎         | 21/656 [00:02<01:06,  9.51it/s]

{'loss': 2.672, 'learning_rate': 4.847560975609756e-05, 'epoch': 0.06}


  5%|▍         | 31/656 [00:03<01:08,  9.17it/s]

{'loss': 2.1733, 'learning_rate': 4.771341463414634e-05, 'epoch': 0.09}


  6%|▋         | 41/656 [00:04<01:02,  9.79it/s]

{'loss': 1.8842, 'learning_rate': 4.695121951219512e-05, 'epoch': 0.12}


  8%|▊         | 52/656 [00:05<01:07,  8.99it/s]

{'loss': 1.512, 'learning_rate': 4.618902439024391e-05, 'epoch': 0.15}


  9%|▉         | 61/656 [00:06<01:04,  9.27it/s]

{'loss': 1.3094, 'learning_rate': 4.542682926829269e-05, 'epoch': 0.18}


 11%|█         | 72/656 [00:07<01:04,  9.07it/s]

{'loss': 1.1487, 'learning_rate': 4.466463414634147e-05, 'epoch': 0.21}


 12%|█▏        | 81/656 [00:08<01:01,  9.40it/s]

{'loss': 1.0367, 'learning_rate': 4.390243902439025e-05, 'epoch': 0.24}


 14%|█▍        | 91/656 [00:09<01:01,  9.18it/s]

{'loss': 0.9074, 'learning_rate': 4.314024390243903e-05, 'epoch': 0.27}


 16%|█▌        | 102/656 [00:10<00:58,  9.54it/s]

{'loss': 0.7869, 'learning_rate': 4.237804878048781e-05, 'epoch': 0.3}


 17%|█▋        | 112/656 [00:11<00:57,  9.46it/s]

{'loss': 0.7246, 'learning_rate': 4.161585365853659e-05, 'epoch': 0.34}


 18%|█▊        | 121/656 [00:12<00:56,  9.51it/s]

{'loss': 0.6654, 'learning_rate': 4.085365853658537e-05, 'epoch': 0.37}


 20%|█▉        | 131/656 [00:13<00:53,  9.87it/s]

{'loss': 0.6087, 'learning_rate': 4.0091463414634153e-05, 'epoch': 0.4}


 22%|██▏       | 142/656 [00:15<00:53,  9.60it/s]

{'loss': 0.5616, 'learning_rate': 3.932926829268293e-05, 'epoch': 0.43}


 23%|██▎       | 150/656 [00:15<00:48, 10.44it/s]

{'loss': 0.5431, 'learning_rate': 3.856707317073171e-05, 'epoch': 0.46}


 25%|██▍       | 162/656 [00:17<00:51,  9.57it/s]

{'loss': 0.491, 'learning_rate': 3.780487804878049e-05, 'epoch': 0.49}


 26%|██▌       | 171/656 [00:18<00:55,  8.80it/s]

{'loss': 0.4443, 'learning_rate': 3.704268292682927e-05, 'epoch': 0.52}


 28%|██▊       | 181/656 [00:19<00:52,  9.00it/s]

{'loss': 0.4163, 'learning_rate': 3.628048780487805e-05, 'epoch': 0.55}


 29%|██▉       | 191/656 [00:20<00:49,  9.37it/s]

{'loss': 0.4513, 'learning_rate': 3.551829268292683e-05, 'epoch': 0.58}


 31%|███       | 201/656 [00:21<00:49,  9.25it/s]

{'loss': 0.4095, 'learning_rate': 3.475609756097561e-05, 'epoch': 0.61}


 32%|███▏      | 212/656 [00:22<00:46,  9.51it/s]

{'loss': 0.3438, 'learning_rate': 3.399390243902439e-05, 'epoch': 0.64}


 34%|███▎      | 221/656 [00:23<00:50,  8.56it/s]

{'loss': 0.4073, 'learning_rate': 3.323170731707317e-05, 'epoch': 0.67}


 35%|███▌      | 231/656 [00:24<00:45,  9.29it/s]

{'loss': 0.3435, 'learning_rate': 3.246951219512195e-05, 'epoch': 0.7}


 37%|███▋      | 241/656 [00:25<00:48,  8.61it/s]

{'loss': 0.333, 'learning_rate': 3.170731707317073e-05, 'epoch': 0.73}


 38%|███▊      | 251/656 [00:26<00:46,  8.78it/s]

{'loss': 0.3229, 'learning_rate': 3.094512195121951e-05, 'epoch': 0.76}


 40%|███▉      | 261/656 [00:28<00:41,  9.59it/s]

{'loss': 0.3357, 'learning_rate': 3.0182926829268294e-05, 'epoch': 0.79}


 41%|████▏     | 271/656 [00:29<00:40,  9.49it/s]

{'loss': 0.2678, 'learning_rate': 2.9420731707317074e-05, 'epoch': 0.82}


 43%|████▎     | 281/656 [00:30<00:38,  9.68it/s]

{'loss': 0.2965, 'learning_rate': 2.8658536585365854e-05, 'epoch': 0.85}


 44%|████▍     | 291/656 [00:31<00:37,  9.76it/s]

{'loss': 0.3198, 'learning_rate': 2.7896341463414637e-05, 'epoch': 0.88}


 46%|████▌     | 301/656 [00:32<00:37,  9.56it/s]

{'loss': 0.2956, 'learning_rate': 2.7134146341463417e-05, 'epoch': 0.91}


 48%|████▊     | 312/656 [00:33<00:40,  8.55it/s]

{'loss': 0.2507, 'learning_rate': 2.6371951219512197e-05, 'epoch': 0.95}


 49%|████▉     | 322/656 [00:34<00:37,  9.00it/s]

{'loss': 0.2421, 'learning_rate': 2.5609756097560977e-05, 'epoch': 0.98}


 50%|█████     | 328/656 [00:35<00:30, 10.61it/s]***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32

 50%|█████     | 328/656 [00:37<00:30, 10.61it/s]Saving model checkpoint to ./snips_tok_clf/results\checkpoint-328
Configuration saved in ./snips_tok_clf/results\checkpoint-328\config.json


{'eval_loss': 0.21123558282852173, 'eval_runtime': 2.576, 'eval_samples_per_second': 1015.9, 'eval_steps_per_second': 31.832, 'epoch': 1.0}


Model weights saved in ./snips_tok_clf/results\checkpoint-328\pytorch_model.bin
 50%|█████     | 331/656 [00:44<08:01,  1.48s/it]

{'loss': 0.2768, 'learning_rate': 2.4847560975609756e-05, 'epoch': 1.01}


 52%|█████▏    | 341/656 [00:45<01:02,  5.03it/s]

{'loss': 0.2234, 'learning_rate': 2.4085365853658536e-05, 'epoch': 1.04}


 54%|█████▎    | 352/656 [00:46<00:33,  9.08it/s]

{'loss': 0.1926, 'learning_rate': 2.332317073170732e-05, 'epoch': 1.07}


 55%|█████▌    | 362/656 [00:47<00:31,  9.40it/s]

{'loss': 0.1608, 'learning_rate': 2.25609756097561e-05, 'epoch': 1.1}


 57%|█████▋    | 371/656 [00:48<00:29,  9.69it/s]

{'loss': 0.2051, 'learning_rate': 2.179878048780488e-05, 'epoch': 1.13}


 58%|█████▊    | 381/656 [00:50<00:32,  8.46it/s]

{'loss': 0.2121, 'learning_rate': 2.103658536585366e-05, 'epoch': 1.16}


 60%|█████▉    | 392/656 [00:51<00:28,  9.42it/s]

{'loss': 0.1916, 'learning_rate': 2.0274390243902442e-05, 'epoch': 1.19}


 61%|██████▏   | 402/656 [00:52<00:27,  9.36it/s]

{'loss': 0.1483, 'learning_rate': 1.9512195121951222e-05, 'epoch': 1.22}


 63%|██████▎   | 412/656 [00:53<00:25,  9.46it/s]

{'loss': 0.1509, 'learning_rate': 1.8750000000000002e-05, 'epoch': 1.25}


 64%|██████▍   | 421/656 [00:54<00:27,  8.57it/s]

{'loss': 0.1595, 'learning_rate': 1.798780487804878e-05, 'epoch': 1.28}


 66%|██████▌   | 431/656 [00:55<00:27,  8.05it/s]

{'loss': 0.1848, 'learning_rate': 1.722560975609756e-05, 'epoch': 1.31}


 67%|██████▋   | 441/656 [00:56<00:25,  8.59it/s]

{'loss': 0.166, 'learning_rate': 1.6463414634146345e-05, 'epoch': 1.34}


 69%|██████▉   | 451/656 [00:57<00:21,  9.43it/s]

{'loss': 0.1679, 'learning_rate': 1.5701219512195124e-05, 'epoch': 1.37}


 70%|███████   | 462/656 [00:58<00:21,  9.19it/s]

{'loss': 0.1401, 'learning_rate': 1.4939024390243902e-05, 'epoch': 1.4}


 72%|███████▏  | 471/656 [00:59<00:19,  9.37it/s]

{'loss': 0.1886, 'learning_rate': 1.4176829268292682e-05, 'epoch': 1.43}


 73%|███████▎  | 482/656 [01:01<00:17,  9.79it/s]

{'loss': 0.1473, 'learning_rate': 1.3414634146341466e-05, 'epoch': 1.46}


 75%|███████▌  | 492/656 [01:02<00:17,  9.47it/s]

{'loss': 0.1356, 'learning_rate': 1.2652439024390245e-05, 'epoch': 1.49}


 77%|███████▋  | 502/656 [01:03<00:16,  9.61it/s]

{'loss': 0.1643, 'learning_rate': 1.1890243902439025e-05, 'epoch': 1.52}


 78%|███████▊  | 511/656 [01:04<00:15,  9.38it/s]

{'loss': 0.1531, 'learning_rate': 1.1128048780487805e-05, 'epoch': 1.55}


 80%|███████▉  | 522/656 [01:05<00:14,  9.41it/s]

{'loss': 0.1382, 'learning_rate': 1.0365853658536585e-05, 'epoch': 1.59}


 81%|████████  | 532/656 [01:06<00:13,  9.24it/s]

{'loss': 0.208, 'learning_rate': 9.603658536585366e-06, 'epoch': 1.62}


 82%|████████▏ | 541/656 [01:07<00:11,  9.93it/s]

{'loss': 0.1347, 'learning_rate': 8.841463414634146e-06, 'epoch': 1.65}


 84%|████████▍ | 552/656 [01:08<00:10,  9.48it/s]

{'loss': 0.1427, 'learning_rate': 8.079268292682928e-06, 'epoch': 1.68}


 86%|████████▌ | 561/656 [01:09<00:10,  9.06it/s]

{'loss': 0.1643, 'learning_rate': 7.317073170731707e-06, 'epoch': 1.71}


 87%|████████▋ | 571/656 [01:10<00:08,  9.71it/s]

{'loss': 0.146, 'learning_rate': 6.554878048780488e-06, 'epoch': 1.74}


 89%|████████▊ | 582/656 [01:11<00:07,  9.69it/s]

{'loss': 0.1496, 'learning_rate': 5.792682926829269e-06, 'epoch': 1.77}


 90%|█████████ | 592/656 [01:12<00:06,  9.30it/s]

{'loss': 0.1378, 'learning_rate': 5.030487804878049e-06, 'epoch': 1.8}


 92%|█████████▏| 602/656 [01:13<00:05,  9.26it/s]

{'loss': 0.1155, 'learning_rate': 4.26829268292683e-06, 'epoch': 1.83}


 93%|█████████▎| 611/656 [01:15<00:05,  8.30it/s]

{'loss': 0.1488, 'learning_rate': 3.5060975609756102e-06, 'epoch': 1.86}


 95%|█████████▍| 621/656 [01:16<00:04,  8.58it/s]

{'loss': 0.1503, 'learning_rate': 2.7439024390243905e-06, 'epoch': 1.89}


 96%|█████████▌| 631/656 [01:17<00:02,  9.04it/s]

{'loss': 0.1503, 'learning_rate': 1.9817073170731707e-06, 'epoch': 1.92}


 98%|█████████▊| 641/656 [01:18<00:01,  9.95it/s]

{'loss': 0.133, 'learning_rate': 1.2195121951219514e-06, 'epoch': 1.95}


 99%|█████████▉| 651/656 [01:19<00:00,  8.50it/s]

{'loss': 0.1293, 'learning_rate': 4.573170731707317e-07, 'epoch': 1.98}


100%|█████████▉| 655/656 [01:19<00:00,  9.11it/s]***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
                                                 
100%|██████████| 656/656 [01:22<00:00,  9.11it/s]Saving model checkpoint to ./snips_tok_clf/results\checkpoint-656
Configuration saved in ./snips_tok_clf/results\checkpoint-656\config.json


{'eval_loss': 0.1553291380405426, 'eval_runtime': 2.5425, 'eval_samples_per_second': 1029.319, 'eval_steps_per_second': 32.252, 'epoch': 2.0}


Model weights saved in ./snips_tok_clf/results\checkpoint-656\pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./snips_tok_clf/results\checkpoint-656 (score: 0.1553291380405426).
100%|██████████| 656/656 [01:27<00:00,  7.50it/s]

{'train_runtime': 87.5014, 'train_samples_per_second': 239.242, 'train_steps_per_second': 7.497, 'train_loss': 0.47977145942972926, 'epoch': 2.0}





TrainOutput(global_step=656, training_loss=0.47977145942972926, metrics={'train_runtime': 87.5014, 'train_samples_per_second': 239.242, 'train_steps_per_second': 7.497, 'train_loss': 0.47977145942972926, 'epoch': 2.0})

In [133]:
pipe = pipeline(
    task="token-classification", model=tok_clf_model, tokenizer=tokenizer, device=0
)
pipe([
    'Add Morph by the band Twenty One Pilot to my TOP playlist',
    'Rate A thousand splendid suns 9 out of 10 stars',
    ])

[[{'entity': 'B-entity_name',
   'score': 0.71627563,
   'index': 2,
   'word': 'mor',
   'start': 4,
   'end': 7},
  {'entity': 'B-entity_name',
   'score': 0.6723716,
   'index': 3,
   'word': '##ph',
   'start': 7,
   'end': 9},
  {'entity': 'B-entity_name',
   'score': 0.8071509,
   'index': 7,
   'word': 'twenty',
   'start': 22,
   'end': 28},
  {'entity': 'I-entity_name',
   'score': 0.85959774,
   'index': 8,
   'word': 'one',
   'start': 29,
   'end': 32},
  {'entity': 'I-entity_name',
   'score': 0.88719934,
   'index': 9,
   'word': 'pilot',
   'start': 33,
   'end': 38},
  {'entity': 'B-playlist_owner',
   'score': 0.98588145,
   'index': 11,
   'word': 'my',
   'start': 42,
   'end': 44},
  {'entity': 'B-playlist',
   'score': 0.9719445,
   'index': 12,
   'word': 'top',
   'start': 45,
   'end': 48}],
 [{'entity': 'B-object_name',
   'score': 0.9828287,
   'index': 2,
   'word': 'a',
   'start': 5,
   'end': 6},
  {'entity': 'I-object_name',
   'score': 0.97984385,
   'in

# BERT for question/answering

In [1]:
import torch

# Check if CUDA is available
def show_cuda_space_info():
    if torch.cuda.is_available():
        # Get the CUDA device name
        device = torch.device("cuda")
        print("Using device:", torch.cuda.get_device_name(device))

        # Memory allocation and caching are dynamic in PyTorch, but you can get approximate memory usage as follows
        total_memory = torch.cuda.get_device_properties(device).total_memory
        allocated_memory = torch.cuda.memory_allocated(device)
        cached_memory = torch.cuda.memory_reserved(device)
        free_memory = total_memory - (allocated_memory + cached_memory)

        print(f"Total memory: {total_memory / 1e9:.2f} GB")
        print(f"Allocated memory: {allocated_memory / 1e9:.2f} GB")
        print(f"Cached memory: {cached_memory / 1e9:.2f} GB")
        print(f"Free memory: {free_memory / 1e9:.2f} GB")
    else:
        print("CUDA is not available.")
show_cuda_space_info()

Using device: NVIDIA GeForce GTX 1660 Ti
Total memory: 6.44 GB
Allocated memory: 0.00 GB
Cached memory: 0.00 GB
Free memory: 6.44 GB


In [2]:
from transformers import BertTokenizerFast, BertForQuestionAnswering, pipeline, \
                            DataCollatorWithPadding, TrainingArguments, Trainer, \
                                AutoModelForQuestionAnswering, AutoTokenizer

from datasets import Dataset
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from bs4 import BeautifulSoup
import requests

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
bert_tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased', return_token_type_ids=True)
qa_bert = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
qa_df = pd.read_csv('./data/qa.csv')
qa_df.head()

Unnamed: 0,question,context,start_positions,end_positions,answer
0,What sare the benifts of the blood brain barrir?,Another approach to brain function is to exami...,56,60,isolated from the bloodstream
1,What is surrounded by cerebrospinal fluid?,Another approach to brain function is to exami...,16,16,brain
2,What does the skull protect?,Another approach to brain function is to exami...,11,11,brain
3,What has been injected into rats to produce pr...,Another approach to brain function is to exami...,153,153,chemicals
4,What can cause issues with how the brain works?,Another approach to brain function is to exami...,93,94,brain damage


In [5]:
# Example:
bert_tokenizer.decode(
    bert_tokenizer.encode(
        qa_df.iloc[0].question,qa_df.iloc[0].context
    )[56:61]
)

'isolated from the bloodstream'

In [6]:
qa_dataset = Dataset.from_pandas(
    qa_df.sample(2000,random_state=100)
).train_test_split(test_size=.2)
qa_dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'context', 'start_positions', 'end_positions', 'answer', '__index_level_0__'],
        num_rows: 1600
    })
    test: Dataset({
        features: ['question', 'context', 'start_positions', 'end_positions', 'answer', '__index_level_0__'],
        num_rows: 400
    })
})

In [7]:
def preprocess(data):
    return bert_tokenizer(
        text=data['question'], text_pair=data['context'], truncation=True
    )

qa_dataset = qa_dataset.map(preprocess, batched=True)
qa_dataset

Map: 100%|██████████| 1600/1600 [00:00<00:00, 1993.95 examples/s]
Map: 100%|██████████| 400/400 [00:00<00:00, 1746.73 examples/s]


DatasetDict({
    train: Dataset({
        features: ['question', 'context', 'start_positions', 'end_positions', 'answer', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1600
    })
    test: Dataset({
        features: ['question', 'context', 'start_positions', 'end_positions', 'answer', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 400
    })
})

In [8]:
show_cuda_space_info()

Using device: NVIDIA GeForce GTX 1660 Ti
Total memory: 6.44 GB
Allocated memory: 0.00 GB
Cached memory: 0.00 GB
Free memory: 6.44 GB


In [12]:
# Freeze all but last 2 layers
for name, param in qa_bert.bert.named_parameters():
    if 'encoder.layer.7' in name:
        break
    param.requires_grad = False

In [13]:
data_collator = DataCollatorWithPadding(tokenizer=bert_tokenizer)

In [14]:
batch_size = 32
epochs = 2

training_args = TrainingArguments(
    output_dir='./qa/results',
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    logging_dir='./qa/logs',
    save_strategy='epoch',
    logging_steps=10,
    evaluation_strategy='epoch',
    load_best_model_at_end=True
)

trainer = Trainer(
    model=qa_bert,
    args=training_args,
    train_dataset=qa_dataset['train'],
    eval_dataset=qa_dataset['test'],
    data_collator=data_collator
)

# Check if the model is using CUDA (GPU)
is_using_cuda = trainer.args.device.type == 'cuda'
print(f"Is using CUDA: "+'\033[92m'+str(is_using_cuda)+'\033[0m'+'\n\n\n\n')



trainer.evaluate()
show_cuda_space_info()

Is using CUDA: [92mTrue[0m






100%|██████████| 13/13 [00:12<00:00,  1.03it/s]

Using device: NVIDIA GeForce GTX 1660 Ti
Total memory: 6.44 GB
Allocated memory: 0.45 GB
Cached memory: 2.60 GB
Free memory: 3.40 GB





In [19]:
torch.cuda.empty_cache()
show_cuda_space_info()

Using device: NVIDIA GeForce GTX 1660 Ti
Total memory: 6.44 GB
Allocated memory: 5.30 GB
Cached memory: 5.50 GB
Free memory: -4.36 GB


In [20]:
# trainer.train()
# trainer.save_model()

In [47]:
PERSON = 'Yahya Sinwar'

google_html = BeautifulSoup(
    requests.get(f"https://www.google.com/search?hl=en&q={PERSON}").text
).get_text().split(f"_".join(PERSON.split()))[-1][:512]


google_html

"Yahya Sinwar also spelled Yehya Sinwar, is a Palestinian politician who has been leader of Hamas, the Sunni Islamist political and military organization\xa0...Khan Yunis refugee camp · Essam al-Da'alis · Far'aIsrael finds tunnels under vacation homes of Yahya Sinwar, other ...www.timesofisrael.com › idf-uncovers-tunnels-under-vacation-homes-of-y...1 day ago · Israeli forces announced Sunday that they had discovered tunnels under Gaza vacation homes used by senior Hamas leaders, as well as another\xa0...Gaza's big"

In [48]:
squad_pipe = pipeline(
    "question-answering", "bert-large-uncased-whole-word-masking-finetuned-squad"
)

config.json: 100%|██████████| 443/443 [00:00<00:00, 110kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
model.safetensors: 100%|██████████| 1.34G/1.34G [04:51<00:00, 4.60MB/s]
Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initiali

In [49]:
squad_pipe(f"Who's {PERSON}?", google_html)

{'score': 0.5514828562736511,
 'start': 43,
 'end': 67,
 'answer': 'a Palestinian politician'}

In [55]:
large_tokenizer = AutoTokenizer.from_pretrained(
    'bert-large-uncased-whole-word-masking-finetuned-squad'
)

qa_input = large_tokenizer(
    f"Who's {PERSON}?", google_html, return_tensors='pt'
)

In [56]:
qa_input

{'input_ids': tensor([[  101,  2040,  1005,  1055,  8038, 17915,  8254,  9028,  1029,   102,
          8038, 17915,  8254,  9028,  2036, 11479,  6300, 17915,  8254,  9028,
          1010,  2003,  1037,  9302,  3761,  2040,  2038,  2042,  3003,  1997,
         22129,  1010,  1996, 18883, 27256,  2576,  1998,  2510,  3029,  1012,
          1012,  1012,  4967, 22854,  2483, 13141,  3409,  1087,  9686, 21559,
          2632,  1011,  4830,  1005,  4862,  2015,  1087,  2521,  1005,  9932,
         21338, 21147,  4858, 10633,  2104, 10885,  5014,  1997,  8038, 17915,
          8254,  9028,  1010,  2060,  1012,  1012,  1012,  7479,  1012,  2335,
         11253,  2483, 16652,  2140,  1012,  4012,  1533, 24011,  1011, 26944,
          2015,  1011, 10633,  1011,  2104,  1011, 10885,  1011,  5014,  1011,
          1997,  1011,  1061,  1012,  1012,  1012,  1015,  2154,  3283,  1087,
          5611,  2749,  2623,  4465,  2008,  2027,  2018,  3603, 10633,  2104,
         14474, 10885,  5014,  2109,  

In [57]:
large_qa_bert =  AutoModelForQuestionAnswering.from_pretrained(
    'bert-large-uncased-whole-word-masking-finetuned-squad'
)
output = large_qa_bert(**qa_input)
output

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


QuestionAnsweringModelOutput(loss=None, start_logits=tensor([[-4.4032, -6.5777, -5.8264, -7.7264, -7.3004, -9.1275, -9.0435, -9.4126,
         -9.2898, -4.4032,  0.5009, -4.8267, -3.6733, -6.0305, -4.6577, -5.3955,
         -0.8210, -5.2820, -5.6137, -6.2838, -4.6382,  2.5435,  6.7323,  6.2563,
          2.2676, -3.9064, -2.8994, -3.9998,  0.8833, -5.3907, -0.2169, -5.8393,
         -2.9233, -1.4878, -3.0171, -2.4509, -7.0053, -3.5163, -3.5348, -5.1425,
         -6.8108, -7.1112, -2.4771, -6.6830, -7.4937, -6.2401, -7.1022, -5.7704,
         -3.1855, -6.9959, -6.8450, -9.0561, -7.4292, -9.0235, -8.0169, -7.7524,
         -5.5358, -3.0518, -8.7132, -7.2135, -7.6325, -6.8808, -6.7930, -6.5131,
         -8.3057, -7.0755, -7.5506, -8.1282, -4.9998, -8.1438, -7.7102, -8.9431,
         -8.3114, -6.7733, -6.9179, -8.1168, -8.3249, -5.3237, -8.4998, -5.8672,
         -9.1332, -9.3890, -9.2773, -9.2029, -8.7893, -7.8259, -5.1415, -6.3822,
         -7.7428, -7.5734, -8.8970, -8.1382, -7.1561, -8