# Ray Tune Hyperparameter Search

https://docs.ray.io/en/latest/tune/index.html

In [None]:
!pip install "ray[tune]" transformers datasets -q
!pip list | grep "transformers\|datasets\|torch\|ray" # show versions for reproducibility

[K     |████████████████████████████████| 54.5 MB 116 kB/s 
[K     |████████████████████████████████| 4.7 MB 53.5 MB/s 
[K     |████████████████████████████████| 365 kB 68.3 MB/s 
[K     |████████████████████████████████| 101 kB 12.4 MB/s 
[K     |████████████████████████████████| 6.6 MB 48.7 MB/s 
[K     |████████████████████████████████| 596 kB 73.3 MB/s 
[K     |████████████████████████████████| 141 kB 58.1 MB/s 
[K     |████████████████████████████████| 212 kB 74.0 MB/s 
[K     |████████████████████████████████| 127 kB 74.9 MB/s 
[K     |████████████████████████████████| 8.8 MB 60.9 MB/s 
[K     |████████████████████████████████| 4.1 MB 47.8 MB/s 
[K     |████████████████████████████████| 125 kB 75.0 MB/s 
[K     |████████████████████████████████| 466 kB 75.0 MB/s 
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datascience 0.10.6 requires

In [None]:
from datasets import load_dataset, load_metric
from transformers import (AutoModelForSequenceClassification, AutoTokenizer,
                          Trainer, TrainingArguments)

# I use a small model so it goes fast
model_ckpt = 'microsoft/xtremedistil-l6-h256-uncased'

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

# this is a small binary text classification dataset
dataset = load_dataset('glue', 'mrpc')
metric = load_metric('glue', 'mrpc')

# This will tokenize the examples
def encode(examples):
    outputs = tokenizer(
        examples['sentence1'], examples['sentence2'], truncation=True)
    return outputs

encoded_dataset = dataset.map(encode, batched=True)

# his function will be called at the beginning
# of each trial to create a new model
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(
        model_ckpt, return_dict=True)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Evaluate during training and a bit more often
# than the default to be able to prune bad trials early.
# Disabling tqdm is a matter of preference.
training_args = TrainingArguments(
    "ray-hp-search", 
    evaluation_strategy="steps", 
    eval_steps=500, 
    disable_tqdm=True
    )
trainer = Trainer(
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    model_init=model_init,
    compute_metrics=compute_metrics,
)

def raytune_objective(metrics):
    # if you don't pass this to the `hyperparameter_search` function, 
    # it will default to maximizing the sum of all metrics

    return metrics["eval_accuracy"]

def raytune_hp_space(trial):
    from ray import tune

    return {
        "learning_rate": tune.loguniform(1e-4, 1e-2),
        "num_train_epochs": tune.randint(lower=1, upper=6),
        "per_device_train_batch_size": tune.choice([4, 8, 16, 32, 64]),
        "weight_decay": tune.choice([0.0, 0.01, 0.05, 0.009]),
    }

# Default objective is the sum of all metrics
# when metrics are provided, so we have to maximize it.
best_trial = trainer.hyperparameter_search(
    direction="maximize", 
    backend="ray", 
    hp_space=raytune_hp_space,
    n_trials=10, # number of trials
    #compute_objective=raytune_objective,
    resources_per_trial={
            "cpu": 1,
            "gpu": 1, # can utilize multiple GPUs if available
        }
)
# additional kwargs will be passed to ray.tune.run (https://docs.ray.io/en/latest/tune/api_docs/execution.html)

Downloading config.json:   0%|          | 0.00/525 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.78k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/4.47k [00:00<?, ?B/s]

Downloading and preparing dataset glue/mrpc (download: 1.43 MiB, generated: 1.43 MiB, post-processed: Unknown size, total: 2.85 MiB) to /root/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data: 0.00B [00:00, ?B/s]

Downloading data: 0.00B [00:00, ?B/s]

Downloading data: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

Dataset glue downloaded and prepared to /root/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading builder script:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

loading configuration file https://huggingface.co/microsoft/xtremedistil-l6-h256-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/e545113a9b5e643083a51ba645bed76c8df28ab46010b1ae39650ce7c668fe0c.142acdac8e7631616b334d09a95700d181a33a84fd7eb9ab46593634d0873dda
Model config BertConfig {
  "_name_or_path": "microsoft/xtremedistil-l6-h256-uncased",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 8,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.21.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

https://huggingface.co/microso

Downloading pytorch_model.bin:   0%|          | 0.00/48.7M [00:00<?, ?B/s]

storing https://huggingface.co/microsoft/xtremedistil-l6-h256-uncased/resolve/main/pytorch_model.bin in cache at /root/.cache/huggingface/transformers/5b9e9aa22907609566b731f4ac500e12a8f652024928a0d35bd8cb4de48d04dc.182b458c89816670d9a6a9910ed233b7a10984e821dac6b9a056a9512b4e1ad0
creating metadata file for /root/.cache/huggingface/transformers/5b9e9aa22907609566b731f4ac500e12a8f652024928a0d35bd8cb4de48d04dc.182b458c89816670d9a6a9910ed233b7a10984e821dac6b9a056a9512b4e1ad0
loading weights file https://huggingface.co/microsoft/xtremedistil-l6-h256-uncased/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/5b9e9aa22907609566b731f4ac500e12a8f652024928a0d35bd8cb4de48d04dc.182b458c89816670d9a6a9910ed233b7a10984e821dac6b9a056a9512b4e1ad0
All model checkpoint weights were used when initializing BertForSequenceClassification.

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and 

== Status ==
Current time: 2022-07-28 22:02:52 (running for 00:00:00.22)
Memory usage on this node: 2.6/12.7 GiB
Using FIFO scheduling algorithm.
Resources requested: 1.0/2 CPUs, 1.0/1 GPUs, 0.0/7.34 GiB heap, 0.0/3.67 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/_objective_2022-07-28_22-02-51
Number of trials: 10/10 (9 PENDING, 1 RUNNING)
+------------------------+----------+----------------+-----------------+--------------------+-------------------------------+----------------+
| Trial name             | status   | loc            |   learning_rate |   num_train_epochs |   per_device_train_batch_size |   weight_decay |
|------------------------+----------+----------------+-----------------+--------------------+-------------------------------+----------------|
| _objective_067ad_00000 | RUNNING  | 172.28.0.2:367 |     0.000561152 |                  5 |                            16 |          0.009 |
| _objective_067ad_00001 | PENDING  |                |

[2m[36m(_objective pid=367)[0m Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
[2m[36m(_objective pid=367)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


== Status ==
Current time: 2022-07-28 22:03:01 (running for 00:00:09.51)
Memory usage on this node: 3.9/12.7 GiB
Using FIFO scheduling algorithm.
Resources requested: 1.0/2 CPUs, 1.0/1 GPUs, 0.0/7.34 GiB heap, 0.0/3.67 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/_objective_2022-07-28_22-02-51
Number of trials: 10/10 (9 PENDING, 1 RUNNING)
+------------------------+----------+----------------+-----------------+--------------------+-------------------------------+----------------+
| Trial name             | status   | loc            |   learning_rate |   num_train_epochs |   per_device_train_batch_size |   weight_decay |
|------------------------+----------+----------------+-----------------+--------------------+-------------------------------+----------------|
| _objective_067ad_00000 | RUNNING  | 172.28.0.2:367 |     0.000561152 |                  5 |                            16 |          0.009 |
| _objective_067ad_00001 | PENDING  |                |

[2m[36m(_objective pid=430)[0m Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
[2m[36m(_objective pid=430)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


== Status ==
Current time: 2022-07-28 22:04:13 (running for 00:01:21.57)
Memory usage on this node: 3.9/12.7 GiB
Using FIFO scheduling algorithm.
Resources requested: 1.0/2 CPUs, 1.0/1 GPUs, 0.0/7.34 GiB heap, 0.0/3.67 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/_objective_2022-07-28_22-02-51
Number of trials: 10/10 (8 PENDING, 1 RUNNING, 1 TERMINATED)
+------------------------+------------+----------------+-----------------+--------------------+-------------------------------+----------------+-------------+
| Trial name             | status     | loc            |   learning_rate |   num_train_epochs |   per_device_train_batch_size |   weight_decay |   objective |
|------------------------+------------+----------------+-----------------+--------------------+-------------------------------+----------------+-------------|
| _objective_067ad_00001 | RUNNING    | 172.28.0.2:430 |     0.00157513  |                  2 |                            16 |        

[2m[36m(_objective pid=482)[0m Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
[2m[36m(_objective pid=482)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


== Status ==
Current time: 2022-07-28 22:04:45 (running for 00:01:53.57)
Memory usage on this node: 3.9/12.7 GiB
Using FIFO scheduling algorithm.
Resources requested: 1.0/2 CPUs, 1.0/1 GPUs, 0.0/7.34 GiB heap, 0.0/3.67 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/_objective_2022-07-28_22-02-51
Number of trials: 10/10 (7 PENDING, 1 RUNNING, 2 TERMINATED)
+------------------------+------------+----------------+-----------------+--------------------+-------------------------------+----------------+-------------+
| Trial name             | status     | loc            |   learning_rate |   num_train_epochs |   per_device_train_batch_size |   weight_decay |   objective |
|------------------------+------------+----------------+-----------------+--------------------+-------------------------------+----------------+-------------|
| _objective_067ad_00002 | RUNNING    | 172.28.0.2:482 |     0.000130667 |                  5 |                            32 |        

[2m[36m(_objective pid=544)[0m Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
[2m[36m(_objective pid=544)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


== Status ==
Current time: 2022-07-28 22:05:54 (running for 00:03:02.58)
Memory usage on this node: 3.8/12.7 GiB
Using FIFO scheduling algorithm.
Resources requested: 1.0/2 CPUs, 1.0/1 GPUs, 0.0/7.34 GiB heap, 0.0/3.67 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/_objective_2022-07-28_22-02-51
Number of trials: 10/10 (6 PENDING, 1 RUNNING, 3 TERMINATED)
+------------------------+------------+----------------+-----------------+--------------------+-------------------------------+----------------+-------------+
| Trial name             | status     | loc            |   learning_rate |   num_train_epochs |   per_device_train_batch_size |   weight_decay |   objective |
|------------------------+------------+----------------+-----------------+--------------------+-------------------------------+----------------+-------------|
| _objective_067ad_00003 | RUNNING    | 172.28.0.2:544 |     0.00260702  |                  5 |                             8 |        

[2m[36m(_objective pid=609)[0m Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
[2m[36m(_objective pid=609)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


== Status ==
Current time: 2022-07-28 22:07:04 (running for 00:04:12.60)
Memory usage on this node: 3.9/12.7 GiB
Using FIFO scheduling algorithm.
Resources requested: 1.0/2 CPUs, 1.0/1 GPUs, 0.0/7.34 GiB heap, 0.0/3.67 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/_objective_2022-07-28_22-02-51
Number of trials: 10/10 (5 PENDING, 1 RUNNING, 4 TERMINATED)
+------------------------+------------+----------------+-----------------+--------------------+-------------------------------+----------------+-------------+
| Trial name             | status     | loc            |   learning_rate |   num_train_epochs |   per_device_train_batch_size |   weight_decay |   objective |
|------------------------+------------+----------------+-----------------+--------------------+-------------------------------+----------------+-------------|
| _objective_067ad_00004 | RUNNING    | 172.28.0.2:609 |     0.00462259  |                  2 |                            32 |        

[2m[36m(_objective pid=659)[0m Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
[2m[36m(_objective pid=659)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


== Status ==
Current time: 2022-07-28 22:07:36 (running for 00:04:44.61)
Memory usage on this node: 3.8/12.7 GiB
Using FIFO scheduling algorithm.
Resources requested: 1.0/2 CPUs, 1.0/1 GPUs, 0.0/7.34 GiB heap, 0.0/3.67 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/_objective_2022-07-28_22-02-51
Number of trials: 10/10 (4 PENDING, 1 RUNNING, 5 TERMINATED)
+------------------------+------------+----------------+-----------------+--------------------+-------------------------------+----------------+-------------+
| Trial name             | status     | loc            |   learning_rate |   num_train_epochs |   per_device_train_batch_size |   weight_decay |   objective |
|------------------------+------------+----------------+-----------------+--------------------+-------------------------------+----------------+-------------|
| _objective_067ad_00005 | RUNNING    | 172.28.0.2:659 |     0.00171776  |                  2 |                            64 |        

[2m[36m(_objective pid=708)[0m Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
[2m[36m(_objective pid=708)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


== Status ==
Current time: 2022-07-28 22:08:08 (running for 00:05:16.61)
Memory usage on this node: 3.8/12.7 GiB
Using FIFO scheduling algorithm.
Resources requested: 1.0/2 CPUs, 1.0/1 GPUs, 0.0/7.34 GiB heap, 0.0/3.67 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/_objective_2022-07-28_22-02-51
Number of trials: 10/10 (3 PENDING, 1 RUNNING, 6 TERMINATED)
+------------------------+------------+----------------+-----------------+--------------------+-------------------------------+----------------+-------------+
| Trial name             | status     | loc            |   learning_rate |   num_train_epochs |   per_device_train_batch_size |   weight_decay |   objective |
|------------------------+------------+----------------+-----------------+--------------------+-------------------------------+----------------+-------------|
| _objective_067ad_00006 | RUNNING    | 172.28.0.2:708 |     0.000111205 |                  3 |                            16 |        

[2m[36m(_objective pid=763)[0m Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
[2m[36m(_objective pid=763)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


== Status ==
Current time: 2022-07-28 22:08:53 (running for 00:06:01.62)
Memory usage on this node: 3.8/12.7 GiB
Using FIFO scheduling algorithm.
Resources requested: 1.0/2 CPUs, 1.0/1 GPUs, 0.0/7.34 GiB heap, 0.0/3.67 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/_objective_2022-07-28_22-02-51
Number of trials: 10/10 (2 PENDING, 1 RUNNING, 7 TERMINATED)
+------------------------+------------+----------------+-----------------+--------------------+-------------------------------+----------------+-------------+
| Trial name             | status     | loc            |   learning_rate |   num_train_epochs |   per_device_train_batch_size |   weight_decay |   objective |
|------------------------+------------+----------------+-----------------+--------------------+-------------------------------+----------------+-------------|
| _objective_067ad_00007 | RUNNING    | 172.28.0.2:763 |     0.000190102 |                  4 |                            32 |        

[2m[36m(_objective pid=822)[0m Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
[2m[36m(_objective pid=822)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


== Status ==
Current time: 2022-07-28 22:09:50 (running for 00:06:58.62)
Memory usage on this node: 3.9/12.7 GiB
Using FIFO scheduling algorithm.
Resources requested: 1.0/2 CPUs, 1.0/1 GPUs, 0.0/7.34 GiB heap, 0.0/3.67 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/_objective_2022-07-28_22-02-51
Number of trials: 10/10 (1 PENDING, 1 RUNNING, 8 TERMINATED)
+------------------------+------------+----------------+-----------------+--------------------+-------------------------------+----------------+-------------+
| Trial name             | status     | loc            |   learning_rate |   num_train_epochs |   per_device_train_batch_size |   weight_decay |   objective |
|------------------------+------------+----------------+-----------------+--------------------+-------------------------------+----------------+-------------|
| _objective_067ad_00008 | RUNNING    | 172.28.0.2:822 |     0.000292107 |                  3 |                            32 |        

[2m[36m(_objective pid=876)[0m Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
[2m[36m(_objective pid=876)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


== Status ==
Current time: 2022-07-28 22:10:35 (running for 00:07:43.10)
Memory usage on this node: 3.9/12.7 GiB
Using FIFO scheduling algorithm.
Resources requested: 1.0/2 CPUs, 1.0/1 GPUs, 0.0/7.34 GiB heap, 0.0/3.67 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/_objective_2022-07-28_22-02-51
Number of trials: 10/10 (1 RUNNING, 9 TERMINATED)
+------------------------+------------+----------------+-----------------+--------------------+-------------------------------+----------------+-------------+
| Trial name             | status     | loc            |   learning_rate |   num_train_epochs |   per_device_train_batch_size |   weight_decay |   objective |
|------------------------+------------+----------------+-----------------+--------------------+-------------------------------+----------------+-------------|
| _objective_067ad_00009 | RUNNING    | 172.28.0.2:876 |     0.00925682  |                  1 |                            16 |          0     |  

2022-07-28 22:10:45,463	INFO tune.py:748 -- Total run time: 473.67 seconds (473.41 seconds for the tuning loop).


Result for _objective_067ad_00009:
  date: 2022-07-28_22-10-45
  done: true
  epoch: 1.0
  eval_accuracy: 0.6838235294117647
  eval_f1: 0.8122270742358079
  eval_loss: 0.6276770234107971
  eval_runtime: 0.3749
  eval_samples_per_second: 1088.374
  eval_steps_per_second: 136.047
  experiment_id: 3ab4b8d535c040619fe7bbd16ae02b25
  hostname: f2cea957d83c
  iterations_since_restore: 1
  node_ip: 172.28.0.2
  objective: 1.4960506036475727
  pid: 876
  time_since_restore: 15.309271097183228
  time_this_iter_s: 15.309271097183228
  time_total_s: 15.309271097183228
  timestamp: 1659046245
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 067ad_00009
  warmup_time: 0.0037779808044433594
  
== Status ==
Current time: 2022-07-28 22:10:45 (running for 00:07:53.43)
Memory usage on this node: 3.7/12.7 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/2 CPUs, 0/1 GPUs, 0.0/7.34 GiB heap, 0.0/3.67 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/_ob

In [None]:
best_trial

BestRun(run_id='067ad_00002', objective=1.8217900063251107, hyperparameters={'learning_rate': 0.000130667392380533, 'num_train_epochs': 5, 'per_device_train_batch_size': 32, 'weight_decay': 0.009})

# Ray with Hyperopt

http://hyperopt.github.io/hyperopt/

In [None]:
!pip install -U hyperopt -q 
!pip list | grep "transformers\|datasets\|torch\|hyperopt"

[K     |████████████████████████████████| 1.6 MB 6.9 MB/s 
[K     |████████████████████████████████| 199 kB 46.8 MB/s 
[?25hdatasets                      2.4.0
hyperopt                      0.2.7
tensorflow-datasets           4.0.1
torch                         1.12.0+cu113
torchaudio                    0.12.0+cu113
torchsummary                  1.5.1
torchtext                     0.13.0
torchvision                   0.13.0+cu113
transformers                  4.21.0
vega-datasets                 0.9.0


In [None]:
from ray.tune.suggest.hyperopt import HyperOptSearch
from ray.tune.schedulers import ASHAScheduler

from hyperopt import hp

hp_space = {
        "learning_rate": hp.loguniform("learning_rate", 1e-4, 1e-2),
        "num_train_epochs": hp.uniformint("num_train_epochs", lower=1, upper=6),
        "per_device_train_batch_size": hp.choice("per_device_train_batch_size", [4, 8, 16, 32, 64]),
        "weight_decay": hp.choice("weight_decay", [0.0, 0.01, 0.05, 0.009]),
    }

best_trial = trainer.hyperparameter_search(
    direction="maximize",
    backend="ray",
    # hp_space=raytune_hp_space,
    # Choose among many libraries:
    # https://docs.ray.io/en/latest/tune/api_docs/suggestion.html
    search_alg=HyperOptSearch(metric="objective", mode="max", space=hp_space),
    # Choose among schedulers:
    # https://docs.ray.io/en/latest/tune/api_docs/schedulers.html
    scheduler=ASHAScheduler(metric="objective", mode="max", )
)


loading configuration file https://huggingface.co/microsoft/xtremedistil-l6-h256-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/e545113a9b5e643083a51ba645bed76c8df28ab46010b1ae39650ce7c668fe0c.142acdac8e7631616b334d09a95700d181a33a84fd7eb9ab46593634d0873dda
Model config BertConfig {
  "_name_or_path": "microsoft/xtremedistil-l6-h256-uncased",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 8,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.21.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://h

== Status ==
Current time: 2022-07-27 19:32:39 (running for 00:00:00.28)
Memory usage on this node: 2.9/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 64.000: None | Iter 16.000: None | Iter 4.000: None | Iter 1.000: None
Resources requested: 1.0/2 CPUs, 1.0/1 GPUs, 0.0/7.36 GiB heap, 0.0/3.68 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/_objective_2022-07-27_19-32-38
Number of trials: 1/20 (1 RUNNING)
+---------------------+----------+-----------------+-----------------+--------------------+-------------------------------+----------------+
| Trial name          | status   | loc             |   learning_rate |   num_train_epochs |   per_device_train_batch_size |   weight_decay |
|---------------------+----------+-----------------+-----------------+--------------------+-------------------------------+----------------|
| _objective_e02bb3a2 | RUNNING  | 172.28.0.2:1836 |       0.0030653 |                  3 |                             8 |    

[2m[36m(_objective pid=1836)[0m Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
[2m[36m(_objective pid=1836)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


== Status ==
Current time: 2022-07-27 19:32:49 (running for 00:00:10.32)
Memory usage on this node: 4.2/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 64.000: None | Iter 16.000: None | Iter 4.000: None | Iter 1.000: None
Resources requested: 1.0/2 CPUs, 1.0/1 GPUs, 0.0/7.36 GiB heap, 0.0/3.68 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/_objective_2022-07-27_19-32-38
Number of trials: 2/20 (1 PENDING, 1 RUNNING)
+---------------------+----------+-----------------+-----------------+--------------------+-------------------------------+----------------+
| Trial name          | status   | loc             |   learning_rate |   num_train_epochs |   per_device_train_batch_size |   weight_decay |
|---------------------+----------+-----------------+-----------------+--------------------+-------------------------------+----------------|
| _objective_e02bb3a2 | RUNNING  | 172.28.0.2:1836 |     0.0030653   |                  3 |                         

[2m[36m(_objective pid=1907)[0m Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
[2m[36m(_objective pid=1907)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


== Status ==
Current time: 2022-07-27 19:33:54 (running for 00:01:15.16)
Memory usage on this node: 4.0/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 64.000: None | Iter 16.000: None | Iter 4.000: None | Iter 1.000: 1.4960506036475727
Resources requested: 1.0/2 CPUs, 1.0/1 GPUs, 0.0/7.36 GiB heap, 0.0/3.68 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/_objective_2022-07-27_19-32-38
Number of trials: 3/20 (1 PENDING, 1 RUNNING, 1 TERMINATED)
+---------------------+------------+-----------------+-----------------+--------------------+-------------------------------+----------------+-------------+
| Trial name          | status     | loc             |   learning_rate |   num_train_epochs |   per_device_train_batch_size |   weight_decay |   objective |
|---------------------+------------+-----------------+-----------------+--------------------+-------------------------------+----------------+-------------|
| _objective_e347d7e6 | RUNNING    | 172

[2m[36m(_objective pid=1971)[0m Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
[2m[36m(_objective pid=1971)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


== Status ==
Current time: 2022-07-27 19:34:41 (running for 00:02:02.08)
Memory usage on this node: 4.0/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 64.000: None | Iter 16.000: None | Iter 4.000: None | Iter 1.000: 1.4960506036475727
Resources requested: 1.0/2 CPUs, 1.0/1 GPUs, 0.0/7.36 GiB heap, 0.0/3.68 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/_objective_2022-07-27_19-32-38
Number of trials: 4/20 (1 PENDING, 1 RUNNING, 2 TERMINATED)
+---------------------+------------+-----------------+-----------------+--------------------+-------------------------------+----------------+-------------+
| Trial name          | status     | loc             |   learning_rate |   num_train_epochs |   per_device_train_batch_size |   weight_decay |   objective |
|---------------------+------------+-----------------+-----------------+--------------------+-------------------------------+----------------+-------------|
| _objective_06e22aa8 | RUNNING    | 172

[2m[36m(_objective pid=2022)[0m Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
[2m[36m(_objective pid=2022)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


== Status ==
Current time: 2022-07-27 19:35:08 (running for 00:02:29.08)
Memory usage on this node: 4.0/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 64.000: None | Iter 16.000: None | Iter 4.000: None | Iter 1.000: 1.4960506036475727
Resources requested: 1.0/2 CPUs, 1.0/1 GPUs, 0.0/7.36 GiB heap, 0.0/3.68 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/_objective_2022-07-27_19-32-38
Number of trials: 5/20 (1 PENDING, 1 RUNNING, 3 TERMINATED)
+---------------------+------------+-----------------+-----------------+--------------------+-------------------------------+----------------+-------------+
| Trial name          | status     | loc             |   learning_rate |   num_train_epochs |   per_device_train_batch_size |   weight_decay |   objective |
|---------------------+------------+-----------------+-----------------+--------------------+-------------------------------+----------------+-------------|
| _objective_22e7903a | RUNNING    | 172

[2m[36m(_objective pid=2103)[0m Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
[2m[36m(_objective pid=2103)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


== Status ==
Current time: 2022-07-27 19:36:44 (running for 00:04:05.19)
Memory usage on this node: 4.1/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 64.000: None | Iter 16.000: None | Iter 4.000: None | Iter 1.000: 1.4960506036475727
Resources requested: 1.0/2 CPUs, 1.0/1 GPUs, 0.0/7.36 GiB heap, 0.0/3.68 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/_objective_2022-07-27_19-32-38
Number of trials: 6/20 (1 PENDING, 1 RUNNING, 4 TERMINATED)
+---------------------+------------+-----------------+-----------------+--------------------+-------------------------------+----------------+-------------+
| Trial name          | status     | loc             |   learning_rate |   num_train_epochs |   per_device_train_batch_size |   weight_decay |   objective |
|---------------------+------------+-----------------+-----------------+--------------------+-------------------------------+----------------+-------------|
| _objective_32ff4efe | RUNNING    | 172

[2m[36m(_objective pid=2182)[0m Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
[2m[36m(_objective pid=2182)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


== Status ==
Current time: 2022-07-27 19:38:34 (running for 00:05:54.86)
Memory usage on this node: 4.0/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 64.000: None | Iter 16.000: None | Iter 4.000: 1.7984368572603866 | Iter 1.000: 1.4960506036475727
Resources requested: 1.0/2 CPUs, 1.0/1 GPUs, 0.0/7.36 GiB heap, 0.0/3.68 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/_objective_2022-07-27_19-32-38
Number of trials: 7/20 (1 PENDING, 1 RUNNING, 5 TERMINATED)
+---------------------+------------+-----------------+-----------------+--------------------+-------------------------------+----------------+-------------+
| Trial name          | status     | loc             |   learning_rate |   num_train_epochs |   per_device_train_batch_size |   weight_decay |   objective |
|---------------------+------------+-----------------+-----------------+--------------------+-------------------------------+----------------+-------------|
| _objective_6c3c96fe | RU

[2m[36m(_objective pid=2244)[0m Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
[2m[36m(_objective pid=2244)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


== Status ==
Current time: 2022-07-27 19:39:40 (running for 00:07:01.15)
Memory usage on this node: 3.9/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 64.000: None | Iter 16.000: None | Iter 4.000: 1.7984368572603866 | Iter 1.000: 1.4960506036475727
Resources requested: 1.0/2 CPUs, 1.0/1 GPUs, 0.0/7.36 GiB heap, 0.0/3.68 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/_objective_2022-07-27_19-32-38
Number of trials: 8/20 (1 PENDING, 1 RUNNING, 6 TERMINATED)
+---------------------+------------+-----------------+-----------------+--------------------+-------------------------------+----------------+-------------+
| Trial name          | status     | loc             |   learning_rate |   num_train_epochs |   per_device_train_batch_size |   weight_decay |   objective |
|---------------------+------------+-----------------+-----------------+--------------------+-------------------------------+----------------+-------------|
| _objective_ad371a80 | RU

[2m[36m(_objective pid=2292)[0m Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
[2m[36m(_objective pid=2292)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


== Status ==
Current time: 2022-07-27 19:40:08 (running for 00:07:29.22)
Memory usage on this node: 4.0/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 64.000: None | Iter 16.000: None | Iter 4.000: 1.7984368572603866 | Iter 1.000: 1.6249468535770177
Resources requested: 1.0/2 CPUs, 1.0/1 GPUs, 0.0/7.36 GiB heap, 0.0/3.68 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/_objective_2022-07-27_19-32-38
Number of trials: 9/20 (1 PENDING, 1 RUNNING, 7 TERMINATED)
+---------------------+------------+-----------------+-----------------+--------------------+-------------------------------+----------------+-------------+
| Trial name          | status     | loc             |   learning_rate |   num_train_epochs |   per_device_train_batch_size |   weight_decay |   objective |
|---------------------+------------+-----------------+-----------------+--------------------+-------------------------------+----------------+-------------|
| _objective_d528b94a | RU

[2m[36m(_objective pid=2339)[0m Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
[2m[36m(_objective pid=2339)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


== Status ==
Current time: 2022-07-27 19:40:35 (running for 00:07:56.14)
Memory usage on this node: 4.0/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 64.000: None | Iter 16.000: None | Iter 4.000: 1.7984368572603866 | Iter 1.000: 1.5604987286122953
Resources requested: 1.0/2 CPUs, 1.0/1 GPUs, 0.0/7.36 GiB heap, 0.0/3.68 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/_objective_2022-07-27_19-32-38
Number of trials: 10/20 (1 PENDING, 1 RUNNING, 8 TERMINATED)
+---------------------+------------+-----------------+-----------------+--------------------+-------------------------------+----------------+-------------+
| Trial name          | status     | loc             |   learning_rate |   num_train_epochs |   per_device_train_batch_size |   weight_decay |   objective |
|---------------------+------------+-----------------+-----------------+--------------------+-------------------------------+----------------+-------------|
| _objective_e5d9e606 | R

[2m[36m(_objective pid=2387)[0m Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
[2m[36m(_objective pid=2387)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


== Status ==
Current time: 2022-07-27 19:41:03 (running for 00:08:24.25)
Memory usage on this node: 4.0/12.7 GiB
Using AsyncHyperBand: num_stopped=1
Bracket: Iter 64.000: None | Iter 16.000: None | Iter 4.000: 1.7984368572603866 | Iter 1.000: 1.4960506036475727
Resources requested: 1.0/2 CPUs, 1.0/1 GPUs, 0.0/7.36 GiB heap, 0.0/3.68 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/_objective_2022-07-27_19-32-38
Number of trials: 11/20 (1 PENDING, 1 RUNNING, 9 TERMINATED)
+---------------------+------------+-----------------+-----------------+--------------------+-------------------------------+----------------+-------------+
| Trial name          | status     | loc             |   learning_rate |   num_train_epochs |   per_device_train_batch_size |   weight_decay |   objective |
|---------------------+------------+-----------------+-----------------+--------------------+-------------------------------+----------------+-------------|
| _objective_f5f1ce3c | R

[2m[36m(_objective pid=2434)[0m Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
[2m[36m(_objective pid=2434)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


== Status ==
Current time: 2022-07-27 19:41:31 (running for 00:08:52.19)
Memory usage on this node: 4.0/12.7 GiB
Using AsyncHyperBand: num_stopped=1
Bracket: Iter 64.000: None | Iter 16.000: None | Iter 4.000: 1.7984368572603866 | Iter 1.000: 1.4960506036475727
Resources requested: 1.0/2 CPUs, 1.0/1 GPUs, 0.0/7.36 GiB heap, 0.0/3.68 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/_objective_2022-07-27_19-32-38
Number of trials: 12/20 (1 PENDING, 1 RUNNING, 10 TERMINATED)
+---------------------+------------+-----------------+-----------------+--------------------+-------------------------------+----------------+-------------+
| Trial name          | status     | loc             |   learning_rate |   num_train_epochs |   per_device_train_batch_size |   weight_decay |   objective |
|---------------------+------------+-----------------+-----------------+--------------------+-------------------------------+----------------+-------------|
| _objective_06a2e0e0 | 

[2m[36m(_objective pid=2503)[0m Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
[2m[36m(_objective pid=2503)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


== Status ==
Current time: 2022-07-27 19:43:04 (running for 00:10:25.22)
Memory usage on this node: 4.0/12.7 GiB
Using AsyncHyperBand: num_stopped=2
Bracket: Iter 64.000: None | Iter 16.000: None | Iter 4.000: 1.7228402938571832 | Iter 1.000: 1.4960506036475727
Resources requested: 1.0/2 CPUs, 1.0/1 GPUs, 0.0/7.36 GiB heap, 0.0/3.68 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/_objective_2022-07-27_19-32-38
Number of trials: 13/20 (1 PENDING, 1 RUNNING, 11 TERMINATED)
+---------------------+------------+-----------------+-----------------+--------------------+-------------------------------+----------------+-------------+
| Trial name          | status     | loc             |   learning_rate |   num_train_epochs |   per_device_train_batch_size |   weight_decay |   objective |
|---------------------+------------+-----------------+-----------------+--------------------+-------------------------------+----------------+-------------|
| _objective_1753f352 | 

[2m[36m(_objective pid=2579)[0m Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
[2m[36m(_objective pid=2579)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


== Status ==
Current time: 2022-07-27 19:44:48 (running for 00:12:09.37)
Memory usage on this node: 4.0/12.7 GiB
Using AsyncHyperBand: num_stopped=2
Bracket: Iter 64.000: None | Iter 16.000: None | Iter 4.000: 1.765661920159184 | Iter 1.000: 1.5450903348455483
Resources requested: 1.0/2 CPUs, 1.0/1 GPUs, 0.0/7.36 GiB heap, 0.0/3.68 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/_objective_2022-07-27_19-32-38
Number of trials: 14/20 (1 PENDING, 1 RUNNING, 12 TERMINATED)
+---------------------+------------+-----------------+-----------------+--------------------+-------------------------------+----------------+-------------+
| Trial name          | status     | loc             |   learning_rate |   num_train_epochs |   per_device_train_batch_size |   weight_decay |   objective |
|---------------------+------------+-----------------+-----------------+--------------------+-------------------------------+----------------+-------------|
| _objective_4ec597c8 | R

[2m[36m(_objective pid=2626)[0m Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
[2m[36m(_objective pid=2626)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


== Status ==
Current time: 2022-07-27 19:45:17 (running for 00:12:38.28)
Memory usage on this node: 4.0/12.7 GiB
Using AsyncHyperBand: num_stopped=3
Bracket: Iter 64.000: None | Iter 16.000: None | Iter 4.000: 1.765661920159184 | Iter 1.000: 1.4960506036475727
Resources requested: 1.0/2 CPUs, 1.0/1 GPUs, 0.0/7.36 GiB heap, 0.0/3.68 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/_objective_2022-07-27_19-32-38
Number of trials: 15/20 (1 PENDING, 1 RUNNING, 13 TERMINATED)
+---------------------+------------+-----------------+-----------------+--------------------+-------------------------------+----------------+-------------+
| Trial name          | status     | loc             |   learning_rate |   num_train_epochs |   per_device_train_batch_size |   weight_decay |   objective |
|---------------------+------------+-----------------+-----------------+--------------------+-------------------------------+----------------+-------------|
| _objective_8cc7ff66 | R

[2m[36m(_objective pid=2673)[0m Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
[2m[36m(_objective pid=2673)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


== Status ==
Current time: 2022-07-27 19:45:46 (running for 00:13:07.23)
Memory usage on this node: 4.0/12.7 GiB
Using AsyncHyperBand: num_stopped=3
Bracket: Iter 64.000: None | Iter 16.000: None | Iter 4.000: 1.765661920159184 | Iter 1.000: 1.6431697972414994
Resources requested: 1.0/2 CPUs, 1.0/1 GPUs, 0.0/7.36 GiB heap, 0.0/3.68 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/_objective_2022-07-27_19-32-38
Number of trials: 16/20 (1 PENDING, 1 RUNNING, 14 TERMINATED)
+---------------------+------------+-----------------+-----------------+--------------------+-------------------------------+----------------+-------------+
| Trial name          | status     | loc             |   learning_rate |   num_train_epochs |   per_device_train_batch_size |   weight_decay |   objective |
|---------------------+------------+-----------------+-----------------+--------------------+-------------------------------+----------------+-------------|
| _objective_9e151236 | R

[2m[36m(_objective pid=2736)[0m Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
[2m[36m(_objective pid=2736)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


== Status ==
Current time: 2022-07-27 19:47:10 (running for 00:14:31.39)
Memory usage on this node: 4.0/12.7 GiB
Using AsyncHyperBand: num_stopped=3
Bracket: Iter 64.000: None | Iter 16.000: None | Iter 4.000: 1.765661920159184 | Iter 1.000: 1.7230263159729688
Resources requested: 1.0/2 CPUs, 1.0/1 GPUs, 0.0/7.36 GiB heap, 0.0/3.68 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/_objective_2022-07-27_19-32-38
Number of trials: 17/20 (1 PENDING, 1 RUNNING, 15 TERMINATED)
+---------------------+------------+-----------------+-----------------+--------------------+-------------------------------+----------------+-------------+
| Trial name          | status     | loc             |   learning_rate |   num_train_epochs |   per_device_train_batch_size |   weight_decay |   objective |
|---------------------+------------+-----------------+-----------------+--------------------+-------------------------------+----------------+-------------|
| _objective_af5acbee | R

[2m[36m(_objective pid=2784)[0m Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
[2m[36m(_objective pid=2784)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


== Status ==
Current time: 2022-07-27 19:47:40 (running for 00:15:01.29)
Memory usage on this node: 4.0/12.7 GiB
Using AsyncHyperBand: num_stopped=4
Bracket: Iter 64.000: None | Iter 16.000: None | Iter 4.000: 1.765661920159184 | Iter 1.000: 1.707617922206222
Resources requested: 1.0/2 CPUs, 1.0/1 GPUs, 0.0/7.36 GiB heap, 0.0/3.68 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/_objective_2022-07-27_19-32-38
Number of trials: 18/20 (1 PENDING, 1 RUNNING, 16 TERMINATED)
+---------------------+------------+-----------------+-----------------+--------------------+-------------------------------+----------------+-------------+
| Trial name          | status     | loc             |   learning_rate |   num_train_epochs |   per_device_train_batch_size |   weight_decay |   objective |
|---------------------+------------+-----------------+-----------------+--------------------+-------------------------------+----------------+-------------|
| _objective_e16fa8e8 | RU

[2m[36m(_objective pid=2832)[0m Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
[2m[36m(_objective pid=2832)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


== Status ==
Current time: 2022-07-27 19:48:09 (running for 00:15:30.28)
Memory usage on this node: 4.0/12.7 GiB
Using AsyncHyperBand: num_stopped=5
Bracket: Iter 64.000: None | Iter 16.000: None | Iter 4.000: 1.765661920159184 | Iter 1.000: 1.692209528439475
Resources requested: 1.0/2 CPUs, 1.0/1 GPUs, 0.0/7.36 GiB heap, 0.0/3.68 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/_objective_2022-07-27_19-32-38
Number of trials: 19/20 (1 PENDING, 1 RUNNING, 17 TERMINATED)
+---------------------+------------+-----------------+-----------------+--------------------+-------------------------------+----------------+-------------+
| Trial name          | status     | loc             |   learning_rate |   num_train_epochs |   per_device_train_batch_size |   weight_decay |   objective |
|---------------------+------------+-----------------+-----------------+--------------------+-------------------------------+----------------+-------------|
| _objective_f352e106 | RU

[2m[36m(_objective pid=2883)[0m Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
[2m[36m(_objective pid=2883)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


== Status ==
Current time: 2022-07-27 19:48:38 (running for 00:15:59.35)
Memory usage on this node: 4.0/12.7 GiB
Using AsyncHyperBand: num_stopped=6
Bracket: Iter 64.000: None | Iter 16.000: None | Iter 4.000: 1.765661920159184 | Iter 1.000: 1.6431697972414994
Resources requested: 1.0/2 CPUs, 1.0/1 GPUs, 0.0/7.36 GiB heap, 0.0/3.68 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/_objective_2022-07-27_19-32-38
Number of trials: 20/20 (1 PENDING, 1 RUNNING, 18 TERMINATED)
+---------------------+------------+-----------------+-----------------+--------------------+-------------------------------+----------------+-------------+
| Trial name          | status     | loc             |   learning_rate |   num_train_epochs |   per_device_train_batch_size |   weight_decay |   objective |
|---------------------+------------+-----------------+-----------------+--------------------+-------------------------------+----------------+-------------|
| _objective_049cdb06 | R

[2m[36m(_objective pid=2933)[0m Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
[2m[36m(_objective pid=2933)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


== Status ==
Current time: 2022-07-27 19:49:08 (running for 00:16:29.41)
Memory usage on this node: 4.0/12.7 GiB
Using AsyncHyperBand: num_stopped=7
Bracket: Iter 64.000: None | Iter 16.000: None | Iter 4.000: 1.765661920159184 | Iter 1.000: 1.5941300660435238
Resources requested: 1.0/2 CPUs, 1.0/1 GPUs, 0.0/7.36 GiB heap, 0.0/3.68 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/_objective_2022-07-27_19-32-38
Number of trials: 20/20 (1 RUNNING, 19 TERMINATED)
+---------------------+------------+-----------------+-----------------+--------------------+-------------------------------+----------------+-------------+
| Trial name          | status     | loc             |   learning_rate |   num_train_epochs |   per_device_train_batch_size |   weight_decay |   objective |
|---------------------+------------+-----------------+-----------------+--------------------+-------------------------------+----------------+-------------|
| _objective_15e635ce | RUNNING    |

2022-07-27 19:49:27,733	INFO tune.py:748 -- Total run time: 1008.84 seconds (1008.38 seconds for the tuning loop).


Result for _objective_15e635ce:
  date: 2022-07-27_19-49-27
  done: true
  epoch: 1.09
  eval_accuracy: 0.6838235294117647
  eval_f1: 0.8122270742358079
  eval_loss: 0.626849889755249
  eval_runtime: 0.5448
  eval_samples_per_second: 748.883
  eval_steps_per_second: 93.61
  experiment_id: 08f80bd08feb4e958a148ec218993d97
  hostname: 2e9b942bd43a
  iterations_since_restore: 1
  node_ip: 172.28.0.2
  objective: 1.4960506036475727
  pid: 2933
  time_since_restore: 23.955030918121338
  time_this_iter_s: 23.955030918121338
  time_total_s: 23.955030918121338
  timestamp: 1658951367
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 15e635ce
  warmup_time: 0.0037593841552734375
  
[2m[36m(_objective pid=2933)[0m {'eval_loss': 0.626849889755249, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.8122270742358079, 'eval_runtime': 0.5448, 'eval_samples_per_second': 748.883, 'eval_steps_per_second': 93.61, 'epoch': 1.09}
== Status ==
Current time: 2022-07-27 19:49:27 (running for 

In [None]:
best_trial

BestRun(run_id='9e151236', objective=1.8322743731638997, hyperparameters={'learning_rate': 0.00010552453166133928, 'num_train_epochs': 4, 'per_device_train_batch_size': 32, 'weight_decay': 0.009})

# Optuna

You should restart the runtime after installing Optuna if you already ran the Ray hyperparameter search.

In [None]:
!pip install optuna -q 
!pip list | grep "transformers\|datasets\|torch\|optuna"

[K     |████████████████████████████████| 308 kB 8.2 MB/s 
[K     |████████████████████████████████| 81 kB 7.6 MB/s 
[K     |████████████████████████████████| 209 kB 46.6 MB/s 
[K     |████████████████████████████████| 78 kB 6.6 MB/s 
[K     |████████████████████████████████| 112 kB 53.1 MB/s 
[K     |████████████████████████████████| 147 kB 55.9 MB/s 
[K     |████████████████████████████████| 49 kB 5.5 MB/s 
[?25h  Building wheel for pyperclip (setup.py) ... [?25l[?25hdone
datasets                      2.4.0
optuna                        2.10.1
tensorflow-datasets           4.0.1
torch                         1.12.0+cu113
torchaudio                    0.12.0+cu113
torchsummary                  1.5.1
torchtext                     0.13.0
torchvision                   0.13.0+cu113
transformers                  4.21.0
vega-datasets                 0.9.0


In [None]:
def optuna_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 7e-5, log=True),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 1, 5),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [4, 8, 16, 32, 64]),
    }

def optuna_objective(metrics):
    # if you only want to optimize f1
    # pass to trainer.hyperparameter_search as compute_objective.
    # Default is to optimize sum of metrics (accuracy+f1 in this case)
    return metrics["eval_f1"]

# using same trainer as above
best_trial = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    hp_space=optuna_hp_space,
    compute_objective=optuna_objective
)

[32m[I 2022-07-27 20:09:08,277][0m A new study created in memory with name: no-name-1012c61e-f265-47cf-bab6-0aac1e8f9005[0m
Trial:
loading configuration file https://huggingface.co/microsoft/xtremedistil-l6-h256-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/e545113a9b5e643083a51ba645bed76c8df28ab46010b1ae39650ce7c668fe0c.142acdac8e7631616b334d09a95700d181a33a84fd7eb9ab46593634d0873dda
Model config BertConfig {
  "_name_or_path": "microsoft/xtremedistil-l6-h256-uncased",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 8,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  

{'train_runtime': 18.7338, 'train_samples_per_second': 195.796, 'train_steps_per_second': 24.501, 'train_loss': 0.5082604568248741, 'epoch': 1.0}


[32m[I 2022-07-27 20:09:28,790][0m Trial 0 finished with value: 0.883248730964467 and parameters: {'learning_rate': 3.22581283235675e-05, 'num_train_epochs': 1, 'per_device_train_batch_size': 16}. Best is trial 0 with value: 0.883248730964467.[0m
Trial:


{'eval_loss': 0.43409672379493713, 'eval_accuracy': 0.8308823529411765, 'eval_f1': 0.883248730964467, 'eval_runtime': 0.5985, 'eval_samples_per_second': 681.755, 'eval_steps_per_second': 85.219, 'epoch': 1.0}


loading configuration file https://huggingface.co/microsoft/xtremedistil-l6-h256-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/e545113a9b5e643083a51ba645bed76c8df28ab46010b1ae39650ce7c668fe0c.142acdac8e7631616b334d09a95700d181a33a84fd7eb9ab46593634d0873dda
Model config BertConfig {
  "_name_or_path": "microsoft/xtremedistil-l6-h256-uncased",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 8,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.21.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://h

{'loss': 0.5109, 'learning_rate': 8.642403882308289e-06, 'epoch': 1.09}


Saving model checkpoint to ray-hp-search/run-1/checkpoint-500
Configuration saved in ray-hp-search/run-1/checkpoint-500/config.json
Model weights saved in ray-hp-search/run-1/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ray-hp-search/run-1/checkpoint-500/tokenizer_config.json
Special tokens file saved in ray-hp-search/run-1/checkpoint-500/special_tokens_map.json


{'eval_loss': 0.43625786900520325, 'eval_accuracy': 0.8284313725490197, 'eval_f1': 0.8801369863013697, 'eval_runtime': 0.5821, 'eval_samples_per_second': 700.959, 'eval_steps_per_second': 87.62, 'epoch': 1.09}




Training completed. Do not forget to share your model on huggingface.co/models =)


[32m[I 2022-07-27 20:10:07,430][0m Trial 1 finished with value: 0.8801369863013697 and parameters: {'learning_rate': 1.8980207569279925e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 4}. Best is trial 0 with value: 0.883248730964467.[0m
Trial:


{'train_runtime': 37.8053, 'train_samples_per_second': 194.047, 'train_steps_per_second': 24.282, 'train_loss': 0.4752014924757881, 'epoch': 2.0}


loading configuration file https://huggingface.co/microsoft/xtremedistil-l6-h256-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/e545113a9b5e643083a51ba645bed76c8df28ab46010b1ae39650ce7c668fe0c.142acdac8e7631616b334d09a95700d181a33a84fd7eb9ab46593634d0873dda
Model config BertConfig {
  "_name_or_path": "microsoft/xtremedistil-l6-h256-uncased",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 8,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.21.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://h

{'loss': 0.5581, 'learning_rate': 5.919860367501352e-06, 'epoch': 1.09}


Saving model checkpoint to ray-hp-search/run-2/checkpoint-500
Configuration saved in ray-hp-search/run-2/checkpoint-500/config.json
Model weights saved in ray-hp-search/run-2/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ray-hp-search/run-2/checkpoint-500/tokenizer_config.json
Special tokens file saved in ray-hp-search/run-2/checkpoint-500/special_tokens_map.json


{'eval_loss': 0.4747830927371979, 'eval_accuracy': 0.821078431372549, 'eval_f1': 0.8773109243697478, 'eval_runtime': 0.5952, 'eval_samples_per_second': 685.428, 'eval_steps_per_second': 85.678, 'epoch': 1.09}


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence2, idx, sentence1. If sentence2, idx, sentence1 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 408
  Batch size = 8


{'loss': 0.4666, 'learning_rate': 3.7043437628975525e-06, 'epoch': 2.18}


Saving model checkpoint to ray-hp-search/run-2/checkpoint-1000
Configuration saved in ray-hp-search/run-2/checkpoint-1000/config.json
Model weights saved in ray-hp-search/run-2/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ray-hp-search/run-2/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ray-hp-search/run-2/checkpoint-1000/special_tokens_map.json


{'eval_loss': 0.4429112672805786, 'eval_accuracy': 0.8259803921568627, 'eval_f1': 0.8777969018932873, 'eval_runtime': 0.592, 'eval_samples_per_second': 689.216, 'eval_steps_per_second': 86.152, 'epoch': 2.18}


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence2, idx, sentence1. If sentence2, idx, sentence1 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 408
  Batch size = 8


{'loss': 0.4325, 'learning_rate': 1.488827158293753e-06, 'epoch': 3.27}


Saving model checkpoint to ray-hp-search/run-2/checkpoint-1500
Configuration saved in ray-hp-search/run-2/checkpoint-1500/config.json
Model weights saved in ray-hp-search/run-2/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in ray-hp-search/run-2/checkpoint-1500/tokenizer_config.json
Special tokens file saved in ray-hp-search/run-2/checkpoint-1500/special_tokens_map.json


{'eval_loss': 0.43435677886009216, 'eval_accuracy': 0.8406862745098039, 'eval_f1': 0.8892674616695059, 'eval_runtime': 0.5744, 'eval_samples_per_second': 710.285, 'eval_steps_per_second': 88.786, 'epoch': 3.27}




Training completed. Do not forget to share your model on huggingface.co/models =)


[32m[I 2022-07-27 20:11:25,988][0m Trial 2 finished with value: 0.8892674616695059 and parameters: {'learning_rate': 8.13537697210515e-06, 'num_train_epochs': 4, 'per_device_train_batch_size': 32}. Best is trial 2 with value: 0.8892674616695059.[0m
Trial:


{'train_runtime': 77.6229, 'train_samples_per_second': 189.016, 'train_steps_per_second': 23.653, 'train_loss': 0.4743451386495353, 'epoch': 4.0}


loading configuration file https://huggingface.co/microsoft/xtremedistil-l6-h256-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/e545113a9b5e643083a51ba645bed76c8df28ab46010b1ae39650ce7c668fe0c.142acdac8e7631616b334d09a95700d181a33a84fd7eb9ab46593634d0873dda
Model config BertConfig {
  "_name_or_path": "microsoft/xtremedistil-l6-h256-uncased",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 8,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.21.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://h

{'loss': 0.4914, 'learning_rate': 1.8198506792568653e-05, 'epoch': 1.09}


Saving model checkpoint to ray-hp-search/run-3/checkpoint-500
Configuration saved in ray-hp-search/run-3/checkpoint-500/config.json
Model weights saved in ray-hp-search/run-3/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ray-hp-search/run-3/checkpoint-500/tokenizer_config.json
Special tokens file saved in ray-hp-search/run-3/checkpoint-500/special_tokens_map.json


{'eval_loss': 0.4127797782421112, 'eval_accuracy': 0.8333333333333334, 'eval_f1': 0.8823529411764706, 'eval_runtime': 0.5995, 'eval_samples_per_second': 680.612, 'eval_steps_per_second': 85.077, 'epoch': 1.09}


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence2, idx, sentence1. If sentence2, idx, sentence1 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 408
  Batch size = 8


{'loss': 0.3613, 'learning_rate': 7.82307532588185e-06, 'epoch': 2.18}


Saving model checkpoint to ray-hp-search/run-3/checkpoint-1000
Configuration saved in ray-hp-search/run-3/checkpoint-1000/config.json
Model weights saved in ray-hp-search/run-3/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ray-hp-search/run-3/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ray-hp-search/run-3/checkpoint-1000/special_tokens_map.json


{'eval_loss': 0.31737855076789856, 'eval_accuracy': 0.8799019607843137, 'eval_f1': 0.9126559714795008, 'eval_runtime': 0.5832, 'eval_samples_per_second': 699.553, 'eval_steps_per_second': 87.444, 'epoch': 2.18}




Training completed. Do not forget to share your model on huggingface.co/models =)


[32m[I 2022-07-27 20:12:25,421][0m Trial 3 finished with value: 0.9126559714795008 and parameters: {'learning_rate': 2.8573938259255455e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 64}. Best is trial 3 with value: 0.9126559714795008.[0m
Trial:


{'train_runtime': 58.6036, 'train_samples_per_second': 187.77, 'train_steps_per_second': 23.497, 'train_loss': 0.3903980961552373, 'epoch': 3.0}


loading configuration file https://huggingface.co/microsoft/xtremedistil-l6-h256-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/e545113a9b5e643083a51ba645bed76c8df28ab46010b1ae39650ce7c668fe0c.142acdac8e7631616b334d09a95700d181a33a84fd7eb9ab46593634d0873dda
Model config BertConfig {
  "_name_or_path": "microsoft/xtremedistil-l6-h256-uncased",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 8,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.21.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://h

{'loss': 0.5364, 'learning_rate': 5.582178493069879e-06, 'epoch': 1.09}


Saving model checkpoint to ray-hp-search/run-4/checkpoint-500
Configuration saved in ray-hp-search/run-4/checkpoint-500/config.json
Model weights saved in ray-hp-search/run-4/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ray-hp-search/run-4/checkpoint-500/tokenizer_config.json
Special tokens file saved in ray-hp-search/run-4/checkpoint-500/special_tokens_map.json


{'eval_loss': 0.45770591497421265, 'eval_accuracy': 0.8284313725490197, 'eval_f1': 0.8817567567567567, 'eval_runtime': 0.5952, 'eval_samples_per_second': 685.514, 'eval_steps_per_second': 85.689, 'epoch': 1.09}




Training completed. Do not forget to share your model on huggingface.co/models =)


[32m[I 2022-07-27 20:13:04,411][0m Trial 4 finished with value: 0.8817567567567567 and parameters: {'learning_rate': 1.2259425494349638e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 32}. Best is trial 3 with value: 0.9126559714795008.[0m
Trial:


{'train_runtime': 38.1707, 'train_samples_per_second': 192.19, 'train_steps_per_second': 24.05, 'train_loss': 0.5021245952265455, 'epoch': 2.0}


loading configuration file https://huggingface.co/microsoft/xtremedistil-l6-h256-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/e545113a9b5e643083a51ba645bed76c8df28ab46010b1ae39650ce7c668fe0c.142acdac8e7631616b334d09a95700d181a33a84fd7eb9ab46593634d0873dda
Model config BertConfig {
  "_name_or_path": "microsoft/xtremedistil-l6-h256-uncased",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 8,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.21.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://h

{'loss': 0.6305, 'learning_rate': 1.7992773374942384e-06, 'epoch': 1.09}


[32m[I 2022-07-27 20:13:25,863][0m Trial 5 pruned. [0m
Trial:


{'eval_loss': 0.5787696242332458, 'eval_accuracy': 0.7769607843137255, 'eval_f1': 0.8557844690966719, 'eval_runtime': 0.5729, 'eval_samples_per_second': 712.12, 'eval_steps_per_second': 89.015, 'epoch': 1.09}


loading configuration file https://huggingface.co/microsoft/xtremedistil-l6-h256-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/e545113a9b5e643083a51ba645bed76c8df28ab46010b1ae39650ce7c668fe0c.142acdac8e7631616b334d09a95700d181a33a84fd7eb9ab46593634d0873dda
Model config BertConfig {
  "_name_or_path": "microsoft/xtremedistil-l6-h256-uncased",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 8,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.21.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://h

{'loss': 0.5012, 'learning_rate': 1.5871050084836866e-05, 'epoch': 1.09}


[32m[I 2022-07-27 20:13:47,123][0m Trial 6 pruned. [0m
Trial:


{'eval_loss': 0.42514172196388245, 'eval_accuracy': 0.8308823529411765, 'eval_f1': 0.880415944540728, 'eval_runtime': 0.5881, 'eval_samples_per_second': 693.765, 'eval_steps_per_second': 86.721, 'epoch': 1.09}


loading configuration file https://huggingface.co/microsoft/xtremedistil-l6-h256-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/e545113a9b5e643083a51ba645bed76c8df28ab46010b1ae39650ce7c668fe0c.142acdac8e7631616b334d09a95700d181a33a84fd7eb9ab46593634d0873dda
Model config BertConfig {
  "_name_or_path": "microsoft/xtremedistil-l6-h256-uncased",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 8,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.21.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://h

{'train_runtime': 19.1303, 'train_samples_per_second': 191.737, 'train_steps_per_second': 23.993, 'train_loss': 0.6549071600754017, 'epoch': 1.0}


[32m[I 2022-07-27 20:14:07,683][0m Trial 7 finished with value: 0.8341232227488151 and parameters: {'learning_rate': 2.0620097470827714e-06, 'num_train_epochs': 1, 'per_device_train_batch_size': 8}. Best is trial 3 with value: 0.9126559714795008.[0m
Trial:


{'eval_loss': 0.6358305811882019, 'eval_accuracy': 0.7426470588235294, 'eval_f1': 0.8341232227488151, 'eval_runtime': 0.5672, 'eval_samples_per_second': 719.318, 'eval_steps_per_second': 89.915, 'epoch': 1.0}


loading configuration file https://huggingface.co/microsoft/xtremedistil-l6-h256-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/e545113a9b5e643083a51ba645bed76c8df28ab46010b1ae39650ce7c668fe0c.142acdac8e7631616b334d09a95700d181a33a84fd7eb9ab46593634d0873dda
Model config BertConfig {
  "_name_or_path": "microsoft/xtremedistil-l6-h256-uncased",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 8,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.21.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://h

{'loss': 0.5566, 'learning_rate': 5.444346974041073e-06, 'epoch': 1.09}


[32m[I 2022-07-27 20:14:28,520][0m Trial 8 pruned. [0m
Trial:


{'eval_loss': 0.47510120272636414, 'eval_accuracy': 0.821078431372549, 'eval_f1': 0.8773109243697478, 'eval_runtime': 0.572, 'eval_samples_per_second': 713.297, 'eval_steps_per_second': 89.162, 'epoch': 1.09}


loading configuration file https://huggingface.co/microsoft/xtremedistil-l6-h256-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/e545113a9b5e643083a51ba645bed76c8df28ab46010b1ae39650ce7c668fe0c.142acdac8e7631616b334d09a95700d181a33a84fd7eb9ab46593634d0873dda
Model config BertConfig {
  "_name_or_path": "microsoft/xtremedistil-l6-h256-uncased",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 8,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.21.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://h

{'loss': 0.4692, 'learning_rate': 2.5177222936868015e-05, 'epoch': 1.09}


Saving model checkpoint to ray-hp-search/run-9/checkpoint-500
Configuration saved in ray-hp-search/run-9/checkpoint-500/config.json
Model weights saved in ray-hp-search/run-9/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ray-hp-search/run-9/checkpoint-500/tokenizer_config.json
Special tokens file saved in ray-hp-search/run-9/checkpoint-500/special_tokens_map.json


{'eval_loss': 0.33517372608184814, 'eval_accuracy': 0.875, 'eval_f1': 0.9097345132743363, 'eval_runtime': 0.5839, 'eval_samples_per_second': 698.705, 'eval_steps_per_second': 87.338, 'epoch': 1.09}


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence2, idx, sentence1. If sentence2, idx, sentence1 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 408
  Batch size = 8


{'loss': 0.3388, 'learning_rate': 1.0823047944354893e-05, 'epoch': 2.18}


Saving model checkpoint to ray-hp-search/run-9/checkpoint-1000
Configuration saved in ray-hp-search/run-9/checkpoint-1000/config.json
Model weights saved in ray-hp-search/run-9/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ray-hp-search/run-9/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ray-hp-search/run-9/checkpoint-1000/special_tokens_map.json


{'eval_loss': 0.31679004430770874, 'eval_accuracy': 0.8946078431372549, 'eval_f1': 0.9249563699825479, 'eval_runtime': 0.5829, 'eval_samples_per_second': 699.907, 'eval_steps_per_second': 87.488, 'epoch': 2.18}




Training completed. Do not forget to share your model on huggingface.co/models =)


[32m[I 2022-07-27 20:15:25,038][0m Trial 9 finished with value: 0.9249563699825479 and parameters: {'learning_rate': 3.9531397929381135e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 32}. Best is trial 9 with value: 0.9249563699825479.[0m
Trial:


{'train_runtime': 55.6982, 'train_samples_per_second': 197.565, 'train_steps_per_second': 24.723, 'train_loss': 0.3671695872675966, 'epoch': 3.0}


loading configuration file https://huggingface.co/microsoft/xtremedistil-l6-h256-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/e545113a9b5e643083a51ba645bed76c8df28ab46010b1ae39650ce7c668fe0c.142acdac8e7631616b334d09a95700d181a33a84fd7eb9ab46593634d0873dda
Model config BertConfig {
  "_name_or_path": "microsoft/xtremedistil-l6-h256-uncased",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 8,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.21.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://h

{'loss': 0.4523, 'learning_rate': 4.883901660922127e-05, 'epoch': 1.09}


Saving model checkpoint to ray-hp-search/run-10/checkpoint-500
Configuration saved in ray-hp-search/run-10/checkpoint-500/config.json
Model weights saved in ray-hp-search/run-10/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ray-hp-search/run-10/checkpoint-500/tokenizer_config.json
Special tokens file saved in ray-hp-search/run-10/checkpoint-500/special_tokens_map.json


{'eval_loss': 0.3306081295013428, 'eval_accuracy': 0.8774509803921569, 'eval_f1': 0.9116607773851589, 'eval_runtime': 0.5558, 'eval_samples_per_second': 734.055, 'eval_steps_per_second': 91.757, 'epoch': 1.09}


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence2, idx, sentence1. If sentence2, idx, sentence1 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 408
  Batch size = 8


{'loss': 0.3116, 'learning_rate': 3.5234833709716744e-05, 'epoch': 2.18}


Saving model checkpoint to ray-hp-search/run-10/checkpoint-1000
Configuration saved in ray-hp-search/run-10/checkpoint-1000/config.json
Model weights saved in ray-hp-search/run-10/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ray-hp-search/run-10/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ray-hp-search/run-10/checkpoint-1000/special_tokens_map.json


{'eval_loss': 0.3301372528076172, 'eval_accuracy': 0.9044117647058824, 'eval_f1': 0.9321739130434782, 'eval_runtime': 0.5616, 'eval_samples_per_second': 726.491, 'eval_steps_per_second': 90.811, 'epoch': 2.18}


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence2, idx, sentence1. If sentence2, idx, sentence1 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 408
  Batch size = 8


{'loss': 0.2235, 'learning_rate': 2.1630650810212208e-05, 'epoch': 3.27}


Saving model checkpoint to ray-hp-search/run-10/checkpoint-1500
Configuration saved in ray-hp-search/run-10/checkpoint-1500/config.json
Model weights saved in ray-hp-search/run-10/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in ray-hp-search/run-10/checkpoint-1500/tokenizer_config.json
Special tokens file saved in ray-hp-search/run-10/checkpoint-1500/special_tokens_map.json


{'eval_loss': 0.416564404964447, 'eval_accuracy': 0.8946078431372549, 'eval_f1': 0.9249563699825479, 'eval_runtime': 0.5686, 'eval_samples_per_second': 717.515, 'eval_steps_per_second': 89.689, 'epoch': 3.27}


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence2, idx, sentence1. If sentence2, idx, sentence1 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 408
  Batch size = 8


{'loss': 0.1615, 'learning_rate': 8.026467910707675e-06, 'epoch': 4.36}


Saving model checkpoint to ray-hp-search/run-10/checkpoint-2000
Configuration saved in ray-hp-search/run-10/checkpoint-2000/config.json
Model weights saved in ray-hp-search/run-10/checkpoint-2000/pytorch_model.bin
tokenizer config file saved in ray-hp-search/run-10/checkpoint-2000/tokenizer_config.json
Special tokens file saved in ray-hp-search/run-10/checkpoint-2000/special_tokens_map.json


{'eval_loss': 0.46860161423683167, 'eval_accuracy': 0.8823529411764706, 'eval_f1': 0.915492957746479, 'eval_runtime': 0.574, 'eval_samples_per_second': 710.743, 'eval_steps_per_second': 88.843, 'epoch': 4.36}




Training completed. Do not forget to share your model on huggingface.co/models =)


[32m[I 2022-07-27 20:17:00,593][0m Trial 10 finished with value: 0.915492957746479 and parameters: {'learning_rate': 6.244319950872581e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 4}. Best is trial 9 with value: 0.9249563699825479.[0m
Trial:


{'train_runtime': 94.6952, 'train_samples_per_second': 193.674, 'train_steps_per_second': 24.236, 'train_loss': 0.2657350909995617, 'epoch': 5.0}


loading configuration file https://huggingface.co/microsoft/xtremedistil-l6-h256-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/e545113a9b5e643083a51ba645bed76c8df28ab46010b1ae39650ce7c668fe0c.142acdac8e7631616b334d09a95700d181a33a84fd7eb9ab46593634d0873dda
Model config BertConfig {
  "_name_or_path": "microsoft/xtremedistil-l6-h256-uncased",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 8,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.21.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://h

{'loss': 0.4478, 'learning_rate': 5.1342065896239045e-05, 'epoch': 1.09}


Saving model checkpoint to ray-hp-search/run-11/checkpoint-500
Configuration saved in ray-hp-search/run-11/checkpoint-500/config.json
Model weights saved in ray-hp-search/run-11/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ray-hp-search/run-11/checkpoint-500/tokenizer_config.json
Special tokens file saved in ray-hp-search/run-11/checkpoint-500/special_tokens_map.json


{'eval_loss': 0.3349061906337738, 'eval_accuracy': 0.8799019607843137, 'eval_f1': 0.9129662522202486, 'eval_runtime': 0.581, 'eval_samples_per_second': 702.265, 'eval_steps_per_second': 87.783, 'epoch': 1.09}


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence2, idx, sentence1. If sentence2, idx, sentence1 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 408
  Batch size = 8


{'loss': 0.3121, 'learning_rate': 3.704065478308054e-05, 'epoch': 2.18}


Saving model checkpoint to ray-hp-search/run-11/checkpoint-1000
Configuration saved in ray-hp-search/run-11/checkpoint-1000/config.json
Model weights saved in ray-hp-search/run-11/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ray-hp-search/run-11/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ray-hp-search/run-11/checkpoint-1000/special_tokens_map.json


{'eval_loss': 0.33052220940589905, 'eval_accuracy': 0.9019607843137255, 'eval_f1': 0.9298245614035087, 'eval_runtime': 0.5795, 'eval_samples_per_second': 704.018, 'eval_steps_per_second': 88.002, 'epoch': 2.18}


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence2, idx, sentence1. If sentence2, idx, sentence1 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 408
  Batch size = 8


{'loss': 0.223, 'learning_rate': 2.2739243669922026e-05, 'epoch': 3.27}


Saving model checkpoint to ray-hp-search/run-11/checkpoint-1500
Configuration saved in ray-hp-search/run-11/checkpoint-1500/config.json
Model weights saved in ray-hp-search/run-11/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in ray-hp-search/run-11/checkpoint-1500/tokenizer_config.json
Special tokens file saved in ray-hp-search/run-11/checkpoint-1500/special_tokens_map.json


{'eval_loss': 0.4268735945224762, 'eval_accuracy': 0.875, 'eval_f1': 0.9103690685413005, 'eval_runtime': 0.5869, 'eval_samples_per_second': 695.194, 'eval_steps_per_second': 86.899, 'epoch': 3.27}


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence2, idx, sentence1. If sentence2, idx, sentence1 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 408
  Batch size = 8


{'loss': 0.1647, 'learning_rate': 8.43783255676352e-06, 'epoch': 4.36}


Saving model checkpoint to ray-hp-search/run-11/checkpoint-2000
Configuration saved in ray-hp-search/run-11/checkpoint-2000/config.json
Model weights saved in ray-hp-search/run-11/checkpoint-2000/pytorch_model.bin
tokenizer config file saved in ray-hp-search/run-11/checkpoint-2000/tokenizer_config.json
Special tokens file saved in ray-hp-search/run-11/checkpoint-2000/special_tokens_map.json


{'eval_loss': 0.4631838798522949, 'eval_accuracy': 0.8799019607843137, 'eval_f1': 0.9135802469135803, 'eval_runtime': 0.5713, 'eval_samples_per_second': 714.171, 'eval_steps_per_second': 89.271, 'epoch': 4.36}




Training completed. Do not forget to share your model on huggingface.co/models =)


[32m[I 2022-07-27 20:18:36,372][0m Trial 11 finished with value: 0.9135802469135803 and parameters: {'learning_rate': 6.564347700939755e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 4}. Best is trial 9 with value: 0.9249563699825479.[0m
Trial:


{'train_runtime': 94.9437, 'train_samples_per_second': 193.167, 'train_steps_per_second': 24.172, 'train_loss': 0.265488990667339, 'epoch': 5.0}


loading configuration file https://huggingface.co/microsoft/xtremedistil-l6-h256-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/e545113a9b5e643083a51ba645bed76c8df28ab46010b1ae39650ce7c668fe0c.142acdac8e7631616b334d09a95700d181a33a84fd7eb9ab46593634d0873dda
Model config BertConfig {
  "_name_or_path": "microsoft/xtremedistil-l6-h256-uncased",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 8,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.21.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://h

{'loss': 0.4498, 'learning_rate': 4.9979818251870405e-05, 'epoch': 1.09}


Saving model checkpoint to ray-hp-search/run-12/checkpoint-500
Configuration saved in ray-hp-search/run-12/checkpoint-500/config.json
Model weights saved in ray-hp-search/run-12/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ray-hp-search/run-12/checkpoint-500/tokenizer_config.json
Special tokens file saved in ray-hp-search/run-12/checkpoint-500/special_tokens_map.json


{'eval_loss': 0.33227217197418213, 'eval_accuracy': 0.875, 'eval_f1': 0.9097345132743363, 'eval_runtime': 0.5663, 'eval_samples_per_second': 720.463, 'eval_steps_per_second': 90.058, 'epoch': 1.09}


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence2, idx, sentence1. If sentence2, idx, sentence1 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 408
  Batch size = 8


{'loss': 0.3109, 'learning_rate': 3.605786330705971e-05, 'epoch': 2.18}


[32m[I 2022-07-27 20:19:17,850][0m Trial 12 pruned. [0m
Trial:


{'eval_loss': 0.3452945649623871, 'eval_accuracy': 0.8848039215686274, 'eval_f1': 0.9171075837742505, 'eval_runtime': 0.5795, 'eval_samples_per_second': 703.995, 'eval_steps_per_second': 87.999, 'epoch': 2.18}


loading configuration file https://huggingface.co/microsoft/xtremedistil-l6-h256-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/e545113a9b5e643083a51ba645bed76c8df28ab46010b1ae39650ce7c668fe0c.142acdac8e7631616b334d09a95700d181a33a84fd7eb9ab46593634d0873dda
Model config BertConfig {
  "_name_or_path": "microsoft/xtremedistil-l6-h256-uncased",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 8,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.21.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://h

{'loss': 0.4605, 'learning_rate': 2.0593907774260154e-05, 'epoch': 1.09}


Saving model checkpoint to ray-hp-search/run-13/checkpoint-500
Configuration saved in ray-hp-search/run-13/checkpoint-500/config.json
Model weights saved in ray-hp-search/run-13/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ray-hp-search/run-13/checkpoint-500/tokenizer_config.json
Special tokens file saved in ray-hp-search/run-13/checkpoint-500/special_tokens_map.json


{'eval_loss': 0.3278953731060028, 'eval_accuracy': 0.8725490196078431, 'eval_f1': 0.9078014184397164, 'eval_runtime': 0.5858, 'eval_samples_per_second': 696.435, 'eval_steps_per_second': 87.054, 'epoch': 1.09}




Training completed. Do not forget to share your model on huggingface.co/models =)


[32m[I 2022-07-27 20:19:56,391][0m Trial 13 finished with value: 0.9078014184397164 and parameters: {'learning_rate': 4.522776874825555e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 8}. Best is trial 9 with value: 0.9249563699825479.[0m
Trial:


{'train_runtime': 37.7003, 'train_samples_per_second': 194.587, 'train_steps_per_second': 24.35, 'train_loss': 0.4045644190836057, 'epoch': 2.0}


loading configuration file https://huggingface.co/microsoft/xtremedistil-l6-h256-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/e545113a9b5e643083a51ba645bed76c8df28ab46010b1ae39650ce7c668fe0c.142acdac8e7631616b334d09a95700d181a33a84fd7eb9ab46593634d0873dda
Model config BertConfig {
  "_name_or_path": "microsoft/xtremedistil-l6-h256-uncased",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 8,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.21.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://h

{'loss': 0.5858, 'learning_rate': 3.912390510103469e-06, 'epoch': 1.09}


[32m[I 2022-07-27 20:20:17,255][0m Trial 14 pruned. [0m
Trial:


{'eval_loss': 0.5104352831840515, 'eval_accuracy': 0.8063725490196079, 'eval_f1': 0.8685524126455907, 'eval_runtime': 0.5626, 'eval_samples_per_second': 725.224, 'eval_steps_per_second': 90.653, 'epoch': 1.09}


loading configuration file https://huggingface.co/microsoft/xtremedistil-l6-h256-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/e545113a9b5e643083a51ba645bed76c8df28ab46010b1ae39650ce7c668fe0c.142acdac8e7631616b334d09a95700d181a33a84fd7eb9ab46593634d0873dda
Model config BertConfig {
  "_name_or_path": "microsoft/xtremedistil-l6-h256-uncased",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 8,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.21.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://h

{'loss': 0.6601, 'learning_rate': 8.409512249780387e-07, 'epoch': 1.09}


[32m[I 2022-07-27 20:20:38,044][0m Trial 15 pruned. [0m
Trial:


{'eval_loss': 0.6340118050575256, 'eval_accuracy': 0.7475490196078431, 'eval_f1': 0.8403100775193798, 'eval_runtime': 0.5762, 'eval_samples_per_second': 708.038, 'eval_steps_per_second': 88.505, 'epoch': 1.09}


loading configuration file https://huggingface.co/microsoft/xtremedistil-l6-h256-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/e545113a9b5e643083a51ba645bed76c8df28ab46010b1ae39650ce7c668fe0c.142acdac8e7631616b334d09a95700d181a33a84fd7eb9ab46593634d0873dda
Model config BertConfig {
  "_name_or_path": "microsoft/xtremedistil-l6-h256-uncased",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 8,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.21.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://h

{'loss': 0.4713, 'learning_rate': 2.4565831524757642e-05, 'epoch': 1.09}


Saving model checkpoint to ray-hp-search/run-16/checkpoint-500
Configuration saved in ray-hp-search/run-16/checkpoint-500/config.json
Model weights saved in ray-hp-search/run-16/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ray-hp-search/run-16/checkpoint-500/tokenizer_config.json
Special tokens file saved in ray-hp-search/run-16/checkpoint-500/special_tokens_map.json


{'eval_loss': 0.3363688588142395, 'eval_accuracy': 0.8799019607843137, 'eval_f1': 0.9135802469135803, 'eval_runtime': 0.5802, 'eval_samples_per_second': 703.193, 'eval_steps_per_second': 87.899, 'epoch': 1.09}


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence2, idx, sentence1. If sentence2, idx, sentence1 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 408
  Batch size = 8


{'loss': 0.3426, 'learning_rate': 1.056022632250129e-05, 'epoch': 2.18}


[32m[I 2022-07-27 20:21:20,522][0m Trial 16 pruned. [0m
Trial:


{'eval_loss': 0.3212238848209381, 'eval_accuracy': 0.8921568627450981, 'eval_f1': 0.9236111111111112, 'eval_runtime': 0.5664, 'eval_samples_per_second': 720.367, 'eval_steps_per_second': 90.046, 'epoch': 2.18}


loading configuration file https://huggingface.co/microsoft/xtremedistil-l6-h256-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/e545113a9b5e643083a51ba645bed76c8df28ab46010b1ae39650ce7c668fe0c.142acdac8e7631616b334d09a95700d181a33a84fd7eb9ab46593634d0873dda
Model config BertConfig {
  "_name_or_path": "microsoft/xtremedistil-l6-h256-uncased",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 8,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.21.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://h

{'loss': 0.5123, 'learning_rate': 8.410005468913953e-06, 'epoch': 1.09}


[32m[I 2022-07-27 20:21:41,533][0m Trial 17 pruned. [0m
Trial:


{'eval_loss': 0.43712273240089417, 'eval_accuracy': 0.8308823529411765, 'eval_f1': 0.882051282051282, 'eval_runtime': 0.625, 'eval_samples_per_second': 652.76, 'eval_steps_per_second': 81.595, 'epoch': 1.09}


loading configuration file https://huggingface.co/microsoft/xtremedistil-l6-h256-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/e545113a9b5e643083a51ba645bed76c8df28ab46010b1ae39650ce7c668fe0c.142acdac8e7631616b334d09a95700d181a33a84fd7eb9ab46593634d0873dda
Model config BertConfig {
  "_name_or_path": "microsoft/xtremedistil-l6-h256-uncased",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 8,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.21.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://h

{'loss': 0.4593, 'learning_rate': 3.581517089819552e-05, 'epoch': 1.09}


Saving model checkpoint to ray-hp-search/run-18/checkpoint-500
Configuration saved in ray-hp-search/run-18/checkpoint-500/config.json
Model weights saved in ray-hp-search/run-18/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ray-hp-search/run-18/checkpoint-500/tokenizer_config.json
Special tokens file saved in ray-hp-search/run-18/checkpoint-500/special_tokens_map.json


{'eval_loss': 0.3443868160247803, 'eval_accuracy': 0.8651960784313726, 'eval_f1': 0.9016100178890876, 'eval_runtime': 0.5744, 'eval_samples_per_second': 710.275, 'eval_steps_per_second': 88.784, 'epoch': 1.09}


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence2, idx, sentence1. If sentence2, idx, sentence1 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 408
  Batch size = 8


{'loss': 0.3277, 'learning_rate': 2.241128957402055e-05, 'epoch': 2.18}


[32m[I 2022-07-27 20:22:23,045][0m Trial 18 pruned. [0m
Trial:


{'eval_loss': 0.30488160252571106, 'eval_accuracy': 0.8921568627450981, 'eval_f1': 0.9219858156028369, 'eval_runtime': 0.5796, 'eval_samples_per_second': 703.929, 'eval_steps_per_second': 87.991, 'epoch': 2.18}


loading configuration file https://huggingface.co/microsoft/xtremedistil-l6-h256-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/e545113a9b5e643083a51ba645bed76c8df28ab46010b1ae39650ce7c668fe0c.142acdac8e7631616b334d09a95700d181a33a84fd7eb9ab46593634d0873dda
Model config BertConfig {
  "_name_or_path": "microsoft/xtremedistil-l6-h256-uncased",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 8,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.21.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://h

{'loss': 0.5349, 'learning_rate': 7.576941113638977e-06, 'epoch': 1.09}


[32m[I 2022-07-27 20:22:44,351][0m Trial 19 pruned. [0m


{'eval_loss': 0.4520810544490814, 'eval_accuracy': 0.8284313725490197, 'eval_f1': 0.8813559322033899, 'eval_runtime': 0.5895, 'eval_samples_per_second': 692.087, 'eval_steps_per_second': 86.511, 'epoch': 1.09}


In [None]:
print(best_trial)

BestRun(run_id='9', objective=0.9249563699825479, hyperparameters={'learning_rate': 3.9531397929381135e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 32})
