# Training a BERT4Rec baseline  

In [None]:
!pip install transformers4rec[pytorch,nvtabular]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers4rec[nvtabular,pytorch]
  Downloading transformers4rec-0.1.16.tar.gz (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m51.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting transformers<4.19
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.0/4.0 MB[0m [31m66.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting betterproto<2.0.0
  Downloading betterproto-1.2.5.tar.gz (26 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting torchmetrics>=0.10.0
  Downloadi

In [None]:
import os
import glob

import torch 
import transformers4rec.torch as tr

from transformers4rec.torch.ranking_metric import NDCGAt, RecallAt
from transformers4rec.torch.utils.examples_utils import wipe_memory

In [None]:
from merlin_standard_lib import Schema

# Define schema object to pass it to the TabularSequenceFeatures class
SCHEMA_PATH = '/content/drive/MyDrive/dataset_rees46/processed_nvt/schema.pbtxt'
schema = Schema().from_proto_text(SCHEMA_PATH)
schema = schema.select_by_name(['product_id-list'])



In [None]:
schema

[{'name': 'product_id-list', 'value_count': {'min': '2', 'max': '20'}, 'type': 'INT', 'int_domain': {'name': 'product_id', 'max': '166795', 'is_categorical': True}, 'annotation': {'tag': ['item', 'id', 'list', 'item_id', 'categorical'], 'comment': ['{"freq_threshold": 0.0, "start_index": 1.0, "is_list": true, "embedding_sizes": {"dimension": 512.0, "cardinality": 166796.0}, "cat_path": ".//categories/unique.product_id.parquet", "max_size": 0.0, "is_ragged": true, "num_buckets": null, "dtype_item_size": 64.0}']}}]

In [None]:
!head -50 $SCHEMA_PATH

feature {
  name: "product_id-count"
  type: INT
  int_domain {
    name: "product_id"
    max: 166795
    is_categorical: true
  }
  annotation {
    tag: "categorical"
    extra_metadata {
      type_url: "type.googleapis.com/google.protobuf.Struct"
      value: "\n\034\n\017dtype_item_size\022\t\021\000\000\000\000\000\000@@\nG\n\017embedding_sizes\0224*2\n\026\n\tdimension\022\t\021\000\000\000\000\000\000\200@\n\030\n\013cardinality\022\t\021\000\000\000\000`\\\004A\n\033\n\016freq_threshold\022\t\021\000\000\000\000\000\000\000\000\n\017\n\tis_ragged\022\002 \000\n\r\n\007is_list\022\002 \000\n\021\n\013num_buckets\022\002\010\000\n\025\n\010max_size\022\t\021\000\000\000\000\000\000\000\000\n5\n\010cat_path\022)\032\'.//categories/unique.product_id.parquet\n\030\n\013start_index\022\t\021\000\000\000\000\000\000\360?"
    }
  }
}
feature {
  name: "user_session"
  type: INT
  int_domain {
    name: "user_session"
    max: 9244422
    is_categorical: true
  }
  annotation {
    t

In [None]:
#Input 
sequence_length, d_model = 20, 320

# Define input module to process tabular input-features and to prepare masked inputs
inputs= tr.TabularSequenceFeatures.from_schema(
    schema,
    max_sequence_length=sequence_length,
    d_output=d_model,
    masking="mlm",
)

In [None]:
inputs

TabularSequenceFeatures(
  (to_merge): ModuleDict(
    (categorical_module): SequenceEmbeddingFeatures(
      (filter_features): FilterFeatures()
      (embedding_tables): ModuleDict(
        (product_id-list): Embedding(166796, 64, padding_idx=0)
      )
    )
  )
  (_aggregation): ConcatFeatures()
  (projection_module): SequentialBlock(
    (0): DenseBlock(
      (0): Linear(in_features=64, out_features=320, bias=True)
      (1): ReLU(inplace=True)
    )
  )
  (_masking): MaskedLanguageModeling()
)

La tecnica di tying embedding è spesso usata in NLP e "collega" i pesi dell'input embedding con la matrice dell'output projection layer. 
Questo viene fatto perchè l'input e l'output dei modelli sono nello stesso spazio, cioè sono delle parole (in NLP) e dovrebbero ricadere nello stesso spazio vettoriale. 
I modelli di rec sys in modo analogo hanno gli item id come input e come output. 

si ridurrebbero i parametri del modello, poichè il numero di embeddings delle features categoriche ad alta cardinalità sono molto maggiori rispetto a NLP. 
Un altro beneficio è che si ha una regolarizzazione del modello, adattando la sua complessità ai dati.

I modelli rec sys di deep learing sono generalmente limitati dallo spazio di memoria disponibile durante il training e l'evaluation (sono memory-bound) e molti parametri sono concentrati in grandi tabelle di embedding. Qui subentra la tecnica di tying embedding che riduce i requisiti di memoria utilizzando soltanto una matrice di proiezione sia per gli item sia per le rappresentazioni dell'output. 
Questa riduzione dei parametri non è il solo motivo per cui si utilizza la tecnica di tying embedding. 
Questa tecnica introduce inoltre una operazione di fattorizzazione della matrice tra gli item embeddings e la rappresentazione finale della sessione. 



In [None]:
#import transformers4rec.config.transformer as hf

transformer_config = tr.AlbertConfig.build(
    d_model=d_model, 
    item_embedding_dim = 320,
    n_head=8, 
    n_layer=2, 
    total_seq_length=sequence_length, 
    stochastic_shared_embeddings_replacement_prob = 0.06, #regularization
    input_dropout = 0.1,
    dropout = 0.0, #regularization
    label_smoothing = 0.2, #regularization (proved to be useful in train/val accuracy)
    weight_decay = 9.565968888623912e-05, #regularization,
    item_id_embeddings_init_std = 0.11,
    mlm_probability = 0.6,
    eval_on_last_item_seq_only = True,
    mf_constrained_embeddings = True,
    layer_norm_featurewise = True,
    num_hidden_groups = 1,
    inner_group_num = 1
)

# Define the model block including: inputs, masking, projection and transformer block.
body = tr.SequentialBlock(
    inputs,
    tr.MLPBlock([d_model]),
    tr.TransformerBlock(transformer_config, masking=inputs.masking)
)

# Define the head for to next item prediction task 
head = tr.Head(
    body,
    tr.NextItemPredictionTask(weight_tying=True,
                              metrics=[NDCGAt(top_ks=[10, 20], labels_onehot=True),  
                                       RecallAt(top_ks=[10, 20], labels_onehot=True)]),
)

# Get the end-to-end Model class 
model = tr.Model(head)



In [None]:
from transformers4rec.config.trainer import T4RecTrainingArguments
from transformers4rec.torch import Trainer
from transformers4rec.torch.utils.data_utils import MerlinDataLoader

#Set arguments for training 
training_args = T4RecTrainingArguments(
            output_dir="/content/drive/MyDrive/dataset_rees46/bert",
            max_sequence_length=20,
            data_loader_engine='merlin',
            num_train_epochs=10, 
            dataloader_drop_last=True,
            compute_metrics_each_n_steps = 1,
            per_device_train_batch_size = 192,
            per_device_eval_batch_size = 512,
            gradient_accumulation_steps = 1,
            learning_rate=0.0004904752786458524,
            report_to = [],
            logging_steps=200,
        )

In [None]:
# Instantiate the T4Rec Trainer, which manages training and evaluation
trainer = Trainer(
    model=model,
    args=training_args,
    schema=schema,
    compute_metrics=True,
)

In [None]:
OUTPUT_DIR = os.environ.get("OUTPUT_DIR", "/content/drive/MyDrive/dataset_rees46/sessions_by_day")

In [None]:
%%time
start_time_window_index = 1
final_time_window_index = 31
for time_index in range(start_time_window_index, final_time_window_index):
    # Set data 
    time_index_train = time_index
    time_index_eval = time_index + 1
    train_paths = glob.glob(os.path.join(OUTPUT_DIR, f"{time_index_train}/train.parquet"))
    eval_paths = glob.glob(os.path.join(OUTPUT_DIR, f"{time_index_eval}/test.parquet"))
    # Train on day related to time_index 
    print('*'*20)
    print("Launch training for day %s are:" %time_index)
    print('*'*20 + '\n')
    trainer.train_dataset_or_path = train_paths
    trainer.reset_lr_scheduler()
    trainer.train()
    trainer.state.global_step +=1
    # Evaluate on the following day
    trainer.eval_dataset_or_path = eval_paths
    train_metrics = trainer.evaluate(metric_key_prefix='eval')
    print('*'*20)
    print("Eval results for day %s are:\t" %time_index_eval)
    print('\n' + '*'*20 + '\n')
    for key in sorted(train_metrics.keys()):
        print(" %s = %s" % (key, str(train_metrics[key]))) 
    wipe_memory()

********************
Launch training for day 1 are:
********************



***** Running training *****
  Num examples = 111936
  Num Epochs = 10
  Instantaneous batch size per device = 192
  Total train batch size (w. parallel, distributed & accumulation) = 192
  Gradient Accumulation steps = 1
  Total optimization steps = 5830


Step,Training Loss
200,10.4449
400,9.12
600,8.929
800,8.6614
1000,8.5983
1200,8.4678
1400,8.3531
1600,8.2918
1800,8.197
2000,8.0936


Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-3000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/

********************
Eval results for day 2 are:	

********************

 eval_/loss = 8.172948837280273
 eval_/next-item/ndcg_at_10 = 0.0967572033405304
 eval_/next-item/ndcg_at_20 = 0.11630704253911972
 eval_/next-item/recall_at_10 = 0.18117186427116394
 eval_/next-item/recall_at_20 = 0.25898435711860657
 eval_runtime = 0.7703
 eval_samples_per_second = 16617.182
 eval_steps_per_second = 32.455
********************
Launch training for day 2 are:
********************



***** Running training *****
  Num examples = 105984
  Num Epochs = 10
  Instantaneous batch size per device = 192
  Total train batch size (w. parallel, distributed & accumulation) = 192
  Gradient Accumulation steps = 1
  Total optimization steps = 5520


Step,Training Loss
200,7.8147
400,7.7511
600,7.6259
800,7.4737
1000,7.4206
1200,7.2945
1400,7.1948
1600,7.1972
1800,7.028
2000,6.9976


Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-3000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/

********************
Eval results for day 3 are:	

********************

 eval_/loss = 7.575089931488037
 eval_/next-item/ndcg_at_10 = 0.1283506155014038
 eval_/next-item/ndcg_at_20 = 0.15097562968730927
 eval_/next-item/recall_at_10 = 0.23802649974822998
 eval_/next-item/recall_at_20 = 0.32753056287765503
 eval_runtime = 0.708
 eval_samples_per_second = 16632.994
 eval_steps_per_second = 32.486
********************
Launch training for day 3 are:
********************



***** Running training *****
  Num examples = 97728
  Num Epochs = 10
  Instantaneous batch size per device = 192
  Total train batch size (w. parallel, distributed & accumulation) = 192
  Gradient Accumulation steps = 1
  Total optimization steps = 5090


Step,Training Loss
200,7.2315
400,7.2121
600,7.1183
800,6.9609
1000,6.9743
1200,6.8198
1400,6.8267
1600,6.7453
1800,6.7086
2000,6.6817


Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-3000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/

********************
Eval results for day 4 are:	

********************

 eval_/loss = 7.2072367668151855
 eval_/next-item/ndcg_at_10 = 0.14284388720989227
 eval_/next-item/ndcg_at_20 = 0.16713716089725494
 eval_/next-item/recall_at_10 = 0.2571614682674408
 eval_/next-item/recall_at_20 = 0.3532552123069763
 eval_runtime = 0.9195
 eval_samples_per_second = 16704.711
 eval_steps_per_second = 32.626
********************
Launch training for day 4 are:
********************



***** Running training *****
  Num examples = 124416
  Num Epochs = 10
  Instantaneous batch size per device = 192
  Total train batch size (w. parallel, distributed & accumulation) = 192
  Gradient Accumulation steps = 1
  Total optimization steps = 6480


Step,Training Loss
200,6.8972
400,6.8626
600,6.8289
800,6.7237
1000,6.6444
1200,6.6535
1400,6.5909
1600,6.529
1800,6.5036
2000,6.4247


Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-3000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/

********************
Eval results for day 5 are:	

********************

 eval_/loss = 7.051671981811523
 eval_/next-item/ndcg_at_10 = 0.15136845409870148
 eval_/next-item/ndcg_at_20 = 0.17786242067813873
 eval_/next-item/recall_at_10 = 0.27531829476356506
 eval_/next-item/recall_at_20 = 0.38035300374031067
 eval_runtime = 0.835
 eval_samples_per_second = 16555.863
 eval_steps_per_second = 32.336
********************
Launch training for day 5 are:
********************



***** Running training *****
  Num examples = 114432
  Num Epochs = 10
  Instantaneous batch size per device = 192
  Total train batch size (w. parallel, distributed & accumulation) = 192
  Gradient Accumulation steps = 1
  Total optimization steps = 5960


Step,Training Loss
200,6.7089
400,6.6542
600,6.6525
800,6.5255
1000,6.4977
1200,6.5259
1400,6.4313
1600,6.3741
1800,6.4019
2000,6.2742


Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-3000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/

********************
Eval results for day 6 are:	

********************

 eval_/loss = 6.763752460479736
 eval_/next-item/ndcg_at_10 = 0.17027194797992706
 eval_/next-item/ndcg_at_20 = 0.19678239524364471
 eval_/next-item/recall_at_10 = 0.30743634700775146
 eval_/next-item/recall_at_20 = 0.41218170523643494
 eval_runtime = 0.8305
 eval_samples_per_second = 16644.78
 eval_steps_per_second = 32.509
********************
Launch training for day 6 are:
********************



***** Running training *****
  Num examples = 112704
  Num Epochs = 10
  Instantaneous batch size per device = 192
  Total train batch size (w. parallel, distributed & accumulation) = 192
  Gradient Accumulation steps = 1
  Total optimization steps = 5870


Step,Training Loss
200,6.5616
400,6.5093
600,6.4411
800,6.3887
1000,6.3583
1200,6.3584
1400,6.2565
1600,6.2713
1800,6.2267
2000,6.1949


Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-3000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/

********************
Eval results for day 7 are:	

********************

 eval_/loss = 6.939610481262207
 eval_/next-item/ndcg_at_10 = 0.1629038006067276
 eval_/next-item/ndcg_at_20 = 0.1900152713060379
 eval_/next-item/recall_at_10 = 0.29499998688697815
 eval_/next-item/recall_at_20 = 0.4020312428474426
 eval_runtime = 0.7982
 eval_samples_per_second = 16035.94
 eval_steps_per_second = 31.32
********************
Launch training for day 7 are:
********************



***** Running training *****
  Num examples = 105600
  Num Epochs = 10
  Instantaneous batch size per device = 192
  Total train batch size (w. parallel, distributed & accumulation) = 192
  Gradient Accumulation steps = 1
  Total optimization steps = 5500


Step,Training Loss
200,6.6187
400,6.5846
600,6.5305
800,6.4334
1000,6.4563
1200,6.3468
1400,6.3478
1600,6.3457
1800,6.2494
2000,6.2468


Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-3000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/

********************
Eval results for day 8 are:	

********************

 eval_/loss = 6.901777744293213
 eval_/next-item/ndcg_at_10 = 0.16624481976032257
 eval_/next-item/ndcg_at_20 = 0.19264362752437592
 eval_/next-item/recall_at_10 = 0.3043619990348816
 eval_/next-item/recall_at_20 = 0.4089193046092987
 eval_runtime = 0.9413
 eval_samples_per_second = 16317.851
 eval_steps_per_second = 31.871
********************
Launch training for day 8 are:
********************



***** Running training *****
  Num examples = 124992
  Num Epochs = 10
  Instantaneous batch size per device = 192
  Total train batch size (w. parallel, distributed & accumulation) = 192
  Gradient Accumulation steps = 1
  Total optimization steps = 6510


Step,Training Loss
200,6.694
400,6.6129
600,6.6113
800,6.5718
1000,6.4826
1200,6.4311
1400,6.397
1600,6.3725
1800,6.3878
2000,6.3354


Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-3000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/

********************
Eval results for day 9 are:	

********************

 eval_/loss = 6.901921272277832
 eval_/next-item/ndcg_at_10 = 0.17226850986480713
 eval_/next-item/ndcg_at_20 = 0.19845454394817352
 eval_/next-item/recall_at_10 = 0.31088361144065857
 eval_/next-item/recall_at_20 = 0.4143318831920624
 eval_runtime = 0.9113
 eval_samples_per_second = 16292.646
 eval_steps_per_second = 31.822
********************
Launch training for day 9 are:
********************



***** Running training *****
  Num examples = 120768
  Num Epochs = 10
  Instantaneous batch size per device = 192
  Total train batch size (w. parallel, distributed & accumulation) = 192
  Gradient Accumulation steps = 1
  Total optimization steps = 6290


Step,Training Loss
200,6.61
400,6.65
600,6.5937
800,6.4771
1000,6.4763
1200,6.4465
1400,6.4184
1600,6.362
1800,6.3337
2000,6.2771


Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-3000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/

********************
Eval results for day 10 are:	

********************

 eval_/loss = 6.931536674499512
 eval_/next-item/ndcg_at_10 = 0.17533372342586517
 eval_/next-item/ndcg_at_20 = 0.20105977356433868
 eval_/next-item/recall_at_10 = 0.31850406527519226
 eval_/next-item/recall_at_20 = 0.42042824625968933
 eval_runtime = 0.8419
 eval_samples_per_second = 16419.596
 eval_steps_per_second = 32.07
********************
Launch training for day 10 are:
********************



***** Running training *****
  Num examples = 112320
  Num Epochs = 10
  Instantaneous batch size per device = 192
  Total train batch size (w. parallel, distributed & accumulation) = 192
  Gradient Accumulation steps = 1
  Total optimization steps = 5850


Step,Training Loss
200,6.5858
400,6.592
600,6.5145
800,6.4688
1000,6.439
1200,6.4411
1400,6.3456
1600,6.338
1800,6.3338
2000,6.2581


Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-3000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/

********************
Eval results for day 11 are:	

********************

 eval_/loss = 6.8886542320251465
 eval_/next-item/ndcg_at_10 = 0.1705126166343689
 eval_/next-item/ndcg_at_20 = 0.19679485261440277
 eval_/next-item/recall_at_10 = 0.31158447265625
 eval_/next-item/recall_at_20 = 0.41552734375
 eval_runtime = 1.0035
 eval_samples_per_second = 16326.135
 eval_steps_per_second = 31.887
********************
Launch training for day 11 are:
********************



***** Running training *****
  Num examples = 132480
  Num Epochs = 10
  Instantaneous batch size per device = 192
  Total train batch size (w. parallel, distributed & accumulation) = 192
  Gradient Accumulation steps = 1
  Total optimization steps = 6900


Step,Training Loss
200,6.5994
400,6.6201
600,6.592
800,6.5015
1000,6.4575
1200,6.4883
1400,6.4341
1600,6.3778
1800,6.3744
2000,6.3778


Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-3000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/

********************
Eval results for day 12 are:	

********************

 eval_/loss = 6.781890869140625
 eval_/next-item/ndcg_at_10 = 0.1765429973602295
 eval_/next-item/ndcg_at_20 = 0.20335368812084198
 eval_/next-item/recall_at_10 = 0.3169102668762207
 eval_/next-item/recall_at_20 = 0.42300906777381897
 eval_runtime = 0.9587
 eval_samples_per_second = 16555.297
 eval_steps_per_second = 32.335
********************
Launch training for day 12 are:
********************



***** Running training *****
  Num examples = 128640
  Num Epochs = 10
  Instantaneous batch size per device = 192
  Total train batch size (w. parallel, distributed & accumulation) = 192
  Gradient Accumulation steps = 1
  Total optimization steps = 6700


Step,Training Loss
200,6.4824
400,6.4876
600,6.4915
800,6.4089
1000,6.3534
1200,6.3303
1400,6.2872
1600,6.2822
1800,6.2448
2000,6.2633


Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-3000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/

********************
Eval results for day 13 are:	

********************

 eval_/loss = 6.719053745269775
 eval_/next-item/ndcg_at_10 = 0.1803516000509262
 eval_/next-item/ndcg_at_20 = 0.20826466381549835
 eval_/next-item/recall_at_10 = 0.3229549527168274
 eval_/next-item/recall_at_20 = 0.43342140316963196
 eval_runtime = 1.0762
 eval_samples_per_second = 16175.719
 eval_steps_per_second = 31.593
********************
Launch training for day 13 are:
********************



***** Running training *****
  Num examples = 141312
  Num Epochs = 10
  Instantaneous batch size per device = 192
  Total train batch size (w. parallel, distributed & accumulation) = 192
  Gradient Accumulation steps = 1
  Total optimization steps = 7360


Step,Training Loss
200,6.4279
400,6.3694
600,6.3778
800,6.3234
1000,6.299
1200,6.2821
1400,6.2761
1600,6.1932
1800,6.2088
2000,6.1942


Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-3000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/

********************
Eval results for day 14 are:	

********************

 eval_/loss = 6.743072032928467
 eval_/next-item/ndcg_at_10 = 0.1836458444595337
 eval_/next-item/ndcg_at_20 = 0.21032269299030304
 eval_/next-item/recall_at_10 = 0.3291666805744171
 eval_/next-item/recall_at_20 = 0.4345052242279053
 eval_runtime = 0.9397
 eval_samples_per_second = 16345.856
 eval_steps_per_second = 31.925
********************
Launch training for day 14 are:
********************



***** Running training *****
  Num examples = 127296
  Num Epochs = 10
  Instantaneous batch size per device = 192
  Total train batch size (w. parallel, distributed & accumulation) = 192
  Gradient Accumulation steps = 1
  Total optimization steps = 6630


Step,Training Loss
200,6.4212
400,6.4248
600,6.3494
800,6.3183
1000,6.2361
1200,6.2717
1400,6.2371
1600,6.1918
1800,6.1482
2000,6.1705


Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-3000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/

********************
Eval results for day 15 are:	

********************

 eval_/loss = 6.749814510345459
 eval_/next-item/ndcg_at_10 = 0.1830327808856964
 eval_/next-item/ndcg_at_20 = 0.21049407124519348
 eval_/next-item/recall_at_10 = 0.32891845703125
 eval_/next-item/recall_at_20 = 0.437744140625
 eval_runtime = 0.9939
 eval_samples_per_second = 16484.302
 eval_steps_per_second = 32.196
********************
Launch training for day 15 are:
********************



***** Running training *****
  Num examples = 136320
  Num Epochs = 10
  Instantaneous batch size per device = 192
  Total train batch size (w. parallel, distributed & accumulation) = 192
  Gradient Accumulation steps = 1
  Total optimization steps = 7100


Step,Training Loss
200,6.4401
400,6.431
600,6.4084
800,6.3745
1000,6.3315
1200,6.3053
1400,6.28
1600,6.2144
1800,6.2293
2000,6.187


Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-3000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/

********************
Eval results for day 16 are:	

********************

 eval_/loss = 6.754256248474121
 eval_/next-item/ndcg_at_10 = 0.18291352689266205
 eval_/next-item/ndcg_at_20 = 0.20987385511398315
 eval_/next-item/recall_at_10 = 0.3280029296875
 eval_/next-item/recall_at_20 = 0.43487548828125
 eval_runtime = 0.9861
 eval_samples_per_second = 16614.687
 eval_steps_per_second = 32.451
********************
Launch training for day 16 are:
********************



***** Running training *****
  Num examples = 133632
  Num Epochs = 10
  Instantaneous batch size per device = 192
  Total train batch size (w. parallel, distributed & accumulation) = 192
  Gradient Accumulation steps = 1
  Total optimization steps = 6960


Step,Training Loss
200,6.4385
400,6.3843
600,6.3973
800,6.2903
1000,6.3032
1200,6.2579
1400,6.2761
1600,6.2148
1800,6.1876
2000,6.1774


Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-3000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/

********************
Eval results for day 17 are:	

********************

 eval_/loss = 6.7532243728637695
 eval_/next-item/ndcg_at_10 = 0.18470366299152374
 eval_/next-item/ndcg_at_20 = 0.21100293099880219
 eval_/next-item/recall_at_10 = 0.3298087418079376
 eval_/next-item/recall_at_20 = 0.4336611032485962
 eval_runtime = 0.9088
 eval_samples_per_second = 16338.18
 eval_steps_per_second = 31.911
********************
Launch training for day 17 are:
********************



***** Running training *****
  Num examples = 122304
  Num Epochs = 10
  Instantaneous batch size per device = 192
  Total train batch size (w. parallel, distributed & accumulation) = 192
  Gradient Accumulation steps = 1
  Total optimization steps = 6370


Step,Training Loss
200,6.441
400,6.4292
600,6.4229
800,6.3307
1000,6.3024
1200,6.2747
1400,6.2267
1600,6.1911
1800,6.2078
2000,6.1706


Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-3000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/

********************
Eval results for day 18 are:	

********************

 eval_/loss = 6.943165302276611
 eval_/next-item/ndcg_at_10 = 0.17806600034236908
 eval_/next-item/ndcg_at_20 = 0.2048434466123581
 eval_/next-item/recall_at_10 = 0.31766632199287415
 eval_/next-item/recall_at_20 = 0.4233240783214569
 eval_runtime = 0.979
 eval_samples_per_second = 16212.851
 eval_steps_per_second = 31.666
********************
Launch training for day 18 are:
********************



***** Running training *****
  Num examples = 129216
  Num Epochs = 10
  Instantaneous batch size per device = 192
  Total train batch size (w. parallel, distributed & accumulation) = 192
  Gradient Accumulation steps = 1
  Total optimization steps = 6730


Step,Training Loss
200,6.5562
400,6.5348
600,6.4631
800,6.4182
1000,6.374
1200,6.3429
1400,6.3162
1600,6.2298
1800,6.2797
2000,6.2276


Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-3000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/

********************
Eval results for day 19 are:	

********************

 eval_/loss = 6.6992106437683105
 eval_/next-item/ndcg_at_10 = 0.1832534521818161
 eval_/next-item/ndcg_at_20 = 0.21079809963703156
 eval_/next-item/recall_at_10 = 0.32974138855934143
 eval_/next-item/recall_at_20 = 0.4386449456214905
 eval_runtime = 0.8889
 eval_samples_per_second = 16703.902
 eval_steps_per_second = 32.625
********************
Launch training for day 19 are:
********************



***** Running training *****
  Num examples = 122304
  Num Epochs = 10
  Instantaneous batch size per device = 192
  Total train batch size (w. parallel, distributed & accumulation) = 192
  Gradient Accumulation steps = 1
  Total optimization steps = 6370


Step,Training Loss
200,6.3559
400,6.3578
600,6.3214
800,6.2389
1000,6.2149
1200,6.2001
1400,6.1683
1600,6.1381
1800,6.119
2000,6.1074


Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-3000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/

********************
Eval results for day 20 are:	

********************

 eval_/loss = 6.460631847381592
 eval_/next-item/ndcg_at_10 = 0.1958910971879959
 eval_/next-item/ndcg_at_20 = 0.2234266698360443
 eval_/next-item/recall_at_10 = 0.3473958373069763
 eval_/next-item/recall_at_20 = 0.45631512999534607
 eval_runtime = 0.9378
 eval_samples_per_second = 16379.63
 eval_steps_per_second = 31.991
********************
Launch training for day 20 are:
********************



***** Running training *****
  Num examples = 126528
  Num Epochs = 10
  Instantaneous batch size per device = 192
  Total train batch size (w. parallel, distributed & accumulation) = 192
  Gradient Accumulation steps = 1
  Total optimization steps = 6590


Step,Training Loss
200,6.2254
400,6.2204
600,6.1759
800,6.1651
1000,6.1244
1200,6.108
1400,6.0427
1600,6.0299
1800,6.0178
2000,6.0217


Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-3000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/

********************
Eval results for day 21 are:	

********************

 eval_/loss = 6.49577522277832
 eval_/next-item/ndcg_at_10 = 0.19414766132831573
 eval_/next-item/ndcg_at_20 = 0.22212117910385132
 eval_/next-item/recall_at_10 = 0.34334591031074524
 eval_/next-item/recall_at_20 = 0.45373114943504333
 eval_runtime = 0.8908
 eval_samples_per_second = 16667.383
 eval_steps_per_second = 32.553
********************
Launch training for day 21 are:
********************



***** Running training *****
  Num examples = 120960
  Num Epochs = 10
  Instantaneous batch size per device = 192
  Total train batch size (w. parallel, distributed & accumulation) = 192
  Gradient Accumulation steps = 1
  Total optimization steps = 6300


Step,Training Loss
200,6.2319
400,6.1727
600,6.1613
800,6.0992
1000,6.0845
1200,6.0623
1400,6.0201
1600,5.9664
1800,5.9922
2000,5.9607


Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-3000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/

********************
Eval results for day 22 are:	

********************

 eval_/loss = 6.5262908935546875
 eval_/next-item/ndcg_at_10 = 0.19691981375217438
 eval_/next-item/ndcg_at_20 = 0.22496281564235687
 eval_/next-item/recall_at_10 = 0.3478583097457886
 eval_/next-item/recall_at_20 = 0.45858028531074524
 eval_runtime = 0.9089
 eval_samples_per_second = 16335.489
 eval_steps_per_second = 31.905
********************
Launch training for day 22 are:
********************



***** Running training *****
  Num examples = 122496
  Num Epochs = 10
  Instantaneous batch size per device = 192
  Total train batch size (w. parallel, distributed & accumulation) = 192
  Gradient Accumulation steps = 1
  Total optimization steps = 6380


Step,Training Loss
200,6.2254
400,6.1612
600,6.1714
800,6.1
1000,6.0386
1200,6.1032
1400,5.9911
1600,5.9571
1800,5.9927
2000,5.9348


Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-3000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/

********************
Eval results for day 23 are:	

********************

 eval_/loss = 6.540719509124756
 eval_/next-item/ndcg_at_10 = 0.1977466642856598
 eval_/next-item/ndcg_at_20 = 0.2259296476840973
 eval_/next-item/recall_at_10 = 0.3477236032485962
 eval_/next-item/recall_at_20 = 0.45932111144065857
 eval_runtime = 0.9183
 eval_samples_per_second = 16169.782
 eval_steps_per_second = 31.582
********************
Launch training for day 23 are:
********************



***** Running training *****
  Num examples = 120384
  Num Epochs = 10
  Instantaneous batch size per device = 192
  Total train batch size (w. parallel, distributed & accumulation) = 192
  Gradient Accumulation steps = 1
  Total optimization steps = 6270


Step,Training Loss
200,6.1955
400,6.1781
600,6.1505
800,6.0438
1000,6.0746
1200,6.0478
1400,5.9633
1600,5.9479
1800,5.9686
2000,5.8874


Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-3000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/

********************
Eval results for day 24 are:	

********************

 eval_/loss = 6.449972152709961
 eval_/next-item/ndcg_at_10 = 0.20032663643360138
 eval_/next-item/ndcg_at_20 = 0.22844848036766052
 eval_/next-item/recall_at_10 = 0.35380497574806213
 eval_/next-item/recall_at_20 = 0.46498844027519226
 eval_runtime = 0.8374
 eval_samples_per_second = 16507.756
 eval_steps_per_second = 32.242
********************
Launch training for day 24 are:
********************



***** Running training *****
  Num examples = 113280
  Num Epochs = 10
  Instantaneous batch size per device = 192
  Total train batch size (w. parallel, distributed & accumulation) = 192
  Gradient Accumulation steps = 1
  Total optimization steps = 5900


Step,Training Loss
200,6.1551
400,6.1485
600,6.0961
800,6.0447
1000,6.044
1200,6.0201
1400,5.9292
1600,5.9397
1800,5.9344
2000,5.863


Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-3000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/

********************
Eval results for day 25 are:	

********************

 eval_/loss = 6.529910087585449
 eval_/next-item/ndcg_at_10 = 0.19284354150295258
 eval_/next-item/ndcg_at_20 = 0.22051723301410675
 eval_/next-item/recall_at_10 = 0.3443359434604645
 eval_/next-item/recall_at_20 = 0.45423179864883423
 eval_runtime = 0.9525
 eval_samples_per_second = 16126.051
 eval_steps_per_second = 31.496
********************
Launch training for day 25 are:
********************



***** Running training *****
  Num examples = 125376
  Num Epochs = 10
  Instantaneous batch size per device = 192
  Total train batch size (w. parallel, distributed & accumulation) = 192
  Gradient Accumulation steps = 1
  Total optimization steps = 6530


Step,Training Loss
200,6.19
400,6.1655
600,6.1652
800,6.0549
1000,6.0228
1200,6.0468
1400,5.9731
1600,5.9341
1800,5.9458
2000,5.887


Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-3000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/

********************
Eval results for day 26 are:	

********************

 eval_/loss = 6.394448280334473
 eval_/next-item/ndcg_at_10 = 0.20124149322509766
 eval_/next-item/ndcg_at_20 = 0.22814559936523438
 eval_/next-item/recall_at_10 = 0.35293692350387573
 eval_/next-item/recall_at_20 = 0.45912906527519226
 eval_runtime = 0.8568
 eval_samples_per_second = 16134.56
 eval_steps_per_second = 31.513
********************
Launch training for day 26 are:
********************



***** Running training *****
  Num examples = 114432
  Num Epochs = 10
  Instantaneous batch size per device = 192
  Total train batch size (w. parallel, distributed & accumulation) = 192
  Gradient Accumulation steps = 1
  Total optimization steps = 5960


Step,Training Loss
200,6.053
400,6.0333
600,6.0324
800,5.961
1000,5.9159
1200,5.9212
1400,5.8414
1600,5.8677
1800,5.8216
2000,5.7984


Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-3000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/

********************
Eval results for day 27 are:	

********************

 eval_/loss = 6.301319599151611
 eval_/next-item/ndcg_at_10 = 0.20679794251918793
 eval_/next-item/ndcg_at_20 = 0.2346506118774414
 eval_/next-item/recall_at_10 = 0.36342594027519226
 eval_/next-item/recall_at_20 = 0.47345197200775146
 eval_runtime = 0.8364
 eval_samples_per_second = 16527.925
 eval_steps_per_second = 32.281
********************
Launch training for day 27 are:
********************



***** Running training *****
  Num examples = 115776
  Num Epochs = 10
  Instantaneous batch size per device = 192
  Total train batch size (w. parallel, distributed & accumulation) = 192
  Gradient Accumulation steps = 1
  Total optimization steps = 6030


Step,Training Loss
200,5.9916
400,6.0078
600,6.0109
800,5.8818
1000,5.9163
1200,5.8937
1400,5.8048
1600,5.7975
1800,5.8192
2000,5.7373


Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-3000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/

********************
Eval results for day 28 are:	

********************

 eval_/loss = 6.402892589569092
 eval_/next-item/ndcg_at_10 = 0.20498305559158325
 eval_/next-item/ndcg_at_20 = 0.23336003720760345
 eval_/next-item/recall_at_10 = 0.36215445399284363
 eval_/next-item/recall_at_20 = 0.47430890798568726
 eval_runtime = 0.8024
 eval_samples_per_second = 16590.321
 eval_steps_per_second = 32.403
********************
Launch training for day 28 are:
********************



***** Running training *****
  Num examples = 110016
  Num Epochs = 10
  Instantaneous batch size per device = 192
  Total train batch size (w. parallel, distributed & accumulation) = 192
  Gradient Accumulation steps = 1
  Total optimization steps = 5730


Step,Training Loss
200,6.0316
400,6.0691
600,6.0248
800,5.9446
1000,5.9266
1200,5.8932
1400,5.8465
1600,5.8561
1800,5.8217
2000,5.7655


Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-3000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/

********************
Eval results for day 29 are:	

********************

 eval_/loss = 6.384833812713623
 eval_/next-item/ndcg_at_10 = 0.20521201193332672
 eval_/next-item/ndcg_at_20 = 0.23246820271015167
 eval_/next-item/recall_at_10 = 0.3602343797683716
 eval_/next-item/recall_at_20 = 0.4678124785423279
 eval_runtime = 0.7791
 eval_samples_per_second = 16428.666
 eval_steps_per_second = 32.087
********************
Launch training for day 29 are:
********************



***** Running training *****
  Num examples = 107328
  Num Epochs = 10
  Instantaneous batch size per device = 192
  Total train batch size (w. parallel, distributed & accumulation) = 192
  Gradient Accumulation steps = 1
  Total optimization steps = 5590


Step,Training Loss
200,6.0942
400,6.0755
600,6.0395
800,5.9815
1000,5.9542
1200,5.899
1400,5.8671
1600,5.877
1800,5.8197
2000,5.833


Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-1500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-2500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/bert/checkpoint-3000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to /content/drive/MyDrive/dataset_rees46/

********************
Eval results for day 30 are:	

********************

 eval_/loss = 6.447874546051025
 eval_/next-item/ndcg_at_10 = 0.20332306623458862
 eval_/next-item/ndcg_at_20 = 0.23116347193717957
 eval_/next-item/recall_at_10 = 0.35789060592651367
 eval_/next-item/recall_at_20 = 0.4682031273841858
 eval_runtime = 0.7713
 eval_samples_per_second = 16595.475
 eval_steps_per_second = 32.413
CPU times: user 2h 34min 7s, sys: 1min 43s, total: 2h 35min 50s
Wall time: 1h 17min 15s


In [None]:
print("BERT results:")
for key, value in  model.compute_metrics().items(): 
  print('%s: %s' % (key, value.item()))

BERT results:
next-item/ndcg_at_10: 0.20332306623458862
next-item/ndcg_at_20: 0.23116347193717957
next-item/recall_at_10: 0.35789060592651367
next-item/recall_at_20: 0.4682031273841858


In [None]:
with open("/content/drive/MyDrive/dataset_rees46/results.txt", 'w') as f:
    f.write('\n')
    f.write('Albert - MLM:')
    f.write('\n')
    for key, value in  model.compute_metrics().items(): 
        f.write('%s %s\n' % (key, value.item()))