In [1]:
#Imports
from transformers import BertConfig, BertForMaskedLM, BertPreTrainedModel, BertModel, PreTrainedTokenizerFast, DataCollatorForLanguageModeling, BertPreTrainedModel
from transformers import Trainer, TrainingArguments

from packaging import version
import datasets
import torch.nn as nn
from tokenizers import Tokenizer
import torch
from torch.utils.data import Dataset
from tqdm import tqdm
from pathlib import Path
import wandb
import time
import os
from typing import Any, Optional, Tuple, Union
from collections import OrderedDict
import pandas as pd
import pickle

from torch.nn import CrossEntropyLoss

from utils.NSP_source_code import *
from utils.computeMDE import *

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#Compatability is tuned on Chopin only
dataset_choice = 'Chopin43'

In [3]:
dataset_map = {"Chopin43": '_C', 
               "ChopinAndHannds": '_CH',
               "Maestro": '_M'}
Key = dataset_map[dataset_choice]
handConfigNumsMap = {"Chopin43": 110, 
               "ChopinAndHannds": 136,
               "Maestro": 12047}


#There is only a compat dataset for Maestro currently
MDEDir = './Extracted_Repns/MDE' + Key
NSPDir = './Datasets/NSP'

#### Parse the Dataset's text file

In [4]:
df = pd.read_csv('/home/mconati/ttmp/styletransfer/NSP/measures.txt')

In [5]:
with open(MDEDir +'/dict/handConf_dict', 'rb') as handle:
    hands = pickle.load(handle)

In [6]:
df.keys()

Index(['MEASURE', 'NEXT MEASURE', 'RANDOM MEASURE'], dtype='object')

# Modified huggingface code

Here are the three elements that I modified from Huggingface. Modified elements are commented

Modified BERT, the forward function is changed to calculate loss based only on NSP.

### It wasn't 100% clear if we should finetune this task on MLM+NSP objectives or just NSP(or maybe MLM+NSP then NSP). The following code is the implementation to finetune on just NSP, but uncommenting the MLM loss calculation code will add that objective back in

In [28]:
class BertForPreTraining(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        self.bert = BertModel(config)
        self.cls = BertPreTrainingHeads(config)

        # Initialize weights and apply final processing
        self.post_init()
        self.decoder = {value:key for key, value in config.decoder.items()}
        self.maskToken = config.decoder['[MASK]']
        self.unkToken = config.decoder['[UNK]']
        self.sepToken = config.decoder['[SEP]']
        self.padToken = config.decoder['[PAD]']
        self.clsToken = config.decoder['[CLS]']
        self.specialTokens = [self.maskToken, self.unkToken, self.sepToken, self.padToken, self.clsToken]

    def set_output_embeddings(self, new_embeddings):
        self.cls.predictions.decoder = new_embeddings


    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        next_sentence_label: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], BertForPreTrainingOutput]:
        r"""
            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
                config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked),
                the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
            next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
                Labels for computing the next sequence prediction (classification) loss. Input should be a sequence
                pair (see `input_ids` docstring) Indices should be in `[0, 1]`:
                - 0 indicates sequence B is a continuation of sequence A,
                - 1 indicates sequence B is a random sequence.
            kwargs (`Dict[str, any]`, optional, defaults to *{}*):
                Used to hide legacy arguments that have been deprecated.
        Returns:
        Example:
        ```python
        >>> from transformers import BertTokenizer, BertForPreTraining
        >>> import torch
        >>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        >>> model = BertForPreTraining.from_pretrained("bert-base-uncased")
        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> prediction_logits = outputs.prediction_logits
        >>> seq_relationship_logits = outputs.seq_relationship_logits
        ```
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output, pooled_output = outputs[:2]
        pitch_p, octave_p, hand_p, seq_relationship_score = self.cls(sequence_output, pooled_output)
        
        masked_lm_loss = None
        
        #For calculating loss, decode the labels from MDE representation (ex 5s3s31) into three sequences
        #of pitch, hand, octave
#         if labels is not None:
#             octaves = []
#             pitches = []
#             handConfs = []
#             #Iterate through the batch
#             for x in labels:
#                 #For each sequence, make a list to store the octave, pitch, and handConf ids
#                 octave = []
#                 pitch = []
#                 handConf = []
#                 #Iterate through the sequence
#                 for y in x:
#                     #If the token is not a mask token, decode into the octave_pitch_handConf representation
#                     if y.item() != -100 and y.item() not in self.specialTokens:
#                         #Split on s
#                         code = [int(x) for x in self.decoder[y.item()].split('s')]
#                         #Add each element to the correct list
#                         octave.append(code[0])
#                         pitch.append(code[1])
#                         handConf.append(code[2])
#                     else:
#                         #Otherwise, make a representation from the mask token ie, -100, -100, -100
#                         octave.append(y.item())
#                         pitch.append(y.item())
#                         handConf.append(y.item())
#                 #Aggregate the samples in the batch
#                 octaves.append(octave)
#                 pitches.append(pitch)
#                 handConfs.append(handConf)
            
            
#       Loss is cross entropy
        loss_fct = CrossEntropyLoss()  # -100 index = padding token
            
#             device = input_ids.device
#             #Put the new labels on the gpu
#             octaves = torch.LongTensor(octaves).to(device)
#             pitches = torch.LongTensor(pitches).to(device)
#             handConfs = torch.LongTensor(handConfs).to(device)
            
#             #Calculate a loss for each
#             octave_loss = loss_fct(octave_p.view(-1, octave_p.shape[2]), octaves.view(-1))
#             pitch_loss = loss_fct(pitch_p.view(-1, pitch_p.shape[2]), pitches.view(-1))
#             hand_loss = loss_fct(hand_p.view(-1, hand_p.shape[2]), handConfs.view(-1))
            
#             #The returned loss is the sum of the three losses
#             masked_lm_loss = octave_loss + pitch_loss + hand_loss

        total_loss = 0# masked_lm_loss
        if next_sentence_label is not None:
            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
            total_loss = next_sentence_loss# + masked_lm_loss
            
            

            

        if not return_dict:
            output = (prediction_scores, seq_relationship_score) + outputs[2:]
            return ((total_loss,) + output) if total_loss is not None else output

        return BertForPreTrainingOutput(
            loss=total_loss,
            prediction_logits=[pitch_p, octave_p, hand_p],
            seq_relationship_logits=seq_relationship_score,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )



### Custom encoder

In [29]:
class CustomBertEmbeddings(nn.Module):
    """Construct the embeddings from octave, pitch, hand configuration, and position."""

    def __init__(self, config):
        super().__init__()
        #Get the mapping from token to encoded representation
        self.decoder = {value:key for key, value in config.decoder.items()}
        
        #Aggregate special tokens
        self.maskToken = config.decoder['[MASK]']
        self.unkToken = config.decoder['[UNK]']
        self.sepToken = config.decoder['[SEP]']
        self.padToken = config.decoder['[PAD]']
        self.clsToken = config.decoder['[CLS]']
        self.specialTokens = [self.maskToken, self.unkToken, self.sepToken, self.padToken, self.clsToken]
        
        #Declare embedding layers
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        self.pitch_embeddings = nn.Embedding(config.numPitches, config.hidden_size)
        self.handConfig_embeddings = nn.Embedding(config.numConfigs, config.hidden_size)
        self.octave_embeddings = nn.Embedding(config.numOctaves, config.hidden_size)

        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
        # any TensorFlow checkpoint file
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
        if version.parse(torch.__version__) > version.parse("1.6.0"):
            self.register_buffer(
                "token_type_ids",
                torch.zeros(self.position_ids.size(), dtype=torch.long),
                persistent=False,
            )

    def forward(
        self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
    ):
        #Custom code to use 3 embedding layers
        #Convert the tokenized MDE representation ie 9s2s55 to pitch=9 octave=2 hand=55 for all tokens in the batch
        octaves = []
        pitches = []
        handConfs = []
        #Iterate through the batch
        for x in input_ids:
            #For each sequence, make a list to store the octave, pitch, and handConf ids
            octave = []
            pitch = []
            handConf = []
            #Iterate through the sequence
            for y in x:
                #If the token is not a special token, decode into the octave_pitch_handConf representation
                if y.item() not in self.specialTokens:
                    #Split on s
                    try:
                        code = [int(x) for x in self.decoder[y.item()].split('s')]
                    except:
                        code = [x for x in self.decoder[y.item()].split('s')]
                        print(code)
                    #Add each element to the correct list
                    octave.append(code[0])
                    pitch.append(code[1])
                    handConf.append(code[2])
                else:
                    #Otherwise, make a representation from the special token. ie: a cls token(1) becomes 1_1_1
                    octave.append(y.item())
                    pitch.append(y.item())
                    handConf.append(y.item())
            #Aggregate the samples in the batch
            octaves.append(octave)
            pitches.append(pitch)
            handConfs.append(handConf)
            
        device = input_ids.device if input_ids is not None else inputs_embeds.device
        
        #Convert the lists to tensors and put them on the gpu
        octTensor = torch.LongTensor(octaves).to(device)
        pitchTensor = torch.LongTensor(pitches).to(device)
        handConfTensor = torch.LongTensor(handConfs).to(device)
        
        #Sum the three embeddings
        input_embeds = self.handConfig_embeddings(handConfTensor)\
                       +self.octave_embeddings(octTensor)\
                       +self.pitch_embeddings(pitchTensor)
        embeddings = input_embeds

        #Standard BertEmbeddings code
        if input_ids is not None:
            input_shape = input_ids.size()
        else:
            input_shape = inputs_embeds.size()[:-1]

        seq_length = input_shape[1]
        
        if position_ids is None:
            position_ids = torch.arange(seq_length, dtype=torch.long, device=device)
            position_ids = position_ids.unsqueeze(0).expand(input_shape)
        position_embeddings = self.position_embeddings(position_ids)
        embeddings += position_embeddings
        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings

### Custom Configuration

In order for the encoder and MaskedLM to access the dictionary between MDE representation and tokens, we need to pass that in the model's config

In [9]:
class CustomBertConfig(BertConfig):
    def __init__(self,**kwargs):
        super().__init__(**kwargs)
        #The decoder holds the conversion back to the coded representation for the customEmbeddings layer
        self.decoder = kwargs.get('decoder')
        self.numOctaves = 9
        self.numConfigs = handConfigNumsMap[dataset_choice]
        self.numPitches = 12

Setup training

In [10]:
wandb.login()
#1dd35d404a289e1e49f18069e4fe0a51d28d52c7

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmconati[0m (use `wandb login --relogin` to force relogin)


True

In [11]:
#Load the tokenizer
TOKENIZER_SAVEDIR = Path(MDEDir + '/tokenizer')
LM_MODEL_SAVEDIR = Path(MDEDir + '/model/NSP')
Path(LM_MODEL_SAVEDIR).mkdir(exist_ok=True)

In [12]:
from transformers import BertTokenizer

Load the tokenizer

In [13]:
tokenizer = BertTokenizer.from_pretrained(TOKENIZER_SAVEDIR)
tokenizer

PreTrainedTokenizer(name_or_path='Extracted_Repns/MDE_C/tokenizer', vocab_size=1605, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

Create the Sentence pairings, and save them as a file to be read by the dataset creation code. This can be commented out if ian inputs.pickle exists

In [30]:
sentence_a = []
sentence_b = []
label = []

for idx, measure in enumerate(df['MEASURE']):
    
    #If they are consecutive add a 0 to the labels
    sentence_a.append(computeMDE(NSPDir + '/' + df['MEASURE'][idx], hands))
    sentence_b.append(computeMDE(NSPDir + '/' + df['NEXT MEASURE'][idx], hands))
    label.append(0)
    
    #If not add a 1
    sentence_a.append(computeMDE(NSPDir + '/' + df['MEASURE'][idx], hands))
    sentence_b.append(computeMDE(NSPDir + '/' + df['RANDOM MEASURE'][idx], hands))
    label.append(1)

inputs = tokenizer(sentence_a, sentence_b, return_tensors='pt', max_length=64, truncation=True, padding='max_length')
inputs['labels'] = torch.LongTensor([label]).T
with open (NSPDir + '/inputs.pickle', 'wb') as handle:
    pickle.dump(inputs, handle, protocol=pickle.HIGHEST_PROTOCOL)

Validate that inputs look as expected

In [31]:
with open(NSPDir + '/inputs.pickle', 'rb') as handle:
    inputs = pickle.load(handle)
inputs

{'input_ids': tensor([[  1, 267, 312,  ...,   3,   3,   3],
        [  1, 267, 312,  ...,   3,   3,   3],
        [  1, 501, 547,  ...,   3,   3,   3],
        ...,
        [  1,  50,  29,  ...,   3,   3,   3],
        [  1,  44,   0,  ...,   3,   3,   3],
        [  1,  44,   0,  ...,   3,   3,   3]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[0],
        [1],
        [0],
        ...,
        [1],
        [0],
        [1]])}

In [17]:
inputs.labels[:10]

tensor([[0],
        [1],
        [0],
        [1],
        [0],
        [1],
        [0],
        [1],
        [0],
        [1]])

Load the inputs pickle into a dataset form

In [18]:
class NSPDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        returnee = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        returnee['next_sentence_label'] = returnee.pop('labels')
        return returnee
    def __len__(self):
        return len(self.encodings.input_ids)


In [19]:
dataset = NSPDataset(inputs)
dataset[0]

  """


{'input_ids': tensor([  1, 267, 312, 267, 312,   2, 267, 312, 267, 312,   2,   3,   3,   3,
           3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
           3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
           3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
           3,   3,   3,   3,   3,   3,   3,   3]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'next_sentence_label': tensor([0])}

Using a data collator for language modeling, but the MLM objective is disabled in the forward function. This can be easily reenabled by changing the forward function and mlm_probability

In [20]:
#BERT training code basically copied from the Huggingface Esperanto Tutorial from here on out
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.0
)

In [21]:
config = CustomBertConfig(
    #The decoder holds the conversion back to the coded representation for the customEmbeddings layer
    decoder = tokenizer.vocab,
    vocab_size=len(tokenizer.vocab),
)
len(tokenizer.vocab)

1605

In [22]:
#Create a standard BERT model
model = BertForPreTraining(config=config)
print('Num parameters:', model.num_parameters())


#Create a custom embedding class
temp = CustomBertEmbeddings(config)
#Replace the model's embedding layer
model.bert.embeddings = temp


#As a sanity check, make sure that the custom embedding layers exist
model.bert.embeddings.handConfig_embeddings.weight
print('Num parameters:', model.num_parameters())

Num parameters: 87968389
Num parameters: 86834821


In [23]:
NUM_EPOCHS = 100
BATCH_SIZE = 128

In [24]:
training_args = TrainingArguments(
    output_dir=LM_MODEL_SAVEDIR,
    overwrite_output_dir=True,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    save_steps=10,
    logging_steps=5,
    evaluation_strategy="steps",
    eval_steps=5,
    save_total_limit=1,
    prediction_loss_only=False,
    report_to="wandb",
    load_best_model_at_end = True
)

In [25]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
    eval_dataset=dataset,
)
trainer.args

TrainingArguments(
_n_gpu=2,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
bf16=False,
bf16_full_eval=False,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_steps=5,
evaluation_strategy=IntervalStrategy.STEPS,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=False,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_strategy=HubStrategy.EVERY_SAVE,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=5e-05,
length_column_name=length,
load_best_model_at_end=True,
local_rank=-1,
log_level=-1,
log_level_replica=-1,
log_on_each_node=True,
logging_dir=Extracted_Repns/MDE_C/m

In [26]:
trainer.train()

***** Running training *****
  Num examples = 1612
  Num Epochs = 100
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 1
  Total optimization steps = 700
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


  """


Step,Training Loss,Validation Loss
5,1.0962,0.689439
10,0.7674,0.690972
15,0.6986,0.693431
20,0.6933,0.690141
25,0.7059,0.689278
30,0.6926,0.682641
35,0.6819,0.67765
40,0.6757,0.666909
45,0.6791,0.667661
50,0.6705,0.650907


***** Running Evaluation *****
  Num examples = 1612
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1612
  Batch size = 16
Saving model checkpoint to Extracted_Repns/MDE_C/model/NSP/checkpoint-10
Configuration saved in Extracted_Repns/MDE_C/model/NSP/checkpoint-10/config.json
Model weights saved in Extracted_Repns/MDE_C/model/NSP/checkpoint-10/pytorch_model.bin
Deleting older checkpoint [Extracted_Repns/MDE_C/model/NSP/checkpoint-50] due to args.save_total_limit
  """
***** Running Evaluation *****
  Num examples = 1612
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1612
  Batch size = 16
Saving model checkpoint to Extracted_Repns/MDE_C/model/NSP/checkpoint-20
Configuration saved in Extracted_Repns/MDE_C/model/NSP/checkpoint-20/config.json
Model weights saved in Extracted_Repns/MDE_C/model/NSP/checkpoint-20/pytorch_model.bin
Deleting older checkpoint [Extracted_Repns/MDE_C/model/NSP/checkpoint-10] due to args.save_total_limit
  """
***** Running Ev

***** Running Evaluation *****
  Num examples = 1612
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1612
  Batch size = 16
Saving model checkpoint to Extracted_Repns/MDE_C/model/NSP/checkpoint-90
Configuration saved in Extracted_Repns/MDE_C/model/NSP/checkpoint-90/config.json
Model weights saved in Extracted_Repns/MDE_C/model/NSP/checkpoint-90/pytorch_model.bin
Deleting older checkpoint [Extracted_Repns/MDE_C/model/NSP/checkpoint-80] due to args.save_total_limit
  """
***** Running Evaluation *****
  Num examples = 1612
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1612
  Batch size = 16
Saving model checkpoint to Extracted_Repns/MDE_C/model/NSP/checkpoint-100
Configuration saved in Extracted_Repns/MDE_C/model/NSP/checkpoint-100/config.json
Model weights saved in Extracted_Repns/MDE_C/model/NSP/checkpoint-100/pytorch_model.bin
Deleting older checkpoint [Extracted_Repns/MDE_C/model/NSP/checkpoint-90] due to args.save_total_limit
  """
***** Running

Model weights saved in Extracted_Repns/MDE_C/model/NSP/checkpoint-160/pytorch_model.bin
Deleting older checkpoint [Extracted_Repns/MDE_C/model/NSP/checkpoint-150] due to args.save_total_limit
  """
***** Running Evaluation *****
  Num examples = 1612
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1612
  Batch size = 16
Saving model checkpoint to Extracted_Repns/MDE_C/model/NSP/checkpoint-170
Configuration saved in Extracted_Repns/MDE_C/model/NSP/checkpoint-170/config.json
Model weights saved in Extracted_Repns/MDE_C/model/NSP/checkpoint-170/pytorch_model.bin
Deleting older checkpoint [Extracted_Repns/MDE_C/model/NSP/checkpoint-160] due to args.save_total_limit
  """
***** Running Evaluation *****
  Num examples = 1612
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1612
  Batch size = 16
Saving model checkpoint to Extracted_Repns/MDE_C/model/NSP/checkpoint-180
Configuration saved in Extracted_Repns/MDE_C/model/NSP/checkpoint-180/config.json
Model we

***** Running Evaluation *****
  Num examples = 1612
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1612
  Batch size = 16
Saving model checkpoint to Extracted_Repns/MDE_C/model/NSP/checkpoint-240
Configuration saved in Extracted_Repns/MDE_C/model/NSP/checkpoint-240/config.json
Model weights saved in Extracted_Repns/MDE_C/model/NSP/checkpoint-240/pytorch_model.bin
Deleting older checkpoint [Extracted_Repns/MDE_C/model/NSP/checkpoint-230] due to args.save_total_limit
  """
***** Running Evaluation *****
  Num examples = 1612
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1612
  Batch size = 16
Saving model checkpoint to Extracted_Repns/MDE_C/model/NSP/checkpoint-250
Configuration saved in Extracted_Repns/MDE_C/model/NSP/checkpoint-250/config.json
Model weights saved in Extracted_Repns/MDE_C/model/NSP/checkpoint-250/pytorch_model.bin
  """
***** Running Evaluation *****
  Num examples = 1612
  Batch size = 16
***** Running Evaluation *****
  Num exam

***** Running Evaluation *****
  Num examples = 1612
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1612
  Batch size = 16
Saving model checkpoint to Extracted_Repns/MDE_C/model/NSP/checkpoint-320
Configuration saved in Extracted_Repns/MDE_C/model/NSP/checkpoint-320/config.json
Model weights saved in Extracted_Repns/MDE_C/model/NSP/checkpoint-320/pytorch_model.bin
Deleting older checkpoint [Extracted_Repns/MDE_C/model/NSP/checkpoint-310] due to args.save_total_limit
  """
***** Running Evaluation *****
  Num examples = 1612
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1612
  Batch size = 16
Saving model checkpoint to Extracted_Repns/MDE_C/model/NSP/checkpoint-330
Configuration saved in Extracted_Repns/MDE_C/model/NSP/checkpoint-330/config.json
Model weights saved in Extracted_Repns/MDE_C/model/NSP/checkpoint-330/pytorch_model.bin
  """
***** Running Evaluation *****
  Num examples = 1612
  Batch size = 16
***** Running Evaluation *****
  Num exam

Model weights saved in Extracted_Repns/MDE_C/model/NSP/checkpoint-390/pytorch_model.bin
Deleting older checkpoint [Extracted_Repns/MDE_C/model/NSP/checkpoint-380] due to args.save_total_limit
  """
***** Running Evaluation *****
  Num examples = 1612
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1612
  Batch size = 16
Saving model checkpoint to Extracted_Repns/MDE_C/model/NSP/checkpoint-400
Configuration saved in Extracted_Repns/MDE_C/model/NSP/checkpoint-400/config.json
Model weights saved in Extracted_Repns/MDE_C/model/NSP/checkpoint-400/pytorch_model.bin
  """
***** Running Evaluation *****
  Num examples = 1612
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1612
  Batch size = 16
Saving model checkpoint to Extracted_Repns/MDE_C/model/NSP/checkpoint-410
Configuration saved in Extracted_Repns/MDE_C/model/NSP/checkpoint-410/config.json
Model weights saved in Extracted_Repns/MDE_C/model/NSP/checkpoint-410/pytorch_model.bin
Deleting older checkpoin

***** Running Evaluation *****
  Num examples = 1612
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1612
  Batch size = 16
Saving model checkpoint to Extracted_Repns/MDE_C/model/NSP/checkpoint-470
Configuration saved in Extracted_Repns/MDE_C/model/NSP/checkpoint-470/config.json
Model weights saved in Extracted_Repns/MDE_C/model/NSP/checkpoint-470/pytorch_model.bin
Deleting older checkpoint [Extracted_Repns/MDE_C/model/NSP/checkpoint-440] due to args.save_total_limit
Deleting older checkpoint [Extracted_Repns/MDE_C/model/NSP/checkpoint-460] due to args.save_total_limit
  """
***** Running Evaluation *****
  Num examples = 1612
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1612
  Batch size = 16
Saving model checkpoint to Extracted_Repns/MDE_C/model/NSP/checkpoint-480
Configuration saved in Extracted_Repns/MDE_C/model/NSP/checkpoint-480/config.json
Model weights saved in Extracted_Repns/MDE_C/model/NSP/checkpoint-480/pytorch_model.bin
Deleting older

***** Running Evaluation *****
  Num examples = 1612
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1612
  Batch size = 16
Saving model checkpoint to Extracted_Repns/MDE_C/model/NSP/checkpoint-550
Configuration saved in Extracted_Repns/MDE_C/model/NSP/checkpoint-550/config.json
Model weights saved in Extracted_Repns/MDE_C/model/NSP/checkpoint-550/pytorch_model.bin
Deleting older checkpoint [Extracted_Repns/MDE_C/model/NSP/checkpoint-520] due to args.save_total_limit
Deleting older checkpoint [Extracted_Repns/MDE_C/model/NSP/checkpoint-540] due to args.save_total_limit
  """
***** Running Evaluation *****
  Num examples = 1612
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1612
  Batch size = 16
Saving model checkpoint to Extracted_Repns/MDE_C/model/NSP/checkpoint-560
Configuration saved in Extracted_Repns/MDE_C/model/NSP/checkpoint-560/config.json
Model weights saved in Extracted_Repns/MDE_C/model/NSP/checkpoint-560/pytorch_model.bin
  """
***** Ru

Model weights saved in Extracted_Repns/MDE_C/model/NSP/checkpoint-620/pytorch_model.bin
Deleting older checkpoint [Extracted_Repns/MDE_C/model/NSP/checkpoint-600] due to args.save_total_limit
Deleting older checkpoint [Extracted_Repns/MDE_C/model/NSP/checkpoint-610] due to args.save_total_limit
  """
***** Running Evaluation *****
  Num examples = 1612
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1612
  Batch size = 16
Saving model checkpoint to Extracted_Repns/MDE_C/model/NSP/checkpoint-630
Configuration saved in Extracted_Repns/MDE_C/model/NSP/checkpoint-630/config.json
Model weights saved in Extracted_Repns/MDE_C/model/NSP/checkpoint-630/pytorch_model.bin
Deleting older checkpoint [Extracted_Repns/MDE_C/model/NSP/checkpoint-620] due to args.save_total_limit
  """
***** Running Evaluation *****
  Num examples = 1612
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1612
  Batch size = 16
Saving model checkpoint to Extracted_Repns/MDE_C/model/NSP/c

***** Running Evaluation *****
  Num examples = 1612
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1612
  Batch size = 16
Saving model checkpoint to Extracted_Repns/MDE_C/model/NSP/checkpoint-700
Configuration saved in Extracted_Repns/MDE_C/model/NSP/checkpoint-700/config.json
Model weights saved in Extracted_Repns/MDE_C/model/NSP/checkpoint-700/pytorch_model.bin
Deleting older checkpoint [Extracted_Repns/MDE_C/model/NSP/checkpoint-690] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from Extracted_Repns/MDE_C/model/NSP/checkpoint-660 (score: 0.024689989164471626).


TrainOutput(global_step=700, training_loss=0.22964170221771513, metrics={'train_runtime': 1434.6605, 'train_samples_per_second': 112.361, 'train_steps_per_second': 0.488, 'total_flos': 5344576787097600.0, 'train_loss': 0.22964170221771513, 'epoch': 100.0})

# I'm doubtful of how low the validation loss goes on this training. It needs further investigation.