In [1]:
# RUN 3 lines below in a seperate cell in Google Colab
!pip install transformers tokenizers wandb huggingface_hub datasets datetime nvidia-ml-py3
from huggingface_hub import notebook_login
notebook_login()
hf_repo = "misnaej/the-jam-machine-elec-famil"

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.huggingface/token
Login successful


In [7]:
# RUN 3 lines below in a seperate cell in Google Colab
# !pip install transformers tokenizers wandb huggingface_hub datasets datetime nvidia-ml-py3
# from huggingface_hub import notebook_login
# notebook_login()

import os
from pathlib import Path
from transformers import (
    PreTrainedTokenizerFast,
    DataCollatorForLanguageModeling,
    GPT2Config,
    GPT2LMHeadModel,
    TrainingArguments,
    Trainer,
)
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import WhitespaceSplit
from tokenizers.trainers import WordLevelTrainer
from datetime import datetime
import wandb
from datasets import load_dataset
from pynvml import *

# CONFIG:
TRAIN_FROM_CHECKPOINT = "/content/gdrive/MyDrive/the_jam_machine/model_2048_elec_familiarised/checkpoint-15000"  # Example: checkpoint-80000
EVAL_STEPS = 1000
PER_DEVICE_TRAIN_BATCH_SIZE = 4
GRADIENT_ACCUMULATION_STEPS = 2
TRAIN_EPOCHS = 5

"""Set paths either from Google Drive or locally"""
formattedtime = datetime.now().strftime("%d-%m__%H-%M-%S")
try:
    from google.colab import drive

    wandb.init(project=f"the-jammy-machine", resume=True)
    drive.mount("/content/gdrive")
    drive_path = "/content/gdrive/MyDrive/the_jam_machine"
    dataset_path = f"{drive_path}/data_familiarised"
    model_path = f"{drive_path}/model_{formattedtime}"
except:
    dataset_path = "./midi_encoded"
    model_path = f"./models/model_{formattedtime}"
tokenizer_path = f"{model_path}/tokenizer.json"
if not os.path.exists(model_path):
    os.mkdir(model_path)

"""Load dataset from gzip files"""
train_data = load_dataset(dataset_path, data_files={"train": "train/*.zip"})["train"]
validate_data = load_dataset(dataset_path, data_files={"val": "validate/*.zip"})["val"]

# TODO: Move tokenizer logic to encoder and use its json here only.
"""Get tokenizer from scratch or saved tokenizer.json"""
if not os.path.isfile(tokenizer_path):
    tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
    tokenizer.pre_tokenizer = WhitespaceSplit()
    tokenizer_trainer = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]", "[MASK]"])
    tokenizer.train_from_iterator(train_data["text"], trainer=tokenizer_trainer)
    tokenizer.save(tokenizer_path)
tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_path)
tokenizer.add_special_tokens({"pad_token": "[PAD]"})
print("Vocabulary size: ", tokenizer.vocab_size)


def tokenize(data):
    return tokenizer(
        data["text"],
        truncation=True,
        padding=True,
        max_length=2048,
    )


train_data_tokenized = train_data.map(tokenize, batched=True, remove_columns=["text"])
validate_data_tokenized = validate_data.map(
    tokenize, batched=True, remove_columns=["text"]
)

"""Make sure the tokenized dataset structure is correct and check a few examples"""
assert "input_ids" in list(train_data_tokenized[0]), list(train_data_tokenized[0])
for i, data in enumerate(train_data["text"][:3]):
    print("----")
    print(data)
    print(train_data_tokenized[i]["input_ids"])


"""Create model and trainer"""
model = GPT2LMHeadModel(
    GPT2Config(
        vocab_size=tokenizer.vocab_size,
        pad_token_id=tokenizer.pad_token_id,
        n_embd=512,
        n_head=8,
        n_layer=6,
        n_positions=2048,
    )
)
training_args = TrainingArguments(
    output_dir=model_path,
    overwrite_output_dir=True,
    num_train_epochs=TRAIN_EPOCHS,
    evaluation_strategy="steps",
    eval_steps=EVAL_STEPS,
    learning_rate=5e-4,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    fp16=True,
    save_strategy="steps",
    save_steps=EVAL_STEPS * 5,
    save_total_limit=5,
    logging_steps=EVAL_STEPS,
    logging_first_step=True,
    logging_dir=os.path.join(model_path, "logs"),
    report_to="wandb",
    seed=42,
    push_to_hub = True,
    hub_strategy="end",
    hub_model_id=hf_repo
)
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_data_tokenized,
    eval_dataset=validate_data_tokenized,
)

"""Train the model from scratch or from checkpoint"""
if TRAIN_FROM_CHECKPOINT is not None:
    result = trainer.train(TRAIN_FROM_CHECKPOINT)
else:
    result = trainer.train()

print("Training finished")
print(result)

"""Save the tokenizer, latest status of trained model and push it to hugging face."""
tokenizer.save_pretrained(model_path)
model.save_pretrained(model_path)




Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


  0%|          | 0/1 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]

Assigning [PAD] to the pad_token key of the tokenizer


Vocabulary size:  285


  0%|          | 0/1 [00:00<?, ?ba/s]

----
PIECE_START TRACK_START INST=8 DENSITY=3 BAR_START BAR_END BAR_START TIME_DELTA=2 NOTE_ON=67 TIME_DELTA=1 NOTE_OFF=67 TIME_DELTA=1 NOTE_ON=64 TIME_DELTA=3 NOTE_OFF=64 TIME_DELTA=1 NOTE_ON=67 TIME_DELTA=1 NOTE_OFF=67 TIME_DELTA=1 NOTE_ON=64 TIME_DELTA=1 NOTE_OFF=64 TIME_DELTA=1 NOTE_ON=62 TIME_DELTA=1 NOTE_OFF=62 TIME_DELTA=1 NOTE_ON=64 TIME_DELTA=1 NOTE_OFF=64 BAR_END BAR_START TIME_DELTA=2 NOTE_ON=64 TIME_DELTA=1 NOTE_OFF=64 TIME_DELTA=1 NOTE_ON=64 TIME_DELTA=1 NOTE_OFF=64 TIME_DELTA=1 NOTE_ON=59 TIME_DELTA=1 NOTE_OFF=59 TIME_DELTA=1 NOTE_ON=64 TIME_DELTA=5 NOTE_OFF=64 BAR_END BAR_START TIME_DELTA=2 NOTE_ON=67 TIME_DELTA=1 NOTE_OFF=67 TIME_DELTA=1 NOTE_ON=64 TIME_DELTA=3 NOTE_OFF=64 TIME_DELTA=1 NOTE_ON=67 TIME_DELTA=1 NOTE_OFF=67 TIME_DELTA=1 NOTE_ON=64 TIME_DELTA=1 NOTE_OFF=64 TIME_DELTA=1 NOTE_ON=62 TIME_DELTA=1 NOTE_OFF=62 TIME_DELTA=1 NOTE_ON=64 TIME_DELTA=1 NOTE_OFF=64 BAR_END BAR_START TIME_DELTA=2 NOTE_ON=64 TIME_DELTA=1 NOTE_OFF=64 TIME_DELTA=1 NOTE_ON=64 TIME_DELTA=1 NO

PyTorch: setting up devices
Cloning https://huggingface.co/misnaej/the-jam-machine-elec-famil into local empty directory.
Using cuda_amp half precision backend
Loading model from /content/gdrive/MyDrive/the_jam_machine/model_2048_elec_familiarised/checkpoint-15000.
***** Running training *****
  Num examples = 33721
  Num Epochs = 5
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 2
  Total optimization steps = 21075
  Number of trainable parameters = 20109824
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 3
  Continuing training from global step 15000
  Will skip the first 3 epochs then the first 4710 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.


  0%|          | 0/4710 [00:00<?, ?it/s]

Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
16000,0.2358,0.883431
17000,0.2278,0.894813
18000,0.2115,0.915353
19000,0.2038,0.927088
20000,0.1993,0.931696
21000,0.2003,0.932842


***** Running Evaluation *****
  Num examples = 386
  Batch size = 8
***** Running Evaluation *****
  Num examples = 386
  Batch size = 8
***** Running Evaluation *****
  Num examples = 386
  Batch size = 8
***** Running Evaluation *****
  Num examples = 386
  Batch size = 8
***** Running Evaluation *****
  Num examples = 386
  Batch size = 8
Saving model checkpoint to /content/gdrive/MyDrive/the_jam_machine/model_08-12__20-34-42/checkpoint-20000
Configuration saved in /content/gdrive/MyDrive/the_jam_machine/model_08-12__20-34-42/checkpoint-20000/config.json
Model weights saved in /content/gdrive/MyDrive/the_jam_machine/model_08-12__20-34-42/checkpoint-20000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 386
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


tokenizer config file saved in /content/gdrive/MyDrive/the_jam_machine/model_08-12__20-34-42/tokenizer_config.json
Special tokens file saved in /content/gdrive/

Training finished
TrainOutput(global_step=21075, training_loss=0.061333450778947605, metrics={'train_runtime': 7640.7425, 'train_samples_per_second': 22.067, 'train_steps_per_second': 2.758, 'total_flos': 3.918026771708314e+16, 'train_loss': 0.061333450778947605, 'epoch': 5.0})


Model weights saved in /content/gdrive/MyDrive/the_jam_machine/model_08-12__20-34-42/pytorch_model.bin
Saving model checkpoint to /content/gdrive/MyDrive/the_jam_machine/model_08-12__20-34-42
Configuration saved in /content/gdrive/MyDrive/the_jam_machine/model_08-12__20-34-42/config.json
Model weights saved in /content/gdrive/MyDrive/the_jam_machine/model_08-12__20-34-42/pytorch_model.bin


KeyboardInterrupt: ignored

In [8]:
trainer.push_to_hub(hf_repo) # I think this does not work from a gdrive folder - it needs to be in the collab hard drive probably

Saving model checkpoint to /content/gdrive/MyDrive/the_jam_machine/model_08-12__20-34-42
Configuration saved in /content/gdrive/MyDrive/the_jam_machine/model_08-12__20-34-42/config.json
Model weights saved in /content/gdrive/MyDrive/the_jam_machine/model_08-12__20-34-42/pytorch_model.bin
Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


KeyboardInterrupt: ignored