In [1]:
from transformers import RobertaConfig, RobertaForMaskedLM, Trainer, TrainingArguments, RobertaTokenizerFast, DataCollatorForLanguageModeling
import evaluate
from torch.cuda import is_available as cuda_available, is_bf16_supported
from tqdm import tqdm
import pickle

  from .autonotebook import tqdm as notebook_tqdm

A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/home/maximos/miniconda3/envs/torch/lib/python3.12/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/home/maximos/miniconda3/envs/torch/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/home/maximos/miniconda3/envs/torch/lib/python3.12/site-packages

In [2]:
MAX_LENGTH = 1024

In [3]:
# Load the trained tokenizer
# tokenizer = RobertaTokenizerFast.from_pretrained('../data/chroma_tokenizer_1/' , max_len=MAX_LENGTH)
tokenizer = RobertaTokenizerFast.from_pretrained('../data/chroma_wordlevel_tokenizer/' , max_len=MAX_LENGTH)

# Data collator for MLM (will handle random masking)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)

# Roberta-MED
model_config = RobertaConfig(
    vocab_size=tokenizer.vocab_size,
    hidden_size=512,
    num_hidden_layers=8,
    num_attention_heads=8,
    pad_token_id=tokenizer.pad_token_id,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    mask_token_id=tokenizer.mask_token_id,
    max_position_embeddings=MAX_LENGTH,
)

model = RobertaForMaskedLM(model_config)

In [4]:
# Open and read the text file where each line is a new instance
with open('../data/chroma_accompaniment_sentences.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

# Remove trailing newline characters and strip extra spaces
sentences = [line.strip() for line in lines if line.strip()]

In [6]:
print(len(sentences))

159139


In [7]:
# Tokenize each sentence
tokenized_sentences = []
for sentence in tqdm(sentences[:10]):
    tokenized = tokenizer(
        sentence,
        max_length=MAX_LENGTH,
        truncation=True,
        return_overflowing_tokens=True,
        padding=False
    )
    
    # Collect all parts of the tokenized sentence
    input_ids = tokenized["input_ids"]
    attention_mask = tokenized['attention_mask']
    for i_segment in range(len(input_ids)):
        tokenized_sentences.append( {
            'input_ids': input_ids[i_segment],
            'attention_mask': attention_mask[i_segment]
            } )

100%|██████████| 10/10 [00:00<00:00, 90.71it/s]


In [4]:
print(len(tokenized_sentences))
print(tokenized_sentences)

NameError: name 'tokenized_sentences' is not defined

In [9]:
from sklearn.model_selection import train_test_split

# Split the dataset (80% training, 20% testing)
train_texts, test_texts = train_test_split(tokenized_sentences, test_size=0.2, random_state=42)

# Extract input_ids and attention_mask for training and testing datasets
train_input_ids = [example['input_ids'] for example in train_texts]
train_attention_mask = [example['attention_mask'] for example in train_texts]

test_input_ids = [example['input_ids'] for example in test_texts]
test_attention_mask = [example['attention_mask'] for example in test_texts]


In [21]:
print(train_texts)

[{'input_ids': [269, 26, 261, 67, 27, 261, 67, 29, 261, 67, 265, 264, 67, 20, 261, 67, 21, 261, 67, 22, 261, 67, 24, 261, 67, 26, 261, 67, 27, 261, 67, 29, 261, 67, 265, 264, 67, 21, 261, 67, 21, 261, 67, 22, 261, 67, 24, 261, 67, 26, 261, 67, 27, 261, 67, 29, 261, 67, 265, 264, 67, 22, 261, 67, 21, 261, 67, 22, 261, 67, 24, 261, 67, 26, 261, 67, 27, 261, 67, 29, 261, 67, 265, 264, 67, 23, 261, 67, 21, 261, 67, 22, 261, 67, 24, 261, 67, 26, 261, 67, 27, 261, 67, 29, 261, 67, 265, 264, 67, 24, 261, 67, 21, 261, 67, 22, 261, 67, 24, 261, 67, 26, 261, 67, 27, 261, 67, 29, 261, 67, 265, 264, 67, 25, 261, 67, 21, 261, 67, 22, 261, 67, 24, 261, 67, 26, 261, 67, 27, 261, 67, 29, 261, 67, 265, 264, 67, 26, 261, 67, 21, 261, 67, 22, 261, 67, 24, 261, 67, 26, 261, 67, 27, 261, 67, 29, 261, 67, 265, 264, 67, 27, 261, 67, 21, 261, 67, 22, 261, 67, 24, 261, 67, 26, 261, 67, 27, 261, 67, 29, 261, 67, 265, 264, 67, 20, 261, 67, 21, 261, 67, 22, 261, 67, 24, 261, 67, 26, 261, 67, 27, 261, 67, 29, 261,

In [10]:
import torch
from datasets import Dataset

# Create a Hugging Face Dataset from tokenized inputs
def create_dataset(input_ids, attention_mask):
    dataset_dict = {
        "input_ids": input_ids,
        "attention_mask": attention_mask
    }
    dataset = Dataset.from_dict(dataset_dict)
    return dataset

train_dataset = create_dataset(train_input_ids, train_attention_mask)
test_dataset = create_dataset(test_input_ids, test_attention_mask)

# Check the size of the datasets
print("Training dataset size:", len(train_dataset))
print("Testing dataset size:", len(test_dataset))


Training dataset size: 41
Testing dataset size: 11


In [4]:
with open('../data/chroma_dataset.pickle', "rb") as input_file:
    chroma_dataset = pickle.load(input_file)

In [7]:
print(chroma_dataset.keys())
train_dataset = chroma_dataset['train_dataset']
test_dataset = chroma_dataset['test_dataset']

dict_keys(['train_dataset', 'test_dataset'])


In [6]:
print(train_dataset)

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 709356
})


In [8]:
def preprocess_logits_for_metrics(logits, labels):
    if isinstance(logits, tuple):
        print(f"is indeed tuple")
        # Depending on the model and config, logits may contain extra tensors,
        # like past_key_values, but logits always come first
        logits = logits[0]
    # import pdb; pdb.set_trace()
    return logits.argmax(dim=-1)

metric = evaluate.load("accuracy")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # preds have the same shape as the labels, after the argmax(-1) has been calculated
    # by preprocess_logits_for_metrics
    labels = labels.reshape(-1)
    preds = preds.reshape(-1)
    mask = labels != -100
    labels = labels[mask]
    preds = preds[mask]
    return metric.compute(predictions=preds, references=labels)

In [10]:
# Trainer
USE_CUDA = cuda_available()
if not cuda_available():
    print(f"NO 16-floats used")
    FP16 = FP16_EVAL = BF16 = BF16_EVAL = False
elif is_bf16_supported():
    BF16 = BF16_EVAL = True
    FP16 = FP16_EVAL = False
    print(f"BF16_float used")
else:
    BF16 = BF16_EVAL = False
    FP16 = FP16_EVAL = True
    (f"FP16_float used")
print(f"Using CUDA as {USE_CUDA}")

training_config = TrainingArguments(
    output_dir="../data/chroma_mlm_medium_3e-4_100",
    overwrite_output_dir=False,
    do_train=True,
    do_eval=True,
    do_predict=False,
    evaluation_strategy="steps",
    per_device_train_batch_size=64,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=8,  # for 1 device results in 512 bsz,
    eval_accumulation_steps=1,
    # gradient_accumulation_steps=8, # for 2 devices results in 1024 bsz,
    learning_rate=3e-4,
    weight_decay=0.01,
    max_grad_norm=0,
    warmup_steps=1300,
    log_level="debug",
    lr_scheduler_type="linear",
    logging_strategy="steps",
    max_steps=100*(673191//(64*8)),
    # num_train_epochs=10,
    logging_steps=16,
    logging_dir='./logs',
    save_strategy="steps",
    save_steps=500,
    save_total_limit=5,  # keeps 5 checkpoints only
    no_cuda=not USE_CUDA,
    seed=1993,
    fp16=FP16,
    fp16_full_eval=FP16_EVAL,
    bf16=BF16,
    bf16_full_eval=BF16_EVAL,
    load_best_model_at_end=True,
    report_to=["tensorboard"],
    gradient_checkpointing=False, # when True saves memory in the expense of slower backward
)

trainer = Trainer(
    model=model,
    args=training_config,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    callbacks=None
)

using `logging_steps` to initialize `eval_steps` to 250
PyTorch: setting up devices
max_steps is given, it will override any value given in num_train_epochs
Using auto half precision backend


BF16_float used
Using CUDA as True


In [11]:
train_result = trainer.train()

Currently training with a batch size of: 192
***** Running training *****
  Num examples = 709,356
  Num Epochs = 286
  Instantaneous batch size per device = 64
  Training with DataParallel so batch size has been adjusted to: 192
  Total train batch size (w. parallel, distributed & accumulation) = 1,536
  Gradient Accumulation steps = 8
  Total optimization steps = 131,400
  Number of trainable parameters = 34,419,226
