In [None]:
!pip install git+https://github.com/huggingface/optimum-neuron.git

In [1]:
!neuron-ls

instance-type: trn1.2xlarge
instance-id: i-0570615e41700a481
+--------+--------+--------+---------+
| NEURON | NEURON | NEURON |   PCI   |
| DEVICE | CORES  | MEMORY |   BDF   |
+--------+--------+--------+---------+
| 0      | 2      | 32 GB  | 00:1e.0 |
+--------+--------+--------+---------+


## Preprocess dataset

In [2]:
from datasets import load_dataset

# Dataset id from huggingface.co/dataset
dataset_id = "banking77"
# Model id to load the tokenizer
model_id = "bert-base-uncased"
save_dataset_path = "dataset"


# Load raw dataset
raw_dataset = load_dataset(dataset_id)

print(f"Train dataset size: {len(raw_dataset['train'])}")
print(f"Test dataset size: {len(raw_dataset['test'])}")

# Train dataset size: 10003
# Test dataset size: 3080
from random import randrange

random_id = randrange(len(raw_dataset['train']))
print(raw_dataset['train'][random_id])
# {'text': 'How can I change my PIN without going to the bank?', 'label': 21}

import os
from transformers import AutoTokenizer


# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Tokenize helper function
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True,return_tensors="pt")

# Tokenize dataset
raw_dataset =  raw_dataset.rename_column("label", "labels") # to match Trainer
tokenized_dataset = raw_dataset.map(tokenize, batched=True, remove_columns=["text"])
tokenized_dataset = tokenized_dataset.with_format("torch")

print(tokenized_dataset["train"].features.keys())
# dict_keys(['input_ids', 'token_type_ids', 'attention_mask','lable'])
# save dataset to disk
tokenized_dataset["train"].save_to_disk(os.path.join(save_dataset_path,"train"))
tokenized_dataset["test"].save_to_disk(os.path.join(save_dataset_path,"eval"))

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset banking77 (/home/ubuntu/.cache/huggingface/datasets/banking77/default/1.1.0/ff44c4421d7e70aa810b0fa79d36908a38b87aff8125d002cd44f7fcd31f493c)
100%|██████████| 2/2 [00:00<00:00, 130.36it/s]
Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/banking77/default/1.1.0/ff44c4421d7e70aa810b0fa79d36908a38b87aff8125d002cd44f7fcd31f493c/cache-084cb9babe899b20.arrow
Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/banking77/default/1.1.0/ff44c4421d7e70aa810b0fa79d36908a38b87aff8125d002cd44f7fcd31f493c/cache-5f7f794fe0ef7b57.arrow


Train dataset size: 10003
Test dataset size: 3080
{'text': 'Can I change my address?', 'label': 30}
dict_keys(['labels', 'input_ids', 'token_type_ids', 'attention_mask'])


                                                                                                

## precompiple 

In [3]:
!neuron_parallel_compile python3 scripts/train.py --model_id bert-large-uncased --per_device_train_batch_size 8 

2023-03-07 13:08:19.000831: INFO ||PARALLEL_COMPILE||: Removing existing workdir /tmp/parallel_compile_workdir
2023-03-07 13:08:19.000858: INFO ||PARALLEL_COMPILE||: Running trial run (add option to terminate trial run early; also ignore trial run's generated outputs, i.e. loss, checkpoints)
is precompilation: 1
Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if

# train

In [None]:
!python3 scripts/train.py --model_id bert-large-uncased --per_device_train_batch_size 8 