In [None]:
%%capture
!pip install sagemaker
!pip install transformers
!pip install datasets

In [None]:
from fastformer import FastformerForCausalLM, FastformerLMConfig
from transformers import TrainingArguments, Trainer, default_data_collator
from datasets import load_dataset

from itertools import chain

In [None]:
import sagemaker
sess = sagemaker.Session()
role = sagemaker.get_execution_role()

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-small")

In [None]:
raw_datasets = load_dataset("wikitext", "wikitext-103-raw-v1")
column_names = raw_datasets["train"].column_names
text_column_name = "text" if "text" in column_names else column_names[0]
block_size = 1024

def tokenize_function(examples):
    output = tokenizer(examples[text_column_name])
    return output

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

tokenized_datasets = raw_datasets.map(
            tokenize_function,
            batched=True,
            num_proc=2,
            remove_columns=column_names,
            desc="tokenization"
        )

lm_dataset = tokenized_datasets.map(
    group_texts,
    batched=True,
    num_proc=2,
    desc=f"Grouping texts in chunks of {block_size}",
)

lm_dataset.set_format('pt')

In [None]:
import botocore
from datasets.filesystems import S3FileSystem

s3 = S3FileSystem()
s3_prefix = "wikitext103"

# save train_dataset to s3
training_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/train'
lm_dataset.save_to_disk(training_input_path,fs=s3)

In [None]:
from sagemaker.huggingface import HuggingFace

# hyperparameters, which are passed into the training job
hyperparameters={'epochs': 3,
                 'train_batch_size': 32,
                 'eval_batch_size': 64,
                 'learning_rate': 1e-3,
                 'subset': 200,
                 'output_data_dir': "/results/outputs"
                 'model_dir': "/results/models"
                 }

In [None]:
distribution = {'smdistributed':{'dataparallel':{ 'enabled': True }}}

huggingface_estimator = HuggingFace(
    entry_point='Sagemaker_Train.py',
    source_dir='Additive-Attention-Is-All-You-Need/',
    instance_type='ml.p3.8xlarge',
    instance_count=4,
    role=role,
    py_version='py38',
    transformers_version='4.12',
    pytorch_version='1.9',
    hyperparameters=hyperparameters,
    distribution=distribution
)

In [None]:
huggingface_estimator.fit()