# Get Data

Setup up huggingface credential first `huggingface-cli login`

Create a huggingface_hub account if you don't have one already or log in.
https://huggingface.co/join

Then Create a token
https://huggingface.co/settings/tokens


In [None]:
!pip install --upgrade huggingface_hub --quiet
!pip install datasets --quiet
!pip install transformers --quiet

In [None]:
from huggingface_hub import login

login()

In [None]:
from datasets import load_dataset, load_from_disk

model_name_or_path = "EleutherAI/gpt-j-6b"
dataset_name = "wikitext"
dataset_config_name = "wikitext-103-raw-v1"
cache_dir = "cache"
streaming = False
max_train_samples = 100
max_eval_samples = 100
tokenized_save_dir = "./data/wiki_tokenized_dataset_chunk"


In [None]:
raw_datasets = load_dataset(
        dataset_name,
        dataset_config_name,
        cache_dir=cache_dir,
        use_auth_token=False,
        streaming=streaming,
)
raw_datasets

In [None]:
raw_datasets.shape

In [None]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
)

tokenizer_kwargs = {
    "cache_dir": cache_dir,
    "use_fast": True,
    "use_auth_token": False,
}

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, **tokenizer_kwargs)
column_names = ["text"]

def tokenize_function(examples):
    output = tokenizer(examples["text"])
    return output

if not streaming:
    tokenized_datasets = raw_datasets.map(
        tokenize_function,
        batched=True,
        remove_columns=column_names,
        load_from_cache_file=True,
        desc="Running tokenizer on dataset",
    )
else:
    tokenized_datasets = raw_datasets.map(
        tokenize_function,
        batched=True,
        remove_columns=column_names,
    )
tokenized_datasets

In [None]:
from itertools import chain

block_size = tokenizer.model_max_length

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

if not streaming:
    lm_datasets = tokenized_datasets.map(
        group_texts,
        batched=True,
        load_from_cache_file=True,
        desc=f"Grouping texts in chunks of {block_size}",
    )
else:
    lm_datasets = tokenized_datasets.map(
        group_texts,
        batched=True,
    )
lm_datasets

In [None]:
lm_datasets.save_to_disk(tokenized_save_dir)

In [None]:
lm_datasets = load_from_disk(tokenized_save_dir)
lm_datasets

## Upload Data to S3

In [None]:
import sagemaker
sess = sagemaker.Session()
bucket = sess.default_bucket()                    # Set a default S3 bucket
prefix = 'demo-clm-finetune'

In [None]:
bucket, prefix

In [None]:
# save train_dataset to s3
training_input_path = f"s3://{bucket}/{prefix}/wiki-tokenized-dataset-chunk"
lm_datasets.save_to_disk(training_input_path)

print("uploaded data to:")
print(f"training dataset to: {training_input_path}")