# Get Data

Setup up huggingface credential first `huggingface-cli login`

In [3]:
from datasets import load_dataset, load_from_disk

model_name_or_path = "EleutherAI/gpt-j-6b"
dataset_name = "wikitext"
dataset_config_name = "wikitext-103-raw-v1"
cache_dir = "cache"
streaming = False
max_train_samples = 100
max_eval_samples = 100
tokenized_save_dir = "./data/wiki_tokenized_dataset_chunk"


In [4]:
raw_datasets = load_dataset(
        dataset_name,
        dataset_config_name,
        cache_dir=cache_dir,
        use_auth_token=False,
        streaming=streaming,
)
raw_datasets

Downloading builder script:   0%|          | 0.00/8.48k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/6.84k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/9.25k [00:00<?, ?B/s]

Downloading and preparing dataset wikitext/wikitext-103-raw-v1 to /home/ec2-user/SageMaker/workspace/clm-lora/cache/wikitext/wikitext-103-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126...


Downloading data:   0%|          | 0.00/192M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1801350 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Dataset wikitext downloaded and prepared to /home/ec2-user/SageMaker/workspace/clm-lora/cache/wikitext/wikitext-103-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 1801350
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

In [5]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
)

tokenizer_kwargs = {
    "cache_dir": cache_dir,
    "use_fast": True,
    "use_auth_token": False,
}

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, **tokenizer_kwargs)
column_names = ["text"]

def tokenize_function(examples):
    output = tokenizer(examples["text"])
    return output

if not streaming:
    tokenized_datasets = raw_datasets.map(
        tokenize_function,
        batched=True,
        remove_columns=column_names,
        load_from_cache_file=True,
        desc="Running tokenizer on dataset",
    )
else:
    tokenized_datasets = raw_datasets.map(
        tokenize_function,
        batched=True,
        remove_columns=column_names,
    )
tokenized_datasets

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


Downloading (…)okenizer_config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/4.04k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

Running tokenizer on dataset:   0%|          | 0/4358 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/1801350 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/3760 [00:00<?, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1801350
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 3760
    })
})

In [6]:
from itertools import chain

block_size = tokenizer.model_max_length

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

if not streaming:
    lm_datasets = tokenized_datasets.map(
        group_texts,
        batched=True,
        load_from_cache_file=True,
        desc=f"Grouping texts in chunks of {block_size}",
    )
else:
    lm_datasets = tokenized_datasets.map(
        group_texts,
        batched=True,
    )
lm_datasets

Grouping texts in chunks of 2048:   0%|          | 0/4358 [00:00<?, ? examples/s]

Grouping texts in chunks of 2048:   0%|          | 0/1801350 [00:00<?, ? examples/s]

Grouping texts in chunks of 2048:   0%|          | 0/3760 [00:00<?, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 136
    })
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 56670
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 119
    })
})

In [7]:
lm_datasets.save_to_disk(tokenized_save_dir)

Saving the dataset (0/1 shards):   0%|          | 0/136 [00:00<?, ? examples/s]

Saving the dataset (0/4 shards):   0%|          | 0/56670 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/119 [00:00<?, ? examples/s]

In [12]:
lm_datasets = load_from_disk(tokenized_save_dir)
lm_datasets

DatasetDict({
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 136
    })
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 56670
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 119
    })
})

## Upload Data to S3

In [13]:
import sagemaker
sess = sagemaker.Session()
bucket = sess.default_bucket()                    # Set a default S3 bucket
prefix = 'demo-clm-finetune'

In [10]:
bucket, prefix

('sagemaker-us-east-1-783128296767', 'demo-clm-finetune')

In [14]:
# save train_dataset to s3
training_input_path = f"s3://{bucket}/{prefix}/wiki-tokenized-dataset-chunk"
lm_datasets.save_to_disk(training_input_path)

print("uploaded data to:")
print(f"training dataset to: {training_input_path}")

Saving the dataset (0/1 shards):   0%|          | 0/136 [00:00<?, ? examples/s]

Saving the dataset (0/4 shards):   0%|          | 0/56670 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/119 [00:00<?, ? examples/s]

uploaded data to:
training dataset to: s3://sagemaker-us-east-1-783128296767/demo-clm-finetune/wiki-tokenized-dataset-chunk
