In [None]:
%pip install --quiet --upgrade transformers datasets sagemaker s3fs

In [None]:
import sagemaker
import boto3

In [None]:
sess = sagemaker.Session()
sess

In [None]:
role = sagemaker.get_execution_role()
role

In [None]:
sess.boto_region_name

In [None]:
from datasets import load_dataset
from random import randrange

dataset = load_dataset("databricks/databricks-dolly-15k", split = "train")

print(f"dataset size: {len(dataset)}")
print(dataset[randrange(len(dataset))])

In [None]:
def format_dolly(sample):
    instruction = f"### Instruction\n{sample['instruction']}"
    context = f"### Context\n{sample['context']}" if len(sample['context']) > 0 else None
    response = f"### Answer\n{sample['response']}"
    prompt = "\n\n".join([i for i in [instruction,context,response] if i is not None ])
    return prompt

In [None]:
print(format_dolly(dataset[randrange(len(dataset))]))

In [None]:
import os

os.environ["HF_TOKEN"] = "<INSERT YOUR OWN TOKEN>"

In [None]:
from transformers import AutoTokenizer

model_id = "mistralai/Mixtral-8x7B-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
dataset.features

In [None]:
from random import randint
from itertools import chain
from functools import partial


def template_dataset(sample):
    sample['text'] = f"{format_dolly(sample)}{tokenizer.eos_token}"
    return sample

dataset = dataset.map(template_dataset,remove_columns=list(dataset.features))

#print(dataset[randint(0,len(dataset))]['text'])

remainder = {'input_ids':[],'attention_mask':[],'token_type_ids':[]}

def chunk(sample, chunk_length = 2048):

    global remainder

    concatenated_examples = {k: list(chain(*sample[k])) for k in sample.keys()}

    concatenated_examples = {
        k: remainder[k] + concatenated_examples[k] for k in concatenated_examples.keys()
    }

    batch_total_length = len(concatenated_examples[list(sample.keys())[0]])

    if batch_total_length >= chunk_length:
        batch_total_length = (batch_total_length // chunk_length) * chunk_length
        
    result = {
        k: [t[i: i + chunk_length] for i in range(0, batch_total_length, chunk_length)] for k,t in concatenated_examples.items()
    }
    
    remainder = {
        k: concatenated_examples[k][batch_total_length:] for k in concatenated_examples.keys()
    }

    result["labels"] = result["input_ids"].copy()
    return result

lm_dataset = dataset.map(
    lambda sample: tokenizer(sample["text"]),
    batched = True,
    remove_columns = list(dataset.features)
).map(
    partial(chunk,chunk_length=2048),
    batched = True
)

print(f"Total number of samples: {len(lm_dataset)}") # you have this many chunks, and each is 2048 tokens long


In [None]:
207655-206848 # 807 tokens from the first batch that will be put into the remainder dictirionary,
#and that will be processed with tht next batch

In [None]:

import s3fs

training_input_path = f"s3://<YOUR_BUCKET_NAME>/processed/mixtral/dolly/train"

lm_dataset.save_to_disk(training_input_path)
print("uploading the dataset to s3")


In [None]:
import time
from sagemaker.huggingface import HuggingFace


job_name = f"mixtral-8x7b-qlora-{time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())}"

hyperparameters = {
    "model_id": model_id,
    "dataset_path": "/opt/ml/input/data/training",
    "epochs": 2,
    "per_device_train_batch_size": 2,
    "lr": 2e-4,
    "merge_weights": True,
}

huggingface_estimator = HuggingFace(
    entry_point = "run_clm.py",
    source_dir= "scripts",
    instance_type = "ml.g5.24xlarge",
    instance_count = 1,
    base_job_name = job_name,
    role = role,
    volume_size = 300,
    transformers_version= "4.28",
    pytorch_version= "2.0",
    py_version= "py310",
    hyperparameters = hyperparameters,
    environment = {
        "HUGGINFACE_HUB_CACHE": "/tmp/.cache"
    }
)





In [None]:
data = {"training": training_input_path}
huggingface_estimator.fit(data,wait = True)