In [1]:
%pip install -r scripts/requirements.txt

Note: you may need to restart the kernel to use updated packages.


#### Development environment

#### Permissions

In [2]:
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
sagemaker role arn: arn:aws:iam::211125449279:role/service-role/AmazonSageMaker-ExecutionRole-20240307T175168
sagemaker bucket: sagemaker-eu-west-1-211125449279
sagemaker session region: eu-west-1


#### Prepare and store the dataset

In [4]:
# Specify the dataset and the model to be downloaded
from datasets import load_dataset
from transformers import AutoTokenizer
from typing import Union
from dotenv import load_dotenv
load_dotenv("../env.txt")
from huggingface_hub import login
login(token=os.getenv("HF_TOKEN"))

# dataset used
data_path = 'MichaelAI23/English_CVs'

# s3 key prefix for the data
s3_prefix = 'samples/datasets/English_CVs'

cutoff_len = 1024 # 512
base_model = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(base_model)
use_special_token = True

if use_special_token:
    # Add the single quote as special token for faster inference (only fill the blanks --> no embedding training necessary, since token exists already)
    tokenizer.add_tokens(["\'"], special_tokens=True) #, "}, "])

    assert len(tokenizer) == 32000

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/ec2-user/.cache/huggingface/token
Login successful




tokenizer_config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

In [5]:
template = {
    "prompt_input": "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\nInstruction:\n{instruction}\n\nInput:\n{input}\n\nResponse:\n",
    "prompt_no_input": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\nInstruction:\n{instruction}\n\nResponse:\n",
    "response_split": "Response:"
}

In [6]:
def generate_prompt(
    template: str,
    instruction: str,
    input: Union[None, str] = None,
    label: Union[None, str] = None,
) -> str:
    # returns the full prompt from instruction and optional input
    # if a label (=response, =output) is provided, it's also appended.
    if label:
        res = template["prompt_input"].format(
            instruction=instruction, input=input
        )
        messages = [
            {"role": "user", "content": res},
            {"role": "assistant", "content": f" {label}"}
        ]

    else:
        res = template["prompt_input"].format(
            instruction=instruction, input=input
        )
        messages = [
            {"role": "user", "content": res},
        ]

    return messages

def tokenize(messages):
    # Create the final prompt (with the template and tokenize it)
    final_prompt = tokenizer.apply_chat_template(messages, tokenize=False)

    result = tokenizer(
        final_prompt,
        truncation=True,
        add_special_tokens=False, # this is already done by the application of the chat template
        max_length=cutoff_len,
        padding=False,
        return_tensors=None,
    )
    result["full_prompt"] = final_prompt


    result["labels"] = result["input_ids"].copy()

    return result

def generate_and_tokenize_prompt(data_point):
    full_prompt = generate_prompt(
        template,
        data_point["instruction"],
        data_point["input"],
        data_point["output"],
    )
    tokenized_full_prompt = tokenize(full_prompt)
    return tokenized_full_prompt

In [7]:
data = load_dataset(data_path)
data["train"]

Downloading readme:   0%|          | 0.00/538 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/877k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset({
    features: ['output', 'hobbies', 'personal', 'work_experience', 'skills', 'academia', 'education', 'overall'],
    num_rows: 1000
})

In [16]:
print(data["train"]["overall"][10])

Levi Cardoso can be reached at levi.cardoso@wynnlasvegas.com for any inquiries. His address is 1234 Wynn Way, Las Vegas, NV, USA. You can contact him at (555) 123-4567. Levi Cardoso is a dedicated employee at The Wynn in Las Vegas and always ready to assist with any requests.

Proficient in Python, R, and SQL for data manipulation and analysis. Skilled in machine learning algorithms and statistical modeling to derive actionable insights from complex datasets.

I enjoy skiing during the winter months, exploring new hiking trails in the summer, and practicing yoga for relaxation. Additionally, I have a passion for cooking and experimenting with new recipes in my spare time.

A bachelor's degree in Computer Science from Universidad Complutense de Madrid in 2010 and a master's degree in Data Science from Universidad Politécnica de Madrid in 2014 were successfully completed.

Data Scientist, 2014 - 2016 at Google
- Developed machine learning models to optimize search engine algorithms and i

In [19]:
import json
import re
output = data["train"]["output"][10]
output = re.sub("\'", "\"", output)
output = output.replace("\\n", "")
json.loads(output)

{'pe': {'s': 'Levi', 'e': ' requests.'},
 'ed': {'s': 'A bachelor', 'e': ' completed.'},
 'wo': {'s': 'Data Scientist', 'e': ' processes.'},
 'sk': {'s': 'Proficient', 'e': ' datasets.'}}

In [9]:
data["train"] = data["train"].add_column(
    "instruction",
    ["Extract the start and end sequences for the categories 'personal information', 'work experience', 'education' and 'skills' from the following text in dictionary form"]*len(data["train"])
)

data["train"] = data["train"].rename_column("overall", "input")
data["train"]["input"][0]

'Benjamin Costa is a guest at The Taj Exotica Resort & Spa in the Maldives. He can be reached via email at bencosta@example.com. His address is Villa 12, Oceanfront Drive, South Male Atoll, Maldives. For any inquiries or reservations, you can contact him at +960-123-4567.\nI enjoy swimming as a hobby, as it allows me to relax and stay active outside of my work as a Data Scientist. In addition, I also love hiking in the mountains and playing the guitar in my free time.\nData Scientist, 2014 - 2016 at Google\n- Developed machine learning models to optimize search algorithms.\n- Conducted data analysis and visualization to extract actionable insights from large datasets.\n- Collaborated with cross-functional teams to implement data-driven solutions for product enhancements.\n- Presented findings to stakeholders and provided recommendations for business strategies.\n\nSenior Data Scientist, 2012 - 2014 at Amazon\n- Led a team of data scientists in developing predictive analytics models for

In [10]:
val_set_size= 100

In [None]:
if val_set_size > 0:
    train_val = data["train"].train_test_split(
        test_size=val_set_size, shuffle=True, seed=42
    )
    train_data = (
        train_val["train"].map(generate_and_tokenize_prompt)
    )
    val_data = (
        train_val["test"].map(generate_and_tokenize_prompt)
    )
else:
    train_data = data["train"].shuffle().map(generate_and_tokenize_prompt)
    val_data = None

In [12]:
train_data

Dataset({
    features: ['output', 'hobbies', 'personal', 'work_experience', 'skills', 'academia', 'education', 'input', 'instruction', 'input_ids', 'attention_mask', 'full_prompt', 'labels'],
    num_rows: 900
})

In [15]:
val_data["input"][6]

'- I enjoy playing golf in my free time and often participate in local tournaments.\n- Photography is another passion of mine, capturing beautiful moments and landscapes brings me joy.\n- Cooking is a relaxing hobby for me, experimenting with new recipes and flavors is always exciting.\n- I love hiking and exploring nature trails on the weekends, it helps me unwind and stay active.\n- Reading fiction novels is a favorite pastime of mine, getting lost in a good book is a great escape from reality.\n\nAs a seasoned Business Analyst, I excel in translating complex business requirements into actionable insights that drive strategic decision-making. With a keen eye for detail and a knack for data analysis, I have a proven track record of delivering innovative solutions to enhance operational efficiency and maximize profitability. My strong communication skills enable me to effectively collaborate with cross-functional teams and stakeholders to achieve project objectives seamlessly.\n\n- Bac

In [16]:
tokenizer.decode(train_data["input_ids"][2]) #[:len(train_data["input_ids"][2]) - 64])

2024-05-21 16:31:42.466838: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-21 16:31:42.466996: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-21 16:31:42.468254: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-21 16:31:42.478998: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


"<s> [INST] Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\nInstruction:\nExtract the start and end sequences for the categories ' personal information' , ' work experience' , ' education'  and ' skills'  from the following text in dictionary form\n\nInput:\n\nProficient in statistical analysis and data visualization techniques, with a strong background in machine learning algorithms. Skilled at deriving actionable insights from complex datasets to drive informed decision-making and business growth.\n- I enjoy dancing salsa and bachata in my free time.\n- I am an avid hiker and love exploring new trails in the mountains.\n- Painting is a relaxing hobby of mine, and I often create abstract art pieces.\n- Cooking is another passion of mine, and I love experimenting with different cuisines.\n- I have a green thumb and enjoy gardening, especially growing herbs and vegetables.\n-

In [17]:
train_data["input_ids"][2][:len(train_data["input_ids"][2]) - 63][-1]

28742

In [None]:
train_data = train_data.map(lambda example: {"token_len": len(example["input_ids"])})
val_data = val_data.map(lambda example: {"token_len": len(example["input_ids"])})

In [27]:
max(train_data["token_len"])

1024

In [31]:
if max(train_data["token_len"]) > cutoff_len or max(val_data["token_len"]) > cutoff_len:
    raise ValueError("You have samples that are longer than your cutoff length. This can lead to unintended side consequences")

In [None]:
# save train_dataset to s3
training_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/train'
train_data.save_to_disk(training_input_path)

# save test_dataset to s3
val_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/val'
val_data.save_to_disk(val_input_path)

#### Fine-tuning & starting Sagemaker Training Job

In [38]:
import time
import re
time_string = time.strftime("%H-%M-%S")

base_job_name=f"LLM-Textmarker-{base_model}-{time_string}"
base_job_name = re.sub(r"[_/\.]", "-", base_job_name)
checkpoint_in_bucket="checkpoints"

# The S3 URI to store the checkpoints
checkpoint_s3_bucket="s3://{}/{}/{}".format(sess.default_bucket(), base_job_name, checkpoint_in_bucket)

# The local path where the model will save its checkpoints in the training container
checkpoint_local_path="/opt/ml/checkpoints"

In [39]:
from sagemaker.huggingface import HuggingFace

# hyperparameters, which are passed into the training job
hyperparameters={
    'epochs': 5,
    # 'batch_size': 8,
    'batch_size': 1,
    'base_model': base_model,
    'gradient_accumulation_steps': 8,
    'num_epochs': 5,
    'learning_rate': 3e-4,
    'cutoff_len': cutoff_len,
    'lora_r': 8,
    'lora_alpha': 16,
    'lora_dropout': 0.05,
    'group_by_length': False,
    'device_map': 'auto',
    'model_dir': checkpoint_local_path,
    'use_special_token': use_special_token

}

In [40]:
huggingface_estimator = HuggingFace(entry_point='train_mistral.py',
                            source_dir='./scripts',
                            instance_type='ml.g5.2xlarge',
                            instance_count=1,
                            base_job_name=base_job_name,
                            role=role,
                            transformers_version='4.36',
                            pytorch_version='2.1',
                            py_version='py310',
                            hyperparameters=hyperparameters,
                            # Parameters required to enable checkpointing
                            checkpoint_s3_uri=checkpoint_s3_bucket,
                            checkpoint_local_path=checkpoint_local_path)

In [None]:
# starting the train job with our uploaded datasets as input
huggingface_estimator.fit({'train': training_input_path, 'val': val_input_path}, job_name=base_job_name)

#### Tar the model files

In [67]:
!aws s3 sync s3://sagemaker-eu-west-1-211125449279/LLM-Textmarker-mistralai-Mistral-7B-Instruct-v0-2-15-24-12/checkpoints/model_files ./model

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


download: s3://sagemaker-eu-west-1-211125449279/LLM-Textmarker-mistralai-Mistral-7B-Instruct-v0-2-15-24-12/checkpoints/model_files/adapter_config.json to model/adapter_config.json
download: s3://sagemaker-eu-west-1-211125449279/LLM-Textmarker-mistralai-Mistral-7B-Instruct-v0-2-15-24-12/checkpoints/model_files/adapter_model.safetensors to model/adapter_model.safetensors


In [None]:
!tar zcvf model.tar.gz ./model

In [None]:
!aws s3 cp model.tar.gz s3://sagemaker-eu-west-1-211125449279/LLM-Textmarker-mistralai-Mistral-7B-Instruct-v0-2-15-24-12/checkpoints/model_files/model.tar.gz

#### Deploying the endpoint

In [None]:
# predictor = huggingface_estimator.deploy(1, "ml.g4dn.2xlarge")

In [None]:
# predictor.delete_model()
# predictor.delete_endpoint()