## imports

In [None]:
%pip install boto3

In [None]:
import boto3
import json
import os
import datetime

## setup

Create the clients and the policy for working with buckets.

Prerequisites: the S3 bucket with raw data and the IAM Role with the Policy created.

In [None]:
AWS_SERVER_PUBLIC_KEY = input("AWS_SERVER_PUBLIC_KEY: ")
AWS_SERVER_SECRET_KEY = input("AWS_SERVER_SECRET_KEY: ")
AWS_SESSION_TOKEN = input("AWS_SESSION_TOKEN: ")
REGION_NAME = input("REGION_NAME: ")

In [None]:
s3 = boto3.client(
    's3',
    aws_access_key_id=AWS_SERVER_PUBLIC_KEY,
    aws_secret_access_key=AWS_SERVER_SECRET_KEY,
    aws_session_token=AWS_SESSION_TOKEN,
    region_name=REGION_NAME
)

raw_bucket_name = "existing-bucket-name"  # Replace with the actual name
formatted_bucket_name = "existing-bucket-name"  # Replace with the actual name
role_name = "existing-role-name"  # Replace with the actual name

In [None]:
# NOTE in case I need to create the policy myself:

# iam = boto3.client(
#     "iam",
#     aws_access_key_id=AWS_SERVER_PUBLIC_KEY,
#     aws_secret_access_key=AWS_SERVER_SECRET_KEY,
#     aws_session_token=AWS_SESSION_TOKEN,
#     region_name=REGION_NAME
# )

# policy_arn = iam.create_policy(
#     PolicyName="Bedrock-Finetuning-Role-Policy",
#     PolicyDocument=json.dumps({
#         "Version": "2012-10-17",
#         "Statement": [
#             {
#                 "Effect": "Allow",
#                 "Action": [
#                     "s3:GetObject",
#                     "s3:PutObject",
#                     "s3:ListBucket"
#                 ],
#                 "Resource": [
#                     f"arn:aws:s3:::{raw_bucket_name}",
#                     f"arn:aws:s3:::{raw_bucket_name}/*",
#                     f"arn:aws:s3:::{formatted_bucket_name}",
#                     f"arn:aws:s3:::{formatted_bucket_name}/*"
#                 ]
#             }
#         ]
#     })
# )['Policy']['Arn']

# iam.attach_role_policy(
#     RoleName=role_name,
#     PolicyArn=policy_arn
# )

## data preparation

Download the raw data from S3, format it for the LLM, and save back to S3.

In [None]:
dataset_dir = "dataset"

In [None]:
def download_from_s3(s3_bucket, s3_key, local_filename):
    os.makedirs(dataset_dir, exist_ok=True)
    s3.download_file(s3_bucket, s3_key, f"{dataset_dir}/{local_filename}")
    print(f"Downloaded {s3_key} from {s3_bucket} to {dataset_dir}/{local_filename}")

# will modify later for the final raw data format
def load_dataset_from_file(local_filename):
    data = []
    file_path = f"{dataset_dir}/{local_filename}"
    with open(file_path, 'r') as f:
        for line in f:
            data.append(json.loads(line))
    return data

# the data formatting for LLaMa 3. will modify later for the final raw data format
def format_data(item: list[dict[str, str]]):
    formatted_data = ""
    for replic in item:
        formatted_data += "<|start_header_id|>" + replic["role"] + "<|end_header_id|>" + replic["content"]
    formatted_data = "<|begin_of_text|>" + formatted_data + "<|eot_id|>" + "<|end_of_text|>"
    return formatted_data

def save_formatted_data(local_filename, formatted_dataset):
    os.makedirs(dataset_dir, exist_ok=True)
    with open(f"{dataset_dir}/{local_filename}", "w") as f:
        for item in formatted_dataset:
            json.dump(item, f)
            f.write('\n')

def upload_to_s3(local_filename, s3_bucket, s3_key):
    full_path = f"{dataset_dir}/{local_filename}"
    s3.upload_file(full_path, s3_bucket, s3_key)
    print(f"Uploaded {local_filename} to {s3_bucket}/{s3_key}")

In [None]:
input_s3_key = "your-input-key.jsonl"  # Replace with the actual key
download_from_s3(raw_bucket_name, input_s3_key, "raw_data.jsonl")  # Replace with the actual name
dataset = load_dataset_from_file("raw_data.jsonl")

In [None]:
formatted_dataset = [format_data(item) for item in dataset]
save_formatted_data("formatted_data.jsonl", formatted_dataset)

In [None]:
output_s3_key = "formatted_data.jsonl"  # Replace with the actual key
upload_to_s3("formatted_data.jsonl", formatted_bucket_name, output_s3_key)

## training

In [None]:
bedrock = boto3.client(
    'bedrock',
    aws_access_key_id=AWS_SERVER_PUBLIC_KEY,
    aws_secret_access_key=AWS_SERVER_SECRET_KEY,
    aws_session_token=AWS_SESSION_TOKEN,
    region_name=REGION_NAME
)
account_id = boto3.client('sts').get_caller_identity()['Account']
    
customModelName = "meta.llama3-8b-instruct-v1:0-therapist"
baseModelIdentifier = "arn:aws:bedrock:us-east-1::foundation-model/meta.llama3-8b-instruct-v1:0"

datetime_string = datetime.datetime.now().strftime("%Y%m%d%H%M%S")

In [None]:
response_ft = bedrock.create_model_customization_job(
    jobName=f"Finetune-Job-{datetime_string}",
    customizationType="FINE_TUNING",
    roleArn=f"arn:aws:iam::{account_id}:role/Bedrock-Finetuning-Role-{account_id}",
    hyperParameters = {
        "epochCount": "5",
        "batchSize": "1",
        "learningRate": ".0001",
        # "learningRateWarmupSteps": "5"
    },
    trainingDataConfig={"s3Uri": f"s3://bedrock-finetuning-{account_id}/train.jsonl"},
    outputDataConfig={"s3Uri": f"s3://bedrock-finetuning-{account_id}/finetuning-output"},
    customModelName=customModelName,
    baseModelIdentifier=baseModelIdentifier
)

In [None]:
jobArn = response_ft.get('jobArn')
print(jobArn)