# Download Model
### Make sure you have enough drive space to get this model (300GB)

In [None]:
!pip install huggingface_hub -q

In [None]:
access_token = "" # Fill in your token here
hf_model_id = "meta-llama/Llama-3.1-70B-Instruct"

In [None]:
from huggingface_hub import snapshot_download
from pathlib import Path

models_path = Path.home().joinpath('SageMaker', hf_model_id)
models_path.mkdir(parents=True, exist_ok=True)

model_path = snapshot_download(repo_id=hf_model_id, local_dir=models_path, token=access_token)
print(model_path)

# Counting Tokens

In [None]:
!pip install transformers -q

In [None]:
import transformers

tokenizer = transformers.AutoTokenizer.from_pretrained(f"/home/ec2-user/SageMaker/{hf_model_id}")

In [None]:
len(tokenizer.tokenize("what is sagemaker?"))

# Deploy Endpoint

In [None]:
import sagemaker
from sagemaker.jumpstart.model import JumpStartModel
from datetime import datetime

endpoint_name = sagemaker.utils.name_from_base(f"llama-js")

instance_type = "ml.p4d.24xlarge" # This can also be "ml.g5.48xlarge", but will run slower than the p4d version
model_id = "meta-textgeneration-llama-3-1-70b-instruct"

model = JumpStartModel(model_id=model_id, instance_type=instance_type, env={'SERVING_CHUNKED_READ_TIMEOUT': str(600),'SERVING_PREDICT_TIMEOUT': str(600)})
predictor = model.deploy(endpoint_name=endpoint_name, accept_eula=True)

# Run Inference

In [None]:
# This function was written by Generative AI to create randomly long strings to simulate large token payloads
import random
import string

def generate_word(min_length=3, max_length=10):
    length = random.randint(min_length, max_length)
    return ''.join(random.choice(string.ascii_lowercase) for _ in range(length))

def generate_text(num_words):
    words = []
    for _ in range(num_words):
        words.append(generate_word())

        # Add punctuation occasionally
        if random.random() < 0.1:
            words[-1] += random.choice('.,...?!')

        # Add paragraph break occasionally
        if random.random() < 0.05:
            words.append('\n\n')

    return ' '.join(words)

In [None]:
import boto3
import json

import botocore
import boto3
config = botocore.config.Config(
    read_timeout=600,
    connect_timeout=600
)

sagemaker_runtime = boto3.client("sagemaker-runtime", verify=True, config=config)

In [None]:
payload_text = generate_text(30000) # This seems to produce around 110000 tokens
input_prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful AI assistant for travel tips and recommendations {payload_text}<|eot_id|>

<|start_header_id|>user<|end_header_id|>
Write a movie screenplay about corgis being attacked by aliens<|eot_id|>

<|start_header_id|>assistant<|end_header_id|>It was a dark night"""
print(f"# tokens = {len(tokenizer.tokenize(input_prompt))}")

## Non-Streaming Request

In [None]:
%%time
print(f"Start Time - {datetime.now().strftime('%H:%M:%S')}")       # Hours:Minutes:Seconds
payload = {
    "inputs": input_prompt,
    "parameters": {
        "max_new_tokens": 4096,
        "top_p": 0.9,
        "temperature": 0.6,
    },
}
# print(payload)
try:
    response = sagemaker_runtime.invoke_endpoint(
        EndpointName=f"{endpoint_name}",
        ContentType="application/json",
        Body=json.dumps(payload)
    )
    # print(response)
    t = response['Body']
    # print(t)
    t_read = t.read()
    # print(t_read)
    j = json.loads(t_read)
    print(j['generated_text'])
except Exception as e:
    print(f"Error - {e}")

In [None]:
%%time
print(f"Start Time - {datetime.now().strftime('%H:%M:%S')}")       # Hours:Minutes:Seconds
payload = {
    "inputs": input_prompt,
    "parameters": {
        "max_new_tokens":4096, 
        "top_p":0.9, 
        "temperature":0.6, 
        "stream": True
    }
}
# print(payload)
try:
    streaming_response = sagemaker_runtime.invoke_endpoint_with_response_stream(
        EndpointName=f"{endpoint_name}",
        ContentType="application/json",
        Body=json.dumps(payload)
    )

    chunk = ''
    first_token = False
    for event in streaming_response["Body"]:
        # print(event)
        chunk += event["PayloadPart"]["Bytes"].decode('utf-8')
        try:
            chunk_dict = json.loads(chunk)
            if not first_token:
                first_token = True
                print(f"Time of First Token: {datetime.now().strftime('%H:%M:%S')}")       # Hours:Minutes:Seconds
            chunk = ''
            print(chunk_dict['token'].get("text", ""), end="")
        except Exception as e:
            None
except Exception as e:
    print(f"Exception - {e}")
print('\n')

# Cleanup

In [None]:
predictor.delete_endpoint(delete_endpoint_config=True)
model.delete_model()