# ALPACA-BOT SageMaker

This solution was inspired by this github: https://gitlab.com/juliensimon/huggingface-demos/-/blob/main/langchain/rag-demo-sagemaker-textract/Building%20a%20simple%20RAG%20chatbot.ipynb

In [1]:
# Instaling packages
%%sh
pip install --upgrade langchain==0.1.11
pip install --upgrade langchain-core==0.1.30
pip install SQLAlchemy
pip install sagemaker amazon-textract-caller amazon-textract-textractor sentence-transformers pypdf pip install faiss-cpu -qU

Couldn't find program: 'sh'


In [None]:
#Importing libraries
import boto3, json, sagemaker
from typing import Dict
from langchain import LLMChain
from langchain.docstore.document import Document
from langchain.prompts import PromptTemplate
from langchain.llms import SagemakerEndpoint
from langchain.llms.sagemaker_endpoint import LLMContentHandler
from sagemaker.huggingface import HuggingFaceModel, get_huggingface_llm_image_uri


**Why to use MISTRAL 7B?** It stands out for its efficiency despite being a smaller model compared to others, such as Llama. Despite its compact size, MISTRAL has demonstrated outstanding performance and additionally, its status as an open-source project provides users with the freedom to adapt and enhance the model according to their specific needs. The comprehensive documentation available facilitates its implementation and understanding.

## Deploy LLM on SageMaker

In [None]:
#Taken from HugginFace Mistral documentation
import json
import sagemaker
import boto3
from sagemaker.huggingface import HuggingFaceModel, get_huggingface_llm_image_uri

try:
	role = sagemaker.get_execution_role()
except ValueError:
	iam = boto3.client('iam')
	role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

# Hub Model configuration. https://huggingface.co/models
hub = {
	'HF_MODEL_ID':'mistralai/Mistral-7B-Instruct-v0.2',
	'SM_NUM_GPUS': json.dumps(1)
}

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
	image_uri=get_huggingface_llm_image_uri("huggingface",version="1.4.2"),
	env=hub,
	role=role, 
)

# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
	initial_instance_count=1,
	instance_type="ml.g5.2xlarge",
	container_startup_health_check_timeout=300,
  )
  
  

In [None]:
#Getting endpoint
endpoint_name = predictor.endpoint_name
endpoint_name

## Configure LLM in LangChain

**Parameters:**
**max_new_tokens**: This parameter specifies the maximum number of new tokens (text units, such as words or subwords) that the model will generate as output. Limiting this quantity can be useful to control the length of the generated output.

**top_p**: Also known as "nucleus sampling" or "nucleus decoding," this parameter controls the cumulative probability of generated words. It limits generation to words that have a cumulative probability above a certain threshold (top_p). This helps prevent the model from continually generating improbable or irrelevant text.

**temperature**: This parameter affects the smoothness of probability distributions during generation. A higher value of temperature (e.g., 1.0) makes the generation more diverse but potentially less coherent, while a lower value (e.g., 0.8) tends to produce more deterministic and coherent outputs.

In [None]:
model_kwargs = {"max_new_tokens": 512, "top_p": 0.8, "temperature": 0.8}

In [None]:
# This class and its methods facilitate the preparation and processing of JSON-formatted data for interaction with the model.
class ContentHandler(LLMContentHandler):
    content_type = "application/json"
    accepts = "application/json"

    def transform_input(self, prompt: str, model_kwargs: Dict) -> bytes:
        input_str = json.dumps(
            # Mistral prompt, see https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1
            {"inputs": f"<s>[INST] {prompt} [/INST]", "parameters": {**model_kwargs}}
        )
        return input_str.encode("utf-8")

    def transform_output(self, output: bytes) -> str:
        response_json = json.loads(output.read().decode("utf-8"))
        splits = response_json[0]["generated_text"].split("[/INST] ")
        return splits[1]

content_handler = ContentHandler()

In [None]:
# This code block is setting up a connection to a SageMaker endpoint using the AWS SDK for Python (Boto3) and the class named SagemakerEndpoint
sm_client = boto3.client("sagemaker-runtime") # needed for AWS credentials

llm = SagemakerEndpoint(
    endpoint_name=endpoint_name,
    model_kwargs=model_kwargs,
    content_handler=content_handler,
    client=sm_client,
)

## RAG process with md files

In [None]:
# Importing library to process markdown documents
from markdown2 import markdown_path

class MarkdownLoader:
    def load_document(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            markdown_content = file.read()
        return markdown_content


from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA

### Uploading files to an S3 bucket


In [None]:
# Define S3 bucket and prefix for markdown storage

bucket = sagemaker.Session().default_bucket()
prefix = "clementine-documentation"

In [None]:
# Build list of S3 URIs

s3 = boto3.client("s3")
objs = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
objs = objs['Contents']
uris = [f's3://{bucket}/{obj["Key"]}' for obj in objs]
uris    

### Analyze documents with Amazon Textract and split them in chunks

In [None]:
# Spliting the markdown files into chunks of a 256 size
%%time


markdown_loader = MarkdownLoader()  
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)

all_chunks = []

for uri in uris:
    document = markdown_loader.load_document(uri)  
    chunks = splitter.split_documents(document)
    all_chunks += chunks
    print(f"Loaded {uri}, {len(chunks)} chunks")


### Embed document chunks and store them with CHROMA

In [None]:
# Define embedding model

embedding_model_id = "all-MiniLM-L6-v2"

embeddings = HuggingFaceEmbeddings(
    model_name=embedding_model_id,
)

In [None]:
# Creating the database
%%time
# Embed chunks
embeddings_db = Chroma.from_documents(all_chunks, embeddings)


********

### Configure RAG using chain

In [None]:
retriever = embeddings_db.as_retriever(search_kwargs={"k": 10})

In [None]:
# Define prompt template
prompt_template = """
Clementine: please answer the question below, focusing on numerical data and using only the context below.
Don't invent facts. If you can't provide a factual answer, say you don't know what the answer is.

question: {question}

context: {context}
"""

prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

In [None]:
# Creating Chatbot
chain = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff",
    retriever=retriever, 
    chain_type_kwargs = {"prompt": prompt})

### Asking the question again

In [None]:
question = "What are all regions where AWS Sagemaker is available?"
answer = chain.run({"query": question})
print(answer)

In [None]:
question = " How to check if an endpoint is KMS encrypted?"
answer = chain.run({"query": question})
print(answer)

## Delete endpoint and model

In [None]:
predictor.delete_model()
predictor.delete_endpoint()