In [None]:
!pip install langchain==0.2.5 langchain-community==0.2.5 langchain-core==0.2.9 langchain-openai==0.1.9 bitsandbytes accelerate xformers triton transformers sentence-transformers datasets peft trl

- llama3-8B is the base model which basically just do the completions to the input prompt, But llama3-8B Instruct is finetuned for instruction following and multi-turn conversation templates for assistant completions as chat response.

- If your specific purpose is for chat completions then instruct is the best choice otherwise if it is for simple completions of input then base model is fine. But there might be a chance for the model to continue generation till max_seq_len is achieved while generating while using base model.

- Llama 3 8B model has a knowledge cut-off of March, 2023.
- Llama 3 70B model has a knowledge cut-off of December, 2023.

# Use the one directly from meta

## Create the pipeline

### 1. Importing Libraries:

- Various necessary modules from the Transformers library and PyTorch are imported. langchain.llms is also imported for integration with LangChain.

In [None]:
import transformers
import torch
from torch import cuda
from transformers import AutoTokenizer, BitsAndBytesConfig, pipeline, StoppingCriteria, StoppingCriteriaList, AutoConfig, AutoModelForCausalLM
from langchain.llms import HuggingFacePipeline

In [None]:
import os
import configparser


def credential_init():
    """
    Initializes and sets environment variables for API keys from a configuration file.

    This function reads a configuration file named 'credentials.ini' located in the 'config' directory.
    It extracts API keys for different services (OpenAI, SERPER, and TAVILY) and sets them as environment variables.

    The configuration file should have the following structure:

    [openai]
    api_key = your_openai_api_key

    [SERPER_API_KEY]
    api_key = your_serper_api_key

    [TAVILY_API_KEY]
    api_key = your_tavily_api_key

    Raises:
        KeyError: If any of the required sections or keys are missing in the configuration file.
        FileNotFoundError: If the 'credentials.ini' file is not found in the specified directory.

    Example:
        To use this function, simply call it at the beginning of your script:

        credential_init()

        This will set the necessary environment variables for the APIs to be used later in your code.

    """

    credential_file = "credentials.ini"

    credentials = configparser.ConfigParser()
    credentials.read(credential_file)
    os.environ['OPENAI_API_KEY'] = credentials['openai'].get('api_key')
    os.environ['SERPER_API_KEY'] = credentials['SERPER_API_KEY'].get('api_key')
    os.environ['TAVILY_API_KEY'] = credentials['TAVILY_API_KEY'].get('api_key')
    os.environ['HuggingFace_API_KEY'] = credentials['HuggingFace_API_KEY'].get('api_key')

credential_init()

In [None]:
stop_token_ids = None
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

HF_TOKEN = os.environ['HuggingFace_API_KEY']

device = f"cuda:{cuda.current_device()}" if cuda.is_available() else 'cpu'

### 2. Bits and Bytes Configuration:

- Configures the model to use 4-bit quantization to reduce memory usage and computation cost.

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16)

### 3. Creating the Tokenizer:

- Defines a function to create a tokenizer from a pre-trained model. It also sets up stop token IDs which are sequences of tokens that, when encountered, will stop the text generation.

In [None]:
def create_tokenizer():

    # global stop_token_ids

    tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, use_auth_token=HF_TOKEN)
    # stop_list = ['\nHuman:', '\n```\n']

    # stop_token_ids = [tokenizer(x)['input_ids'] for x in stop_list]
    # stop_token_ids = [torch.LongTensor(x).to(device) for x in stop_token_ids]
    # stop_token_ids = stop_token_ids

    return tokenizer

### 4. Stopping Criteria Class:

- Custom stopping criteria class that stops text generation when certain sequences of tokens (stop tokens) are generated.

In [None]:
# class StopOnTokens(StoppingCriteria):

#   def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:

#     global stop_token_ids

#     # print(f"input_ids: {input_ids}")
#     # print(f"content: { tokenizer.decode(input_ids[0])}")
#     for stop_ids in stop_token_ids:
#         if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
#               # print("Stopping")
#             return True
#     return False

### 5. Loading the Model Configuration and Model:

- Loads the configuration and the model from Hugging Face with specified parameters, including the bits and bytes configuration for quantization.

In [None]:
model_config = AutoConfig.from_pretrained(
    model_id,
    use_auth_token=HF_TOKEN
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    device_map="auto",
    quantization_config=bnb_config,
    token=HF_TOKEN,
    config=model_config,
)

### 6. Setting Up Stopping Criteria:

- Creates a list of stopping criteria with the custom StopOnTokens class.

In [None]:
# stopping_criteria = StoppingCriteriaList([StopOnTokens()])

### 7. Tokenizing and Generating Text:

- Initializes the tokenizer, sets padding token, and defines terminators. Then it sets up a text generation pipeline with specific parameters like temperature, max new tokens, and stopping criteria.

In [None]:
tokenizer = create_tokenizer()

"""
Source Code:

if generation_config.pad_token_id is None and generation_config.eos_token_id is not None:
    if model_kwargs.get("attention_mask") is None:
        logger.warning(
            "The attention mask and the pad token id were not set. As a consequence, you may observe "
            "unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results."
        )
    eos_token_id = generation_config.eos_token_id
    if isinstance(eos_token_id, list):
        eos_token_id = eos_token_id[0]
    logger.warning(f"Setting `pad_token_id` to `eos_token_id`:{eos_token_id} for open-end generation.")
    generation_config.pad_token_id = eos_token_id

Discussion on the eos_token_id:

https://github.com/vllm-project/vllm/issues/4180
"""


terminators = [
    tokenizer.convert_tokens_to_ids("<|end_of_text|>"),
    tokenizer.convert_tokens_to_ids("<|eot_id|>"),
]

generate_text = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    return_full_text=False,
    task='text-generation',
    temperature=0.5,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    do_sample=True,
    eos_token_id=terminators,
    max_new_tokens=512,  # mex number of tokens to generate in the output
    repetition_penalty=1.2,  # without this output begins repeating
    top_p=0.5,
    pad_token_id = tokenizer.convert_tokens_to_ids("<|eot_id|>"))

- stopping_criteria (StoppingCriteriaList, optional)

    - Custom stopping criteria that complements the default stopping criteria built from arguments and a generation config. If a stopping criteria is passed that is already created with the arguments or a generation config an error is thrown. If your stopping criteria depends on the scores input, make sure you pass return_dict_in_generate=True, output_scores=True to generate. This feature is intended for advanced users.

- Special tokens that can be used at generation time

    - pad_token_id (int, optional) — The id of the padding token.
    - bos_token_id (int, optional) — The id of the beginning-of-sequence token.
    - eos_token_id (Union[int, List[int]], optional) — The id of the end-of-sequence token. Optionally, use a list to set multiple end-of-sequence tokens.

### LLama3 Standard Template

- <|begin_of_text|>: This is equiavalent ot the BOS token
- <|eot_id|>: This signifies the end of the message in a turn
- <|start_header_id|>{role}<|end_header_id|>: These tokens enclose the role for a particular message. The possible roles can be: `system`, `user`, `assistant`
- <|end_of_text|>: This is equivalent to the EOS token. On generating this token, Llama 3 will cease to generate more tokens.

In [None]:
prompt = """
<|begin_of_text|>
<|start_header_id|>system<|end_header_id|>
You are a honest and unbiased AI assistant who answer User queries with accurate responses.
<|eot_id|>
<|start_header_id|>user<|end_header_id|>
What's the capital of Australia?
<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>
"""

In [None]:
prompt

In [None]:
llm = HuggingFacePipeline(pipeline=generate_text)

In [None]:
output = llm.invoke(prompt)

In [None]:
output

In [None]:
prompt = """
<|begin_of_text|>
<|start_header_id|>system<|end_header_id|>
You are a honest and unbiased AI assistant who answer User queries with accurate responses.
<|eot_id|>
<|start_header_id|>user<|end_header_id|>
What's the capital of Taiwan?
<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>
"""
output = llm.invoke(prompt)
print(output)

In [None]:
prompt = """
<|begin_of_text|>
<|start_header_id|>system<|end_header_id|>
You are a honest and unbiased AI assistant who answer User queries with accurate responses.
<|eot_id|>
<|start_header_id|>user<|end_header_id|>
What's the capital of Taiwan? According to the constitution of Taiwan.
<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>
"""
output = llm.invoke(prompt)
print(output)

In [None]:
prompt = """
<|begin_of_text|>
<|start_header_id|>system<|end_header_id|>
You are a honest and unbiased AI assistant who answer User queries with accurate responses.
<|eot_id|>
<|start_header_id|>user<|end_header_id|>
What's the territory of Republic of China? According to the constitution of Taiwan.
<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>
"""
output = llm.invoke(prompt)
print(output)

### Can we invoke with ChatPromptTemplate directly?

In [None]:
from langchain.prompts import PromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate, SystemMessagePromptTemplate

system_prompt = PromptTemplate(template="""You are a honest and unbiased AI assistant who answer User queries with accurate responses.""")

system_message = SystemMessagePromptTemplate(prompt=system_prompt)

human_prompt = PromptTemplate(template='{query}', input_variables=["query"])

human_message = HumanMessagePromptTemplate(prompt=human_prompt)

chat_prompt = ChatPromptTemplate.from_messages([system_message, human_message])

In [None]:
chain = chat_prompt|llm

In [None]:
output = chain.invoke("What's the capital of Australia?")

In [None]:
output

In [None]:
prompt = chat_prompt.invoke("What's the capital of Australia?")

In [None]:
prompt.messages

In [None]:
for message in prompt.messages:
    print(message.type, message.content)

### How to construct the prompt properly from messages?

#### Attempt 1

In [None]:
prompt_template = """<|begin_of_text|>"""

for message in prompt.messages:
    if message.type == "system":
        prompt_template += f"<|start_header_id|>system<|end_header_id|>{message.content}<|eot_id|>"
    elif message.type == "human":
        prompt_template += f"<|start_header_id|>user<|end_header_id|>{message.content}<|eot_id|>"

prompt_template += f"<|start_header_id|>assistant<|end_header_id|>"

In [None]:
llm.invoke(prompt_template)

#### Attempt 2

In [None]:
prompt_template = """\n<|begin_of_text|>"""

for message in prompt.messages:
    if message.type == "system":
        prompt_template += f"\n<|start_header_id|>system<|end_header_id|>\n{message.content}\n<|eot_id|>"
    elif message.type == "human":
        prompt_template += f"\n<|start_header_id|>user<|end_header_id|>{message.content}\n<|eot_id|>"

prompt_template += f"\n<|start_header_id|>assistant<|end_header_id|>\n"

In [None]:
llm.invoke(prompt_template)

In [None]:
from langchain_core.runnables import chain

@chain
def llama3_prompt_parser(prompt):

    prompt_template = """\n<|begin_of_text|>"""

    for message in prompt.messages:
        if message.type == "system":
            prompt_template += f"\n<|start_header_id|>system<|end_header_id|>\n{message.content}\n<|eot_id|>"
        elif message.type == "human":
            prompt_template += f"\n<|start_header_id|>user<|end_header_id|>{message.content}\n<|eot_id|>"

    prompt_template += f"\n<|start_header_id|>assistant<|end_header_id|>\n"

    return prompt_template

In [None]:
pipeline_ = chat_prompt|llama3_prompt_parser|llm

pipeline_.invoke("What's the capital of Australia?")

## Retrieval

Because this might be the most frequent functionality you will use in your work.

source: https://github.com/langchain-ai/langgraph/blob/main/examples/rag/langgraph_rag_agent_llama3_local.ipynb

In [None]:
!pip install faiss-cpu

In [None]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

urls = [
    "https://lilianweng.github.io/posts/2023-06-23-agent/",
    "https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/",
    "https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/",
]

docs = [WebBaseLoader(url).load() for url in urls]
docs_list = [item for sublist in docs for item in sublist]

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=250, chunk_overlap=0
)
doc_splits = text_splitter.split_documents(docs_list)

embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Add to vectorDB
vectorstore = FAISS.from_documents(
    documents=doc_splits,
    # collection_name="rag-chroma",
    embedding=embedding,
)
retriever = vectorstore.as_retriever(k=1)

In [None]:
system_prompt = PromptTemplate(template="""You are a grader assessing relevance
    of a retrieved document to a user question. If the document contains keywords related to the user question,
    grade it as relevant. It does not need to be a stringent test. The goal is to filter out erroneous retrievals. \n
    Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question. \n
    Provide the binary score as a JSON with a single key 'score' and no premable or explanation.""")

system_message = SystemMessagePromptTemplate(prompt=system_prompt)

human_prompt = PromptTemplate(template="""Here is the retrieved document: \n\n {document} \n\n
Here is the user question: {question} \n """, input_variables=["document", "question"])

human_message = HumanMessagePromptTemplate(prompt=human_prompt)

chat_prompt = ChatPromptTemplate.from_messages([system_message, human_message])

## Create the prompt

In [None]:
from langchain_core.runnables import RunnablePassthrough

@chain
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

input_step = {"question": RunnablePassthrough(), "document": retriever|format_docs}

pipeline_ = input_step|chat_prompt|llama3_prompt_parser|llm

In [None]:
pipeline_.invoke("agent memory")

In [None]:
# Let us apply the structure
from langchain.output_parsers import StructuredOutputParser, ResponseSchema

response_schemas = [
        ResponseSchema(name="score", description="a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question")
    ]

output_parser = StructuredOutputParser.from_response_schemas(response_schemas)

format_instructions = output_parser.get_format_instructions()


system_prompt = PromptTemplate.from_template("""You are a grader assessing relevance
    of a retrieved document to a user question. If the document contains keywords related to the user question,
    grade it as relevant. It does not need to be a stringent test. The goal is to filter out erroneous retrievals. \n
    Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question.""")

system_message = SystemMessagePromptTemplate(prompt=system_prompt)

human_prompt = PromptTemplate(template="""Here is the retrieved document:
                      {document}
                      Here is the user question: {question}.
                      format instruction: {format_instructions}
                      """,
                      input_variables=["document", "question"],
                      partial_variables={"format_instructions": format_instructions})

human_message = HumanMessagePromptTemplate(prompt=human_prompt)

chat_prompt = ChatPromptTemplate.from_messages([system_message, human_message])

In [None]:
input_step = {"question": RunnablePassthrough(), "document": retriever|format_docs}


pipeline_ = input_step |chat_prompt|llama3_prompt_parser|llm|output_parser
pipeline_.invoke("agent memory")

## N-Shot

In [None]:
import pandas as pd
from langchain.docstore.document import Document

df_query = pd.read_excel("100cases_for testing.xlsx")
df_key = pd.read_excel("HFACS_Benchmark.xlsx")


embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

documents = []

for idx, row in df_key.iterrows():
    document = Document(page_content=row["Combined Narratives"],
                        metadata={"level_1": row.get('Level 1: Unsafe Acts', ""),
                                  "level_2": row.get('Level 2: Preconditions for Unsafe Acts', "")})
    documents.append(document)

vectorstore = FAISS.from_documents(documents, embedding=embedding)

embedding_retriever = vectorstore.as_retriever(search_type="similarity",
                                              search_kwargs={'k': 5})

In [None]:
from langchain_core.prompts.few_shot import FewShotChatMessagePromptTemplate


system_template = """
You are an expert in aviation safety accident analysis. You are highly analytical and pay close attention to details. I want you to analyse the accident narratives based on The Human Factors Analysis and Classification System (HFACS).
Tell me which categories of errors caused the accident, only identify the factor that belongs to HFACS level 1: Unsafe acts and Level 2: Precondition for unsafe acts.

There could be multiple errors in this level or none at all. I only want errors that can be directly deducted from the narratives, no speculations or guesses.

The candidates of the output are from:
level_1:
- `Decision Errors`
- `Skill-Based Errors`
- `Perceptual Errors`
- `Routine violation`
- `Violation: Exceptional`
level_2:
- `Physical Environment`
- `Technological Environment`
- `Adverse Mental State`
- `Adverse Physiological State`
- `Physical/Mental Limitations`
- `Crew Resource Management`
- `Personal Readiness`
"""


response_schemas = [
        ResponseSchema(name="level_1", description="HFACS Level_1 evaluation as a python list"),
        ResponseSchema(name="level_2", description="HFACS Level_2 evaluation as a python list")
    ]

output_parser = StructuredOutputParser.from_response_schemas(response_schemas)

format_instructions = output_parser.get_format_instructions()


@chain
def few_shot_prompt_fn(data):

    example_prompt = ChatPromptTemplate.from_messages(
    [('human', '{input}'), ('ai', 'level_1 error: [{level_1}], level_2 error: [{level_2}]')]
)

    examples = []

    for example in data['examples']:
        examples.append({"input": example.page_content,
                         "level_1": example.metadata['level_1'],
                         "level_2": example.metadata['level_2']})

    few_shot_prompt = FewShotChatMessagePromptTemplate(
        example_prompt=example_prompt,
        examples=examples,
    )

    system_prompt = PromptTemplate.from_template(system_template)

    system_message = SystemMessagePromptTemplate(prompt=system_prompt)

    human_prompt = PromptTemplate(template='{incident_narrative} \n format instruction: {format_instructions}',
                    partial_variables={"format_instructions": format_instructions, "incident_narrative": data['input']})

    human_message = HumanMessagePromptTemplate(prompt=human_prompt)

    chat_prompt = ChatPromptTemplate.from_messages([system_message,
                                                    few_shot_prompt,
                                                    human_message
                                                   ])

    return chat_prompt

chat_prompt = {"examples": embedding_retriever, "input": RunnablePassthrough()}|few_shot_prompt_fn
pipeline_ = chat_prompt|llama3_prompt_parser|llm|output_parser

In [None]:
df_query.head(5)

In [None]:
df_query.iloc[0]['Incident Narratives']

In [None]:
pipeline_

In [None]:
pipeline_.invoke(df_query.iloc[0]['Incident Narratives'])

## Can we do this faster?

https://ollama.com/library/llama3/tags

*** Under Construction ***

In [None]:
from IPython.display import Image

Image(url='https://ollama.com/public/ollama.png')

## Finetuning

...Under Test...

It is quite complicated ...