In [25]:
from IPython.display import display, Markdown, clear_output

def display_md(text):
    clear_output()
    display(Markdown(text))

In [1]:
import getpass
import os

if "GROQ_API_KEY" not in os.environ:
    # os.environ["GROQ_API_KEY"] = getpass.getpass("Enter your Groq API key: ")
    os.environ["GROQ_API_KEY"] = "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"

In [2]:
from langchain_groq import ChatGroq

In [33]:
llm = ChatGroq(
    model="llama-3.3-70b-versatile",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2
)

In [None]:
llm_json_mode = ChatGroq(
    model="llama-3.3-70b-versatile",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    response_format={"type": "json_object"},
)

In [44]:
messages = [
    (
        "system",
        "You are a special kind of agent that when i give you an question, you will get answer for that question by solve that in step-by-step.",
    ),
    ("human", "How many `r` letters in the word `strawberry`"),
]
ai_msg = llm.invoke(messages)

In [None]:
ai_msg

In [None]:
from IPython.display import display, Markdown

# Display the response
display(Markdown(ai_msg.content))

### Search


In [1]:
import os
import getpass

def _set_env(var: str):
    if not os.environ.get(var):
        os.environ[var] = getpass.getpass(f"{var}")
        
_set_env("TAVILY_API_KEY")
os.environ["TOKENIZERS_PARALLELISM"] = "true"

### Tracing


In [72]:
_set_env("LANGSMITH_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "self-correcting-rag-system"

### Vectorestore


In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import SKLearnVectorStore
from langchain_nomic.embeddings import NomicEmbeddings

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [3]:
urls = [
    "https://lilianweng.github.io/posts/2023-06-23-agent/",
    "https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/",
    "https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/",
]

In [4]:
# load documents
docs = [WebBaseLoader(url).load() for url in urls]
doc_list = [item for sublist in docs for item in sublist]

In [5]:
# split documents
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1000,
    chunk_overlap=200
)
doc_splits = text_splitter.split_documents(doc_list)

In [9]:
# add to vectorDB
vectorstore = SKLearnVectorStore.from_documents(
    documents=doc_splits,
    embedding=NomicEmbeddings(
        model="nomic-embed-text-v1",
        inference_mode="local"
    )
)



In [16]:
# create retriever
retriever = vectorstore.as_retriever(k=3)

### Components


#### Router


In [3]:
import json
from langchain_core.messages import HumanMessage, SystemMessage

In [4]:
# prompt
router_instructions = """
    You are an expert at routing a user question to a vectorstore or web search.
    
    The vectorstore contains documents related to agents, prompt engineering, and adversarial attacks.
    
    Use the vectorstore for questions on these topics. For all else, and especially for current events, use web-search.
    
    Return JSON with single key, datasource, that is 'websearch' or 'vectorstore' depending on the question.
"""

In [18]:
# test router
test_web_search = llm_json_mode.invoke(
    [SystemMessage(content=router_instructions)]
    + [
        HumanMessage(
            content="Who is favored to win the NFC Championship game in the 2024 season?"
        )
    ]
)

test_web_search_2 = llm_json_mode.invoke(
    [SystemMessage(content=router_instructions)]
    + [HumanMessage(content="What are the models released today for llama 3.2?")]
)

test_vector_store = llm_json_mode.invoke(
    [SystemMessage(content=router_instructions)]
    + [HumanMessage(content="What are the types of agent memory?")]
)

In [None]:
print(json.loads(test_web_search.content))
print(json.loads(test_web_search_2.content))
print(json.loads(test_vector_store.content))

#### Retrieval Grader

In [13]:
# doc grader instructions
doc_grader_instructions = """
    You are a grader assessing relevence of a retrieved document to a user question.
    If the document contains keyword(s) or sementic meaning related to the question, grade it as relevant.
"""

# grader prompt
doc_grader_prompt = """
    Here is the retrieved document: \n\n {document} \n\n Here is the user question: \n\n {question}.
    This carefully and objectively assess the document contains at least some information that is relevant to the question.
    Return JSON with single key, `binary_score`, that is 'yes' or 'no' score to indicate whether the document is contains at least some information that is relevant to the question.
"""

In [22]:
# test
question = "What is Chain of thought prompting?"
docs = retriever.invoke(question)
doc_text = docs[1].page_content
doc_grader_prompt_formatted = doc_grader_prompt.format(
    document=doc_text, question=question
)




    Here is the retrieved document: 

 Use an iterative Monte Carlo search method to improve the best candidates by proposing semantically similar variants via prompts like Generate a variation of the following instruction while keeping the semantic meaning.\n\nInput: ...\n\nOutput:...


To construct chain-of-thought prompts automatically, Shum et al. (2023) suggested augment-prune-select, a three-step process:

Augment: Generate multiple pseudo-chains of thought given question using few-shot or zero-shot CoT prompts;
Prune: Prune pseudo chains based on whether generated answers match ground truths.
Select: Apply a variance-reduced policy gradient strategy to learn the probability distribution over selected examples, while considering the probability distribution over examples as policy and the validation set accuracy as reward.

Zhang et al. (2023) instead adopted clustering techniques to sample questions and then generates chains. They observed that LLMs tend to make certain types of mistakes. One type of errors can be similar in the emebedding space and thus get grouped together. By only sampling one or a few from frequent-error clusters, we can prevent too many wrong demonstrations of one error type and collect a diverse set of examples.

Question clustering: Embed questions and run $k$-means for clustering.
Demonstration selection: Select a set of representative questions from each cluster; i.e. one demonstration from one cluster. Samples in each cluster are sorted by distance to the cluster centroid and those closer to the centroid are selected first.
Rationale generation: Use zero-shot CoT to generate reasoning chains for selected questions and construct few-shot prompt to run inference.

Augmented Language Models#
A survey on augmented language models by Mialon et al. (2023) has great coverage over multiple categories of language models augmented with reasoning skills and the ability of using external tools. Recommend it.
Retrieval#
Often we need to complete tasks that require latest knowledge after the model pretraining time cutoff or internal/private knowledge base. In that case, the model would not know the context if we don’t explicitly provide it in the prompt. Many methods for Open Domain Question Answering depend on first doing retrieval over a knowledge base and then incorporating the retrieved content as part of the prompt. The accuracy of such a process depends on the quality of both retrieval and generation steps.
Lazaridou et al. (2022) studied how to use Google Search for document retrieval to augment LLMs. Given a question $q$, clean text is extracted out of 20 URLs returned by Google, resulting in a set of documents. Because these documents are long, each document is split into paragraphs of 6 sentences, $\{p\}$. Paragraphs are ranked by TF-IDF based cosine similarity between evidence paragraphs and the query. Only the most relevant paragraph is used in the prompt to produce an answer $a$.
For closed-book QA, each demonstration is formatted as follows to construct few-shot prompts. Swapping the question with the evidence (longer distance between questions and answers) is found to consistently yield lower results across all datasets.
Evidence: ...
Question: ...
Answer: ...
The answer probability is computed in three ways:

RAG style, $p(a_i \mid q) = \sum_{i=1}^n p_\text{tf-idf} (p_i \mid q) \cdot p_\text{LM}(a_i \mid q, p_i)$, where $p_\text{tf-idf} (p_i \mid q)$ is the normalized cosine similarities between the TF-IDF passage and question representations.
Noisy channel inference, $p(a_i\mid q) = \frac{p_\text{LM}(q \mid a_i, p_i) \cdot p_\text{LM}(a_i \mid p_i)}{p_\text{LM}(q \mid p_i)}$
Product-of-Experts (PoE), combines all probabilities used above in addition to $p_\text{LM}(p_i \mid q)$. 

 Here is the user question: 

 What is Chain of thought prompting?.
    This carefully and objectively assess the document contains at least some information that is relevant to the question.
    Return JSON with single key, `binary_score`, that is 'yes' or 'no' score to indicate whether the document is contains at least some information that is relevant to the question.
