# HyDE - Hypothetical Document Embeddings
    -> Instead of performing the RAG on user query, here we put INTERMEDAITE LLM to generate an answer (hypothetical)
    -> End user can't see this answer. this answer will compare (similarity using embeddings) with source documents.
    -> earlier we compared the user query with the documents in the rag (query-to-answer RAG)
       now we compare hypothetical answer with the documents in the rag (answer-to-answer RAG)
    

In [1]:
from langchain.llms import OpenAI
from langchain.vectorstores import Chroma
import yaml, os, openai, textwrap, langchain
from langchain.prompts import PromptTemplate
from langchain.document_loaders import TextLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.chat_models import ChatOpenAI, AzureChatOpenAI
from langchain.chains import LLMChain, HypotheticalDocumentEmbedder, RetrievalQA

In [2]:
with open('cadentials.yaml') as f:
    credentials = yaml.load(f, Loader=yaml.FullLoader)

os.environ["OPENAI_API_TYPE"] = credentials['AD_OPENAI_API_TYPE']
os.environ["OPENAI_API_VERSION"] = credentials['AD_OPENAI_API_VERSION']
os.environ["OPENAI_API_BASE"] = credentials['AD_OPENAI_API_BASE']
os.environ["OPENAI_API_KEY"] = credentials['AD_OPENAI_API_KEY']

# Method 01 - Single Hypothetical Generation

In [3]:
bge_embeddings = HuggingFaceBgeEmbeddings(
                                        model_name="BAAI/bge-small-en-v1.5",
                                        model_kwargs={'device': 'mps'},
                                        encode_kwargs={'normalize_embeddings': True}
                                        )

llm = AzureChatOpenAI(
                    deployment_name=credentials['AD_DEPLOYMENT_ID'],
                    model_name=credentials['AD_ENGINE'],
                    temperature=0.9, 
                    max_tokens = 256
                    )

embeddings = HypotheticalDocumentEmbedder.from_llm(
                                                    llm,
                                                    bge_embeddings,
                                                    prompt_key="web_search"
                                                    )

  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


In [4]:
print(embeddings.llm_chain.prompt.template)

Please write a passage to answer the question 
Question: {QUESTION}
Passage:


In [5]:
langchain.debug = True

In [6]:
result = embeddings.embed_query("What items does McDonalds make?")

[32;1m[1;3m[llm/start][0m [1m[1:llm:AzureChatOpenAI] Entering LLM run with input:
[0m{
  "prompts": [
    "Human: Please write a passage to answer the question \nQuestion: What items does McDonalds make?\nPassage:"
  ]
}
[36;1m[1;3m[llm/end][0m [1m[1:llm:AzureChatOpenAI] [3.53s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "McDonald's is a fast-food restaurant chain that is famous for its burgers, fries, and milkshakes. However, their menu offers much more than just those few items. Some of the most popular items that McDonald's makes include chicken sandwiches, breakfast sandwiches, salads, and wraps. In addition to these, McDonald's also offers a wide range of side items such as hash browns, mozzarella sticks, and apple slices. The restaurant also offers various beverages, including sodas, juices, and coffee. Moreover, McDonald's has special menus for kids, which include items like Happy Meals, which are designed to appeal to younger au

# Method 02 - Multiple Hypothetical Generation

generate multiple hypothetical answers and aggreate all of their embeddings to get a single embedding for the overall hypothetical answer

In [7]:
bge_embeddings = HuggingFaceBgeEmbeddings(
                                        model_name="BAAI/bge-small-en-v1.5",
                                        model_kwargs={'device': 'mps'},
                                        encode_kwargs={'normalize_embeddings': True}
                                        )

llm = AzureChatOpenAI(
                    deployment_name=credentials['AD_DEPLOYMENT_ID'],
                    model_name=credentials['AD_ENGINE'],
                    temperature=0.9, 
                    max_tokens = 256,
                    n=4,      ########################################### Check this
                    )

embeddings = HypotheticalDocumentEmbedder.from_llm(
                                                    llm,
                                                    bge_embeddings,
                                                    prompt_key="web_search"
                                                    )

In [8]:
result = embeddings.embed_query("What is McDonalds best selling item?")

[32;1m[1;3m[llm/start][0m [1m[1:llm:AzureChatOpenAI] Entering LLM run with input:
[0m{
  "prompts": [
    "Human: Please write a passage to answer the question \nQuestion: What is McDonalds best selling item?\nPassage:"
  ]
}
[36;1m[1;3m[llm/end][0m [1m[1:llm:AzureChatOpenAI] [2.06s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "McDonald's best selling item is none other than their iconic Big Mac sandwich. The Big Mac has been a fan favorite since its creation in 1967 and continues to dominate sales at McDonald's restaurants worldwide. The sandwich consists of two beef patties, special sauce, lettuce, cheese, pickles, and onions, all tucked between a sesame seed bun. It's no surprise that this classic sandwich has remained a top seller for over five decades, as it satisfies cravings for both savory meat and sweet sauce in one delicious bite. Despite McDonald's offering a vast menu of burgers, chicken sandwiches, and breakfast items, the B

# Method 03 - Custom Prompting

In [9]:
prompt_template = """Please answer the user's question as a single food item
Question: {question}
Answer:"""

prompt = PromptTemplate(input_variables=["question"], template=prompt_template)

llm_chain = LLMChain(llm=llm, prompt=prompt)

In [10]:
embeddings = HypotheticalDocumentEmbedder(
                                            llm_chain=llm_chain,
                                            base_embeddings=bge_embeddings
                                        )

In [11]:
result = embeddings.embed_query(
    "What is is McDonalds best selling item?"
)

[32;1m[1;3m[llm/start][0m [1m[1:llm:AzureChatOpenAI] Entering LLM run with input:
[0m{
  "prompts": [
    "Human: Please answer the user's question as a single food item\nQuestion: What is is McDonalds best selling item?\nAnswer:"
  ]
}
[36;1m[1;3m[llm/end][0m [1m[1:llm:AzureChatOpenAI] [834ms] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "Big Mac",
        "generation_info": {
          "finish_reason": "stop"
        },
        "type": "ChatGeneration",
        "message": {
          "lc": 1,
          "type": "constructor",
          "id": [
            "langchain",
            "schema",
            "messages",
            "AIMessage"
          ],
          "kwargs": {
            "content": "Big Mac",
            "additional_kwargs": {}
          }
        }
      },
      {
        "text": "Big Mac",
        "generation_info": {
          "finish_reason": "stop"
        },
        "type": "ChatGeneration",
        "message": {
      

## Use HyDE for RAG

In [12]:
loaders = [
    TextLoader('data/langchain_blog_posts/blog.langchain.dev_announcing-langsmith_.txt'),
    TextLoader('data/langchain_blog_posts/blog.langchain.dev_benchmarking-question-answering-over-csv-data_.txt'),
    TextLoader('data/langchain_blog_posts/blog.langchain.dev_chat-loaders-finetune-a-chatmodel-in-your-voice_.txt'),
]
docs = []
for l in loaders:
    docs.extend(l.load())

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(docs) #split_text

In [13]:
prompt_template = """Please answer the user's question as related to Large Language Models
Question: {question}
Answer:"""

prompt = PromptTemplate(
                        input_variables=["question"], 
                        template=prompt_template
                        )

llm = AzureChatOpenAI(
                    deployment_name=credentials['AD_DEPLOYMENT_ID'],
                    model_name=credentials['AD_ENGINE'],
                    temperature=0.9, 
                    max_tokens = 256
                    )

llm_chain = LLMChain(
                    llm=llm,  ####################################### for Hypothetical Answer Generation
                    prompt=prompt
                    )

In [14]:
embeddings = HypotheticalDocumentEmbedder(
                                        llm_chain=llm_chain,
                                        base_embeddings=bge_embeddings
                                        )

docsearch = Chroma.from_documents(texts, embeddings)

In [15]:
query = "What are chat loaders?"
docs = docsearch.similarity_search(query)

[32;1m[1;3m[llm/start][0m [1m[1:llm:AzureChatOpenAI] Entering LLM run with input:
[0m{
  "prompts": [
    "Human: Please answer the user's question as related to Large Language Models\nQuestion: What are chat loaders?\nAnswer:"
  ]
}
[36;1m[1;3m[llm/end][0m [1m[1:llm:AzureChatOpenAI] [1.52s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "Chat loaders are software components that are used to load, initialize and configure large language models that are used in chatbots or other conversational AI applications. These loaders typically perform a number of functions, such as setting up the model's parameters and hyperparameters, initializing the internal state of the model, and loading its weights and other associated data. Chat loaders are an important part of building and deploying large language models in natural language processing and conversational AI.",
        "generation_info": {
          "finish_reason": "stop"
        },
        "ty

In [16]:
docs

[Document(page_content='URL: https://blog.langchain.dev/chat-loaders-finetune-a-chatmodel-in-your-voice/\nTitle: Chat Loaders: Fine-tune a ChatModel in your Voice\n\nSummary\n\nWe are adding a new integration type, ChatLoaders, to make it easier to fine-tune models on your own unique writing style. These utilities help convert data from popular messaging platforms to chat messages compatible with fine-tuning formats like that supported by OpenAI.\n\nThank you to Greg Kamradt for Misbah Syed for their thought leadership on this.\n\nImportant Links:\n\nContext\n\nOn Tuesday, OpenAI announced improved fine-tuning support, extending the service to larger chat models like GPT-3.5-turbo. This enables anyone to customize these larger, more capable models for their own use cases. They also teased support for fine-tuning GPT-4 later this year.\n\nWhile fine-tuning is typically not advised for teaching an LLM substantially new knowledge or for factual recall; it is good for style transfer.', met

In [17]:
qa_chain = RetrievalQA.from_chain_type(
                                        llm=llm, ####################################### for RAG
                                        chain_type="stuff",
                                        retriever=docsearch.as_retriever(search_kwargs={"k": 3}),
                                        return_source_documents=True
                                        )

In [18]:
def wrap_text_preserve_newlines(text, width=110):
    lines = text.split('\n')
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

def process_llm_response(llm_response):
    print(wrap_text_preserve_newlines(llm_response['result']))
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [20]:
query = "What is Flash attention?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "What is Flash attention?"
}
[32;1m[1;3m[llm/start][0m [1m[1:llm:AzureChatOpenAI] Entering LLM run with input:
[0m{
  "prompts": [
    "Human: Please answer the user's question as related to Large Language Models\nQuestion: What is Flash attention?\nAnswer:"
  ]
}
[36;1m[1;3m[llm/end][0m [1m[1:llm:AzureChatOpenAI] [1.90s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "Flash attention is a type of attention mechanism used in large language models. It allows the model to selectively focus on certain parts of the input sequence, rather than processing the entire sequence at once. The attention is computed using a small set of key-value pairs, which are learned jointly with the rest of the model parameters. Flash attention is particularly useful for processing long sequences, as it allows the model to attend to the most relevant parts of th