# HyDE - Hypothetical Document Embeddings
    -> Instead of performing the RAG on user query, here we put INTERMEDAITE LLM to generate an answer (hypothetical)
    -> End user can't see this answer. this answer will compare (similarity using embeddings) with source documents.
    -> earlier we compared the user query with the documents in the rag (query-to-answer RAG)
       now we compare hypothetical answer with the documents in the rag (answer-to-answer RAG)
    

In [31]:
from langchain.llms import OpenAI
from langchain.vectorstores import Chroma
import yaml, os, openai, textwrap, langchain
from langchain.prompts import PromptTemplate
from langchain.document_loaders import TextLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.chat_models import ChatOpenAI, AzureChatOpenAI
from langchain.chains import LLMChain, HypotheticalDocumentEmbedder, RetrievalQA

In [5]:
with open('cadentials.yaml') as f:
    credentials = yaml.load(f, Loader=yaml.FullLoader)

os.environ["OPENAI_API_TYPE"] = credentials['AD_OPENAI_API_TYPE']
os.environ["OPENAI_API_VERSION"] = credentials['AD_OPENAI_API_VERSION']
os.environ["OPENAI_API_BASE"] = credentials['AD_OPENAI_API_BASE']
os.environ["OPENAI_API_KEY"] = credentials['AD_OPENAI_API_KEY']

# Method 01 - Single Hypothetical Generation

In [7]:
bge_embeddings = HuggingFaceBgeEmbeddings(
                                        model_name="BAAI/bge-small-en-v1.5",
                                        model_kwargs={'device': 'mps'},
                                        encode_kwargs={'normalize_embeddings': True}
                                        )

llm = AzureChatOpenAI(
                    deployment_name=credentials['AD_DEPLOYMENT_ID'],
                    model_name=credentials['AD_ENGINE'],
                    temperature=0.9, 
                    max_tokens = 256
                    )

embeddings = HypotheticalDocumentEmbedder.from_llm(
                                                    llm,
                                                    bge_embeddings,
                                                    prompt_key="web_search"
                                                    )

In [9]:
print(embeddings.llm_chain.prompt.template)

Please write a passage to answer the question 
Question: {QUESTION}
Passage:


In [10]:
langchain.debug = True

In [11]:
result = embeddings.embed_query("What items does McDonalds make?")

[32;1m[1;3m[llm/start][0m [1m[1:llm:AzureChatOpenAI] Entering LLM run with input:
[0m{
  "prompts": [
    "Human: Please write a passage to answer the question \nQuestion: What items does McDonalds make?\nPassage:"
  ]
}
[36;1m[1;3m[llm/end][0m [1m[1:llm:AzureChatOpenAI] [4.25s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "McDonald's is a global fast-food chain that prepares and serves a wide variety of food and beverages for its customers. Some of the most famous items that McDonald's is known for include their Big Mac, Quarter Pounder with Cheese, Chicken McNuggets, and Filet-O-Fish. Additionally, McDonald's offers a range of breakfast items such as Egg McMuffin, Sausage Biscuit, and Hotcakes. They also serve various beverages such as freshly brewed coffee, soft drinks, and milkshakes. Apart from these, McDonald's offers seasonal specials and limited-time menu items, such as McRib sandwich, Shamrock Shake, and various McCafe beverages.

# Method 02 - Multiple Hypothetical Generation

generate multiple hypothetical answers and aggreate all of their embeddings to get a single embedding for the overall hypothetical answer

In [15]:
bge_embeddings = HuggingFaceBgeEmbeddings(
                                        model_name="BAAI/bge-small-en-v1.5",
                                        model_kwargs={'device': 'mps'},
                                        encode_kwargs={'normalize_embeddings': True}
                                        )

llm = AzureChatOpenAI(
                    deployment_name=credentials['AD_DEPLOYMENT_ID'],
                    model_name=credentials['AD_ENGINE'],
                    temperature=0.9, 
                    max_tokens = 256,
                    n=4,      ########################################### Check this
                    )

embeddings = HypotheticalDocumentEmbedder.from_llm(
                                                    llm,
                                                    bge_embeddings,
                                                    prompt_key="web_search"
                                                    )

In [16]:
result = embeddings.embed_query("What is McDonalds best selling item?")

[32;1m[1;3m[llm/start][0m [1m[1:llm:AzureChatOpenAI] Entering LLM run with input:
[0m{
  "prompts": [
    "Human: Please write a passage to answer the question \nQuestion: What is McDonalds best selling item?\nPassage:"
  ]
}
[36;1m[1;3m[llm/end][0m [1m[1:llm:AzureChatOpenAI] [2.74s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "McDonald's most popular and best-selling item, without a doubt, is the Big Mac. The iconic sandwich, which features two hamburger patties, cheese, lettuce, onions, pickles, and special sauce, has been a staple on the McDonald's menu since its debut in 1967. In fact, the Big Mac is so beloved that it has inspired a signature jingle (\"two all-beef patties, special sauce, lettuce, cheese, pickles, onions on a sesame seed bun\") and has even been the subject of a documentary. The Big Mac has remained a top-seller for decades, and its popularity shows no signs of waning anytime soon.",
        "generation_info": {
   

# Method 03 - Custom Prompting

In [17]:
prompt_template = """Please answer the user's question as a single food item
Question: {question}
Answer:"""

prompt = PromptTemplate(input_variables=["question"], template=prompt_template)

llm_chain = LLMChain(llm=llm, prompt=prompt)

In [18]:
embeddings = HypotheticalDocumentEmbedder(
                                            llm_chain=llm_chain,
                                            base_embeddings=bge_embeddings
                                        )

In [19]:
result = embeddings.embed_query(
    "What is is McDonalds best selling item?"
)

[32;1m[1;3m[llm/start][0m [1m[1:llm:AzureChatOpenAI] Entering LLM run with input:
[0m{
  "prompts": [
    "Human: Please answer the user's question as a single food item\nQuestion: What is is McDonalds best selling item?\nAnswer:"
  ]
}
[36;1m[1;3m[llm/end][0m [1m[1:llm:AzureChatOpenAI] [883ms] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "Big Mac",
        "generation_info": {
          "finish_reason": "stop"
        },
        "type": "ChatGeneration",
        "message": {
          "lc": 1,
          "type": "constructor",
          "id": [
            "langchain",
            "schema",
            "messages",
            "AIMessage"
          ],
          "kwargs": {
            "content": "Big Mac",
            "additional_kwargs": {}
          }
        }
      },
      {
        "text": "Big Mac",
        "generation_info": {
          "finish_reason": "stop"
        },
        "type": "ChatGeneration",
        "message": {
      

## Use HyDE for RAG

In [21]:
loaders = [
    TextLoader('data/langchain_blog_posts/blog.langchain.dev_announcing-langsmith_.txt'),
    TextLoader('data/langchain_blog_posts/blog.langchain.dev_benchmarking-question-answering-over-csv-data_.txt'),
    TextLoader('data/langchain_blog_posts/blog.langchain.dev_chat-loaders-finetune-a-chatmodel-in-your-voice_.txt'),
]
docs = []
for l in loaders:
    docs.extend(l.load())

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(docs) #split_text

In [32]:
prompt_template = """Please answer the user's question as related to Large Language Models
Question: {question}
Answer:"""

prompt = PromptTemplate(
                        input_variables=["question"], 
                        template=prompt_template
                        )

llm = AzureChatOpenAI(
                    deployment_name=credentials['AD_DEPLOYMENT_ID'],
                    model_name=credentials['AD_ENGINE'],
                    temperature=0.9, 
                    max_tokens = 256
                    )

llm_chain = LLMChain(
                    llm=llm,  ####################################### for Hypothetical Answer Generation
                    prompt=prompt
                    )

In [33]:
embeddings = HypotheticalDocumentEmbedder(
                                        llm_chain=llm_chain,
                                        base_embeddings=bge_embeddings
                                        )

docsearch = Chroma.from_documents(texts, embeddings)

In [28]:
query = "What are chat loaders?"
docs = docsearch.similarity_search(query)

[32;1m[1;3m[llm/start][0m [1m[1:llm:AzureChatOpenAI] Entering LLM run with input:
[0m{
  "prompts": [
    "Human: Please answer the user's question as related to Large Language Models\nQuestion: What are chat loaders?\nAnswer:"
  ]
}
[36;1m[1;3m[llm/end][0m [1m[1:llm:AzureChatOpenAI] [2.22s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "Chat loaders are modules or components of large language models that are specifically designed to generate conversational responses in natural language. These loaders analyze and interpret the input message/ query and use complex algorithms, machine learning, and natural language processing techniques to generate appropriate responses. These chat loaders enable language models to serve as chatbots, virtual assistants, and other conversational interfaces for various use cases such as customer support, personal assistant, education, and entertainment, among others.",
        "generation_info": {
          "f

In [29]:
docs

[Document(page_content='URL: https://blog.langchain.dev/chat-loaders-finetune-a-chatmodel-in-your-voice/\nTitle: Chat Loaders: Fine-tune a ChatModel in your Voice\n\nSummary\n\nWe are adding a new integration type, ChatLoaders, to make it easier to fine-tune models on your own unique writing style. These utilities help convert data from popular messaging platforms to chat messages compatible with fine-tuning formats like that supported by OpenAI.\n\nThank you to Greg Kamradt for Misbah Syed for their thought leadership on this.\n\nImportant Links:\n\nContext\n\nOn Tuesday, OpenAI announced improved fine-tuning support, extending the service to larger chat models like GPT-3.5-turbo. This enables anyone to customize these larger, more capable models for their own use cases. They also teased support for fine-tuning GPT-4 later this year.\n\nWhile fine-tuning is typically not advised for teaching an LLM substantially new knowledge or for factual recall; it is good for style transfer.', met

In [36]:
qa_chain = RetrievalQA.from_chain_type(
                                        llm=llm, ####################################### for RAG
                                        chain_type="stuff",
                                        retriever=docsearch.as_retriever(search_kwargs={"k": 3}),
                                        return_source_documents=True
                                        )

In [37]:
def wrap_text_preserve_newlines(text, width=110):
    lines = text.split('\n')
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

def process_llm_response(llm_response):
    print(wrap_text_preserve_newlines(llm_response['result']))
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [38]:
query = "What is Flash attention?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "What is Flash attention?"
}
[32;1m[1;3m[llm/start][0m [1m[1:llm:AzureChatOpenAI] Entering LLM run with input:
[0m{
  "prompts": [
    "Human: Please answer the user's question as related to Large Language Models\nQuestion: What is Flash attention?\nAnswer:"
  ]
}
[36;1m[1;3m[llm/end][0m [1m[1:llm:AzureChatOpenAI] [2.78s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "Flash attention is a type of attention mechanism used in large language models such as GPT-3. It is designed to improve the speed and efficiency of the attention network by allowing the model to focus on only a small portion of the input sequence at a time, rather than attending to the entire sequence in each step. This approach has been shown to significantly reduce memory requirements and improve performance on language modeling tasks.",
        "generation_info": {
    