<a href="https://colab.research.google.com/github/mertcan-basut/nlp/blob/main/langchain.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q openai

!pip install -q langchain langchain-openai langchain-experimental langchainhub
!pip install -q docarray
!pip install chromadb==0.4.14 # downgrade for sqlite3.OperationalError: attempt to write a readonly database
!pip install -q pydantic==1.10.9 # downgrade for pydantic and Langchain compatibility: https://python.langchain.com/docs/guides/pydantic_compatibility
!pip install -q wikipedia
!pip install -q pypdf
!pip install -q youtube-transcript-api pytube
!pip install -q lark

!pip install -q python-dotenv

In [2]:
!echo "OPENAI_API_KEY=editme" > .env

In [3]:
# a framework for developing LM powered applications
from langchain_openai import ChatOpenAI # models
from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate, PromptTemplate # input prompts
from langchain_core.messages import HumanMessage, SystemMessage
from langchain.output_parsers import StructuredOutputParser # output parsers
from langchain.output_parsers import ResponseSchema
from langchain.chains import ConversationChain # memory
from langchain.memory import ConversationBufferMemory, ConversationBufferWindowMemory, ConversationTokenBufferMemory, ConversationSummaryBufferMemory
from langchain.chains import LLMChain, SimpleSequentialChain, SequentialChain # chains
from langchain.chains.router import MultiPromptChain
from langchain.chains.router.llm_router import LLMRouterChain, RouterOutputParser
from langchain.chains import RetrievalQA, ConversationalRetrievalChain # question answering
from langchain.document_loaders import CSVLoader, PyPDFLoader, WebBaseLoader # document loaders
from langchain_community.document_loaders.youtube import YoutubeLoader
from langchain_community.document_loaders.text import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter, TokenTextSplitter, MarkdownHeaderTextSplitter # splitting
from langchain.vectorstores import DocArrayInMemorySearch, Chroma # vector stores
from langchain.indexes import VectorstoreIndexCreator # indexes
from langchain.embeddings import OpenAIEmbeddings # embeddings
from langchain_community.llms.openai import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever # retrievers
from langchain.chains.query_constructor.schema import AttributeInfo
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.retrievers import SVMRetriever, TFIDFRetriever
from langchain.evaluation.qa import QAEvalChain, QAGenerateChain # evaluation
import langchain # debugging
from langchain.agents import load_tools, AgentExecutor, create_react_agent, create_openai_functions_agent, tool # agents
from langchain import hub
from langchain_experimental.tools import PythonREPLTool

import openai # direct API calls to OpenAI

import numpy as np

import json
import os
from datetime import date

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
# https://platform.openai.com/api-keys

from IPython.display import display, Markdown
from google.colab import drive
drive.mount("/content/drive")
# https://s172-31-11-251p14136.lab-aws-production.deeplearning.ai/edit/OutdoorClothingCatalog_1000.csv

import warnings
warnings.filterwarnings('ignore')
# /usr/local/lib/python3.10/dist-packages/langchain/chains/llm.py:316: UserWarning: The predict_and_parse method is deprecated, instead pass an output parser directly to LLMChain.

Mounted at /content/drive


## Models, Prompts, Parsers

In [None]:
text = """\
This leaf blower is pretty amazing.  It has four settings: \
candle blower, gentle breeze, windy city, and tornado. \
It arrived in two days, just in time for my wife's \
anniversary present. \
I think my wife liked it so much she was speechless. \
So far I've been the only one using it, and I've been \
using it every other morning to clear the leaves on our lawn. \
It's slightly more expensive than the other leaf blowers \
out there, but I think it's worth it for the extra features.\
"""

system_message_template = """\
For the following text, extract the following information:

gift: Was the item purchased as a gift for someone else? Answer True if yes, False if not or unknown.

delivery_days: How many days did it take for the product to arrive? If this information is not found, output -1.

price_value: Extract any sentences about the value or price, and output them as a comma separated Python list.
"""

human_message_template = """\
text: {text}
"""

format_instructions_template = """\
Format the output as JSON with the following keys:
gift
delivery_days
price_value
"""

In [None]:
client = openai.OpenAI(api_key=os.environ['OPENAI_API_KEY'])

def get_completion(prompt, model="gpt-3.5-turbo"):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0,
    )
    return response.choices[0].message.content

get_completion("Hi!")

'Hello! How can I assist you today?'

In [None]:
def format_prompt(system_message, human_message, format_instructions):
  return f"""\
{system_message}

{human_message}

{format_instructions}

"""

system_message_prompt = system_message_template.format()
human_message_prompt = human_message_template.format(text=text)
format_instructions_prompt = format_instructions_template.format()
prompt = format_prompt(system_message_prompt, human_message_prompt, format_instructions_prompt)
print(prompt)

For the following text, extract the following information:

gift: Was the item purchased as a gift for someone else? Answer True if yes, False if not or unknown.

delivery_days: How many days did it take for the product to arrive? If this information is not found, output -1.

price_value: Extract any sentences about the value or price, and output them as a comma separated Python list.


text: This leaf blower is pretty amazing.  It has four settings: candle blower, gentle breeze, windy city, and tornado. It arrived in two days, just in time for my wife's anniversary present. I think my wife liked it so much she was speechless. So far I've been the only one using it, and I've been using it every other morning to clear the leaves on our lawn. It's slightly more expensive than the other leaf blowers out there, but I think it's worth it for the extra features.


Format the output as JSON with the following keys:
gift
delivery_days
price_value





In [None]:
response = get_completion(prompt)
print(response)

output_dict = json.loads(response)
output_dict.get('delivery_days')

{
  "gift": true,
  "delivery_days": 2,
  "price_value": ["It's slightly more expensive than the other leaf blowers out there"]
}


2

In [None]:
chat = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.0)

chat([HumanMessage(content="Hi!")]).content

'Hello! How can I assist you today?'

In [None]:
system_message_prompt = SystemMessagePromptTemplate.from_template(system_message_template)
human_message_prompt = HumanMessagePromptTemplate.from_template(human_message_template)

gift_schema = ResponseSchema(name="gift", description="Was the item purchased as a gift for someone else? Answer True if yes, False if not or unknown.")
delivery_days_schema = ResponseSchema(name="delivery_days", description="How many days did it take for the product to arrive? If this information is not found, output -1.")
price_value_schema = ResponseSchema(name="price_value", description="Extract any sentences about the value or price, and output them as a comma separated Python list.")
response_schemas = [gift_schema, delivery_days_schema, price_value_schema]

output_parser = StructuredOutputParser.from_response_schemas(response_schemas)

format_instructions = output_parser.get_format_instructions()
format_instructions_prompt = SystemMessage(content=format_instructions)

prompt_template = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt, format_instructions_prompt])
prompt = prompt_template.format_messages(text=text)
for message in prompt: print(message.content)

For the following text, extract the following information:

gift: Was the item purchased as a gift for someone else? Answer True if yes, False if not or unknown.

delivery_days: How many days did it take for the product to arrive? If this information is not found, output -1.

price_value: Extract any sentences about the value or price, and output them as a comma separated Python list.

text: This leaf blower is pretty amazing.  It has four settings: candle blower, gentle breeze, windy city, and tornado. It arrived in two days, just in time for my wife's anniversary present. I think my wife liked it so much she was speechless. So far I've been the only one using it, and I've been using it every other morning to clear the leaves on our lawn. It's slightly more expensive than the other leaf blowers out there, but I think it's worth it for the extra features.

The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "`

In [None]:
response = chat(prompt).content
print(response)

output_dict = output_parser.parse(response)
output_dict.get('delivery_days')

```json
{
	"gift": true,
	"delivery_days": 2,
	"price_value": "It's slightly more expensive than the other leaf blowers out there, but I think it's worth it for the extra features."
}
```


2

## Memory

In [None]:
llm = ChatOpenAI(temperature=0.0, model="gpt-3.5-turbo")
memory = ConversationBufferMemory()
conversation = ConversationChain(llm=llm, memory=memory, verbose=True)

In [None]:
print(conversation.predict(input="Hi, my name is Mert."))
print(conversation.predict(input="What is 1+1?"))
print(conversation.predict(input="What is my name?"))



[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3mThe following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.

Current conversation:

Human: Hi, my name is Mert.
AI:[0m

[1m> Finished chain.[0m
Hello Mert! It's nice to meet you. How can I assist you today?


[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3mThe following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.

Current conversation:
Human: Hi, my name is Mert.
AI: Hello Mert! It's nice to meet you. How can I assist you today?
Human: What is 1+1?
AI:[0m

[1m> Finished chain.[0m
1+1 equals 2. Is there anything els

In [None]:
print(memory.buffer, '\n')
memory.load_memory_variables({})

Human: Hi, my name is Mert.
AI: Hello Mert! It's nice to meet you. How can I assist you today?
Human: What is 1+1?
AI: 1 + 1 equals 2. Is there anything else you would like to know?
Human: What is my name?
AI: Your name is Mert. Is there anything else you would like to know or discuss? 



{'history': "Human: Hi, my name is Mert.\nAI: Hello Mert! It's nice to meet you. How can I assist you today?\nHuman: What is 1+1?\nAI: 1 + 1 equals 2. Is there anything else you would like to know?\nHuman: What is my name?\nAI: Your name is Mert. Is there anything else you would like to know or discuss?"}

In [None]:
# add aditional data to memory
memory.save_context({"input": "Hi!"}, {"output": "What's up?"})
print(memory.buffer)

Human: Hi, my name is Mert.
AI: Hello Mert! It's nice to meet you. How can I assist you today?
Human: What is 1+1?
AI: 1 + 1 equals 2. Is there anything else you would like to know?
Human: What is my name?
AI: Your name is Mert. Is there anything else you would like to know or discuss?
Human: Hi!
AI: What's up?


In [None]:
memory = ConversationBufferWindowMemory(k=1) # only keeps most recent `k` conversations

memory.save_context({"input": "Hi!"}, {"output": "What's up?"})
memory.save_context({"input": "Not much, just hanging."}, {"output": "Cool."})

print(memory.buffer)

Human: Not much, just hanging.
AI: Cool.


In [None]:
memory = ConversationTokenBufferMemory(llm=llm, max_token_limit=30) # chops off the earlier parts of the conversation to not exceed the token limit dependent on the LLM because usually cost is determined by number of tokens

memory.save_context({"input": "AI is what?!"}, {"output": "Amazing!"})
memory.save_context({"input": "Backpropagation is what?"}, {"output": "Beautiful!"})
memory.save_context({"input": "Chatbots are what?"}, {"output": "Charming!"})

print(memory.buffer)

AI: Beautiful!
Human: Chatbots are what?
AI: Charming!


In [None]:
memory = ConversationSummaryBufferMemory(llm=llm, max_token_limit=30) # uses the specified LLM to summarize the entire chat history which exceeds the specified number of tokens

memory.save_context({"input": "AI is what?!"}, {"output": "Amazing!"})
memory.save_context({"input": "Backpropagation is what?"}, {"output": "Beautiful!"})
memory.save_context({"input": "Chatbots are what?"}, {"output": "Charming!"})

print(memory.load_memory_variables({})['history'])
# `System` is not official OpenAI system message!

System: The human expresses surprise at the AI's positive view of artificial intelligence. The AI responds with "Amazing!" and the human asks about backpropagation.
AI: Beautiful!
Human: Chatbots are what?
AI: Charming!


## Chains

In [None]:
llm = ChatOpenAI(temperature=0.9, model="gpt-3.5-turbo")

product = "Queen Size Sheet Set"
review = "Je trouve le goût médiocre. La mousse ne tient pas, c'est bizarre. J'achète les mêmes dans le commerce et le goût est bien meilleur...\nVieux lot ou contrefaçon !?"

In [None]:
prompt_template = ChatPromptTemplate.from_template("What is the best name to describe a company that makes {product}?")
chain = LLMChain(llm=llm, prompt=prompt_template) # LLM + prompt (most basic)

chain.run(product)

'Royal Comfort Bedding'

In [None]:
# ideal when expecting 1 input & returning 1 output for every chain in sequence (output of the previous chain passed as input into the next chain)
prompt_template_1 = ChatPromptTemplate.from_template("What is the best name to describe a company that makes {product}?")
chain_1 = LLMChain(llm=llm, prompt=prompt_template_1)

prompt_template_2 = ChatPromptTemplate.from_template("Write a 20 words description for the following company:{company_name}")
chain_2 = LLMChain(llm=llm, prompt=prompt_template_2)

chain = SimpleSequentialChain(chains=[chain_1, chain_2], verbose=True)

chain.run(product)



[1m> Entering new SimpleSequentialChain chain...[0m
[36;1m[1;3mRegal Dreams Co.[0m
[33;1m[1;3mRegal Dreams Co. specializes in luxury home decor items, offering elegant and timeless pieces to elevate any living space.[0m

[1m> Finished chain.[0m


'Regal Dreams Co. specializes in luxury home decor items, offering elegant and timeless pieces to elevate any living space.'

In [None]:
# sequentially executed multiple inputs & multiple outputs (more complicated)
prompt_template_1 = ChatPromptTemplate.from_template("Translate the following review to english:\n\n{Review}")
chain_1 = LLMChain(llm=llm, prompt=prompt_template_1, output_key="English_Review")

prompt_template_2 = ChatPromptTemplate.from_template("Can you summarize the following review in 1 sentence:\n\n{English_Review}")
chain_2 = LLMChain(llm=llm, prompt=prompt_template_2, output_key="summary")

prompt_template_3 = ChatPromptTemplate.from_template("What language is the following review:\n\n{Review}")
chain_3 = LLMChain(llm=llm, prompt=prompt_template_3, output_key="language")

prompt_template_4 = ChatPromptTemplate.from_template(
    "Write a follow up response to the following "
    "summary in the specified language:"
    "\n\nSummary: {summary}\n\nLanguage: {language}"
)
chain_4 = LLMChain(llm=llm, prompt=prompt_template_4, output_key="followup_message")

chain = SequentialChain(
    chains=[chain_1, chain_2, chain_3, chain_4],
    input_variables=["Review"],
    output_variables=["English_Review", "summary", "followup_message"],
    verbose=True
)

chain(review)
# `run` not supported when there is not exactly one output key



[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


{'Review': "Je trouve le goût médiocre. La mousse ne tient pas, c'est bizarre. J'achète les mêmes dans le commerce et le goût est bien meilleur...\nVieux lot ou contrefaçon !?",
 'English_Review': "I find the taste mediocre. The foam doesn't hold, it's weird. I buy the same ones in stores and the taste is much better... Old batch or counterfeit!?",
 'summary': 'The reviewer is dissatisfied with the taste and foam of the product, suspecting it may be an old batch or counterfeit.',
 'followup_message': "Je vous remercie pour votre avis sur notre produit. Nous sommes désolés que vous ayez été insatisfait de son goût et de sa mousse. Nous prenons très au sérieux la qualité de nos produits et nous aimerions en savoir plus sur votre expérience pour investiguer davantage. S'il vous plaît, contactez notre service clientèle pour que nous puissions résoudre ce problème au plus vite. Merci de nous avoir informés."}

In [None]:
physics_template = """You are a very smart physics professor. \
You are great at answering questions about physics in a concise \
and easy to understand manner. \
When you don't know the answer to a question you admit \
that you don't know.

Here is a question:
{input}"""

math_template = """You are a very good mathematician. \
You are great at answering math questions. \
You are so good because you are able to break down \
hard problems into their component parts, \
answer the component parts, and then put them together\
to answer the broader question.

Here is a question:
{input}"""

history_template = """You are a very good historian. \
You have an excellent knowledge of and understanding of people, \
events and contexts from a range of historical periods. \
You have the ability to think, reflect, debate, discuss and \
evaluate the past. You have a respect for historical evidence \
and the ability to make use of it to support your explanations \
and judgements.

Here is a question:
{input}"""

computerscience_template = """ You are a successful computer scientist. \
You have a passion for creativity, collaboration, \
forward-thinking, confidence, strong problem-solving capabilities, \
understanding of theories and algorithms, and excellent communication \
skills. You are great at answering coding questions. \
You are so good because you know how to solve a problem by \
describing the solution in imperative steps \
that a machine can easily interpret and you know how to \
choose a solution that has a good balance between \
time complexity and space complexity.

Here is a question:
{input}"""

prompt_infos = [
    {
        "name": "physics",
        "description": "Good for answering questions about physics",
        "prompt_template": physics_template
    },
    {
        "name": "math",
        "description": "Good for answering math questions",
        "prompt_template": math_template
    },
    {
        "name": "History",
        "description": "Good for answering history questions",
        "prompt_template": history_template
    },
    {
        "name": "computer science",
        "description": "Good for answering computer science questions",
        "prompt_template": computerscience_template
    }
]

# chains that will be called by router chain
destination_chains = {}
for p_info in prompt_infos:
    name = p_info["name"]
    prompt_template = p_info["prompt_template"]
    prompt = ChatPromptTemplate.from_template(template=prompt_template)
    chain = LLMChain(llm=llm, prompt=prompt)
    destination_chains[name] = chain

destinations = [f"{p['name']}: {p['description']}" for p in prompt_infos]
destinations_str = "\n".join(destinations)

# called when the router can't decide which sub-chain to use
default_prompt = ChatPromptTemplate.from_template("{input}")
default_chain = LLMChain(llm=llm, prompt=default_prompt)

# template that will be used by LLM to route between chains
MULTI_PROMPT_ROUTER_TEMPLATE = """Given a raw text input to a \
language model select the model prompt best suited for the input. \
You will be given the names of the available prompts and a \
description of what the prompt is best suited for. \
You may also revise the original input if you think that revising \
it will ultimately lead to a better response from the language model.

<< FORMATTING >>
Return a markdown code snippet with a JSON object formatted to look like:
```json
{{{{
    "destination": string \\ name of the prompt to use or "DEFAULT"
    "next_inputs": string \\ a potentially modified version of the original input
}}}}
```

REMEMBER: "destination" MUST be one of the candidate prompt \
names specified below OR it can be "DEFAULT" if the input is not \
well suited for any of the candidate prompts.
REMEMBER: "next_inputs" can just be the original input \
if you don't think any modifications are needed.

<< CANDIDATE PROMPTS >>
{destinations}

<< INPUT >>
{{input}}

<< OUTPUT (remember to include the ```json)>>"""

router_template = MULTI_PROMPT_ROUTER_TEMPLATE.format(destinations=destinations_str)
router_prompt = PromptTemplate(template=router_template, input_variables=["input"], output_parser=RouterOutputParser()) # parses the LLM output to determine which chain to use and what the output to that chain should be

# for routing between multiple prompt templates
router_chain = LLMRouterChain.from_llm(llm, router_prompt)

# decides on which sub-chain to route the response as an input (more complex) according to the prompt templates by passing the descriptions to an LLM
chain = MultiPromptChain(router_chain=router_chain, destination_chains=destination_chains, default_chain=default_chain, verbose=True)

chain.run("What is black body radiation?")



[1m> Entering new MultiPromptChain chain...[0m
physics: {'input': 'What is black body radiation?'}
[1m> Finished chain.[0m


"Black body radiation is the electromagnetic radiation emitted by a perfect black body, which absorbs all incoming radiation and emits energy at all wavelengths. The distribution of this radiation follows Planck's law, which describes how the intensity of the radiation changes with temperature. Black body radiation is an important concept in physics and has applications in various fields, including astrophysics and thermodynamics."

## Question and Answer

In [None]:
query = "Please list all your shirts with sun protection in a table in markdown and summarize each one."

In [None]:
# load the documents
# because this documents are small, chunking (splitting documents into smaller pieces) isn't necessary
loader = CSVLoader(file_path="./drive/MyDrive/dataset/OutdoorClothingCatalog_500.csv", encoding='utf-8')

docs = loader.load()
docs[0]

Document(page_content=": 0\nUnnamed: 0: 0\nname: Women's Campside Oxfords\ndescription: This ultracomfortable lace-to-toe Oxford boasts a super-soft canvas, thick cushioning, and quality construction for a broken-in feel from the first time you put them on. \n\nSize & Fit: Order regular shoe size. For half sizes not offered, order up to next whole size. \n\nSpecs: Approx. weight: 1 lb.1 oz. per pair. \n\nConstruction: Soft canvas material for a broken-in feel and look. Comfortable EVA innersole with Cleansport NXT® antimicrobial odor control. Vintage hunt, fish and camping motif on innersole. Moderate arch contour of innersole. EVA foam midsole for cushioning and support. Chain-tread-inspired molded rubber outsole with modified chain-tread pattern. Imported. \n\nQuestions? Please contact us for any inquiries.", metadata={'source': './drive/MyDrive/dataset/OutdoorClothingCatalog_500.csv', 'row': 0})

In [None]:
# numerical representations of pieces of text that captures semantic meaning
embeddings = OpenAIEmbeddings()

embed = embeddings.embed_query(query)
print(len(embed))
embed[:5]

1536


[0.003285329438041333,
 0.001319983088892754,
 0.023928350673644135,
 -0.032684640238161373,
 -0.006897642983824973]

In [None]:
# basic vector store without needing to connect an external db
db = DocArrayInMemorySearch.from_documents(docs, embeddings)

# pieces of text with similar contents will have similar embeddings
docs = db.similarity_search(query)
print(len(docs))
docs[0]

4


Document(page_content=": 374\nUnnamed: 0: 374\nname: Men's Plaid Tropic Shirt, Short-Sleeve\ndescription: Our Ultracomfortable sun protection is rated to UPF 50+, helping you stay cool and dry. Originally designed for fishing, this lightest hot-weather shirt offers UPF 50+ coverage and is great for extended travel. SunSmart technology blocks 98% of the sun's harmful UV rays, while the high-performance fabric is wrinkle-free and quickly evaporates perspiration. Made with 52% polyester and 48% nylon, this shirt is machine washable and dryable. Additional features include front and back cape venting, two front bellows pockets and an imported design. With UPF 50+ coverage, you can limit sun exposure and feel secure with the highest rated sun protection available.", metadata={'source': './drive/MyDrive/dataset/OutdoorClothingCatalog_500.csv', 'row': 374})

In [None]:
llm = OpenAI(temperature=0.0, model="gpt-3.5-turbo-instruct")
# create vector store
index = VectorstoreIndexCreator(vectorstore_cls=DocArrayInMemorySearch, embedding=embeddings).from_loaders([loader])

response = index.query(query, llm=llm)
display(Markdown(response))



| Name | Description | Sun Protection Rating |
| --- | --- | --- |
| Men's Plaid Tropic Shirt, Short-Sleeve | Ultracomfortable sun protection rated to UPF 50+. Made with 52% polyester and 48% nylon. Features front and back cape venting, two front bellows pockets. | UPF 50+ |
| Sun Shield Shirt by | High-performance sun shirt with UPF 50+ rating. Made with 78% nylon and 22% Lycra Xtra Life fiber. Wicks moisture and abrasion resistant. | UPF 50+ |
| Girls' Ocean Breeze Long-Sleeve Stripe Shirt | Long-sleeve sun-protection rash guard with UPF 50+ rating. Made with Nylon Lycra®-elastane blend. Quick-drying and fade-resistant. | UPF 50+ |

Each of these shirts offers sun protection with a UPF 50+ rating, blocking 98% of the sun's harmful UV rays. They are all made with high-performance fabrics that are quick-drying and recommended by The Skin Cancer Foundation. The Men's Plaid Tropic Shirt and Sun Shield Shirt also have additional features such as venting and pockets

In [None]:
qdocs = "".join([docs[i].page_content for i in range(len(docs))])
llm = ChatOpenAI(model="gpt-3.5-turbo")
response = llm([HumanMessage(content=f"{qdocs}\nQuestion: {query}")]).content
display(Markdown(response))

| Shirt Name                                      | Summary                                                                                                                                                                                                                                                                          |
|-----------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| Men's Plaid Tropic Shirt, Short-Sleeve        | This shirt offers UPF 50+ sun protection, blocks 98% of harmful UV rays, is wrinkle-free, and quickly evaporates perspiration. Made with 52% polyester and 48% nylon, it features front and back cape venting and two front bellows pockets.                    |
| Sun Shield Shirt by                           | This high-performance sun shirt provides SPF 50+ sun protection, blocks 98% of harmful rays, and is made of 78% nylon and 22% Lycra Xtra Life fiber. It is quick-drying, abrasion-resistant, and fits comfortably over swimsuits.                                                |
| Girls' Ocean Breeze Long-Sleeve Stripe Shirt | This long-sleeve rash guard offers full-coverage sun protection with UPF 50+. Made of Nylon Lycra-elastane blend, it is quick-drying, fade-resistant, and seawater-resistant. Recommended by The Skin Cancer Foundation for UV protection.                        |

In [None]:
llm = OpenAI(temperature=0.0, model="gpt-3.5-turbo-instruct")
retriever = db.as_retriever() # takes in a query and return fetched documents from the vector store

qa_stuff = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type='stuff', # stuffs all the documents into context for a single call to the LLM (cheap but not ideal for large documents)
    # other methods:
    # -map_reduce: make multiple calls with each retrieved document, then make a final call to summarize the answers (popular even for summarization, supports large and vast amount of documents, supports parallel fast computing, but expensive and can't evaluate all of the information at once)
    # -refine: iterativly make multiple calls with each retrieved document by building on the answer (combines information, but expensive and slow because each step depends on the previous one)
    # -map_rerank: make multiple calls with each retrieved document and ask for a score from the LLM, then select the most relevant answer with the highest score (experimental, relies on the LLM to know the score, expensive)
    retriever=retriever,
    verbose=True
)

response = qa_stuff.run(query)
display(Markdown(response))



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m




| Name | Description | Sun Protection Rating |
| --- | --- | --- |
| Men's Plaid Tropic Shirt, Short-Sleeve | Made with UPF 50+ coverage, blocks 98% of harmful UV rays, wrinkle-free, quick-drying | UPF 50+ |
| Sun Shield Shirt by | High-performance fabric with SPF 50+ sun protection, wicks moisture, abrasion resistant | SPF 50+ |
| Girls' Ocean Breeze Long-Sleeve Stripe Shirt | Made with UPF 50+ coverage, blocks 98% of harmful UV rays, quick-drying, fade-resistant | UPF 50+ |
| Classic Plaid Short-Sleeve Shirt | Made with pure European flax, lightweight and breathable | N/A |

## Evaluation

In [None]:
loader = CSVLoader(file_path="./drive/MyDrive/dataset/OutdoorClothingCatalog_500.csv")
data = loader.load()

index = VectorstoreIndexCreator(
    vectorstore_cls=DocArrayInMemorySearch
).from_loaders([loader])

llm = ChatOpenAI(temperature = 0.0, model="gpt-3.5-turbo")

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=index.vectorstore.as_retriever(),
    verbose=True,
    chain_type_kwargs = {"document_separator": "<<<<>>>>>"}
)

In [None]:
qa_examples = [
  {
    "query": "Do the Cozy Comfort Pullover Set have side pockets?",
    "answer": "Yes"
  },
  {
    "query": "What collection is the Ultra-Lofty 850 Stretch Down Hooded Jacket from?",
    "answer": "The DownTek collection"
  }
]

qa_gen_chain = QAGenerateChain.from_llm(ChatOpenAI(model="gpt-3.5-turbo"))

# generate questions to the documents using an LLM
new_examples = qa_gen_chain.apply_and_parse([{"doc": t} for t in data[:5]])
new_examples = list(map(lambda outer_dict: outer_dict['qa_pairs'], new_examples)) # remove outer dictionaries ('qa_pairs') from the response schema
qa_examples += new_examples

print(data[4].page_content)
qa_examples[-1]

: 4
Unnamed: 0: 4
name: EcoFlex 3L Storm Pants
description: Our new TEK O2 technology makes our four-season waterproof pants even more breathable. It's guaranteed to keep you dry and comfortable – whatever the activity and whatever the weather. Size & Fit: Slightly Fitted through hip and thigh. 

Why We Love It: Our state-of-the-art TEK O2 technology offers the most breathability we've ever tested. Great as ski pants, they're ideal for a variety of outdoor activities year-round. Plus, they're loaded with features outdoor enthusiasts appreciate, including weather-blocking gaiters and handy side zips. Air In. Water Out. See how our air-permeable TEK O2 technology keeps you dry and comfortable. 

Fabric & Care: 100% nylon, exclusive of trim. Machine wash and dry. 

Additional Features: Three-layer shell delivers waterproof protection. Brand new TEK O2 technology provides enhanced breathability. Interior gaiters keep out rain and snow. Full side zips for easy on/off over boots. Two zippere

{'query': 'What technology is featured in the EcoFlex 3L Storm Pants that makes them more breathable and waterproof?',
 'answer': 'The EcoFlex 3L Storm Pants feature TEK O2 technology, which offers the most breathability ever tested and ensures waterproof protection.'}

In [None]:
# to see the details and debug
langchain.debug = True

qa.run(qa_examples[0]["query"])

langchain.debug = False

[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "Do the Cozy Comfort Pullover Set have side pockets?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain > 4:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "Do the Cozy Comfort Pullover Set have side pockets?",
  "context": ": 10\nUnnamed: 0: 10\nname: Cozy Comfort Pullover Set, Stripe\ndescription: Perfect for lounging, this striped knit set lives up to its name. We used ultrasoft fabric and an easy design that's as comfortable at bedtime as it is when we have to make a quick run out.\n\nSize & Fit\n- Pants are Favorite Fit: Sits lower on the waist.\n- Relaxed Fit: Our most generous fit sits farthest from the body.\n\nFabric & Care\n- In the softest blend of 63% polyester, 35% rayon and 2% spandex

In [None]:
# generate predictions for each question
predictions = qa.apply(qa_examples) # takes 'query' from the dictionaries automatically



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


In [None]:
# evaluate whether the predicted answers are true or false according to the real answers
# use another LLM to compare the semantic alignment between two strings
eval_chain = QAEvalChain.from_llm(ChatOpenAI(temperature=0, model="gpt-3.5-turbo"))

graded_outputs = eval_chain.evaluate(qa_examples, predictions)

In [None]:
for i, (prediction, graded_output) in enumerate(zip(predictions, graded_outputs)):
    print(f"Example {i}:")
    print("Question: " + prediction['query'])
    print("Real Answer: " + prediction['answer'])
    print("Predicted Answer: " + prediction['result'])
    print("Predicted Grade: " + graded_output['results'])
    print()

Example 0:
Question: Do the Cozy Comfort Pullover Set have side pockets?
Real Answer: Yes
Predicted Answer: Yes, the Cozy Comfort Pullover Set does have side pockets.
Predicted Grade: CORRECT

Example 1:
Question: What collection is the Ultra-Lofty 850 Stretch Down Hooded Jacket from?
Real Answer: The DownTek collection
Predicted Answer: The Ultra-Lofty 850 Stretch Down Hooded Jacket is from the DownTek collection.
Predicted Grade: CORRECT

Example 2:
Question: What are the key features of the Women's Campside Oxfords as described in the document?
Real Answer: The key features of the Women's Campside Oxfords include a super-soft canvas material for a broken-in feel, thick cushioning for comfort, quality construction, comfortable EVA innersole with Cleansport NXT® antimicrobial odor control, vintage hunt, fish and camping motif on the innersole, moderate arch contour, EVA foam midsole for cushioning and support, and a chain-tread-inspired molded rubber outsole with a modified chain-trea

## Agents

In [None]:
@tool # custom tool (docstring helps LLM to determine when to use this tool)
def get_time(text: str) -> str:
  """Returns todays date, use this for any questions related to knowing todays date. \
  The input should always be an empty string, and this function will always return todays \
  date - any date mathmatics should occur outside this function.
  """
  return str(date.today())

llm = ChatOpenAI(temperature=0) # LLMs as precise reasoning engines
tools = load_tools(["llm-math", "wikipedia"], llm=llm) + [PythonREPLTool(), get_time]
# llm-math tool: a chain (LLM + calculator) solves math problems
# wikipedia tool: API that allows to run search queries against Wikipedia and get back results
# Python REPL tool: a way to interact with code and run it, the interface will only return things that are printed - therefore, it is important to make sure have it print out the answer
prompt = hub.pull("hwchase17/openai-functions-agent")

agent = create_openai_functions_agent(llm, tools, prompt) # chain of thought reasoning + action planning = ReAct (generate reasoning traces and task-specific actions, leveraging the synergy)
agent_executor = AgentExecutor.from_agent_and_tools(
  agent=agent,
  tools=tools,
  handle_parsing_errors=True, # when the output can't be parsed as desired, it is passed back to LLM to correct itself
  verbose=True
)

In [None]:
agent_executor.invoke({"input": "What is the answer to the math problem: %25 of 300?"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `Calculator` with `25% of 300`


[0m[36;1m[1;3mAnswer: 75.0[0m[32;1m[1;3mThe answer to the math problem is 75.[0m

[1m> Finished chain.[0m


{'input': 'What is the answer to the math problem: %25 of 300?',
 'output': 'The answer to the math problem is 75.'}

In [None]:
agent_executor.invoke({"input": "Who is the founder of the Turkish Republic?"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `wikipedia` with `Founder of the Turkish Republic`


[0m[33;1m[1;3mPage: Kamal (name)
Summary: Kamal is a male given name used in several languages. 

In Sanskrit, it is usually spelled Kamal for males and Kamala for females, meaning "lotus" or "pale red".
Kamal or Kamaal (Arabic: كمال‌ kamāl) or Turkish Kemal. The Arabic name which is also a noun means "perfection, superiority, distinction" and "completion, conclusion, accomplishment". The name bears the notion of "completeness of a thing without any deficiency" and "perfection of morals and ethics (adjective: اِكْتِمال iktimāl)". Also the name may be used as an abbreviation of Kamal ad-Din.
In Persian, it means "beauty, perfection, excellence, completion, utmost level".
Azerbaijanis use it as a male name in the meaning of "competent, mature".
In Turkish, it is the misspelling of Kamâl which means "siege, blockade, encirclement" (from the Uzbek qamal) and "cast

{'input': 'Who is the founder of the Turkish Republic?',
 'output': 'The founder of the Turkish Republic is Mustafa Kemal Atatürk. He initiated the reforms that led to the establishment of the Republic of Turkey on October 29, 1923.'}

In [None]:
customer_list = [["Harrison", "Chase"], ["Lang", "Chain"], ["Dolly", "Too"], ["Elle", "Elem"], ["Geoff","Fusion"], ["Trance","Former"], ["Jen","Ayai"]]
agent_executor.invoke({"input": f"""Sort these customers by last name and then first name and then use the print function to list the sorted as output: {customer_list}"""})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `Python_REPL` with `customers = [['Harrison', 'Chase'], ['Lang', 'Chain'], ['Dolly', 'Too'], ['Elle', 'Elem'], ['Geoff', 'Fusion'], ['Trance', 'Former'], ['Jen', 'Ayai']]
sorted_customers = sorted(customers, key=lambda x: (x[1], x[0]))
print(sorted_customers)`


[0m[38;5;200m[1;3m[['Jen', 'Ayai'], ['Lang', 'Chain'], ['Harrison', 'Chase'], ['Elle', 'Elem'], ['Trance', 'Former'], ['Geoff', 'Fusion'], ['Dolly', 'Too']]
[0m[32;1m[1;3mThe customers sorted by last name and then first name are:
1. ['Jen', 'Ayai']
2. ['Lang', 'Chain']
3. ['Harrison', 'Chase']
4. ['Elle', 'Elem']
5. ['Trance', 'Former']
6. ['Geoff', 'Fusion']
7. ['Dolly', 'Too'][0m

[1m> Finished chain.[0m


{'input': "Sort these customers by last name and then first name and then use the print function to list the sorted as output: [['Harrison', 'Chase'], ['Lang', 'Chain'], ['Dolly', 'Too'], ['Elle', 'Elem'], ['Geoff', 'Fusion'], ['Trance', 'Former'], ['Jen', 'Ayai']]",
 'output': "The customers sorted by last name and then first name are:\n1. ['Jen', 'Ayai']\n2. ['Lang', 'Chain']\n3. ['Harrison', 'Chase']\n4. ['Elle', 'Elem']\n5. ['Trance', 'Former']\n6. ['Geoff', 'Fusion']\n7. ['Dolly', 'Too']"}

In [None]:
agent_executor.invoke({"input": "whats the date today?"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `get_time` with `{'text': ''}`


[0m[36;1m[1;3m2024-03-04[0m[32;1m[1;3mToday's date is March 4, 2024.[0m

[1m> Finished chain.[0m


{'input': 'whats the date today?', 'output': "Today's date is March 4, 2024."}

# RAG

## Document Loading

In [None]:
# accessing and reading different types of data as standard document format
loader = PyPDFLoader("./drive/MyDrive/dataset/docs/pdf/A Fast, Minimal Memory, Consistent Hash Algorithm (1406.2294).pdf")
pages = loader.load() # a list of documents (each page is a unique document)
page = pages[0] # document

print(page.page_content[:100]) # content of the page
print(page.metadata) # metadata associated with each document

A Fast, Minimal Memory, Consistent Hash Algorithm 
 
John Lamping, Eric Veach 
Google 
 
Abstract 
 
{'source': './drive/MyDrive/dataset/docs/pdf/A Fast, Minimal Memory, Consistent Hash Algorithm (1406.2294).pdf', 'page': 0}


In [None]:
loader = WebBaseLoader("https://github.com/basecamp/handbook/blob/master/37signals-is-you.md")
docs = loader.load()

In [None]:
# YouTube transcripts
loader = YoutubeLoader.from_youtube_url(
  "https://www.youtube.com/watch?v=QsYGlZkevEg",
  add_video_info=True,
  language=["en", "id"],
  translation="en"
)
docs = loader.load()

## Document Splitting

In [None]:
# splitting documents into smaller chunks before storing them in vector stores
# chunks should include semanticly relevant and complete sentences!

text = """When writing documents, writers will use document structure to group content. \
This can convey to the reader, which idea's are related. For example, closely related ideas \
are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.\n\n\
Paragraphs are often delimited with a carriage return or two carriage returns. \
Carriage returns are the "backslash n" you see embedded in this string. \
Sentences have a period at the end, but also, have a space \
and words are separated by space."""

loader = TextLoader("drive/MyDrive/dataset/docs/md/our-rituals.md")
docs = loader.load()

In [None]:
# splitters try to limit the size of the chunks as close as to the maximum size
# by combining splits according to the separators
# while keeping some overlap content from the previous chunk with respect to the maximum overlap size

splitter = CharacterTextSplitter(
  chunk_size=26,
  chunk_overlap=4,
  separator="",
  length_function=len
)

splits = splitter.split_text(text) # splits text into list of text chunks
documents = splitter.create_documents([text]) # splits list of texts and those splits into documents
chunks = splitter.split_documents(docs) # splits list of documents into chunks

In [None]:
# in case of a exceeding chunk size, next separator is taken into consideration

splitter = RecursiveCharacterTextSplitter(
  chunk_size=26,
  chunk_overlap=4,
  separators=["\n\n", "\n", r"(?<=\. )", "!", "?", ",", " ", ""], # use look-behind regex to fix separator at the end of the split
  keep_separator=True
)

splits = splitter.split_text(text)
documents = splitter.create_documents([text])
chunks = splitter.split_documents(docs)

In [None]:
# useful when considering the context window size of LLMs
# tokens are usually ~4 characters

splitter = TokenTextSplitter(
  chunk_size=26,
  chunk_overlap=4,
  # encoding name or model name should be specified
  encoding_name="gpt2",
  # model_name="gpt2",
)
# CharacterTextSplitter.from_tiktoken_encoder(encoding_name="gpt2",)

splits = splitter.split_text(text)
documents = splitter.create_documents([text])
chunks = splitter.split_documents(docs)

In [None]:
# splitting based on specific headers to keep common context together
# also adds information to metadata for each chunk

splitter = MarkdownHeaderTextSplitter(
  headers_to_split_on=[
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3")
  ]
)

splits = splitter.split_text(' '.join([d.page_content for d in docs]))

In [None]:
# allows splitting code for supported programming languages

# RecursiveCharacterTextSplitter.from_language(language=Language.CSHARP,)

In [None]:
# splitting text by looking at sentences according to the rules of the language

# NLTKTextSplitter(language="english",)
# SpacyTextSplitter(pipeline="en_core_web_sm",)

## Vectorstores and Embedding

In [4]:
loaders = [
  # duplicate documents on purpose - messy data
  TextLoader("drive/MyDrive/dataset/docs/md/our-rituals.md"),
  TextLoader("drive/MyDrive/dataset/docs/md/our-rituals.md"),
  TextLoader("drive/MyDrive/dataset/docs/md/benefits-and-perks.md")
]
docs = []
for loader in loaders:
  docs.extend(loader.load())

splitter = RecursiveCharacterTextSplitter(
  chunk_size = 200,
  chunk_overlap = 20
)
splits = splitter.split_documents(docs)

In [6]:
# numerical representations of text to find text with similar context or semantic meaning

sentence1 = "i like dogs"
sentence2 = "i like canines"
sentence3 = "the weather is ugly outside"

embed = OpenAIEmbeddings()

embedding1 = embed.embed_query(sentence1)
embedding2 = embed.embed_query(sentence2)
embedding3 = embed.embed_query(sentence3)

print(np.dot(embedding1, embedding2))
print(np.dot(embedding1, embedding3))

0.9631227500523626
0.7703257495981698


In [7]:
persist_directory = 'chroma/'
!rm -rf ./chroma  # remove old database files if any

# a database which stores embeddings and allows looking up for similar vectors
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embed,
    persist_directory=persist_directory
)

print(vectordb._collection.count()) # number of individual splits = len(splits)

vectordb.persist() # persist the vector database for future use

133


In [None]:
# basic semantic search

question = "When does an All Hands meeting occur?"
retrieved_docs = vectordb.similarity_search(question, k=3) # k = len(retrieved_docs)
retrieved_docs
# retrieved_docs[0].page_content
# retrieved_docs[0].metadata

[Document(page_content='At the end of every cycle, we hold an All Hands meeting. Everyone at the company gathers on a Zoom call (or in person, at meetups) to hear about product development, business operations, new hires,', metadata={'source': 'drive/MyDrive/dataset/docs/md/our-rituals.md'}),
 Document(page_content='At the end of every cycle, we hold an All Hands meeting. Everyone at the company gathers on a Zoom call (or in person, at meetups) to hear about product development, business operations, new hires,', metadata={'source': 'drive/MyDrive/dataset/docs/md/our-rituals.md'}),
 Document(page_content='## All Hands', metadata={'source': 'drive/MyDrive/dataset/docs/md/our-rituals.md'})]

## Retrieval

In [None]:
texts = [
  """The Amanita phalloides has a large and imposing epigeous (aboveground) fruiting body (basidiocarp).""",
  """A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.""",
  """A. phalloides, a.k.a Death Cap, is one of the most poisonous of all known mushrooms.""",
]
vectordb_small = Chroma.from_texts(texts, embedding=OpenAIEmbeddings())
question = "Tell me about all-white mushrooms with large fruiting bodies"

In [None]:
vectordb_small.similarity_search(question, k=2)

[Document(page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.'),
 Document(page_content='The Amanita phalloides has a large and imposing epigeous (aboveground) fruiting body (basidiocarp).')]

In [None]:
# maximum marginal relevance strives to achieve
# both semantically relevant (relevance to the query)
# and distinct (diverse among the results) chunks

vectordb_small.max_marginal_relevance_search(question, k=2, fetch_k=3) # from top fetch_k most relevant documents, return top k most diverse documents

[Document(page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.'),
 Document(page_content='A. phalloides, a.k.a Death Cap, is one of the most poisonous of all known mushrooms.')]

In [None]:
vectordb = Chroma(
  persist_directory="chroma/",
  embedding_function=OpenAIEmbeddings()
)
question = "when does the all hands meeting, one of the company's rituals, take place?"
vectordb.similarity_search(
  question,
  k=3,
  # metadata filtering based on structured information that is hard to capture semantically
  filter={'source': 'drive/MyDrive/dataset/docs/md/our-rituals.md'}
)

[Document(page_content='At the end of every cycle, we hold an All Hands meeting. Everyone at the company gathers on a Zoom call (or in person, at meetups) to hear about product development, business operations, new hires,', metadata={'source': 'drive/MyDrive/dataset/docs/md/our-rituals.md'}),
 Document(page_content='At the end of every cycle, we hold an All Hands meeting. Everyone at the company gathers on a Zoom call (or in person, at meetups) to hear about product development, business operations, new hires,', metadata={'source': 'drive/MyDrive/dataset/docs/md/our-rituals.md'}),
 Document(page_content='## All Hands', metadata={'source': 'drive/MyDrive/dataset/docs/md/our-rituals.md'})]

In [None]:
metadata_field_info = [
  AttributeInfo(
    name="source",
    description="The document name where the chunk is from, should be one of `drive/MyDrive/dataset/docs/md/our-rituals.md`, or `drive/MyDrive/dataset/docs/md/benefits-and-perks.md`.",
    type="string"
  ),
  AttributeInfo(
    name="page",
    description="The page number of the document.",
    type="integer"
  )
]
document_content_description = "Descriptive documents about company information."
llm = OpenAI(model='gpt-3.5-turbo-instruct', temperature=0)
# inferring the metadata from the query itself
retriever = SelfQueryRetriever.from_llm(
  llm,
  vectordb,
  document_content_description,
  metadata_field_info,
  verbose=True
)

question = "when does the all hands meeting, one of the company's rituals, take place?"
retriever.get_relevant_documents(question)
# [e async for e in retriever.astream_events(question, version="v1")]

In [None]:
# contextual compression gets rid of irrelevant text to improve the quality of
# retrieved documents by revealing the buried information most relevant to the query
# each retrieved document is passed through an LLM and it comes at a cost

compression_retriever = ContextualCompressionRetriever(
  base_compressor=LLMChainExtractor.from_llm(OpenAI(temperature=0, model="gpt-3.5-turbo-instruct")),
  base_retriever=vectordb.as_retriever(search_type="mmr")
)

question = "when does the all hands meeting, one of the company's rituals, take place?"
compression_retriever.get_relevant_documents(question)

[Document(page_content='At the end of every cycle, we hold an All Hands meeting.', metadata={'source': 'drive/MyDrive/dataset/docs/md/our-rituals.md'}),
 Document(page_content='All Hands', metadata={'source': 'drive/MyDrive/dataset/docs/md/our-rituals.md'}),
 Document(page_content='- Our Rituals\n- Meet-ups', metadata={'source': 'drive/MyDrive/dataset/docs/md/our-rituals.md'})]

In [None]:
retriever = SVMRetriever.from_texts(texts, OpenAIEmbeddings())
question = "Tell me about all-white mushrooms with large fruiting bodies"
retriever.get_relevant_documents(question)

[Document(page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.'),
 Document(page_content='The Amanita phalloides has a large and imposing epigeous (aboveground) fruiting body (basidiocarp).'),
 Document(page_content='A. phalloides, a.k.a Death Cap, is one of the most poisonous of all known mushrooms.')]

In [None]:
retriever = TFIDFRetriever.from_texts(texts)
question = "Tell me about all-white mushrooms with large fruiting bodies"
retriever.get_relevant_documents(question)

[Document(page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.'),
 Document(page_content='A. phalloides, a.k.a Death Cap, is one of the most poisonous of all known mushrooms.'),
 Document(page_content='The Amanita phalloides has a large and imposing epigeous (aboveground) fruiting body (basidiocarp).')]

## Question Answering

In [8]:
vectordb = Chroma(
  persist_directory="chroma/",
  embedding_function=OpenAIEmbeddings()
)
question = "when does the all hands meeting, one of the company's rituals, take place?"

In [11]:
template = """Use the following pieces of context to answer the question at the end. \
If you don't know the answer, just say that you don't know, don't try to make up an answer. \
Use three sentences maximum. Keep the answer as concise as possible. \
Always say "thanks for asking!" at the end of the answer.
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

qa_chain = RetrievalQA.from_chain_type(
  ChatOpenAI(model="gpt-3.5-turbo", temperature=0),
  retriever=vectordb.as_retriever(),
  return_source_documents=True,
  chain_type="stuff", # stuff | map_reduce | refine | map_rerank
  chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

In [12]:
qa_chain({"query": question})

{'query': "when does the all hands meeting, one of the company's rituals, take place?",
 'result': 'The All Hands meeting takes place at the end of every cycle. Thanks for asking!',
 'source_documents': [Document(page_content='At the end of every cycle, we hold an All Hands meeting. Everyone at the company gathers on a Zoom call (or in person, at meetups) to hear about product development, business operations, new hires,', metadata={'source': 'drive/MyDrive/dataset/docs/md/our-rituals.md'}),
  Document(page_content='At the end of every cycle, we hold an All Hands meeting. Everyone at the company gathers on a Zoom call (or in person, at meetups) to hear about product development, business operations, new hires,', metadata={'source': 'drive/MyDrive/dataset/docs/md/our-rituals.md'}),
  Document(page_content='## All Hands', metadata={'source': 'drive/MyDrive/dataset/docs/md/our-rituals.md'}),
  Document(page_content='## All Hands', metadata={'source': 'drive/MyDrive/dataset/docs/md/our-rit

## Chat

In [13]:
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0) # llm.predict("Hello world!")
embed = OpenAIEmbeddings() # embed.embed_query(question)

vectordb = Chroma(
  persist_directory="chroma/",
  embedding_function=embed
) # vectordb.similarity_search(question, k=3)

template = """Use the following pieces of context to answer the question at the end. \
If you don't know the answer, just say that you don't know, don't try to make up an answer. \
Use three sentences maximum. Keep the answer as concise as possible. \
Always say "thanks for asking!" at the end of the answer.
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

In [22]:
qa = ConversationalRetrievalChain.from_llm(
  llm,
  retriever=vectordb.as_retriever(search_type="similarity", search_kwargs={"k": 3}),
  chain_type="stuff",
  return_source_documents=True,
  return_generated_question=True,
  memory=ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True,
    output_key="answer" # returned key to store in the memory
  )
)

In [23]:
qa({"question": "when does the all hands meeting, one of the company's rituals, take place?"})

{'question': "when does the all hands meeting, one of the company's rituals, take place?",
 'chat_history': [HumanMessage(content="when does the all hands meeting, one of the company's rituals, take place?"),
  AIMessage(content='The All Hands meeting takes place at the end of every cycle.')],
 'answer': 'The All Hands meeting takes place at the end of every cycle.',
 'source_documents': [Document(page_content='At the end of every cycle, we hold an All Hands meeting. Everyone at the company gathers on a Zoom call (or in person, at meetups) to hear about product development, business operations, new hires,', metadata={'source': 'drive/MyDrive/dataset/docs/md/our-rituals.md'}),
  Document(page_content='At the end of every cycle, we hold an All Hands meeting. Everyone at the company gathers on a Zoom call (or in person, at meetups) to hear about product development, business operations, new hires,', metadata={'source': 'drive/MyDrive/dataset/docs/md/our-rituals.md'}),
  Document(page_cont

In [24]:
qa({"question": "what is the goal of this type of meetings?"})

{'question': 'what is the goal of this type of meetings?',
 'chat_history': [HumanMessage(content="when does the all hands meeting, one of the company's rituals, take place?"),
  AIMessage(content='The All Hands meeting takes place at the end of every cycle.'),
  HumanMessage(content='what is the goal of this type of meetings?'),
  AIMessage(content="The goal of the All Hands meeting is to provide updates on product development, business operations, new hires, and other important information to everyone at the company. It is a way to keep all employees informed and aligned with the company's goals and progress.")],
 'answer': "The goal of the All Hands meeting is to provide updates on product development, business operations, new hires, and other important information to everyone at the company. It is a way to keep all employees informed and aligned with the company's goals and progress.",
 'source_documents': [Document(page_content='At the end of every cycle, we hold an All Hands meet

## UI

In [27]:
def load_db(file, chain_type, k):
  # load documents
  loader = PyPDFLoader(file)
  docs = loader.load()
  # split documents
  splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
  chunks = splitter.split_documents(docs)
  # create vector database from data and embeddings
  db = DocArrayInMemorySearch.from_documents(docs, OpenAIEmbeddings())
  # create a chatbot chain, memory is managed externally!
  qa = ConversationalRetrievalChain.from_llm(
    llm=ChatOpenAI(model="gpt-3.5-turbo", temperature=0),
    chain_type=chain_type,
    retriever=db.as_retriever(search_type="similarity", search_kwargs={"k": k}),
    return_source_documents=True,
    return_generated_question=True
  )
  return qa

import panel as pn
import param

pn.extension()

class cbfs(param.Parameterized):
  chat_history = param.List([])
  answer = param.String("")
  db_query  = param.String("")
  db_response = param.List([])

  def __init__(self, **params):
    super(cbfs, self).__init__(**params)
    self.panels = []
    self.loaded_file = "drive/MyDrive/dataset/docs/pdf/A Fast, Minimal Memory, Consistent Hash Algorithm (1406.2294).pdf"
    self.qa = load_db(self.loaded_file, "stuff", 4)

  def call_load_db(self, count):
    if count == 0 or file_input.value is None:  # init or no file specified
      return pn.pane.Markdown(f"Loaded File: {self.loaded_file}")
    else:
      file_input.save("temp.pdf")  # local copy
      self.loaded_file = file_input.filename
      button_load.button_style="outline"
      self.qa = load_db("temp.pdf", "stuff", 4)
      button_load.button_style="solid"
    self.clr_history()
    return pn.pane.Markdown(f"Loaded File: {self.loaded_file}")

  def convchain(self, query):
    if not query:
      return pn.WidgetBox(pn.Row('User:', pn.pane.Markdown("", width=600)), scroll=True)
    result = self.qa({"question": query, "chat_history": self.chat_history})
    self.chat_history.extend([(query, result["answer"])])
    self.db_query = result["generated_question"]
    self.db_response = result["source_documents"]
    self.answer = result['answer']
    self.panels.extend([
      pn.Row('User:', pn.pane.Markdown(query, width=600)),
      pn.Row('ChatBot:', pn.pane.Markdown(self.answer, width=600, style={'background-color': '#F6F6F6'}))
    ])
    inp.value = ''  #clears loading indicator when cleared
    return pn.WidgetBox(*self.panels,scroll=True)

  @param.depends('db_query ', )
  def get_lquest(self):
    if not self.db_query :
      return pn.Column(
        pn.Row(pn.pane.Markdown(f"Last question to DB:", styles={'background-color': '#F6F6F6'})),
        pn.Row(pn.pane.Str("no DB accesses so far"))
      )
    return pn.Column(
      pn.Row(pn.pane.Markdown(f"DB query:", styles={'background-color': '#F6F6F6'})),
      pn.pane.Str(self.db_query )
    )

  @param.depends('db_response', )
  def get_sources(self):
    if not self.db_response:
      return
    rlist=[pn.Row(pn.pane.Markdown(f"Result of DB lookup:", styles={'background-color': '#F6F6F6'}))]
    for doc in self.db_response:
      rlist.append(pn.Row(pn.pane.Str(doc)))
    return pn.WidgetBox(*rlist, width=600, scroll=True)

  @param.depends('convchain', 'clr_history')
  def get_chats(self):
    if not self.chat_history:
      return pn.WidgetBox(pn.Row(pn.pane.Str("No History Yet")), width=600, scroll=True)
    rlist=[pn.Row(pn.pane.Markdown(f"Current Chat History variable", styles={'background-color': '#F6F6F6'}))]
    for exchange in self.chat_history:
      rlist.append(pn.Row(pn.pane.Str(exchange)))
    return pn.WidgetBox(*rlist, width=600, scroll=True)

  def clr_history(self, count=0):
    self.chat_history = []
    return

cb = cbfs()

file_input = pn.widgets.FileInput(accept='.pdf')
button_load = pn.widgets.Button(name="Load DB", button_type='primary')
button_clearhistory = pn.widgets.Button(name="Clear History", button_type='warning')
button_clearhistory.on_click(cb.clr_history)
inp = pn.widgets.TextInput( placeholder='Enter text here…')

bound_button_load = pn.bind(cb.call_load_db, button_load.param.clicks)
conversation = pn.bind(cb.convchain, inp)

# jpg_pane = pn.pane.Image( './img/convchain.jpg')

tab1 = pn.Column(
    pn.Row(inp),
    pn.layout.Divider(),
    pn.panel(conversation,  loading_indicator=True, height=300),
    pn.layout.Divider(),
)
tab2= pn.Column(
    pn.panel(cb.get_lquest),
    pn.layout.Divider(),
    pn.panel(cb.get_sources ),
)
tab3= pn.Column(
    pn.panel(cb.get_chats),
    pn.layout.Divider(),
)
tab4=pn.Column(
    pn.Row( file_input, button_load, bound_button_load),
    pn.Row( button_clearhistory, pn.pane.Markdown("Clears chat history. Can use to start a new topic" )),
    pn.layout.Divider(),
    # pn.Row(jpg_pane.clone(width=400))
)
dashboard = pn.Column(
    pn.Row(pn.pane.Markdown('# ChatWithYourData_Bot')),
    pn.Tabs(('Conversation', tab1), ('Database', tab2), ('Chat History', tab3),('Configure', tab4))
)
dashboard