# LangChain Playground #3
## Data Augmentation

Our goal is to build an Agent capable of solving queries based on provided contextual knowledge.

In [54]:
%pip install -qU langchain
%pip install -qU requests 
%pip install -qU openai 
%pip install -qU transformers 
%pip install -qU faiss-cpu
%pip install -qU ratelimit

[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
from langchain.llms import OpenAI
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.docstore.document import Document
import requests

### Search agent set up

In [3]:
import os
# openai API key
os.environ["OPENAI_API_KEY"] = "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"

In [4]:
def get_wiki_data(title, first_paragraph_only):
    url = f"https://en.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&explaintext=1&titles={title}"
    if first_paragraph_only:
        url += "&exintro=1"
    data = requests.get(url).json()
    return Document(
        page_content=list(data["query"]["pages"].values())[0]["extract"],
        metadata={"source": f"https://en.wikipedia.org/wiki/{title}"},
    )

In [5]:
sources = [
    get_wiki_data("Unix", True),
    get_wiki_data("Microsoft_Windows", True),
    get_wiki_data("Linux", True),
    get_wiki_data("Seinfeld", True),
]

In [6]:
chain = load_qa_with_sources_chain(OpenAI(temperature=0))

def print_answer(question):
    print(
        chain(
            {
                "input_documents": sources,
                "question": question,
            },
            return_only_outputs=True,
        )["output_text"]
    )

In [7]:
print_answer("Who were the writers of Seinfeld?")

 The writers of Seinfeld were Larry David, Jerry Seinfeld, Larry Charles, Peter Mehlman, Gregg Kavet, Carol Leifer, David Mandel, Jeff Schaffer, Steve Koren, Jennifer Crittenden, Tom Gammill, Max Pross, Dan O'Keefe, Charlie Rubin, Marjorie Gross, Alec Berg, Elaine Pope and Spike Feresten.
SOURCES: https://en.wikipedia.org/wiki/Seinfeld


In [8]:
sources = [
    get_wiki_data("Unix", True),
    get_wiki_data("Microsoft_Windows", True),
    get_wiki_data("Linux", True),
    get_wiki_data("Seinfeld", True),
    get_wiki_data("Matchbox_Twenty", True),
    get_wiki_data("Roman_Empire", True),
    get_wiki_data("London", True),
    get_wiki_data("Python_(programming_language)", True),
    get_wiki_data("Monty_Python", True),
]

In [14]:
print_answer("Who were the writers of Seinfeld?")

InvalidRequestError: This model's maximum context length is 4097 tokens, however you requested 6164 tokens (5908 in your prompt; 256 for the completion). Please reduce your prompt; or completion length.

## Using a multi-step chain

In [22]:
sources = [
    get_wiki_data("Microsoft_Windows", True),
    get_wiki_data("Linux", True),
]
chain = load_qa_with_sources_chain(OpenAI(temperature=0))
print_answer("What are the main differences between Linux and Windows?")

 Linux is an open-source Unix-like operating system based on the Linux kernel, while Windows is a group of proprietary graphical operating system families developed and marketed by Microsoft. Linux is used on servers, embedded systems, and desktop computers, while Windows is primarily used on desktop computers.
SOURCES: 
https://en.wikipedia.org/wiki/Microsoft_Windows
https://en.wikipedia.org/wiki/Linux


# Improving efficiency using a vector space search engine

In [25]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores.faiss import FAISS

In [26]:
sources = [
    get_wiki_data("Unix", True),
    get_wiki_data("Microsoft_Windows", True),
    get_wiki_data("Linux", True),
    get_wiki_data("Seinfeld", True),
    get_wiki_data("Matchbox_Twenty", True),
    get_wiki_data("Roman_Empire", True),
    get_wiki_data("London", True),
    get_wiki_data("Python_(programming_language)", True),
    get_wiki_data("Monty_Python", True),
]

In [27]:
search_index = FAISS.from_documents(sources, OpenAIEmbeddings())

In [28]:
chain = load_qa_with_sources_chain(OpenAI(temperature=0))

def print_answer(question):
    print(
        chain(
            {
                "input_documents": search_index.similarity_search(question, k=4),
                "question": question,
            },
            return_only_outputs=True,
        )["output_text"]
    )

In [29]:
print_answer("Which members of Matchbox 20 play guitar?")

 Rob Thomas, Kyle Cook, and Paul Doucette play guitar in Matchbox 20.
SOURCES: https://en.wikipedia.org/wiki/Matchbox_Twenty


## Dealing with a big document Corpus

In [49]:
sources = [
    get_wiki_data("Unix", False),
    get_wiki_data("Microsoft_Windows", False),
    get_wiki_data("Linux", False),
    get_wiki_data("Seinfeld", False),
    get_wiki_data("Matchbox_Twenty", False),
]

In [50]:
chain = load_qa_with_sources_chain(OpenAI(temperature=0))
search_index = FAISS.from_documents(sources, OpenAIEmbeddings())
print_answer("Which members of Matchbox 20 play guitar?")

InvalidRequestError: This model's maximum context length is 8191 tokens, however you requested 11161 tokens (11161 in your prompt; 0 for the completion). Please reduce your prompt; or completion length.

In [51]:
from langchain.text_splitter import CharacterTextSplitter

In [56]:
import requests
from ratelimit import limits, RateLimitException, sleep_and_retry

In [61]:
ONE_PERIOD = 10
MAX_CALLS_PER_MINUTE = 3

In [73]:
import time
source_chunks = []    
splitter = CharacterTextSplitter(separator=" ", chunk_size=1024, chunk_overlap=0)
for source in sources:
    for chunk in splitter.split_text(source.page_content):
        source_chunks.append(Document(page_content=chunk, metadata=source.metadata))

for i in range(0, len(source_chunks), 5):
    print(i)
    search_index = FAISS.from_documents(source_chunks[i:i+5], OpenAIEmbeddings())
    time.sleep(5.0)

#search_index = FAISS.from_documents(source_chunks, OpenAIEmbeddings())


0
5
10
15
20
25
30
35
40


RateLimitError: Rate limit reached for default-global-with-image-limits in organization org-581hUnehnO5YCYSlq7LTuGW7 on requests per min. Limit: 60.000000 / min. Current: 70.000000 / min. Contact support@openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method.