In [1]:
from main import init_env_vars
init_env_vars()

In [8]:
from time import perf_counter
from langchain.document_loaders import DirectoryLoader
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

from main import estimate_token_cost
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [9]:
#%pip install tqdm

In [10]:
def create_index_docs(loaded_docs, context, batch_num):
    print(f'Creating {context} index db for batch: {batch_num}')
    print(f"Loading {len(loaded_docs)} documents...")

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    docs = text_splitter.split_documents(loaded_docs)

    word_count, token_count, est_cost = estimate_token_cost(docs)
    print(f"Total word count: {word_count}")
    print(f"Total token count: {token_count}")
    print(f"Estimated cost: {est_cost}\n")

    start_time = perf_counter()
    embeddings = OpenAIEmbeddings()
    db = FAISS.from_documents(docs, embeddings)
    print(f"Creating index took {perf_counter() - start_time:.2f} seconds")

    db.save_local(f"temp_index/{context}/batches/{context}_{batch_num}")
    print(f'Done creating {context} index db for batch: {batch_num}\n')

In [11]:
def merge_indexes(context, documents, batch_size):
    # merge indexes for main context
    from langchain.vectorstores import FAISS
    embeddings = OpenAIEmbeddings()

    print(f'Loading {context } index db for batch: 0')
    db = FAISS.load_local(f'temp_index/{context}/batches/{context}_0', embeddings)
    for i in range(batch_size, len(documents), batch_size):
        print(f'Loading {context} index db for batch: {i}')
        FAISS.merge_from(db, FAISS.load_local(f'temp_index/{context}/batches/{context}_{i}', embeddings))
    db.save_local(f'temp_index/{context}/merged')

In [12]:
def load_and_index_site(directory, site_name, batch_size):
    loader = DirectoryLoader(directory, glob='**/*.html', show_progress=True)
    documents = loader.load()
    for i in range(0, len(documents), batch_size):
        create_index_docs(documents[i:i+batch_size], site_name, i)
    merge_indexes(site_name, documents, batch_size)

In [13]:
# load main site
load_and_index_site('./website/python.langchain.com', 'main', batch_size=5)

100%|██████████| 993/993 [00:38<00:00, 25.76it/s]


Creating main index db for batch: 0
Loading 5 documents...
Total word count: 8745
Total token count: 42497
Estimated cost: 0.016998799999999998

Creating index took 9.54 seconds
Done creating main index db for batch: 0

Creating main index db for batch: 5
Loading 5 documents...
Total word count: 713
Total token count: 1239
Estimated cost: 0.0004956

Creating index took 1.37 seconds
Done creating main index db for batch: 5

Creating main index db for batch: 10
Loading 5 documents...
Total word count: 714
Total token count: 1115
Estimated cost: 0.000446

Creating index took 0.78 seconds
Done creating main index db for batch: 10

Creating main index db for batch: 15
Loading 5 documents...
Total word count: 489
Total token count: 928
Estimated cost: 0.0003712

Creating index took 1.43 seconds
Done creating main index db for batch: 15

Creating main index db for batch: 20
Loading 5 documents...
Total word count: 585
Total token count: 944
Estimated cost: 0.0003776

Creating index took 1.26 

In [14]:
# merge the main, blog, and help indexes
db = FAISS.load_local(f'temp_index/main/merged', embeddings)

db.save_local(f'index')

In [44]:
db = FAISS.load_local('index', embeddings)

In [15]:
from langchain.memory import ConversationBufferMemory, ConversationSummaryBufferMemory
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA, ConversationalRetrievalChain

retriever = db.as_retriever(search_type='similarity', search_kwargs={'k': 4})
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

qa = ConversationalRetrievalChain.from_llm(llm=ChatOpenAI(temperature=0), retriever=retriever, memory=memory, verbose=True)

In [16]:
question = "What is LangChain?"
result = qa({"question": question})

result

{'question': 'What is LangChain?',
 'chat_history': [HumanMessage(content='What is LangChain?', additional_kwargs={}, example=False),
  AIMessage(content='LangChain is a software development framework designed to simplify the creation of applications using large language models (LLMs). It is written in Python and JavaScript and was initially released in October 2022. LangChain is a powerful tool that can be used to work with Large Language Models (LLMs) and can be used for chatbots, Generative Question-Answering (GQA), summarization, and much more. It is an intuitive framework created to assist in developing applications driven by a language model, such as OpenAI or Hugging Face. LangChain provides a standard interface for chains, enabling developers to create sequences of calls that go beyond a single LLM call.', additional_kwargs={}, example=False)],
 'answer': 'LangChain is a software development framework designed to simplify the creation of applications using large language models