In [32]:
from main import init_env_vars
init_env_vars()

In [33]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.document_loaders import UnstructuredURLLoader
import tiktoken
from time import perf_counter
from langchain.document_loaders import DirectoryLoader

from main import estimate_token_cost
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [3]:
#%pip install tqdm

In [38]:
def create_index_docs(loaded_docs, context, batch_num):
    print(f'Creating index db for batch: {batch_num}')
    print(f"Loading {len(loaded_docs)} documents...")

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    docs = text_splitter.split_documents(loaded_docs)

    word_count, token_count, est_cost = estimate_token_cost(docs)
    print(f"Total word count: {word_count}")
    print(f"Total token count: {token_count}")
    print(f"Estimated cost: {est_cost}\n")

    start_time = perf_counter()
    embeddings = OpenAIEmbeddings()
    db = FAISS.from_documents(docs, embeddings)
    print(f"Creating index took {perf_counter() - start_time:.2f} seconds")

    db.save_local(f"temp_index/{context}/batches/{context}_{batch_num}")
    print(f'Done creating index db for batch: {batch_num}\n')

In [39]:
def merge_indexes(context, documents, batch_size):
    # merge indexes for main context
    from langchain.vectorstores import FAISS
    embeddings = OpenAIEmbeddings()

    print(f'Loading {context } index db for batch: 0')
    db = FAISS.load_local(f'temp_index/{context}/batches/{context}_0', embeddings)
    for i in range(batch_size, len(documents), batch_size):
        print(f'Loading {context} index db for batch: {i}')
        FAISS.merge_from(db, FAISS.load_local(f'temp_index/{context}/batches/{context}_{i}', embeddings))
    db.save_local(f'temp_index/{context}/merged')

In [40]:
def load_and_index_site(directory, site_name, batch_size):
    loader = DirectoryLoader(directory, glob='**/*.html', show_progress=True)
    documents = loader.load()
    for i in range(0, len(documents), batch_size):
        create_index_docs(documents[i:i+batch_size], site_name, i)
    merge_indexes(site_name, documents, batch_size)

In [42]:
# load main site
# load_and_index_site('./website/zorin.com', 'main', batch_size=5)

# load the blog site
# load_and_index_site('./website/blog.zorin.com', 'blog', batch_size=3)

# load the help site
load_and_index_site('./website/help.zorin.com', 'help', batch_size=3)

100%|██████████| 67/67 [00:00<00:00, 103.55it/s]


Creating index db for batch: 0
Loading 3 documents...
Total word count: 1271
Total token count: 1735
Estimated cost: 0.0006940000000000001

Creating index took 0.72 seconds
Done creating index db for batch: 0

Creating index db for batch: 3
Loading 3 documents...
Total word count: 1484
Total token count: 2169
Estimated cost: 0.0008676000000000001

Creating index took 1.17 seconds
Done creating index db for batch: 3

Creating index db for batch: 6
Loading 3 documents...
Total word count: 1130
Total token count: 1495
Estimated cost: 0.000598

Creating index took 1.03 seconds
Done creating index db for batch: 6

Creating index db for batch: 9
Loading 3 documents...
Total word count: 592
Total token count: 887
Estimated cost: 0.0003548

Creating index took 0.54 seconds
Done creating index db for batch: 9

Creating index db for batch: 12
Loading 3 documents...
Total word count: 1095
Total token count: 1432
Estimated cost: 0.0005727999999999999

Creating index took 0.95 seconds
Done creating

In [43]:
# merge the main, blog, and help indexes
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

db = FAISS.load_local(f'temp_index/main/merged', embeddings)
FAISS.merge_from(db, FAISS.load_local(f'temp_index/blog/merged', embeddings))
FAISS.merge_from(db, FAISS.load_local(f'temp_index/help/merged', embeddings))

db.save_local(f'index')

In [44]:
db = FAISS.load_local('index', embeddings)

In [45]:
from langchain.memory import ConversationBufferMemory, ConversationSummaryBufferMemory
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA, ConversationalRetrievalChain

retriever = db.as_retriever(search_type='similarity', search_kwargs={'k': 4})
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

qa = ConversationalRetrievalChain.from_llm(llm=ChatOpenAI(temperature=0), retriever=retriever, memory=memory, verbose=False)

In [46]:
question = "What is Zorin OS? What is it based on?"
result = qa({"question": question})

result

{'question': 'What is Zorin OS? What is it based on?',
 'chat_history': [HumanMessage(content='What is Zorin OS? What is it based on?', additional_kwargs={}, example=False),
  AIMessage(content='Zorin OS is a Linux-based operating system that was created to bring advanced technology to everyone. It is based on Ubuntu and comes pre-installed with the updated Linux Kernel 4.2 which brings bug fixes, improved hardware support, and performance improvements.', additional_kwargs={}, example=False)],
 'answer': 'Zorin OS is a Linux-based operating system that was created to bring advanced technology to everyone. It is based on Ubuntu and comes pre-installed with the updated Linux Kernel 4.2 which brings bug fixes, improved hardware support, and performance improvements.'}