Export Knowledge Worker

A question answering agent that is specialized in answering questions based on the knowledge base(text, files etc.) provided to it.

In [162]:
import os
import glob
from dotenv import load_dotenv
import gradio as gr

In [163]:
# import from langchain
from langchain.document_loaders import TextLoader, DirectoryLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain.vectorstores import Chroma
from langchain.schema import Document
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go

In [164]:
MODEL = 'gpt-4o-mini'
db_name = 'vector_db'

In [165]:
load_dotenv(override=True)

# Load environment variables from .env file
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [166]:
# read in documents using Langchain loaders

folders = glob.glob('knowledge-base/*')

documents = []

for folder in folders:
    doc_type = folder.split('/')[-1]
    loader = DirectoryLoader(folder, glob='**/*.md', loader_cls=TextLoader)
    folder_docs = loader.load()
    for doc in folder_docs:
        doc.metadata['doc_type'] = doc_type
        documents.append(doc)
        
len(documents)
documents[0]

Document(metadata={'source': 'knowledge-base/products/Rellm.md', 'doc_type': 'products'}, page_content="# Product Summary\n\n# Rellm: AI-Powered Enterprise Reinsurance Solution\n\n## Summary\n\nRellm is an innovative enterprise reinsurance product developed by Insurellm, designed to transform the way reinsurance companies operate. Harnessing the power of artificial intelligence, Rellm offers an advanced platform that redefines risk management, enhances decision-making processes, and optimizes operational efficiencies within the reinsurance industry. With seamless integrations and robust analytics, Rellm enables insurers to proactively manage their portfolios and respond to market dynamics with agility.\n\n## Features\n\n### AI-Driven Analytics\nRellm utilizes cutting-edge AI algorithms to provide predictive insights into risk exposures, enabling users to forecast trends and make informed decisions. Its real-time data analysis empowers reinsurance professionals with actionable intellige

In [167]:
text_splitter = CharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200)

chunks = text_splitter.split_documents(documents)

len(chunks)
chunks[0]

Created a chunk of size 1088, which is longer than the specified 1000


Document(metadata={'source': 'knowledge-base/products/Rellm.md', 'doc_type': 'products'}, page_content='# Product Summary\n\n# Rellm: AI-Powered Enterprise Reinsurance Solution\n\n## Summary\n\nRellm is an innovative enterprise reinsurance product developed by Insurellm, designed to transform the way reinsurance companies operate. Harnessing the power of artificial intelligence, Rellm offers an advanced platform that redefines risk management, enhances decision-making processes, and optimizes operational efficiencies within the reinsurance industry. With seamless integrations and robust analytics, Rellm enables insurers to proactively manage their portfolios and respond to market dynamics with agility.\n\n## Features\n\n### AI-Driven Analytics\nRellm utilizes cutting-edge AI algorithms to provide predictive insights into risk exposures, enabling users to forecast trends and make informed decisions. Its real-time data analysis empowers reinsurance professionals with actionable intellige

In [168]:
doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)
print(f"Document types: {doc_types}")

Document types: {'company', 'products', 'employees', 'contracts'}


In [169]:
embeddings = OpenAIEmbeddings()

In [170]:
# check if Chroma vector store already exists, if so delete it and start from scratch.

if os.path.exists(db_name):
    print(f"Delete existing vector store from {db_name}")
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

Delete existing vector store from vector_db


In [171]:
# Create a new Chroma vector store

vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory=db_name
)

print(f"Persisting vector store to {db_name}")
print(f"Number of documents in vector store: {vectorstore._collection.count()}")

Persisting vector store to vector_db
Number of documents in vector store: 123


In [172]:
# Find one vector and see how many dimentions it has ...

collection = vectorstore._collection
sample_embedding = collection.get(limit=1, include=['embeddings'])['embeddings'][0]
dimentions = len(sample_embedding)
print(f"Sample embedding has {dimentions} dimensions")

Sample embedding has 1536 dimensions


In [173]:
result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
doc_types = (metadata['doc_type'] for metadata in result['metadatas'])
colors = [['blue', 'green', 'red', 'orange'][['products', 'employees', 'contracts', 'company'].index(t)] for t in doc_types]
colors

['blue',
 'blue',
 'blue',
 'blue',
 'blue',
 'blue',
 'blue',
 'blue',
 'blue',
 'blue',
 'blue',
 'blue',
 'blue',
 'blue',
 'blue',
 'blue',
 'blue',
 'blue',
 'blue',
 'blue',
 'blue',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'orange',
 'orange',
 'orange',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 '

In [174]:
# 2D t-SNE visualization of vector store
# Reduce the dimensionality of the vectors to 2D using t-SNE
# Note: t-SNE is computationally expensive and may take a while for large datasets

tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=10, color=colors, opacity=0.8),
    text=[
        f"Document: {doc[:50]}...<br>Type: {metadata['doc_type']}"
        for doc, metadata in zip(documents, result['metadatas'])
    ],
    hoverinfo='text',
)])

fig.update_layout(
    title='2D t-SNE Visualization of Vector Store',
    xaxis_title='x',
    yaxis_title='y',
    width=800,
    height=600,
    margin=dict(l=10, r=20, t=40, b=10),
)

fig.show()

In [175]:
# 2D t-SNE visualization of vector store
# Reduce the dimensionality of the vectors to 2D using t-SNE
# Note: t-SNE is computationally expensive and may take a while for large datasets

tsne = TSNE(n_components=3, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create 2D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers',
    marker=dict(size=10, color=colors, opacity=0.8),
    text=[
        f"Document: {doc[:50]}...<br>Type: {metadata['doc_type']}"
        for doc, metadata in zip(documents, result['metadatas'])
    ],
    hoverinfo='text',
)])

fig.update_layout(
    title='3D t-SNE Visualization of Vector Store',
    scene=dict(
        xaxis_title='x',
        yaxis_title='y',
        zaxis_title='z',
    ),
    width=800,
    height=600,
    margin=dict(l=10, r=20, t=40, b=10),
)

fig.show()

In [177]:
# create a new chat with OpenAI
llm = ChatOpenAI(model_name=MODEL, temperature=0.7)

# set up the conversion memory for chat
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

# the retriever is an abstraction over the vectorstore that will be used during RAG
retriever = vectorstore.as_retriever()

# putting langchain all together, we create a ConversationalRetrievalChain
# that will use the LLM and the retriever to answer questions
# and keep track of the conversation history in the memory
# this is the chain that will be used to answer questions
conversation_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory
)


Please see the migration guide at: https://python.langchain.com/docs/versions/migrating_memory/



In [178]:
query = "Whats Insurellm?"
result = conversation_chain({"question": query})
print(f"Answer: {result['answer']}")


The method `Chain.__call__` was deprecated in langchain 0.1.0 and will be removed in 1.0. Use :meth:`~invoke` instead.



Answer: Insurellm is an innovative insurance tech firm founded by Avery Lancaster in 2015. It specializes in disrupting the insurance industry with innovative products. Insurellm offers four main software products: Carllm (for auto insurance companies), Homellm (for home insurance companies), Rellm (an enterprise platform for the reinsurance sector), and Marketllm (a marketplace connecting consumers with insurance providers). The company has grown to 200 employees and serves more than 300 clients worldwide.


In [179]:
def chat(message, history):
  response = conversation_chain.invoke({"question": message})
  return response['answer']

In [180]:
view = gr.ChatInterface(chat).launch()


The 'tuples' format for chatbot messages is deprecated and will be removed in a future version of Gradio. Please set type='messages' instead, which uses openai-style 'role' and 'content' keys.



* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.
