Export Knowledge Worker

A question answering agent that is specialized in answering questions based on the knowledge base(text, files etc.) provided to it.

In [131]:
import os
import glob
from dotenv import load_dotenv
import gradio as gr

In [132]:
# import from langchain
from langchain.document_loaders import TextLoader, DirectoryLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.schema import Document
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go

In [133]:
MODEL = 'gpt-4o-mini'
db_name = 'vector_db'

In [153]:
load_dotenv(override=True)

# Load environment variables from .env file
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [135]:
# read in documents using Langchain loaders

folders = glob.glob('knowledge-base/*')

documents = []

for folder in folders:
    doc_type = folder.split('/')[-1]
    loader = DirectoryLoader(folder, glob='**/*.md', loader_cls=TextLoader)
    folder_docs = loader.load()
    for doc in folder_docs:
        doc.metadata['doc_type'] = doc_type
        documents.append(doc)
        
len(documents)
documents[0]

Document(metadata={'source': 'knowledge-base/products/Rellm.md', 'doc_type': 'products'}, page_content="# Product Summary\n\n# Rellm: AI-Powered Enterprise Reinsurance Solution\n\n## Summary\n\nRellm is an innovative enterprise reinsurance product developed by Insurellm, designed to transform the way reinsurance companies operate. Harnessing the power of artificial intelligence, Rellm offers an advanced platform that redefines risk management, enhances decision-making processes, and optimizes operational efficiencies within the reinsurance industry. With seamless integrations and robust analytics, Rellm enables insurers to proactively manage their portfolios and respond to market dynamics with agility.\n\n## Features\n\n### AI-Driven Analytics\nRellm utilizes cutting-edge AI algorithms to provide predictive insights into risk exposures, enabling users to forecast trends and make informed decisions. Its real-time data analysis empowers reinsurance professionals with actionable intellige

In [136]:
text_splitter = CharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200)

chunks = text_splitter.split_documents(documents)

len(chunks)
chunks[0]

Created a chunk of size 1088, which is longer than the specified 1000


Document(metadata={'source': 'knowledge-base/products/Rellm.md', 'doc_type': 'products'}, page_content='# Product Summary\n\n# Rellm: AI-Powered Enterprise Reinsurance Solution\n\n## Summary\n\nRellm is an innovative enterprise reinsurance product developed by Insurellm, designed to transform the way reinsurance companies operate. Harnessing the power of artificial intelligence, Rellm offers an advanced platform that redefines risk management, enhances decision-making processes, and optimizes operational efficiencies within the reinsurance industry. With seamless integrations and robust analytics, Rellm enables insurers to proactively manage their portfolios and respond to market dynamics with agility.\n\n## Features\n\n### AI-Driven Analytics\nRellm utilizes cutting-edge AI algorithms to provide predictive insights into risk exposures, enabling users to forecast trends and make informed decisions. Its real-time data analysis empowers reinsurance professionals with actionable intellige

In [137]:
doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)
print(f"Document types: {doc_types}")

Document types: {'company', 'products', 'employees', 'contracts'}


In [138]:
embeddings = OpenAIEmbeddings()

In [139]:
# check if Chroma vector store already exists, if so delete it and start from scratch.

if os.path.exists(db_name):
    print(f"Delete existing vector store from {db_name}")
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

Delete existing vector store from vector_db


In [140]:
# Create a new Chroma vector store

vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory=db_name
)

print(f"Persisting vector store to {db_name}")
print(f"Number of documents in vector store: {vectorstore._collection.count()}")

Persisting vector store to vector_db
Number of documents in vector store: 123


In [141]:
# Find one vector and see how many dimentions it has ...

collection = vectorstore._collection
sample_embedding = collection.get(limit=1, include=['embeddings'])['embeddings'][0]
dimentions = len(sample_embedding)
print(f"Sample embedding has {dimentions} dimensions")

Sample embedding has 1536 dimensions


In [145]:
result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
doc_types = (metadata['doc_type'] for metadata in result['metadatas'])
colors = [['blue', 'green', 'red', 'orange'][['products', 'employees', 'contracts', 'company'].index(t)] for t in doc_types]
colors

['blue',
 'blue',
 'blue',
 'blue',
 'blue',
 'blue',
 'blue',
 'blue',
 'blue',
 'blue',
 'blue',
 'blue',
 'blue',
 'blue',
 'blue',
 'blue',
 'blue',
 'blue',
 'blue',
 'blue',
 'blue',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'red',
 'orange',
 'orange',
 'orange',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 'green',
 '

In [148]:
# 2D t-SNE visualization of vector store
# Reduce the dimensionality of the vectors to 2D using t-SNE
# Note: t-SNE is computationally expensive and may take a while for large datasets

tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=10, color=colors, opacity=0.8),
    text=[
        f"Document: {doc[:50]}...<br>Type: {metadata['doc_type']}"
        for doc, metadata in zip(documents, result['metadatas'])
    ],
    hoverinfo='text',
)])

fig.update_layout(
    title='2D t-SNE Visualization of Vector Store',
    xaxis_title='x',
    yaxis_title='y',
    width=800,
    height=600,
    margin=dict(l=10, r=20, t=40, b=10),
)

fig.show()

In [152]:
# 2D t-SNE visualization of vector store
# Reduce the dimensionality of the vectors to 2D using t-SNE
# Note: t-SNE is computationally expensive and may take a while for large datasets

tsne = TSNE(n_components=3, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create 2D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers',
    marker=dict(size=10, color=colors, opacity=0.8),
    text=[
        f"Document: {doc[:50]}...<br>Type: {metadata['doc_type']}"
        for doc, metadata in zip(documents, result['metadatas'])
    ],
    hoverinfo='text',
)])

fig.update_layout(
    title='3D t-SNE Visualization of Vector Store',
    scene=dict(
        xaxis_title='x',
        yaxis_title='y',
        zaxis_title='z',
    ),
    width=800,
    height=600,
    margin=dict(l=10, r=20, t=40, b=10),
)

fig.show()