In [None]:
!{sys.executable} -m pip install python-dotenv gradio langchain langchain-community langchain-openai langchain-chroma chromadb scikit-learn numpy plotly

In [1]:
### Basic imports

import os 
from dotenv import load_dotenv
import glob 
import gradio as gr

from typing import List

In [2]:
### Langchain, Chroma, plotly 
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI 
from langchain_chroma import Chroma 
import numpy as np 
from sklearn.manifold import TSNE 
import plotly.graph_objects as go

In [3]:
OPENAI_MODEL_ID  = "gpt-5-mini"
db_name = "vector_db"

In [4]:
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [5]:
### Append doc_type on each document 
### with folder name as the value on each

folders: List[str] = glob.glob("knowledge-base/*")

documents = []

for folder in folders:
    folder_name: str = os.path.basename(folder)
    loader = DirectoryLoader(
        path=folder,
        glob="**/*.md",
        loader_cls=TextLoader,
    )
    docs = loader.load()
    
    for doc in docs:
        doc.metadata["doc_type"] = folder_name
        documents.append(doc)    


In [6]:
len(documents)

17

In [7]:
documents[2]

Document(metadata={'source': 'knowledge-base/products/underwriteai.md', 'doc_type': 'products'}, page_content='# UnderwriteAI\n\nUnderwriteAI is an intelligent underwriting assistant designed for insurers and reinsurers.\n\n## Core Capabilities\n- Automated risk scoring based on structured + unstructured data\n- AI-powered document review (applications, medical records, financial statements)\n- Adaptive pricing models with continuous learning\n- Compliance-friendly audit trail\n\n## Benefits\n- Faster onboarding of new policies\n- Consistent risk assessments across underwriters\n- Improved profitability through accurate pricing\n\n**Launch Year:** 2023  \n**Target Market:** Life insurers, Reinsurers  \n')

In [8]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)
len(chunks)

20

In [9]:
chunks[2]

Document(metadata={'source': 'knowledge-base/products/underwriteai.md', 'doc_type': 'products'}, page_content='# UnderwriteAI\n\nUnderwriteAI is an intelligent underwriting assistant designed for insurers and reinsurers.\n\n## Core Capabilities\n- Automated risk scoring based on structured + unstructured data\n- AI-powered document review (applications, medical records, financial statements)\n- Adaptive pricing models with continuous learning\n- Compliance-friendly audit trail\n\n## Benefits\n- Faster onboarding of new policies\n- Consistent risk assessments across underwriters\n- Improved profitability through accurate pricing\n\n**Launch Year:** 2023  \n**Target Market:** Life insurers, Reinsurers')

In [10]:
### Check the number of doc_types 
doc_type_kinds = set(chunk.metadata["doc_type"] for chunk in chunks)
print(f"Document types found:", ", ".join(doc_type_kinds))

Document types found: contracts, employees, products, company


In [11]:
### Check if a text mention match fetches the correcpond documents

for i, chunk in enumerate(chunks): 
    if "UnderwriteAI" in chunk.page_content:
        print(f"Chunk {i}: {chunk}")
    else:
        print(f"Chunk {i}: No chunks found.")

Chunk 0: No chunks found.
Chunk 1: No chunks found.
Chunk 2: page_content='# UnderwriteAI

UnderwriteAI is an intelligent underwriting assistant designed for insurers and reinsurers.

## Core Capabilities
- Automated risk scoring based on structured + unstructured data
- AI-powered document review (applications, medical records, financial statements)
- Adaptive pricing models with continuous learning
- Compliance-friendly audit trail

## Benefits
- Faster onboarding of new policies
- Consistent risk assessments across underwriters
- Improved profitability through accurate pricing

**Launch Year:** 2023  
**Target Market:** Life insurers, Reinsurers' metadata={'source': 'knowledge-base/products/underwriteai.md', 'doc_type': 'products'}
Chunk 3: No chunks found.
Chunk 4: No chunks found.
Chunk 5: No chunks found.
Chunk 6: No chunks found.
Chunk 7: No chunks found.
Chunk 8: No chunks found.
Chunk 9: No chunks found.
Chunk 10: No chunks found.
Chunk 11: No chunks found.
Chunk 12: No chunks

In [12]:
### Embed (Vectorize) the text chunks
embeddings = OpenAIEmbeddings(api_key=OPENAI_API_KEY)

In [22]:
### Check if a Chroma Datastore already exists
### If so, delete the DB collection to start from scratch 

if os.path.exists(db_name):
    Chroma(
        persist_directory=db_name, 
        embedding_function=embeddings,
    ).delete_collection()

In [23]:
### Create Chroma Database Store 

vectorstore = Chroma.from_documents(
    documents=chunks,
    persist_directory=db_name,
    embedding=embeddings
)

### Check the number of embedded data
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

Vectorstore created with 20 documents


In [24]:
### Get one vector and find how many demensions it has

collection = vectorstore._collection
sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)

print(f"The number of dimensions are: ", dimensions)
print(sample_embedding)

NameError: name 'dimentions' is not defined

## Visualizing the Vector Store


In [None]:
### Pre-work

result = collection.get(include=["embeddings", "documents", "metadatas"])
vectors = np.array(result["embeddings"])
documents = result["documents"]
doc_types = [metadata["doc_type"] for metadata in result["metadatas"]]
color_map = {
    "products": "blue",
    "employees": "green", 
    "contracts": "red", 
    "company": "orange"
} 

colors = [color_map[t] for t in doc_types]
print(documents)

In [16]:
import plotly.io as pio
pio.renderers.default = "notebook_connected"

tsne = TSNE(n_components=2, random_state=42, perplexity=5)
reduced_vector = tsne.fit_transform(vectors)

### Create a 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x=reduced_vector[:, 0],
    y=reduced_vector[:, 1], 
    mode="markers", 
    marker=dict(size=5, color=colors, opacity=0.8), 
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)], 
    hoverinfo="text"
)])

fig.update_layout(
    title="2D Chroma Vector Store Visualization", 
    xaxis_title="x", 
    yaxis_title="y", 
    width=900, 
    height=700,
    margin=dict(r=20, b=10, l=10, t=40) 
)

fig.show()

NameError: name 'vectors' is not defined

## 3D Visualization of Vectors

In [17]:
tsne = TSNE(n_components=3, random_state=42, perplexity=5)
reduced_vector = tsne.fit_transform(vectors)

fig = go.Figure(data=[go.Scatter3d(
    x= reduced_vector[:, 0], 
    y= reduced_vector[:, 1], 
    z= reduced_vector[:, 2], 
    mode="markers", 
    marker=dict(size=5, color=colors, opacity=0.8), 
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)], 
    hoverinfo="text"
)]) 

fig.update_layout(
    title="3D Chroma Vector Store Visualization", 
    scene=dict(xaxis_title="x", yaxis_title="y", zaxis_title="z"), 
    width=900, 
    height=700, 
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

NameError: name 'vectors' is not defined