# Introduction

![image](./imgs/0_title.png)

# Concept Diagram

![image](./imgs/1_vision.png)

# Knowledge Sources

![image](./imgs/2_knowledge_sources.png)

# Handbook Samples

- https://github.com/basecamp/handbook
- https://github.com/madetech/handbook
- https://yolospace.notion.site/Onboarding-Employee-Handbooks-138a98bd392a45da910d74b900300194

In [None]:
!rm -rf handbook
!git clone https://github.com/madetech/handbook.git handbook

# Initial Setup

In [None]:
import boto3
import os

def get_openai_api_key(ssm_client, parameter_path):
    '''Get the OpenAI API key from the SSM Parameter Store'''
    try:
        response = ssm_client.get_parameter(
            Name=parameter_path,
            WithDecryption=True
        )
        return response['Parameter']['Value']
    except ssm_client.exceptions.ParameterNotFound:
        raise Exception(f'Parameter {parameter_path} not found in SSM Parameter Store')

region_name = os.getenv('AWS_REGION', 'us-east-1') 
API_KEY_PARAMETER_PATH = '/openai/api_key'
# Create an SSM client using Boto3
ssm = boto3.client('ssm', region_name=region_name)

openai_api_key = get_openai_api_key(ssm_client=ssm, parameter_path=API_KEY_PARAMETER_PATH)
os.environ['OPENAI_API_KEY'] = openai_api_key

llm_model = "gpt-3.5-turbo"


# Load Documents

In [None]:
# Build a sample vectorDB

from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader

loader = DirectoryLoader(
    "./handbook", glob="**/*.md", show_progress=True, loader_cls=TextLoader
)
documents = loader.load()

In [None]:
documents[0].metadata

In [None]:
print(documents[0].page_content[:100])

# Split Documents with Context

In [None]:
from langchain.text_splitter import MarkdownHeaderTextSplitter

def split_documents(documents):
    result_documents = []
    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"),
        ("####", "Header 4"),
    ]
    markdown_splitter = MarkdownHeaderTextSplitter(
        headers_to_split_on=headers_to_split_on
    )
    for document in documents:
        md_header_splits = markdown_splitter.split_text(document.page_content)


        for split in md_header_splits:
            split.metadata.update(document.metadata)
            result_documents.append(split)

    return result_documents

In [None]:
splitted_documents = split_documents(documents)

In [None]:
print(len(splitted_documents))

In [None]:
from IPython.display import display, Markdown

display(Markdown(splitted_documents[0].page_content))
print("# Metadata:")
print(splitted_documents[0].metadata)

# Create Chroma Vector DB

In [None]:
!rm -rf ./docs/chroma
from langchain.embeddings.openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings()

from langchain.vectorstores import Chroma
persist_directory = 'docs/chroma/'

vectordb = Chroma.from_documents(
    documents=splitted_documents,
    embedding=embedding,
    persist_directory=persist_directory
)

# Search Similar Documents

In [None]:
def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))

In [None]:
question = "What are security measures are applied in the company?"

docs = vectordb.similarity_search(question,k=5)
pretty_print_docs(docs)