## Setup Docker container for DB and install lib

In [None]:
!pip install -qU langchain_postgres
# run postgres pg_vector docker with command: docker compose up


## Setup Postgres pg_vector Vector Store and function for importing documents

In [None]:
import os
from app.db.VectorStore import VectorStore
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from tqdm import tqdm
from bs4 import BeautifulSoup
from app.db.EnumDocsCollection import EnumDocsCollection

def purify_HTML(html: str)->str:
    soup = BeautifulSoup(html, features="html.parser")

    for script in soup(["script", "style"]):
        script.extract()    # rip it out

    text = soup.get_text()

    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    text = '\n'.join(chunk for chunk in chunks if chunk)
    return text


def import_source_documents(folder: str):
    # Split documents and store in vector db
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=10000,
        chunk_overlap=200
    )

    vectordb = VectorStore.get_vector_store(folder)
    dir = os.path.join("../data/", folder)
    for root, d_names, f_names in os.walk(dir):
        progress_bar = tqdm(f_names)
        for file in progress_bar:
            progress_bar.set_description(file)
            if not os.path.isdir(file):
                with open(os.path.join(root, file)) as f:
                    splits = []
                    if file.endswith(".png") or file.endswith(".jpg") or file.endswith(".jpeg"):
                        continue
                    try:
                        text = f.read()
                        if file.endswith(".html"):
                            text = purify_HTML(text)
                    except Exception as e:
                        print(e)

                    for idx, chunk in enumerate(text_splitter.split_text(text)):
                        splits.append(Document(
                            page_content=chunk,
                            metadata={"source": file, "chunk_idx": idx}
                        ))
                    try:
                        vectordb.add_documents(splits)
                    except Exception as e:
                        print(e)

## Load Base Information

In [None]:
vectordb = VectorStore.get_vector_store(EnumDocsCollection.BASE_INFO.value)
vectordb.drop_tables()

In [None]:
import_source_documents(EnumDocsCollection.BASE_INFO.value)

## Load Companyhouse Project Code

In [None]:
import_source_documents(EnumDocsCollection.COMPANYHOUSE_PROJ_CODE.value)

## Load Framework Documentation

In [None]:
import_source_documents(EnumDocsCollection.FRAMEWORK_DOCS.value)

## Load Companyhouse Documentation

In [None]:
import_source_documents(EnumDocsCollection.COMPANYHOUSE_PROJ_DOCS.value)