## Setup Docker container for DB and install lib

In [4]:
!pip install -qU langchain_postgres
# run postgres pg_vector docker with command: docker compose up


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


## Setup Postgres pg_vector Vector Store and function for importing documents

In [5]:
import re
import os
from app.db.VectorStore import VectorStore
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from tqdm import tqdm
from bs4 import BeautifulSoup
from app.db.EnumDocsCollection import EnumDocsCollection

def purify_HTML(html: str)->str:
    soup = BeautifulSoup(html, features="html.parser")

    for script in soup(["script", "style"]):
        script.extract()    # rip it out

    text = soup.get_text()

    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    text = '\n'.join(chunk for chunk in chunks if chunk)
    return text


def import_source_documents(folder:str, verbose:bool=True, only_php:bool = False):
    # Split documents and store in vector db
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=10000,
        chunk_overlap=200
    )

    vectordb = VectorStore.get_vector_store(folder)
    dir = os.path.join("../data/", folder)
    for root, d_names, f_names in os.walk(dir):
        if verbose:
            progress_bar = tqdm(f_names)
        else:
            progress_bar = f_names
        for file in progress_bar:
            if verbose:
                progress_bar.set_description(file)
            if not os.path.isdir(file):
                with open(os.path.join(root, file), 'rb') as f:
                    splits = []
                    if only_php and not file.endswith(".php"):
                        continue
                    if file.endswith(".png") or file.endswith(".jpg") or file.endswith(".jpeg") or file.endswith(".mp4"):
                        continue
                    try:
                        text = f.read().decode(errors='replace')
                        if file.endswith(".html"):
                            text = purify_HTML(text)
                    except Exception as e:
                        print(e)

                    text = re.sub(r'\{[\w_-]+\}', '', text)

                    for idx, chunk in enumerate(text_splitter.split_text(text)):
                        splits.append(Document(
                            page_content=chunk,
                            metadata={"source": file, "path": os.path.join(root, file), "chunk_idx": idx}
                        ))
                    try:
                        vectordb.add_documents(splits)
                    except Exception as e:
                        print(e)

## Load Base Information

In [6]:
vectordb = VectorStore.get_vector_store(EnumDocsCollection.BASE_INFO.value)
vectordb.drop_tables()

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

In [7]:
import_source_documents(EnumDocsCollection.BASE_INFO.value)

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

companyhouseWiki.txt: 100%|██████████| 2/2 [00:00<00:00,  3.46it/s]         


## Load Companyhouse Project Code

In [8]:
import_source_documents(EnumDocsCollection.COMPANYHOUSE_PROJ_CODE.value, verbose=False, only_php=True)

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

(psycopg.errors.NotNullViolation) null value in column "id" of relation "langchain_pg_embedding" violates not-null constraint
DETAIL:  Failing row contains (null, null, null, null, null).
[SQL: INSERT INTO langchain_pg_embedding DEFAULT VALUES ON CONFLICT (id) DO UPDATE SET embedding = excluded.embedding, document = excluded.document, cmetadata = excluded.cmetadata]
(Background on this error at: https://sqlalche.me/e/20/gkpj)
(psycopg.errors.NotNullViolation) null value in column "id" of relation "langchain_pg_embedding" violates not-null constraint
DETAIL:  Failing row contains (null, null, null, null, null).
[SQL: INSERT INTO langchain_pg_embedding DEFAULT VALUES ON CONFLICT (id) DO UPDATE SET embedding = excluded.embedding, document = excluded.document, cmetadata = excluded.cmetadata]
(Background on this error at: https://sqlalche.me/e/20/gkpj)
(psycopg.errors.NotNullViolation) null value in column "id" of relation "langchain_pg_embedding" violates not-null constraint
DETAIL:  Faili

## Load Framework Documentation

In [9]:
import_source_documents(EnumDocsCollection.FRAMEWORK_DOCS.value)

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

guide-tutorial-core-validators.html: 100%|██████████| 858/858 [14:16<00:00,  1.00it/s]                         


## Load Companyhouse Documentation

In [10]:
import_source_documents(EnumDocsCollection.COMPANYHOUSE_PROJ_DOCS.value)

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Sitemaps_2239660052.html: 100%|██████████| 122/122 [00:22<00:00,  5.41it/s]                                                               
0it [00:00, ?it/s]
wait.gif:   0%|          | 0/3 [00:00<?, ?it/s]       

(psycopg.DataError) PostgreSQL text fields cannot contain NUL (0x00) bytes
[SQL: INSERT INTO langchain_pg_embedding (id, collection_id, embedding, document, cmetadata) VALUES (%(id_m0)s::VARCHAR, %(collection_id_m0)s::UUID, %(embedding_m0)s, %(document_m0)s::VARCHAR, %(cmetadata_m0)s::JSONB) ON CONFLICT (id) DO UPDATE SET embedding = excluded.embedding, document = excluded.document, cmetadata = excluded.cmetadata]
[parameters: {'id_m0': '6caed318-055f-4178-85fd-bbda4f7e1f50', 'collection_id_m0': UUID('ab3f5a17-d0b5-45ec-a6dd-567660ada528'), 'embedding_m0': '[-0.042048949748277664,-0.0626618042588234,-0.006961728911846876,0.011563229374587536,0.07049935311079025,0.05394190922379494,0.03512005880475044,-0. ... (7779 characters truncated) ... -0.015553394332528114,-0.0005527643370442092,-0.010515833273530006,-0.0378003790974617,0.02900931052863598,0.019590334966778755,-0.04052946716547012]', 'document_m0': 'GIF89a\x08\x00\x08\x00�\x00\x00\x003f����\x01\x02\x00\x00\x00!�\x04\x04\x14\x00�\x

grey_arrow_down.png: 100%|██████████| 3/3 [00:00<00:00,  8.09it/s]


(psycopg.DataError) PostgreSQL text fields cannot contain NUL (0x00) bytes
[SQL: INSERT INTO langchain_pg_embedding (id, collection_id, embedding, document, cmetadata) VALUES (%(id_m0)s::VARCHAR, %(collection_id_m0)s::UUID, %(embedding_m0)s, %(document_m0)s::VARCHAR, %(cmetadata_m0)s::JSONB) ON CONFLICT (id) DO UPDATE SET embedding = excluded.embedding, document = excluded.document, cmetadata = excluded.cmetadata]
[parameters: {'id_m0': '4119ae4a-6f9f-459f-9c4c-d0d7804cd244', 'collection_id_m0': UUID('ab3f5a17-d0b5-45ec-a6dd-567660ada528'), 'embedding_m0': '[-0.027231479063630104,-0.009448573924601078,-0.037720050662755966,-0.01267452072352171,0.06857317686080933,0.04242289438843727,-0.01593729481101036, ... (7826 characters truncated) ... -0.00025657261721789837,-0.04906648024916649,-0.05005236715078354,-0.0427401103079319,-0.060333117842674255,0.02214096300303936,0.007210333365947008]', 'document_m0': 'GIF89a\x10\x00\x10\x00�\x00\x00��������ݻ�����������wwwfffUUUDDD333"""\x11\x11\x11\

smile.png: 100%|██████████| 3/3 [00:00<00:00, 648.57it/s]
1fa99.png: 100%|██████████| 18/18 [00:00<00:00, 1574.93it/s]    
home_page_16.png: 100%|██████████| 1/1 [00:00<00:00, 1664.41it/s]
