## Setup Docker container for DB and install lib

In [33]:
!pip install -qU langchain_postgres

# run postgres pg_vector docker with command: docker compose up


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


## Setup Postgres pg_vector Vector Store and function for importing documents

In [34]:
from lib import utils
import os
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_postgres.vectorstores import PGVector
from tqdm import tqdm
from bs4 import BeautifulSoup

def get_vector_store(collection_name: str)->PGVector:
    connection = "postgresql+psycopg://project_solver:project_solver@localhost:6024/project_solver"
    embedding_model = utils.configure_embedding_model()

    vector_db = PGVector(
        embeddings=embedding_model,
        collection_name=collection_name,
        connection=connection,
        use_jsonb=True,
    )
    return vector_db

def purify_HTML(html: str)->str:
    soup = BeautifulSoup(html, features="html.parser")

    for script in soup(["script", "style"]):
        script.extract()    # rip it out

    text = soup.get_text()

    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    text = '\n'.join(chunk for chunk in chunks if chunk)
    return text


def import_source_documents(folder: str):
    # Split documents and store in vector db
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=10000,
        chunk_overlap=200
    )

    vectordb = get_vector_store(folder)
    dir = os.path.join("../data/", folder)
    for root, d_names, f_names in os.walk(dir):
        progress_bar = tqdm(f_names)
        for file in progress_bar:
            progress_bar.set_description(file)
            if not os.path.isdir(file):
                with open(os.path.join(root, file)) as f:
                    splits = []
                    text = f.read()
                    if file.endswith(".html"):
                        try:
                            text = purify_HTML(text)
                        except Exception as e:
                            print(e)

                    for idx, chunk in enumerate(text_splitter.split_text(text)):
                        splits.append(Document(
                            page_content=chunk,
                            metadata={"source": file, "chunk_idx": idx}
                        ))
                    try:
                        vectordb.add_documents(splits)
                    except Exception as e:
                        print(e)

## Load Base Information

In [35]:
vectordb = get_vector_store("baseInformation")
vectordb.drop_tables()

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

In [None]:
import_source_documents("baseInformation")

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

projectDirectoryStructure.txt:   0%|          | 0/2 [00:00<?, ?it/s]

## Load Companyhouse Project Code

In [29]:
import_source_documents('companyhouseProjectCode')

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

init: 100%|██████████| 4/4 [00:01<00:00,  3.33it/s]         
gulpfile.js: 100%|██████████| 4/4 [00:00<00:00,  8.27it/s]    
contact.js:   8%|▊         | 3/40 [00:01<00:16,  2.21it/s]               

(psycopg.errors.NotNullViolation) null value in column "id" of relation "langchain_pg_embedding" violates not-null constraint
DETAIL:  Failing row contains (null, null, null, null, null).
[SQL: INSERT INTO langchain_pg_embedding DEFAULT VALUES ON CONFLICT (id) DO UPDATE SET embedding = excluded.embedding, document = excluded.document, cmetadata = excluded.cmetadata]
(Background on this error at: https://sqlalche.me/e/20/gkpj)


date-polyfill.js: 100%|██████████| 40/40 [00:08<00:00,  4.89it/s]             
gallery.js: 100%|██████████| 1/1 [00:00<00:00,  6.88it/s]
network.js: 100%|██████████| 2/2 [00:00<00:00,  3.59it/s]     
CenterGraph.js: 100%|██████████| 14/14 [00:02<00:00,  5.29it/s]                 
sigma.plugins.tooltip.js: 100%|██████████| 1/1 [00:00<00:00,  1.52it/s]
media-site.js: 100%|██████████| 1/1 [00:00<00:00,  4.42it/s]
cmp.js: 100%|██████████| 1/1 [00:00<00:00,  2.15it/s]
set_secondary_value.js: 100%|██████████| 3/3 [00:00<00:00, 12.70it/s]
stickyMenu.js: 100%|██████████| 1/1 [00:00<00:00, 10.01it/s]
premium-highlights.js: 100%|██████████| 1/1 [00:00<00:00, 12.19it/s]
layout.js: 100%|██████████| 5/5 [00:03<00:00,  1.46it/s]              
companyStructure.js: 100%|██████████| 1/1 [00:00<00:00,  4.52it/s]
newAddress.js: 100%|██████████| 25/25 [00:03<00:00,  6.96it/s]                   
tasks-js.js: 100%|██████████| 6/6 [00:01<00:00,  4.11it/s]     
columns.js: 100%|██████████| 3/3 [00:00<00:00,  

(psycopg.errors.NotNullViolation) null value in column "id" of relation "langchain_pg_embedding" violates not-null constraint
DETAIL:  Failing row contains (null, null, null, null, null).
[SQL: INSERT INTO langchain_pg_embedding DEFAULT VALUES ON CONFLICT (id) DO UPDATE SET embedding = excluded.embedding, document = excluded.document, cmetadata = excluded.cmetadata]
(Background on this error at: https://sqlalche.me/e/20/gkpj)


_traffic-light-report.php: 100%|██████████| 4/4 [00:00<00:00,  8.13it/s]
index.php: 100%|██████████| 3/3 [00:00<00:00,  8.40it/s] 
overview.php: 100%|██████████| 1/1 [00:00<00:00,  3.80it/s]
_annual-reports-rows.php: 100%|██████████| 2/2 [00:00<00:00,  7.06it/s]
company-financial-data-overview-row.php: 100%|██████████| 7/7 [00:02<00:00,  2.50it/s]
balance-placeholder.png:  50%|█████     | 1/2 [00:00<00:00,  5.61it/s]


UnicodeDecodeError: 'utf-8' codec can't decode byte 0x89 in position 0: invalid start byte

## Load Framework Documentation

In [None]:
import_source_documents("frameworkDocumentation")

## Load Companyhouse Documentation

In [None]:
import_source_documents("companyhouseDocumentation")