In [3]:
from pathlib import Path

from chatbot.chatbot import EmbeddingModel
from common.utils import get_root_dir

# Load all text files from the data folders
data = {}
root_dir = get_root_dir()
em = EmbeddingModel()

for file in Path(root_dir / "data").rglob("*.txt"):
    with open(file, "r", encoding="utf-8") as f:
        data[file.stem] = f.read()


# Write a function that given a text, returns the embedding of that text
def get_embeddings(text: str) -> dict[str, list[float]]:
    chunks = em.split_text_to_chunks(text, 1000)
    return {chunk: em.get_embedding(chunk) for chunk in chunks}


first_file: dict[str, dict[str, list[list[float]]]] = {
    "The French Revolution A History": get_embeddings(data["The French Revolution A History"])
}
second_file: dict[str, dict[list[list[float]]]] = {
    "The world's leading conquerors": get_embeddings(data["The world's leading conquerors"])
}

In [4]:
import json

import psycopg2

conn = psycopg2.connect(
    dbname="postgres", user="postgres", password="postgres", host="localhost", port="5432"
)
cur = conn.cursor()

# Insert the embeddings into the database
for document_id, embeddings in second_file.items():
    id_ = 0
    for text, embedding in embeddings.items():
        cur.execute(
            "INSERT INTO knowledge_base (document_id, embedding, additional_information, text) VALUES (%s, %s, %s, %s)",
            (f"{document_id}_{id_}", embedding, json.dumps({"document_id": document_id}), text),
        )
        id_ += 1

for document_id, embeddings in first_file.items():
    id_ = 0
    for text, embedding in embeddings.items():
        cur.execute(
            "INSERT INTO knowledge_base (document_id, embedding, additional_information, text) VALUES (%s, %s, %s, %s)",
            (f"{document_id}_{id_}", embedding, json.dumps({"document_id": document_id}), text),
        )
        id_ += 1

conn.commit()
cur.close()