In [None]:
# OPEN AI
import os
import openai

from dotenv import load_dotenv


load_dotenv()

# Open AI
openai.api_key = os.getenv("OPENAI_API_KEY")
models = openai.Model.list()


def get_embedding(text, model="text-embedding-ada-002"):
    text = text.replace("\n", " ")
    return openai.Embedding.create(input=[text], model=model)["data"][0]["embedding"]

In [1]:
from curses import meta
import chromadb
import os
import datetime
from chromadb.config import Settings
import docx


LOCAL_CHROMADB_DIR = "local_chroma_db"


def get_client():
    client = chromadb.Client(
        Settings(chroma_db_impl="duckdb+parquet", persist_directory=LOCAL_CHROMADB_DIR)
    )
    return client


def get_collection(client):
    journal_collection = client.get_or_create_collection("journal_entries")

    return journal_collection


def remove_extension(file_name):
    name_without_extension = os.path.splitext(file_name)[0]
    return name_without_extension


def read_docx(file_path):
    document = docx.Document(file_path)
    text = []
    for paragraph in document.paragraphs:
        text.append(paragraph.text)
    return "n".join(text)


def get_documents_to_add(directory):
    documents = []
    metadatas = []
    ids = []
    count = 1
    for root, _, files in os.walk(directory):
        for file in files:
            file_path = os.path.join(root, file)

            if file.startswith("."):
                continue

            # gather info from journals
            last_modified_date = str(
                datetime.datetime.fromtimestamp(os.path.getmtime(file_path))
            )
            title = remove_extension(file)
            text = read_docx(file_path)
            documents.append(f"{title} {text}")
            metadatas.append({"last_modified_date": last_modified_date})
            ids.append(f"Id{count}")
            count += 1

    return ids, documents, metadatas


def add_documents_to_collection(collection, ids, documents, metadatas):
    collection.add(ids=ids, documents=documents, metadatas=metadatas)


def first_time_batch_load():
    ids, documents, metadatas = get_documents_to_add("private-journals")
    add_documents_to_collection(collection, ids, documents, metadatas)


client = get_client()
collection = get_collection(client)
# first_time_batch_load()

In [4]:
from chromadb.utils import embedding_functions

basic_ef = embedding_functions.DefaultEmbeddingFunction()

query_embeddings = basic_ef(["happiness"])

# collection.query(
#     query_embeddings=query_embeddings,
#     n_results=1,
# )
# collection.peek()