In [4]:
# split_md(file_path)

from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain_text_splitters import RecursiveCharacterTextSplitter
import json

def split_md(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        markdown_document = file.read()

    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2")
    ]

    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
    md_header_splits = markdown_splitter.split_text(markdown_document)

    chunk_size = 350
    chunk_overlap = 80
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap
    )

    splits = text_splitter.split_documents(md_header_splits)

    results = []
    for split in splits:
        # Calculate content with headers
        headers_ = '\n'.join([f'{"#"*int(k[-1])} {v}' for k, v in split.metadata.items() if 'Header' in k])
        content_ = f"{headers_}\n\n{split.page_content}"
    
        # Add an item into the array
        results.append({
            "hash": hash(content_),
            "content": content_,
            "metadata": json.dumps(split.metadata),
            "document": split,
        })
    
    return results


In [8]:
def index_markdown(reference, vector_size = 1500):
    # Tokenizes the input document
    splits = split_md(reference)
    splits_hashes = [item['hash'] for item in splits];

    # Remove lines that have changed
    execute_query("""
    DELETE FROM mkd01 
    WHERE reference = %s 
    AND hash NOT IN (SELECT unnest(%s::text[]))
    """, (reference, splits_hashes))

    # Finds hashes that are not present in the table
    existing_hashes = [hash_[0] for hash_ in execute_query(
        "SELECT hash FROM mkd01 WHERE reference = %s",
        (reference,)
    )]
    
    filtered_hashes = [str(item) for item in splits_hashes if str(item) not in existing_hashes]
    matching_splits = [item for item in splits if str(item['hash']) in filtered_hashes]

    # Adds & Embeds lines that has changed
    #import json
    
    query = """
    INSERT INTO mkd01
           (hash, reference, index, content, metadata, embedding)
    VALUES (%s,   %s,        %s,    %s,      %s,       %s)
    ON CONFLICT DO NOTHING;
    """
    
    for index, split in enumerate(matching_splits):
        print(f"embed: {reference}")
        embeddings = generate_embedding(split["content"], vector_size)
        execute_query(query, (
            split["hash"],
            reference,
            index,
            split["content"], 
            split["metadata"],
            embeddings
        ))

    return matching_splits

In [6]:
import os
import fnmatch

def find_sources(base_path, filter_as='*.md'):
    markdown_files = []
    for root, dirs, files in os.walk(base_path):
        # Skip hidden directories by modifying the dirs list in-place
        dirs[:] = [d for d in dirs if not d.startswith('.')]
        # Filter files in the current root folder based on the provided pattern
        for file in fnmatch.filter(files, filter_as):
            # Skip files that start with a dot
            if file.startswith('.'):
                continue
            # Construct the relative path from the base path
            relative_path = os.path.relpath(os.path.join(root, file), start=base_path)
            # Append the relative path prefixed with ./ for local path context
            markdown_files.append(os.path.join(base_path, relative_path))
    return markdown_files

#find_sources('./docs-onefront-small/')