In [None]:
from dataclasses import dataclass

@dataclass
class Document:
    title: str
    author: str
    data: str


In [44]:
import glob
import os
import re
from dataclasses import dataclass

@dataclass
class Text:
    title: str
    author: str
    date: str
    content: str

@dataclass
class SearchResult:
    title: str
    author: str
    date: str
    content: str
    span: tuple[int, int]
    location: int


def parse_filename(filename):
    # Split the filename to extract title, author, and year
    basename = os.path.basename(filename)
    name, _ = os.path.splitext(basename)
    parts = name.split('_')
    if len(parts) != 3:
        raise ValueError(f"Filename {filename} does not match the expected format 'title_Author_year.md'")
    title, author, date = parts
    return title, author, date

def glob_and_parse_texts(folder_path):
    texts = []
    for filepath in glob.glob(os.path.join(folder_path, '*.md')):
        try:
            title, author, date = parse_filename(filepath)
            content = open(filepath).read()
            texts.append(Text(title, author, date, content))
        except ValueError as e:
            print(f"Skipping file {filepath}: {e}")
    return texts

def compile_search_query(query: str) -> re.Pattern:
    pattern = rf"(?<=\b){query}(?=\b)"
    return re.compile(pattern, re.IGNORECASE)

def search_texts(query: str, texts: list[Text], preview_len: int = 10):
    pattern = compile_search_query(query)
    results = []
    for text in texts:
        for match in pattern.finditer(text.content):
            # Extract a snippet of text around the match
            start = text.content.rfind(' ', 0, max(0, match.start() - preview_len)) + 1
            end = text.content.find(' ', match.end() + preview_len)
            if end == -1:
                end = len(text.content)
            content = text.content[start:end]
            # Add the search result to the list
            span = (match.start() - start, match.end() - start)
            results.append(SearchResult(text.title, text.author, text.date, content, span, start))
    return results


folder_path = './backend/texts'
texts = glob_and_parse_texts(folder_path)

print("\nSearch results:")
for result in search_texts("good", texts):
    print(result)


Search results:
SearchResult(title='Assyrian Persian Dictionary', author='John Doe', date='1931', content='hello! I am good. Are you good?', span=(12, 16), location=3)
SearchResult(title='Assyrian Persian Dictionary', author='John Doe', date='1931', content="good. Are you good? Let's be", span=(14, 18), location=15)
SearchResult(title='Assyrian Persian Dictionary', author='John Doe', date='1931', content='Okay now, good for you. Good!', span=(10, 14), location=52)
SearchResult(title='Assyrian Persian Dictionary', author='John Doe', date='1931', content='good for you. Good!', span=(14, 18), location=62)
