From 340b9db64be48ac2658fc37a4648cb608a2292f4 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Tue, 11 Nov 2025 14:10:31 +0530 Subject: [PATCH] fix: expand the source id regex item part ":" is required for the mail integration to work but the regex has been expanded to allow alphanumeric text and hyphen too. Signed-off-by: Anupam Kumar --- context_chat_backend/chain/ingest/injest.py | 10 ++++++++++ context_chat_backend/utils.py | 3 ++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/context_chat_backend/chain/ingest/injest.py b/context_chat_backend/chain/ingest/injest.py index f65c0ea..5871ebb 100644 --- a/context_chat_backend/chain/ingest/injest.py +++ b/context_chat_backend/chain/ingest/injest.py @@ -186,6 +186,16 @@ def embed_sources( f'{source.filename} ({_decode_latin_1(source.headers["title"])})' for source in sources_filtered ], + 'invalid_source_ids': [ + source.filename for source in sources + if not is_valid_source_id(source.filename) # pyright: ignore[reportArgumentType] + ], + 'not_allowed_file_ids': [ + source.filename for source in sources + if not _allowed_file(source) + ], + 'len(source_ids)': len(sources_filtered), + 'len(total_source_ids)': len(sources), }) vectordb = vectordb_loader.load() diff --git a/context_chat_backend/utils.py b/context_chat_backend/utils.py index 4d1344e..0d8466a 100644 --- a/context_chat_backend/utils.py +++ b/context_chat_backend/utils.py @@ -100,7 +100,8 @@ def exec_in_proc(group=None, target=None, name=None, args=(), kwargs={}, *, daem def is_valid_source_id(source_id: str) -> bool: - return re.match(r'^[a-zA-Z0-9_-]+__[a-zA-Z0-9_-]+: \d+$', source_id) is not None + # note the ":" in the item id part + return re.match(r'^[a-zA-Z0-9_-]+__[a-zA-Z0-9_-]+: [a-zA-Z0-9:-]+$', source_id) is not None def is_valid_provider_id(provider_id: str) -> bool: