Skip to content

Commit

Permalink
feat: On the shelf customizable features
Browse files Browse the repository at this point in the history
  • Loading branch information
clemlesne committed Jun 10, 2024
1 parent df59665 commit 689a433
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 13 deletions.
20 changes: 20 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -294,3 +294,23 @@ Finally, run:
# Start the local API server
make dev
```

## Advanced usage

### Configuration

Features are documented in [features.py](helpers/config_models/features.py). The features can all be overridden in `config.yaml` file:

```yaml
# config.yaml
features:
extract_lang_confidence_threshold: 0.75
fact_iterations: 10
fact_score_threshold: 0.5
llm_retry_count: 3
page_split_margin: 100
page_split_size: 666
sanitize_pdf_version: 1.4

[...]
```
23 changes: 10 additions & 13 deletions function_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,14 +98,13 @@ async def raw_to_sanitize(input: BlobClientTrigger) -> None:
await downloader.readinto(in_bytes)
if _detect_extension(blob_name) == ".pdf": # Sanitize PDF
with pikepdf.open(in_bytes) as pdf:
target_version = "1.4"
logger.info(f"Sanitizing PDF from v{pdf.pdf_version} to v{target_version} ({blob_name})")
logger.info(f"Sanitizing PDF from v{pdf.pdf_version} to v{CONFIG.features.sanitize_pdf_version} ({blob_name})")
out_stream = BytesIO()
pdf.save(
deterministic_id=True, # Deterministic document ID for caching
filename_or_stream=out_stream,
linearize=True, # Allows compliant readers to begin displaying a PDF file before it is fully downloaded
min_version=target_version, # Note, if a second PDF is created with a higher version, hash will be different and cache won't work
min_version=CONFIG.features.sanitize_pdf_version, # Note, if a second PDF is created with a higher version, hash will be different and cache won't work
)
# Store
out_path = _replace_root_path(blob_name, SANITIZE_FOLDER)
Expand Down Expand Up @@ -183,7 +182,7 @@ async def sanitize_to_extract(input: BlobClientTrigger) -> None:
file_md5=blob_md5,
file_path=blob_name,
format="markdown",
langs={lang.locale for lang in doc_result.languages or [] if lang.confidence > 0.75},
langs={lang.locale for lang in doc_result.languages or [] if lang.confidence >= CONFIG.features.extract_lang_confidence_threshold},
title=title_paragraph.content if title_paragraph else None,
)
# Store
Expand Down Expand Up @@ -366,9 +365,9 @@ async def synthesis_to_page(input: BlobClientTrigger) -> None:
del content
# Prepare chunks for LLM
pages = _split_text(
max_tokens=int(100 / 75 * 500), # 100 tokens ~= 75 words, ~500 words per page for a dense book
text=synthesis_model.chunk_content,
max_tokens=CONFIG.features.page_split_size,
model=CONFIG.llm.fast.model, # We will use the fast model next step
text=synthesis_model.chunk_content,
)
logger.info(f"Splited to {len(pages)} pages ({blob_name})")
# Store
Expand Down Expand Up @@ -429,7 +428,7 @@ async def page_to_fact(input: BlobClientTrigger) -> None:
del content
# LLM does its magic
facts: list[FactModel] = []
for _ in range(10): # We will generate facts 10 times
for _ in range(CONFIG.features.fact_iterations): # We will generate facts 10 times
def _validate(req: Optional[str]) -> tuple[bool, Optional[str], Optional[FactedLlmModel]]:
if not req:
return False, "Empty response", None
Expand Down Expand Up @@ -612,7 +611,7 @@ def _validate(req: Optional[str]) -> tuple[bool, Optional[str], Optional[float]]
)
kept_facts = []
for i, fact_score in enumerate(fact_scores):
if fact_score >= 0.5: # Discard low quality facts
if fact_score >= CONFIG.features.fact_score_threshold: # Discard low quality facts
kept_facts.append(facted_model.facts[i])
facted_model.facts = kept_facts
if not facted_model.facts:
Expand Down Expand Up @@ -678,8 +677,6 @@ def _split_text(text: str, max_tokens: int, model: str) -> list[str]:
The function returns a list of text chunks.
"""
contents = []
first_margin = 100
last_margin = 100
max_chars = int(1048576 * 0.9) # REST API has a limit of 1MB, with a 10% margin
token_count = _count_tokens(content=text, model=model)

Expand All @@ -693,9 +690,9 @@ def _split_text(text: str, max_tokens: int, model: str) -> list[str]:
ckuncks_count = math.ceil(token_count / max_chars)
chunck_size = math.ceil(len(text) / ckuncks_count)
for i in range(ckuncks_count): # Iterate over desired chunks count
start = max(i * chunck_size - first_margin, 0) # First chunk with margin
start = max(i * chunck_size - CONFIG.features.page_split_margin, 0) # First chunk with margin
end = min(
(i + 1) * chunck_size + last_margin, len(text)
(i + 1) * chunck_size + CONFIG.features.page_split_margin, len(text)
) # Last chunk with margin
contents.append(text[start:end])

Expand All @@ -710,7 +707,7 @@ async def _llm_generate(
temperature: float = 0,
max_tokens: Optional[int] = None,
_previous_result: Optional[str] = None,
_retries_remaining: int = 3,
_retries_remaining: int = CONFIG.features.llm_retry_count,
_validation_error: Optional[str] = None,
) -> Optional[T]:
"""
Expand Down
40 changes: 40 additions & 0 deletions helpers/config_models/features.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from pydantic import BaseModel, Field


class FeaturesModel(BaseModel):
sanitize_pdf_version: bool = Field(
default="1.4",
description="PDF specification version to use when sanitizing PDFs.",
)
extract_lang_confidence_threshold: float = Field(
default=0.75,
description="The minimum confidence level required to note a language as detected.",
ge=0,
le=1,
)
fact_iterations: int = Field(
default=10,
description="The number of iterations to run the fact extraction process.",
ge=1,
)
fact_score_threshold: float = Field(
default=0.5,
description="The minimum score a fact must have to be considered valid.",
ge=0,
le=1,
)
page_split_size: int = Field(
default=int(100 / 75 * 500), # 100 tokens ~= 75 words, ~500 words per page for a dense book
description="The maximum number of characters to allow on a single page.",
ge=0,
)
page_split_margin: int = Field(
default=100,
description="The margin in characters to use when splitting pages.",
ge=0,
)
llm_retry_count: int = Field(
default=3,
description="The number of times to retry a failed LLM request. This includes initial request and validation.",
ge=0,
)
4 changes: 4 additions & 0 deletions helpers/config_models/root.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from helpers.config_models.ai_search import AiSearchModel
from helpers.config_models.document_intelligence import DocumentIntelligenceModel
from helpers.config_models.features import FeaturesModel
from helpers.config_models.llm import LlmModel
from helpers.config_models.monitoring import MonitoringModel
from pydantic import Field
Expand All @@ -19,6 +20,9 @@ class RootModel(BaseSettings):
# Editable fields
ai_search: AiSearchModel
document_intelligence: DocumentIntelligenceModel
features: FeaturesModel = (
FeaturesModel()
) # Object is fully defined by default
llm: LlmModel
monitoring: MonitoringModel = (
MonitoringModel()
Expand Down

0 comments on commit 689a433

Please sign in to comment.