Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions specs/002-mock-call-real-crm/tasks.md
Original file line number Diff line number Diff line change
Expand Up @@ -167,12 +167,12 @@ Single project — `src/opencloser/`, `tests/` at repository root (per plan.md

### Tests for User Story 6

- [ ] T039 [P] [US6] Tests — US6: `[REDACTED]` replacement, summary-only retention writes no transcript file, no-op policy preserves the Slice 1 artifact contract, and a malformed redaction policy fails readiness, in `tests/unit/test_redaction.py` and `tests/integration/test_us6_redaction.py` (SC-009)
- [X] T039 [P] [US6] Tests — US6: `[REDACTED]` replacement, summary-only retention writes no transcript file, no-op policy preserves the Slice 1 artifact contract, and a malformed redaction policy fails readiness, in `tests/unit/test_redaction.py` and `tests/integration/test_us6_redaction.py` (SC-009)

### Implementation for User Story 6

- [ ] T037 [P] [US6] Implement `src/opencloser/redaction/layer.py` — `RedactionLayer` with `RegexRedactionPolicy` (default `[REDACTED]`), `NoOpPolicy`, and summary-only retention (FR-028–FR-030; contracts/redaction-layer.md)
- [ ] T038 [US6] Route transcript text through `RedactionLayer` in `src/opencloser/artifacts/writer.py` before any transcript disk write, honoring summary-only retention and preserving the Slice 1 summary + transcript-pointer contract
- [X] T037 [P] [US6] Implement `src/opencloser/redaction/layer.py` — `RedactionLayer` with `RegexRedactionPolicy` (default `[REDACTED]`), `NoOpPolicy`, and summary-only retention (FR-028–FR-030; contracts/redaction-layer.md)
- [X] T038 [US6] Route transcript text through `RedactionLayer` in `src/opencloser/artifacts/writer.py` before any transcript disk write, honoring summary-only retention and preserving the Slice 1 summary + transcript-pointer contract

**Checkpoint**: The transcript artifact path is privacy-hardened.

Expand Down
44 changes: 41 additions & 3 deletions src/opencloser/artifacts/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
TaskPayload,
WriteBack,
)
from opencloser.redaction.layer import RedactionLayer

_TRANSCRIPT_FILENAME = "transcript.txt"
_SESSION_RESULT_FILENAME = "session-result.json"
Expand All @@ -31,6 +32,10 @@
_ELIGIBILITY_DECISION_FILENAME = "eligibility-decision.json"
_CONFLICTING_EVENTS_FILENAME = "conflicting-events.json"

# Cached so repeated calls without an explicit layer don't re-compile the built-in
# redaction patterns on every session write.
_DEFAULT_REDACTION_LAYER = RedactionLayer.default()

Comment on lines +35 to +38

@dataclass(frozen=True, slots=True)
class ArtifactPaths:
Expand All @@ -55,11 +60,44 @@ def write_session_artifacts(
transcript_text: str | None = None,
conflicting_events: list[ConflictingEventAuditRecord] | None = None,
task: TaskPayload | None = None,
redaction_layer: RedactionLayer | None = None,
) -> ArtifactPaths:
"""Write all per-session artifacts into ``<artifact_root>/<session_id>/`` atomically."""
"""Write all per-session artifacts into ``<artifact_root>/<session_id>/``.

Each individual file is written atomically (tempfile + ``os.replace``); the set
of artifacts as a whole is not transactional. Atomicity is per-file, which is
what duplicate-event redelivery (FR-019) and idempotent re-emission need.

Transcript text always passes through the configured ``RedactionLayer`` before
disk write (FR-028..FR-030). When the layer's retention mode is
``"summary-only"`` — or no transcript text was supplied — no transcript file
is written; the exported ``transcript_pointer`` is cleared so the
session-result never advertises a file that does not exist. Under
``"summary-only"`` retention, any pre-existing transcript file from an
earlier run is removed so idempotent re-emit cannot leave PII on disk.
"""
session_dir = Path(artifact_root) / session_id
session_dir.mkdir(parents=True, exist_ok=True)

# Decide retention BEFORE writing session-result, so the exported pointer never
# advertises a transcript file that the writer is about to skip
# (contracts/redaction-layer.md §Behavior; FR-030).
effective_layer = redaction_layer or _DEFAULT_REDACTION_LAYER
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Wire artifact writer to configured redaction layer

This fallback always selects RedactionLayer.default() when callers omit redaction_layer, which makes runtime behavior ignore operator-provided [redaction] config. I checked the two production write_session_artifacts call sites in src/opencloser/core/orchestrator.py (lines 256-265 and 601-610), and neither passes a layer; additionally, RedactionLayer.from_config(...) is not used anywhere under src/ (rg -n). In practice, custom patterns, policy = "noop", and retention = "summary-only" from config/slice2.toml will not take effect, and malformed user regexes cannot surface as readiness failures.

Useful? React with 👍 / 👎.

summary_only = (
transcript_text is not None and effective_layer.retention_mode() == "summary-only"
)
transcript_will_be_written = transcript_text is not None and not summary_only
if not transcript_will_be_written and normalized_result.transcript_pointer is not None:
normalized_result = normalized_result.model_copy(update={"transcript_pointer": None})
Comment on lines +82 to +91

# FR-030 + FR-019: when no transcript will be written (summary-only retention OR no
# transcript_text supplied), remove any transcript file from an earlier run BEFORE
# writing session-result.json. If the unlink fails (locked file / permissions), we
# raise here without having advertised a null transcript_pointer that would leave
# the on-disk state both privacy-inconsistent and harder to detect.
if not transcript_will_be_written:
(session_dir / _TRANSCRIPT_FILENAME).unlink(missing_ok=True)

session_result_path = session_dir / _SESSION_RESULT_FILENAME
_write_json_atomic(session_result_path, normalized_result)

Expand All @@ -72,9 +110,9 @@ def write_session_artifacts(
_write_json_atomic(task_path, task)

transcript_path: Path | None = None
if transcript_text is not None:
if transcript_will_be_written:
transcript_path = session_dir / _TRANSCRIPT_FILENAME
_write_text_atomic(transcript_path, transcript_text)
_write_text_atomic(transcript_path, effective_layer.redact(transcript_text))

eligibility_path = session_dir / _ELIGIBILITY_DECISION_FILENAME
_write_json_atomic(eligibility_path, eligibility_decision)
Expand Down
31 changes: 31 additions & 0 deletions src/opencloser/core/orchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
PersonaOutput,
SessionContext,
)
from opencloser.redaction.layer import RedactionLayer
from opencloser.state import store

if TYPE_CHECKING:
Expand Down Expand Up @@ -114,6 +115,7 @@
conversation_fixture: ConversationFixture | None,
transport_fixture_id: str | None,
clock: Clock | None = None,
redaction_layer: RedactionLayer | None = None,
) -> RunReport:
"""End-to-end Slice 1 orchestration for a single queue record.

Expand All @@ -123,6 +125,13 @@

`conversation_fixture` is required when a call is expected to reach `connected`.
`transport_fixture_id` is the name of the fixture under the transport's fixtures dir.

`redaction_layer` is the configured transcript redaction layer
(``contracts/redaction-layer.md``; FR-028..FR-030). Slice 2 callers should
build one from ``Slice2Config.redaction`` via ``RedactionLayer.from_config``
so operator-configured patterns / policy / retention take effect. Slice 1
callers may omit it; the artifact writer falls back to a default-on layer
so PII never lands on disk unredacted by accident.
"""
clock = clock or SystemClock()
start_ts_monotonic = time.monotonic()
Expand Down Expand Up @@ -188,6 +197,7 @@
clock=clock,
crm=crm,
start_ts_monotonic=start_ts_monotonic,
redaction_layer=redaction_layer,
)
return report

Expand All @@ -206,6 +216,7 @@
conversation_fixture=conversation_fixture,
transport_fixture_id=transport_fixture_id,
start_ts_monotonic=start_ts_monotonic,
redaction_layer=redaction_layer,
)


Expand All @@ -224,6 +235,7 @@
clock: Clock,
crm: WriteBackAdapter,
start_ts_monotonic: float,
redaction_layer: RedactionLayer | None = None,
) -> RunReport:
"""FR-005 block path: queue-status update only, no Phone Call activity, no task."""
new_status = queue_item.callable_status # FR-032: blocked → unchanged
Expand Down Expand Up @@ -273,6 +285,7 @@
task=None,
transcript_text=None,
conflicting_events=None,
redaction_layer=redaction_layer,
)

return RunReport(
Expand All @@ -291,20 +304,21 @@


def _run_allowed_session(
*,
conn: sqlite3.Connection,
session: Session,
queue_item,
decision,
config: SliceConfig,
clock: Clock,
eligibility,
transport: CallTransport,
persona: Persona,
crm: WriteBackAdapter,
conversation_fixture: ConversationFixture | None,
transport_fixture_id: str | None,
start_ts_monotonic: float,
redaction_layer: RedactionLayer | None = None,

Check warning on line 321 in src/opencloser/core/orchestrator.py

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Function "_run_allowed_session" has 14 parameters, which is greater than the 13 authorized.

See more on https://sonarcloud.io/project/issues?id=opensoft_openCareConnect&issues=AZ5TTiSUm5Gz0TQXBFQe&open=AZ5TTiSUm5Gz0TQXBFQe&pullRequest=4
) -> RunReport:
del eligibility # already used; kept for signature symmetry

Expand Down Expand Up @@ -452,6 +466,7 @@
persona_output=persona_output,
transcript_text=transcript_text,
conflicting_events=conflicting_events,
redaction_layer=redaction_layer,
)
return RunReport(
session_id=session.session_id,
Expand Down Expand Up @@ -483,7 +498,7 @@
return Disposition.FAILED


def _finalize_session(

Check failure on line 501 in src/opencloser/core/orchestrator.py

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Refactor this function to reduce its Cognitive Complexity from 19 to the 15 allowed.

See more on https://sonarcloud.io/project/issues?id=opensoft_openCareConnect&issues=AZ5TVKMvqJQ-O8gWr3Gj&open=AZ5TVKMvqJQ-O8gWr3Gj&pullRequest=4
*,
conn: sqlite3.Connection,
session: Session,
Expand All @@ -497,6 +512,7 @@
persona_output: PersonaOutput | None,
transcript_text: str | None,
conflicting_events: list[ConflictingEventAuditRecord],
redaction_layer: RedactionLayer | None = None,
) -> tuple[NormalizedResult, ArtifactPaths]:
"""Finalize the session, emit write-backs per FR-031, write artifacts."""
ended_at = clock.now_utc_ms()
Expand All @@ -511,6 +527,20 @@
ended_at=ended_at,
)

# Apply the redaction layer's retention decision to ``normalized.transcript_pointer``
# BEFORE persisting to the DB. The artifact writer makes the same decision when it
# serializes session-result.json; doing it here too keeps the DB ``normalized_results``
# row consistent with what is actually written to disk (no DB pointer to a file the
# writer is about to skip under summary-only retention or no-transcript-text paths).
no_transcript = transcript_text is None
summary_only = (
not no_transcript
and redaction_layer is not None
and redaction_layer.retention_mode() == "summary-only"
)
if (no_transcript or summary_only) and normalized.transcript_pointer is not None:
normalized = normalized.model_copy(update={"transcript_pointer": None})

# Persist the disposition, normalized result, and queue-item lifecycle update.
# The session's `state` is intentionally NOT flipped to FINALIZED here — that
# flip is the LAST commit, after write-backs and artifacts succeed (N3), so a
Expand Down Expand Up @@ -621,6 +651,7 @@
task=task,
transcript_text=transcript_text,
conflicting_events=conflicting_events or None,
redaction_layer=redaction_layer,
)
Comment on lines 651 to 655

# FINAL commit: flip the session to FINALIZED only now that every write-back row
Expand Down
27 changes: 26 additions & 1 deletion src/opencloser/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -735,14 +735,39 @@ class TaskOwnersConfig(BaseModel):
review: str


# Default ``[redaction] patterns`` for the regex policy: email + NA phone numbers.
#
# Lives on the config so the implicit and config-driven defaults cannot diverge —
# ``RegexRedactionPolicy`` compiles exactly the patterns it is given, no implicit
# prepending. Operators who supply a custom ``patterns`` list replace these
# defaults outright (set ``policy = "noop"`` to disable redaction entirely).
#
# Phone-number regex is privacy-conservative: separators are OPTIONAL so bare
# 10-digit forms (e.g. ``5551234567``) are also redacted (aligned with the
# default in ``config/slice2.toml``). This may over-match true 10-digit IDs in
# transcript text — operators with that concern should configure stricter
# patterns explicitly.
_BUILTIN_REDACTION_PATTERNS: tuple[str, ...] = (
# Email addresses.
r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b",
# North American phone numbers. Matches all of: 555-123-4567, 5551234567,
# 555.123.4567, (555) 123-4567, +1 555 123 4567.
r"(?:\+?\d{1,2}[\s.-])?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}",
)

BUILTIN_REDACTION_PATTERNS: tuple[str, ...] = _BUILTIN_REDACTION_PATTERNS


class RedactionPolicyConfig(BaseModel):
"""`[redaction]` section (FR-028, FR-029, FR-030)."""

model_config = ConfigDict(extra="forbid")

policy: Literal["regex", "noop"] = "regex"
retention: Literal["full", "summary-only"] = "full"
patterns: list[str] = Field(default_factory=list)
patterns: list[str] = Field(
default_factory=lambda: list(_BUILTIN_REDACTION_PATTERNS),
)


class Slice2Config(BaseModel):
Expand Down
101 changes: 101 additions & 0 deletions src/opencloser/redaction/layer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
"""Transcript redaction layer (FR-028, FR-029, FR-030).

See ``specs/002-mock-call-real-crm/contracts/redaction-layer.md``. The layer is
default-on; disabling redaction requires an explicit ``policy = "noop"`` in
``config/slice2.toml [redaction]``.
"""

from __future__ import annotations

import re
from dataclasses import dataclass
from typing import Literal, Protocol

from opencloser.models import RedactionPolicyConfig

RetentionMode = Literal["full", "summary-only"]

DEFAULT_REPLACEMENT = "[REDACTED]"


class Policy(Protocol):
def redact(self, transcript_text: str) -> str: ...


@dataclass(frozen=True, slots=True)
class NoOpPolicy:
"""Returns transcript text unchanged (FR-029)."""

def redact(self, transcript_text: str) -> str:
return transcript_text


@dataclass(frozen=True, slots=True)
class RegexRedactionPolicy:
"""Replaces every match of the compiled patterns with ``[REDACTED]`` (FR-028).

Compiles exactly the patterns it is given — no implicit built-ins. The
config-driven default set lives in
``opencloser.models.BUILTIN_REDACTION_PATTERNS`` and is wired in via
``RedactionPolicyConfig.patterns`` so config and code cannot drift.
"""

_compiled: tuple[re.Pattern[str], ...]
replacement: str = DEFAULT_REPLACEMENT

@classmethod
def from_patterns(
cls,
patterns: list[str] | tuple[str, ...] = (),
*,
replacement: str = DEFAULT_REPLACEMENT,
) -> RegexRedactionPolicy:
compiled: list[re.Pattern[str]] = []
for raw in patterns:
try:
Comment thread
sourcery-ai[bot] marked this conversation as resolved.
compiled.append(re.compile(raw))
except re.error as exc:
raise ValueError(f"Invalid redaction regex {raw!r}: {exc}") from exc
return cls(_compiled=tuple(compiled), replacement=replacement)

def redact(self, transcript_text: str) -> str:
out = transcript_text
for pattern in self._compiled:
out = pattern.sub(self.replacement, out)
return out
Comment on lines +46 to +65


@dataclass(frozen=True, slots=True)
class RedactionLayer:
"""Transforms transcript text and decides whether a transcript file is written."""

policy: Policy
_retention_mode: RetentionMode

def redact(self, transcript_text: str) -> str:
return self.policy.redact(transcript_text)

def retention_mode(self) -> RetentionMode:
return self._retention_mode

@classmethod
def from_config(cls, config: RedactionPolicyConfig) -> RedactionLayer:
"""Build a layer from validated ``[redaction]`` config (FR-028..FR-030).

Raises ``ValueError`` if any user pattern is not a valid regex — the
orchestrator surfaces this as a readiness failure.
"""
policy: Policy
if config.policy == "regex":
policy = RegexRedactionPolicy.from_patterns(config.patterns)
elif config.policy == "noop":
policy = NoOpPolicy()
else: # pragma: no cover - pydantic Literal guards this
raise ValueError(f"Unknown redaction policy: {config.policy!r}")
return cls(policy=policy, _retention_mode=config.retention)

@classmethod
def default(cls) -> RedactionLayer:
"""Default-on layer (FR-028): mirrors the unconfigured ``[redaction]`` section
so the implicit default and the config-driven default cannot drift apart."""
return cls.from_config(RedactionPolicyConfig())
Loading
Loading