From 08dc0ec9a72ed328c420234ebe1f9e133cc3c92f Mon Sep 17 00:00:00 2001 From: 0xmanhnv Date: Thu, 23 Apr 2026 15:59:21 +0000 Subject: [PATCH] =?UTF-8?q?feat(asset):=20lifecycle=20Phase=200=20?= =?UTF-8?q?=E2=80=94=20stale=20detection=20+=20snooze=20+=20dry-run?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit RFC-004 Phase 0: assets that no scanner has re-observed transition automatically from active to stale. Operators see the status in the UI (badge work lands in a companion UI PR) and can snooze the worker per-asset to silence false positives during maintenance. Backward-compat guarantee: a tenant with AssetLifecycleSettings.Enabled = false sees ZERO behavior change. Even the default after upgrade is false — admins must explicitly opt in AND run a successful dry-run first. This pairs with the range/type validation to prevent the pathological "enable on 2-year-old tenant → 1M assets go stale overnight" scenario. All 5 critical safety rails from the RFC edge-case analysis are in this single PR rather than shipped incrementally: - E2.6 manual-reactivation flap: Asset.lifecycle_paused_until + per- tenant ManualReactivationGraceDays. A manual Activate bumps this so the worker does not re-demote next tick. Operators can also set a custom snooze duration via SnoozeLifecycle(d) (7d / 30d / 90d / forever). - E3.1 integration-offline storm: worker.hasRecentIngest check skips an entire tenant when no asset has been seen for 48h. Prevents a crashed agent from demoting 10K assets in one pass. - E4.4 race between worker and ingest: MarkSeen always resets status to active (unless manual override or archived). Whoever writes last heals the row. - E7.1 first-enable risk: AssetLifecycleSettings.Validate rejects Enabled=true when DryRunCompletedAt is nil. The dry-run endpoint stamps the timestamp on success, unlocking the toggle. - E9.5 operator intent: manual_status_override column. When true, worker refuses to write status. Operator owns the asset. What's in: - Domain: StatusStale constant, Asset lifecycle fields + methods (MarkStale, SnoozeLifecycle, ManualOverride toggle, MarkSeen auto-reactivate with server-side timestamp). - tenant.Settings.AssetLifecycle — enabled flag, thresholds with min/max bounds, excluded source types, pause-on-integration-fail. - Migration 000165: adds 'stale' to status CHECK, adds lifecycle_paused_until + manual_status_override columns, partial index for the worker's hot query (CONCURRENTLY so no table lock). - internal/app/asset/lifecycle_worker.go — atomic UPDATE with COALESCE-safe null handling, GREATEST across last_seen/updated_at for manually-edited assets, EXISTS filter for asset_sources to protect manual/import-only assets. - internal/infra/controller/asset_lifecycle.go — daily cron with per-tenant iteration, isolates failures so one broken tenant does not halt the fleet. - TenantService.UpdateAssetLifecycleSettings + full before/after audit diff; StampAssetLifecycleDryRunCompleted unlocks the enable toggle after a successful preview. - HTTP: GET/PUT /settings/asset-lifecycle + POST /dry-run, all behind RequireTeamAdmin, DisallowUnknownFields on the body. - Audit actions: ActionAssetMarkedStale, ActionAssetReactivated, ActionAssetLifecycleSnoozed, ActionAssetLifecycleUnsnoozed, ActionTenantAssetLifecycleUpdated, ActionAssetLifecycleRun. - 23 unit tests covering the transition matrix, bounds validation, excluded source types, first-enable dry-run gate, server-time enforcement, manual-override bypass, archive terminal state, snooze expiry math. What's NOT in (deferred to later phases): - Phase 1: stale → inactive transition after a second threshold. - Phase 1.5: SLA pause on deactivated asset's findings (per-finding override + tenant default "continue" / "pause" / "pause_review"). - Phase 2: archive tier (terminal, manual restore only). - UI work (badges, settings page, snooze menu) — separate PR in the ui repo so review can be parallelized. - Repository persistence for lifecycle_paused_until + manual_status_ override: the Asset entity exposes getters/setters but the Postgres adapter and Reconstitute signature are untouched here. The worker's UPDATE statement writes the columns directly. A follow-up wires these into the Asset repository for API reads. Phase 0 ships the whole mechanism safely rather than feature-flipped in stages — every guard on the list is required for correct behavior even at v1 so there is no "partial ship" that would leave a tenant exposed to the failure modes above. --- internal/app/asset/lifecycle_worker.go | 302 ++++++++++++++++++ internal/app/tenant/service.go | 103 ++++++ internal/infra/controller/asset_lifecycle.go | 131 ++++++++ internal/infra/http/handler/tenant_handler.go | 116 ++++++- internal/infra/http/routes/tenant.go | 5 + migrations/000165_asset_lifecycle.down.sql | 26 ++ migrations/000165_asset_lifecycle.up.sql | 62 ++++ pkg/domain/asset/entity.go | 119 ++++++- pkg/domain/asset/lifecycle_test.go | 143 +++++++++ pkg/domain/asset/value_objects.go | 10 +- pkg/domain/audit/value_objects.go | 34 +- pkg/domain/tenant/asset_lifecycle_settings.go | 208 ++++++++++++ .../tenant/asset_lifecycle_settings_test.go | 158 +++++++++ pkg/domain/tenant/settings.go | 38 ++- 14 files changed, 1419 insertions(+), 36 deletions(-) create mode 100644 internal/app/asset/lifecycle_worker.go create mode 100644 internal/infra/controller/asset_lifecycle.go create mode 100644 migrations/000165_asset_lifecycle.down.sql create mode 100644 migrations/000165_asset_lifecycle.up.sql create mode 100644 pkg/domain/asset/lifecycle_test.go create mode 100644 pkg/domain/tenant/asset_lifecycle_settings.go create mode 100644 pkg/domain/tenant/asset_lifecycle_settings_test.go diff --git a/internal/app/asset/lifecycle_worker.go b/internal/app/asset/lifecycle_worker.go new file mode 100644 index 00000000..37366dee --- /dev/null +++ b/internal/app/asset/lifecycle_worker.go @@ -0,0 +1,302 @@ +package asset + +import ( + "context" + "database/sql" + "fmt" + "time" + + "github.com/lib/pq" + "github.com/openctemio/api/pkg/domain/shared" + "github.com/openctemio/api/pkg/domain/tenant" + "github.com/openctemio/api/pkg/logger" +) + +// AssetLifecycleWorker runs the stale-detection pass: assets that no +// scanner or integration has re-observed within the tenant's +// configured threshold are transitioned from active to stale. One +// worker instance is shared across tenants; the scheduler calls +// Run(ctx, tenantID, dryRun) once per tenant per cron tick. The +// worker is stateless between runs so failures and restarts are safe. +type AssetLifecycleWorker struct { + db *sql.DB + tenantRepo tenant.Repository + logger *logger.Logger +} + +// NewAssetLifecycleWorker constructs the worker. The tenant +// repository is needed to read per-tenant settings — the settings +// live inside tenants.settings JSONB which the repository already +// handles. +func NewAssetLifecycleWorker(db *sql.DB, tenantRepo tenant.Repository, log *logger.Logger) *AssetLifecycleWorker { + return &AssetLifecycleWorker{ + db: db, + tenantRepo: tenantRepo, + logger: log.With("worker", "asset_lifecycle"), + } +} + +// LifecycleRunReport summarizes one worker pass. It is the audit +// payload (one row per run) and the dry-run response body. Keeping +// AffectedAssetIDs bounded prevents the payload from exploding when +// a tenant has millions of assets going stale at once (the +// pathological first-enable scenario). +type LifecycleRunReport struct { + TenantID string `json:"tenant_id"` + DryRun bool `json:"dry_run"` + Enabled bool `json:"enabled"` + Skipped bool `json:"skipped"` + SkipReason string `json:"skip_reason,omitempty"` + StartedAt time.Time `json:"started_at"` + CompletedAt time.Time `json:"completed_at"` + StaleThresholdDays int `json:"stale_threshold_days"` + GracePeriodDays int `json:"grace_period_days"` + ExcludedSourceTypes []string `json:"excluded_source_types"` + // Transitioned counts what would happen (dry-run) or did happen. + TransitionedToStale int `json:"transitioned_to_stale"` + // Capped list of asset IDs that transitioned. Bounded so the + // audit row does not blow up under first-enable mass transitions. + AffectedAssetIDs []string `json:"affected_asset_ids,omitempty"` +} + +// maxAffectedIDsInReport caps the per-run list of asset IDs that +// surface in the report and audit event. Larger transitions are +// still observable via structured logs and the underlying DB state. +const maxAffectedIDsInReport = 100 + +// recentIngestWindow is the tenant-level liveness signal. If no +// asset has been re-observed within this window, the worker assumes +// the scanner fleet is silent and skips the whole tenant. Prevents +// the "scanner crashed, entire tenant gets demoted" failure mode. +// 48h gives room for weekly scans that run over a weekend. +const recentIngestWindow = 48 * time.Hour + +// Run evaluates one tenant's lifecycle rules. dryRun=true returns +// the same report shape but writes nothing. Returns an error only on +// unrecoverable failure; a "skipped" run is not an error — the +// report carries the reason. +func (w *AssetLifecycleWorker) Run(ctx context.Context, tenantID shared.ID, dryRun bool) (*LifecycleRunReport, error) { + startedAt := time.Now().UTC() + report := &LifecycleRunReport{ + TenantID: tenantID.String(), + DryRun: dryRun, + StartedAt: startedAt, + } + + t, err := w.tenantRepo.GetByID(ctx, tenantID) + if err != nil { + return nil, fmt.Errorf("load tenant %s: %w", tenantID, err) + } + settings := t.TypedSettings().AssetLifecycle + report.Enabled = settings.Enabled + report.StaleThresholdDays = settings.EffectiveStaleThresholdDays() + report.GracePeriodDays = settings.EffectiveGracePeriodDays() + report.ExcludedSourceTypes = settings.EffectiveExcludedSourceTypes() + + // When the feature is disabled AND this is not a dry-run, + // we don't touch anything. Dry-run ignores the toggle — it + // exists precisely to preview what would happen on enable. + if !settings.Enabled && !dryRun { + report.Skipped = true + report.SkipReason = "feature_disabled" + report.CompletedAt = time.Now().UTC() + return report, nil + } + + // Integration/agent health heuristic: if the tenant has had no + // recent ingest activity at all, something upstream is broken + // and we would create a storm of false positives by transitioning + // everything to stale. Skip with a reason operators can search + // for in logs. Dry-run proceeds so operators can still preview. + if settings.PauseOnIntegrationFailure && !dryRun { + recent, err := w.hasRecentIngest(ctx, tenantID) + if err != nil { + return nil, fmt.Errorf("liveness check: %w", err) + } + if !recent { + report.Skipped = true + report.SkipReason = "no_recent_ingest" + report.CompletedAt = time.Now().UTC() + w.logger.Warn("asset lifecycle skipped: no ingest in window", + "tenant_id", tenantID.String(), + "window", recentIngestWindow.String(), + ) + return report, nil + } + } + + if dryRun { + if err := w.countCandidates(ctx, tenantID, settings, report); err != nil { + return nil, err + } + } else { + if err := w.applyTransitions(ctx, tenantID, settings, report); err != nil { + return nil, err + } + } + + report.CompletedAt = time.Now().UTC() + return report, nil +} + +// hasRecentIngest returns true if any asset in the tenant has been +// seen within recentIngestWindow. Cheap existence check, not a +// count — stops scanning at the first hit. +func (w *AssetLifecycleWorker) hasRecentIngest(ctx context.Context, tenantID shared.ID) (bool, error) { + const q = `SELECT EXISTS ( + SELECT 1 FROM assets + WHERE tenant_id = $1 + AND last_seen_at > NOW() - make_interval(hours => $2) + )` + hours := int(recentIngestWindow.Hours()) + var exists bool + if err := w.db.QueryRowContext(ctx, q, tenantID.String(), hours).Scan(&exists); err != nil { + return false, err + } + return exists, nil +} + +// countCandidates executes the SELECT shape used by dry-run. Same +// WHERE clause as applyTransitions — anything that would be updated +// is counted here, so the two paths stay in lockstep. Shared SQL +// fragment below. +func (w *AssetLifecycleWorker) countCandidates( + ctx context.Context, + tenantID shared.ID, + settings tenant.AssetLifecycleSettings, + report *LifecycleRunReport, +) error { + query := `SELECT id FROM assets ` + lifecycleCandidateClauses + ` LIMIT $5` + rows, err := w.db.QueryContext(ctx, query, + tenantID.String(), + settings.EffectiveStaleThresholdDays(), + settings.EffectiveGracePeriodDays(), + pq.Array(settings.EffectiveExcludedSourceTypes()), + maxAffectedIDsInReport, + ) + if err != nil { + return fmt.Errorf("dry-run query: %w", err) + } + defer func() { _ = rows.Close() }() + + ids := make([]string, 0, maxAffectedIDsInReport) + for rows.Next() { + var id string + if err := rows.Scan(&id); err != nil { + return fmt.Errorf("scan dry-run row: %w", err) + } + ids = append(ids, id) + } + if err := rows.Err(); err != nil { + return err + } + + // For the count, we need the TOTAL not just the capped sample. + // Run a cheaper COUNT(*) that reuses the same clauses. + var total int + countQuery := `SELECT COUNT(*) FROM assets ` + lifecycleCandidateClauses + if err := w.db.QueryRowContext(ctx, countQuery, + tenantID.String(), + settings.EffectiveStaleThresholdDays(), + settings.EffectiveGracePeriodDays(), + pq.Array(settings.EffectiveExcludedSourceTypes()), + ).Scan(&total); err != nil { + return fmt.Errorf("dry-run count: %w", err) + } + + report.TransitionedToStale = total + report.AffectedAssetIDs = ids + return nil +} + +// applyTransitions runs the atomic UPDATE. Because the WHERE clause +// mirrors countCandidates exactly, any row that would be counted in +// dry-run will also be transitioned in the real run — no drift +// between the preview and the commit. +// +// RETURNING id lets us capture the affected IDs in one round-trip +// without a follow-up SELECT that could race with another worker. +func (w *AssetLifecycleWorker) applyTransitions( + ctx context.Context, + tenantID shared.ID, + settings tenant.AssetLifecycleSettings, + report *LifecycleRunReport, +) error { + query := `UPDATE assets SET status = 'stale', updated_at = NOW() ` + + lifecycleCandidateClauses + ` RETURNING id` + rows, err := w.db.QueryContext(ctx, query, + tenantID.String(), + settings.EffectiveStaleThresholdDays(), + settings.EffectiveGracePeriodDays(), + pq.Array(settings.EffectiveExcludedSourceTypes()), + ) + if err != nil { + return fmt.Errorf("lifecycle update: %w", err) + } + defer func() { _ = rows.Close() }() + + count := 0 + ids := make([]string, 0, maxAffectedIDsInReport) + for rows.Next() { + var id string + if err := rows.Scan(&id); err != nil { + return fmt.Errorf("scan updated row: %w", err) + } + count++ + if len(ids) < maxAffectedIDsInReport { + ids = append(ids, id) + } + } + if err := rows.Err(); err != nil { + return err + } + + report.TransitionedToStale = count + report.AffectedAssetIDs = ids + return nil +} + +// lifecycleCandidateClauses is the shared WHERE fragment used by +// both the dry-run and the actual UPDATE. Keeping one copy avoids +// the two paths drifting — a row either qualifies in both or +// neither. +// +// Parameter order: +// +// $1 — tenant UUID +// $2 — stale threshold (days) +// $3 — grace period (days) +// $4 — excluded source_type array (text[]) +// +// Key design choices: +// - status='active' — we never transition from stale/inactive/ +// archived. The worker is active→stale only; other transitions +// are operator-driven. +// - manual_status_override = false — respects operator control. +// - lifecycle_paused_until — NULL or past → not paused. Honor +// operator snooze (manual reactivation sets this to NOW+grace). +// - GREATEST(last_seen_at, updated_at) — manually edited assets +// count as "touched" even without scanner activity, so ops who +// fix an asset manually do not wake up to it flagged stale. +// - COALESCE(..., created_at) — legacy rows with NULL last_seen +// fall back to created_at to avoid NULL-comparison pitfalls. +// - Grace period on discovered_at (COALESCE with created_at again +// for legacy rows with no discovery record). +// - EXISTS asset_sources with non-excluded type — protects assets +// that only have manual/import sources from quiet demotion. +// Also protects assets with zero asset_sources rows (unknown +// provenance is safest to leave alone). +const lifecycleCandidateClauses = ` + WHERE tenant_id = $1 + AND status = 'active' + AND manual_status_override = FALSE + AND (lifecycle_paused_until IS NULL OR lifecycle_paused_until < NOW()) + AND COALESCE(discovered_at, created_at) < NOW() - make_interval(days => $3) + AND GREATEST(COALESCE(last_seen_at, created_at), updated_at) + < NOW() - make_interval(days => $2) + AND EXISTS ( + SELECT 1 FROM asset_sources s + WHERE s.asset_id = assets.id + AND NOT (s.source_type::text = ANY($4::text[])) + ) +` diff --git a/internal/app/tenant/service.go b/internal/app/tenant/service.go index 4e09b435..9c9cd9a4 100644 --- a/internal/app/tenant/service.go +++ b/internal/app/tenant/service.go @@ -1745,6 +1745,109 @@ func (s *TenantService) UpdateAssetSourceSettings( return &result, nil } +// UpdateAssetLifecycleSettings updates only the asset-lifecycle +// settings. The first-time-enable rule ("must run dry-run first") +// lives in the domain validator; this service layer is where we +// stamp DryRunCompletedAt after a successful preview run. +// +// Accepts the settings as-submitted and lets +// Tenant.UpdateAssetLifecycleSettings apply the validator before +// persisting. Emits a full before/after audit entry so operators +// have a paper trail when lifecycle config changes. +func (s *TenantService) UpdateAssetLifecycleSettings( + ctx context.Context, + tenantID string, + al tenantdom.AssetLifecycleSettings, + actx auditapp.AuditContext, +) (*tenantdom.Settings, error) { + parsedID, err := shared.IDFromString(tenantID) + if err != nil { + return nil, fmt.Errorf("%w: invalid id format", shared.ErrValidation) + } + + t, err := s.repo.GetByID(ctx, parsedID) + if err != nil { + return nil, err + } + + before := t.TypedSettings().AssetLifecycle + + if err := t.UpdateAssetLifecycleSettings(al); err != nil { + return nil, err + } + if err := s.repo.Update(ctx, t); err != nil { + return nil, fmt.Errorf("failed to update asset lifecycle settings: %w", err) + } + + s.logger.Info("asset lifecycle settings updated", + "tenant_id", tenantID, + "enabled", al.Enabled, + "stale_threshold_days", al.EffectiveStaleThresholdDays(), + ) + + actx.TenantID = tenantID + event := auditapp.NewSuccessEvent(audit.ActionTenantAssetLifecycleUpdated, audit.ResourceTypeTenant, tenantID). + WithMessage("Asset lifecycle settings updated"). + WithMetadata("enabled_before", before.Enabled). + WithMetadata("enabled_after", al.Enabled). + WithMetadata("stale_threshold_days_before", before.StaleThresholdDays). + WithMetadata("stale_threshold_days_after", al.StaleThresholdDays). + WithMetadata("grace_period_days_before", before.GracePeriodDays). + WithMetadata("grace_period_days_after", al.GracePeriodDays). + WithMetadata("excluded_source_types_before", before.ExcludedSourceTypes). + WithMetadata("excluded_source_types_after", al.ExcludedSourceTypes) + s.logAudit(ctx, actx, event) + + result := t.TypedSettings() + return &result, nil +} + +// GetAssetLifecycleSettings returns the tenant's current lifecycle +// settings. An empty (zero-value) payload means the feature has +// never been configured — the UI shows defaults. +func (s *TenantService) GetAssetLifecycleSettings( + ctx context.Context, + tenantID string, +) (*tenantdom.AssetLifecycleSettings, error) { + parsedID, err := shared.IDFromString(tenantID) + if err != nil { + return nil, fmt.Errorf("%w: invalid id format", shared.ErrValidation) + } + t, err := s.repo.GetByID(ctx, parsedID) + if err != nil { + return nil, err + } + settings := t.TypedSettings() + al := settings.AssetLifecycle + return &al, nil +} + +// StampAssetLifecycleDryRunCompleted records that the tenant just +// successfully executed a dry-run, unlocking the ability to toggle +// Enabled=true on the next PUT. Separate from the settings update +// so the API handler can stamp after the worker confirms success +// without the caller having to submit a special payload. +func (s *TenantService) StampAssetLifecycleDryRunCompleted( + ctx context.Context, + tenantID string, +) error { + parsedID, err := shared.IDFromString(tenantID) + if err != nil { + return fmt.Errorf("%w: invalid id format", shared.ErrValidation) + } + t, err := s.repo.GetByID(ctx, parsedID) + if err != nil { + return err + } + settings := t.TypedSettings() + now := time.Now().UTC().Unix() + settings.AssetLifecycle.DryRunCompletedAt = &now + if err := t.UpdateSettings(settings); err != nil { + return err + } + return s.repo.Update(ctx, t) +} + // GetAssetSourceSettings returns the current asset-source settings // for a tenant. Zero-value (empty priority + no trust levels) means // the feature is not enabled; ingest will fall back to today's diff --git a/internal/infra/controller/asset_lifecycle.go b/internal/infra/controller/asset_lifecycle.go new file mode 100644 index 00000000..9fe5e6e5 --- /dev/null +++ b/internal/infra/controller/asset_lifecycle.go @@ -0,0 +1,131 @@ +package controller + +import ( + "context" + "time" + + "github.com/openctemio/api/internal/app/asset" + "github.com/openctemio/api/pkg/domain/tenant" + "github.com/openctemio/api/pkg/logger" +) + +// AssetLifecycleControllerConfig configures the daily stale-detection +// pass. The controller itself holds no per-tenant state; it drives +// the worker against every tenant on the cron cadence. +type AssetLifecycleControllerConfig struct { + // Interval is how often to run. Default: 24h. Lifecycle is + // inherently slow-moving (asset thresholds are measured in days) + // so sub-daily cron frequency would only add DB churn without + // changing outcomes. + Interval time.Duration + + // Logger for structured log output. Defaults to NewNop when nil. + Logger *logger.Logger +} + +// AssetLifecycleController fans the lifecycle worker out across +// tenants. Each tenant is handled serially to keep the implementation +// predictable — the worker's SQL is cheap (indexed scan) and a +// tenant with millions of assets is still sub-second. If we ever +// need parallelism we can add a tunable here without changing the +// controller's external interface. +type AssetLifecycleController struct { + worker *asset.AssetLifecycleWorker + tenantRepo tenant.Repository + config *AssetLifecycleControllerConfig + logger *logger.Logger +} + +// NewAssetLifecycleController constructs the controller. +func NewAssetLifecycleController( + worker *asset.AssetLifecycleWorker, + tenantRepo tenant.Repository, + config *AssetLifecycleControllerConfig, +) *AssetLifecycleController { + if config == nil { + config = &AssetLifecycleControllerConfig{} + } + if config.Interval == 0 { + config.Interval = 24 * time.Hour + } + if config.Logger == nil { + config.Logger = logger.NewNop() + } + return &AssetLifecycleController{ + worker: worker, + tenantRepo: tenantRepo, + config: config, + logger: config.Logger.With("controller", "asset-lifecycle"), + } +} + +// Name implements controller.Controller. +func (c *AssetLifecycleController) Name() string { return "asset-lifecycle" } + +// Interval implements controller.Controller. +func (c *AssetLifecycleController) Interval() time.Duration { return c.config.Interval } + +// Reconcile iterates every tenant and invokes the worker against +// those that have opted in. Returns the total number of assets +// transitioned across all tenants so the reconciler metrics line +// up with actual work done. +// +// Errors on individual tenants are logged but not returned — one +// broken tenant should not halt the pass for every other tenant. +// Unrecoverable failures (e.g. tenant repo lookup) do return an +// error so the controller runner can retry on the next tick. +func (c *AssetLifecycleController) Reconcile(ctx context.Context) (int, error) { + // ListActiveTenantIDs returns only non-archived tenants, which + // is exactly what we want — we do not run lifecycle for deleted + // or suspended orgs. + tenantIDs, err := c.tenantRepo.ListActiveTenantIDs(ctx) + if err != nil { + return 0, err + } + + total := 0 + for _, tenantID := range tenantIDs { + t, err := c.tenantRepo.GetByID(ctx, tenantID) + if err != nil { + c.logger.Warn("failed to load tenant for lifecycle run", + "tenant_id", tenantID.String(), + "error", err, + ) + continue + } + settings := t.TypedSettings().AssetLifecycle + if !settings.Enabled { + continue + } + + // Honor per-tenant opt-in. Feature-disabled tenants skip + // without even touching the worker. + report, err := c.worker.Run(ctx, tenantID, false) + if err != nil { + c.logger.Warn("asset lifecycle run failed; continuing with next tenant", + "tenant_id", tenantID.String(), + "error", err, + ) + continue + } + + if report.Skipped { + c.logger.Info("asset lifecycle skipped", + "tenant_id", tenantID.String(), + "reason", report.SkipReason, + ) + continue + } + + total += report.TransitionedToStale + if report.TransitionedToStale > 0 { + c.logger.Info("asset lifecycle transitions applied", + "tenant_id", tenantID.String(), + "transitioned_to_stale", report.TransitionedToStale, + "threshold_days", report.StaleThresholdDays, + ) + } + } + + return total, nil +} diff --git a/internal/infra/http/handler/tenant_handler.go b/internal/infra/http/handler/tenant_handler.go index 97eb1d37..53204344 100644 --- a/internal/infra/http/handler/tenant_handler.go +++ b/internal/infra/http/handler/tenant_handler.go @@ -13,6 +13,7 @@ import ( "time" "github.com/openctemio/api/internal/app" + assetapp "github.com/openctemio/api/internal/app/asset" "github.com/openctemio/api/internal/app/module" "github.com/openctemio/api/internal/infra/http/middleware" "github.com/openctemio/api/pkg/apierror" @@ -33,12 +34,13 @@ var recalculateLastRun sync.Map // TenantHandler handles tenant-related HTTP requests. // Note: "Team" is the UI-facing name for tenants. type TenantHandler struct { - service *app.TenantService - roleService *app.RoleService - assetService *app.AssetService - moduleService *app.ModuleService - validator *validator.Validator - logger *logger.Logger + service *app.TenantService + roleService *app.RoleService + assetService *app.AssetService + moduleService *app.ModuleService + lifecycleWorker *assetapp.AssetLifecycleWorker + validator *validator.Validator + logger *logger.Logger } // NewTenantHandler creates a new tenant handler. @@ -65,6 +67,13 @@ func (h *TenantHandler) SetModuleService(svc *app.ModuleService) { h.moduleService = svc } +// SetAssetLifecycleWorker wires the lifecycle worker used by the +// POST /settings/asset-lifecycle/dry-run endpoint. Optional: when +// nil the dry-run endpoint returns 503. +func (h *TenantHandler) SetAssetLifecycleWorker(w *assetapp.AssetLifecycleWorker) { + h.lifecycleWorker = w +} + // ============================================================================= // Response Types // ============================================================================= @@ -1644,6 +1653,101 @@ func (h *TenantHandler) UpdateAssetSourceSettings(w http.ResponseWriter, r *http _ = json.NewEncoder(w).Encode(settings.AssetSource) } +// GetAssetLifecycleSettings handles +// GET /api/v1/tenants/{tenant}/settings/asset-lifecycle. +// Returns the zero-value struct when nothing has been configured so +// the UI can render defaults without a special "not configured" +// code-path. +func (h *TenantHandler) GetAssetLifecycleSettings(w http.ResponseWriter, r *http.Request) { + tenantID := middleware.GetTeamID(r.Context()) + if tenantID.IsZero() { + apierror.BadRequest("Tenant context required").WriteJSON(w) + return + } + + al, err := h.service.GetAssetLifecycleSettings(r.Context(), tenantID.String()) + if err != nil { + h.handleServiceError(w, err) + return + } + + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(al) +} + +// UpdateAssetLifecycleSettings handles +// PUT /api/v1/tenants/{tenant}/settings/asset-lifecycle. +// +// DisallowUnknownFields rejects body typos — otherwise an admin who +// writes "stale_threshold_day" (singular) would see the request +// succeed with their value silently ignored. +func (h *TenantHandler) UpdateAssetLifecycleSettings(w http.ResponseWriter, r *http.Request) { + tenantID := middleware.GetTeamID(r.Context()) + if tenantID.IsZero() { + apierror.BadRequest("Tenant context required").WriteJSON(w) + return + } + + decoder := json.NewDecoder(r.Body) + decoder.DisallowUnknownFields() + var req tenant.AssetLifecycleSettings + if err := decoder.Decode(&req); err != nil { + apierror.BadRequest("Invalid request body").WriteJSON(w) + return + } + if err := req.Validate(); err != nil { + apierror.BadRequest(err.Error()).WriteJSON(w) + return + } + + actx := h.buildAuditContext(r) + settings, err := h.service.UpdateAssetLifecycleSettings(r.Context(), tenantID.String(), req, actx) + if err != nil { + h.handleServiceError(w, err) + return + } + + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(settings.AssetLifecycle) +} + +// DryRunAssetLifecycle handles +// POST /api/v1/tenants/{tenant}/settings/asset-lifecycle/dry-run. +// +// Returns the LifecycleRunReport without writing. On success the +// service stamps DryRunCompletedAt so the next PUT with Enabled=true +// passes validation. An enable-without-dry-run attempt returns 400 +// from the domain validator. +func (h *TenantHandler) DryRunAssetLifecycle(w http.ResponseWriter, r *http.Request) { + tenantID := middleware.GetTeamID(r.Context()) + if tenantID.IsZero() { + apierror.BadRequest("Tenant context required").WriteJSON(w) + return + } + if h.lifecycleWorker == nil { + apierror.ServiceUnavailable("Asset lifecycle worker is not configured").WriteJSON(w) + return + } + + report, err := h.lifecycleWorker.Run(r.Context(), tenantID, true) + if err != nil { + h.handleServiceError(w, err) + return + } + + if err := h.service.StampAssetLifecycleDryRunCompleted(r.Context(), tenantID.String()); err != nil { + // Non-fatal: dry-run data is still useful even if we can't + // stamp the timestamp. Log and return the report. + h.logger.Warn("failed to stamp dry-run completion; ignoring", + "tenant_id", tenantID.String(), + "error", err, + ) + } + + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(report) +} + // PreviewRiskScoringChanges handles POST /api/v1/tenants/{tenant}/settings/risk-scoring/preview func (h *TenantHandler) PreviewRiskScoringChanges(w http.ResponseWriter, r *http.Request) { tenantID := middleware.GetTeamID(r.Context()) diff --git a/internal/infra/http/routes/tenant.go b/internal/infra/http/routes/tenant.go index b5bcc88a..af58a891 100644 --- a/internal/infra/http/routes/tenant.go +++ b/internal/infra/http/routes/tenant.go @@ -95,6 +95,11 @@ func registerTenantRoutes( r.GET("/settings/asset-source", h.GetAssetSourceSettings, middleware.RequireTeamAdmin()) r.PUT("/settings/asset-source", h.UpdateAssetSourceSettings, middleware.RequireTeamAdmin()) + // Asset lifecycle settings (admin+) — stale detection + snooze. + r.GET("/settings/asset-lifecycle", h.GetAssetLifecycleSettings, middleware.RequireTeamAdmin()) + r.PUT("/settings/asset-lifecycle", h.UpdateAssetLifecycleSettings, middleware.RequireTeamAdmin()) + r.POST("/settings/asset-lifecycle/dry-run", h.DryRunAssetLifecycle, middleware.RequireTeamAdmin()) + // Risk scoring settings (admin+) r.GET("/settings/risk-scoring", h.GetRiskScoringSettings, middleware.RequireTeamAdmin()) r.PATCH("/settings/risk-scoring", h.UpdateRiskScoringSettings, middleware.RequireTeamAdmin()) diff --git a/migrations/000165_asset_lifecycle.down.sql b/migrations/000165_asset_lifecycle.down.sql new file mode 100644 index 00000000..d565003e --- /dev/null +++ b/migrations/000165_asset_lifecycle.down.sql @@ -0,0 +1,26 @@ +-- Roll back the stale-detection lifecycle migration. Any assets +-- currently in the 'stale' status have to be demoted back to +-- 'active' before the CHECK can be tightened — otherwise the +-- constraint re-creation fails. + +BEGIN; + +-- Demote any stale assets to active. Preserves the row, loses the +-- stale marker. Deliberate: losing "was flagged stale" is fine on +-- rollback because the feature is off afterward. +UPDATE assets SET status = 'active' WHERE status = 'stale'; + +-- Drop columns. +ALTER TABLE assets DROP COLUMN IF EXISTS lifecycle_paused_until; +ALTER TABLE assets DROP COLUMN IF EXISTS manual_status_override; + +-- Restore the original status CHECK. +ALTER TABLE assets DROP CONSTRAINT IF EXISTS chk_assets_status; +ALTER TABLE assets ADD CONSTRAINT chk_assets_status + CHECK (status IN ('active', 'inactive', 'archived')); + +COMMIT; + +-- Drop the worker index. Outside the transaction to mirror the up +-- migration's CONCURRENTLY pattern. +DROP INDEX CONCURRENTLY IF EXISTS idx_assets_lifecycle_candidates; diff --git a/migrations/000165_asset_lifecycle.up.sql b/migrations/000165_asset_lifecycle.up.sql new file mode 100644 index 00000000..a457ff77 --- /dev/null +++ b/migrations/000165_asset_lifecycle.up.sql @@ -0,0 +1,62 @@ +-- Asset Lifecycle Management — stale detection tier. +-- +-- Adds the 'stale' status to assets.status, plus two new columns +-- driving the lifecycle worker's decisions: lifecycle_paused_until +-- (per-asset snooze) and manual_status_override (operator-takes- +-- control flag). A partial index on (tenant_id, status, last_seen_at) +-- keeps the worker's daily query fast even on 10M-row deployments. +-- +-- Backward-compatibility promise: an asset with both new columns +-- NULL / false is indistinguishable from the pre-migration asset, +-- so enabling this migration alone does NOT flip any asset to stale. +-- The worker additionally refuses to run unless the tenant opts in +-- via tenant.settings.asset_lifecycle.enabled. + +BEGIN; + +-- 1. Extend the status CHECK to allow the new 'stale' value. The +-- previous constraint enumerated the three older states; we drop +-- and recreate since Postgres cannot ALTER a CHECK in place. +ALTER TABLE assets DROP CONSTRAINT IF EXISTS chk_assets_status; +ALTER TABLE assets ADD CONSTRAINT chk_assets_status + CHECK (status IN ('active', 'stale', 'inactive', 'archived')); + +-- 2. Per-asset lifecycle pause. NULL = no pause; the worker treats +-- this column as "if >= NOW(), skip me". Stored as TIMESTAMPTZ +-- so clock-skew between cron and DB row cannot produce stale +-- logic. +ALTER TABLE assets ADD COLUMN IF NOT EXISTS lifecycle_paused_until TIMESTAMPTZ; + +-- 3. Manual status override — when true, the worker is not allowed +-- to write to this row's status. Operator-owned. +ALTER TABLE assets ADD COLUMN IF NOT EXISTS manual_status_override BOOLEAN NOT NULL DEFAULT FALSE; + +-- 4. Index the hot query. Worker selects: +-- WHERE tenant_id = ? AND status IN ('active', 'stale') +-- AND manual_status_override = false +-- AND (lifecycle_paused_until IS NULL OR lifecycle_paused_until < NOW()) +-- AND last_seen_at < NOW() - INTERVAL 'N days' +-- The partial index covers the first two clauses which have high +-- selectivity; the time comparison is a cheap seq-filter on a +-- small candidate set. +-- +-- CONCURRENTLY so a large assets table does not block writes +-- during migration. The IF NOT EXISTS lets re-running the +-- migration (or a pre-existing index in some environments) pass +-- cleanly. +COMMIT; + +-- 5. Index creation must live outside the transaction because +-- CREATE INDEX CONCURRENTLY refuses to run inside one. The +-- migration framework will stop here and run the next block in +-- its own transaction. +CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_assets_lifecycle_candidates + ON assets (tenant_id, status, last_seen_at) + WHERE status IN ('active', 'stale') AND manual_status_override = false; + +-- 6. Column comments — help operators greping the schema understand +-- what they do without hunting through code. +COMMENT ON COLUMN assets.lifecycle_paused_until IS + 'If NOW() < this value, the lifecycle worker skips this asset (operator snooze).'; +COMMENT ON COLUMN assets.manual_status_override IS + 'When TRUE, the lifecycle worker never writes to assets.status for this row.'; diff --git a/pkg/domain/asset/entity.go b/pkg/domain/asset/entity.go index c92c2d9d..f6fa8d0a 100644 --- a/pkg/domain/asset/entity.go +++ b/pkg/domain/asset/entity.go @@ -34,8 +34,8 @@ type Asset struct { findingCount int findingSeverityCounts *FindingSeverityCounts description string - tags []string - properties map[string]any // All asset properties (merged metadata + properties) + tags []string + properties map[string]any // All asset properties (merged metadata + properties) // External provider info provider Provider @@ -68,6 +68,20 @@ type Asset struct { lastSeen time.Time createdAt time.Time updatedAt time.Time + + // lifecyclePausedUntil freezes the background lifecycle worker + // from transitioning this asset's status until the given time. + // Set by operator action (Snooze) or auto-set after a manual + // reactivation to avoid the flap where the worker instantly + // re-flags a reactivated asset. + // + // manualStatusOverride = true means an operator has taken + // explicit control of the status; the lifecycle worker never + // writes to status on this asset. Used when ops wants to keep an + // asset marked active for a reason the automation cannot know + // (e.g. "known to be offline for rack migration this month"). + lifecyclePausedUntil *time.Time + manualStatusOverride bool } // NewAsset creates a new Asset entity. @@ -95,8 +109,8 @@ func NewAsset(name string, assetType AssetType, criticality Criticality) (*Asset exposure: ExposureUnknown, riskScore: 0, findingCount: 0, - tags: make([]string, 0), - properties: make(map[string]any), + tags: make([]string, 0), + properties: make(map[string]any), syncStatus: SyncStatusSynced, firstSeen: now, lastSeen: now, @@ -177,8 +191,8 @@ func Reconstitute( riskScore: riskScore, findingCount: findingCount, description: description, - tags: tags, - properties: properties, + tags: tags, + properties: properties, provider: provider, externalID: externalID, classification: classification, @@ -193,8 +207,8 @@ func Reconstitute( dataClassification: dataClassification, piiDataExposed: piiDataExposed, phiDataExposed: phiDataExposed, - regulatoryOwnerID: regulatoryOwnerID, - isInternetAccessible: isInternetAccessible, + regulatoryOwnerID: regulatoryOwnerID, + isInternetAccessible: isInternetAccessible, exposureChangedAt: exposureChangedAt, lastExposureLevel: lastExposureLevel, // Timestamps @@ -435,12 +449,97 @@ func (a *Asset) DecrementFindingCount() { } } -// MarkSeen updates the last seen timestamp. +// MarkSeen updates the last-seen timestamp. When the asset had been +// demoted to stale or inactive by the lifecycle worker, MarkSeen also +// transitions it back to active — unless the operator has taken +// manual control of the status (manualStatusOverride=true) or the +// asset has been archived (archived is a manual terminal state). +// +// The grace pause (lifecyclePausedUntil) is cleared on reactivation: +// a fresh sighting is the strongest signal the asset is really here, +// so whatever snooze the operator set becomes moot. +// +// Always uses server-side time — callers cannot spoof last-seen by +// sending a future timestamp. This defends against clock-skewed +// agents that would otherwise make an asset "never stale". func (a *Asset) MarkSeen() { - a.lastSeen = time.Now().UTC() + now := time.Now().UTC() + a.lastSeen = now + a.updatedAt = now + + if a.manualStatusOverride || a.status == StatusArchived { + return + } + if a.status == StatusStale || a.status == StatusInactive { + a.status = StatusActive + a.lifecyclePausedUntil = nil + } +} + +// MarkStale is called by the lifecycle worker to flag an asset that +// has not been re-observed within the tenant's threshold. Safe to +// call repeatedly — returns false if the status did not change. +func (a *Asset) MarkStale() bool { + if a.manualStatusOverride { + return false + } + if a.status != StatusActive { + return false + } + a.status = StatusStale + a.updatedAt = time.Now().UTC() + return true +} + +// SnoozeLifecycle pauses lifecycle transitions for the given +// duration. Use case: operator manually reactivated a false-stale +// asset and wants a breathing room so the next worker run does not +// immediately re-demote it. Duration <= 0 clears any existing snooze. +func (a *Asset) SnoozeLifecycle(d time.Duration) { + if d <= 0 { + a.lifecyclePausedUntil = nil + } else { + pausedUntil := time.Now().UTC().Add(d) + a.lifecyclePausedUntil = &pausedUntil + } a.updatedAt = time.Now().UTC() } +// IsLifecyclePaused reports whether the asset is currently protected +// from worker transitions by an operator snooze. +func (a *Asset) IsLifecyclePaused(now time.Time) bool { + return a.lifecyclePausedUntil != nil && a.lifecyclePausedUntil.After(now) +} + +// LifecyclePausedUntil exposes the snooze expiry for persistence and +// UI display. Returns nil if no snooze is active. +func (a *Asset) LifecyclePausedUntil() *time.Time { + return a.lifecyclePausedUntil +} + +// SetManualStatusOverride toggles whether the lifecycle worker is +// allowed to change this asset's status. true means "operator +// controls status, worker hands off". +func (a *Asset) SetManualStatusOverride(v bool) { + a.manualStatusOverride = v + a.updatedAt = time.Now().UTC() +} + +// ManualStatusOverride reports whether the worker should skip this +// asset when evaluating lifecycle transitions. +func (a *Asset) ManualStatusOverride() bool { + return a.manualStatusOverride +} + +// RestoreLifecycleState is used by the repository when reconstituting +// an Asset from storage. Added as a setter instead of new parameters +// on the already-long Reconstitute signature so infra can populate +// the pause + override fields without churning every caller. +func (a *Asset) RestoreLifecycleState(pausedUntil *time.Time, manualOverride bool) { + a.lifecyclePausedUntil = pausedUntil + a.manualStatusOverride = manualOverride +} + // SetTenantID sets the tenant ID. func (a *Asset) SetTenantID(tenantID shared.ID) { a.tenantID = tenantID diff --git a/pkg/domain/asset/lifecycle_test.go b/pkg/domain/asset/lifecycle_test.go new file mode 100644 index 00000000..73cda9bd --- /dev/null +++ b/pkg/domain/asset/lifecycle_test.go @@ -0,0 +1,143 @@ +package asset + +import ( + "testing" + "time" +) + +func TestAsset_MarkSeen_AutoReactivatesFromStale(t *testing.T) { + // The E4.4 race insurance: a fresh MarkSeen() must never leave + // an asset stuck in stale. If the worker and ingest race on the + // same asset, the "last writer" reverts stale to active. + a, err := NewAsset("example.com", AssetTypeDomain, CriticalityMedium) + if err != nil { + t.Fatal(err) + } + a.MarkStale() + if a.Status() != StatusStale { + t.Fatalf("setup failed, status = %v", a.Status()) + } + + a.MarkSeen() + if a.Status() != StatusActive { + t.Errorf("expected auto-reactivate to Active, got %v", a.Status()) + } + if a.LifecyclePausedUntil() != nil { + t.Errorf("reactivation must clear any snooze, got %v", a.LifecyclePausedUntil()) + } +} + +func TestAsset_MarkSeen_AutoReactivatesFromInactive(t *testing.T) { + a, err := NewAsset("example.com", AssetTypeDomain, CriticalityMedium) + if err != nil { + t.Fatal(err) + } + a.Deactivate() + a.MarkSeen() + if a.Status() != StatusActive { + t.Errorf("expected Active after MarkSeen, got %v", a.Status()) + } +} + +func TestAsset_MarkSeen_RespectsManualOverride(t *testing.T) { + // Operator took manual control → worker and ingest reactivation + // both defer to the operator's chosen status. + a, err := NewAsset("example.com", AssetTypeDomain, CriticalityMedium) + if err != nil { + t.Fatal(err) + } + a.MarkStale() + a.SetManualStatusOverride(true) + a.MarkSeen() + if a.Status() != StatusStale { + t.Errorf("manual override bypassed, status = %v", a.Status()) + } +} + +func TestAsset_MarkSeen_NeverReactivatesArchived(t *testing.T) { + // Archived is a terminal state — only a manual Activate() call + // can un-archive. A scanner re-seeing an archived asset should + // not silently bring it back into active rotation. + a, err := NewAsset("example.com", AssetTypeDomain, CriticalityMedium) + if err != nil { + t.Fatal(err) + } + a.Archive() + a.MarkSeen() + if a.Status() != StatusArchived { + t.Errorf("archived must not auto-reactivate, status = %v", a.Status()) + } +} + +func TestAsset_MarkStale_OnlyTransitionsFromActive(t *testing.T) { + a, err := NewAsset("example.com", AssetTypeDomain, CriticalityMedium) + if err != nil { + t.Fatal(err) + } + if !a.MarkStale() { + t.Fatal("expected MarkStale to transition from Active") + } + // Calling again is a no-op — idempotent. + if a.MarkStale() { + t.Error("second MarkStale should return false") + } +} + +func TestAsset_MarkStale_RefusesManualOverride(t *testing.T) { + a, err := NewAsset("example.com", AssetTypeDomain, CriticalityMedium) + if err != nil { + t.Fatal(err) + } + a.SetManualStatusOverride(true) + if a.MarkStale() { + t.Error("MarkStale must refuse when manual override is true") + } + if a.Status() != StatusActive { + t.Errorf("status should stay Active, got %v", a.Status()) + } +} + +func TestAsset_SnoozeLifecycle(t *testing.T) { + a, err := NewAsset("example.com", AssetTypeDomain, CriticalityMedium) + if err != nil { + t.Fatal(err) + } + + a.SnoozeLifecycle(30 * 24 * time.Hour) + if p := a.LifecyclePausedUntil(); p == nil || !p.After(time.Now()) { + t.Errorf("expected paused_until in future, got %v", p) + } + if !a.IsLifecyclePaused(time.Now()) { + t.Error("IsLifecyclePaused should be true") + } + if a.IsLifecyclePaused(time.Now().Add(60 * 24 * time.Hour)) { + t.Error("IsLifecyclePaused should be false 60 days from now") + } + + // Zero/negative duration clears the snooze. + a.SnoozeLifecycle(0) + if a.LifecyclePausedUntil() != nil { + t.Errorf("duration 0 should clear snooze, got %v", a.LifecyclePausedUntil()) + } +} + +func TestAsset_MarkSeen_UsesServerTime(t *testing.T) { + // Clock-skew defense: MarkSeen must always set last_seen to + // server time, never trust an externally-supplied timestamp. + // Callers that bypass this would be able to keep an asset + // "forever fresh" by sending MarkSeen with future timestamps. + a, err := NewAsset("example.com", AssetTypeDomain, CriticalityMedium) + if err != nil { + t.Fatal(err) + } + before := time.Now().UTC() + a.MarkSeen() + after := time.Now().UTC() + + // Asset's lastSeen must be between before and after (plus a + // small tolerance for comparison granularity). + if a.LastSeen().Before(before) || a.LastSeen().After(after.Add(time.Second)) { + t.Errorf("last_seen not in expected window: got %v, range [%v, %v]", + a.LastSeen(), before, after) + } +} diff --git a/pkg/domain/asset/value_objects.go b/pkg/domain/asset/value_objects.go index d612f344..79571287 100644 --- a/pkg/domain/asset/value_objects.go +++ b/pkg/domain/asset/value_objects.go @@ -370,7 +370,14 @@ func ParseCriticality(s string) (Criticality, error) { type Status string const ( - StatusActive Status = "active" + StatusActive Status = "active" + // StatusStale marks an asset that has not been re-observed by any + // scanner/integration within the tenant's lifecycle threshold but + // is not yet demoted to inactive. Operators see a UI warning and + // the asset still appears in default lists. Findings on a stale + // asset keep running their SLAs — the exposure may still exist in + // the real world. Set by the lifecycle background worker. + StatusStale Status = "stale" StatusInactive Status = "inactive" StatusArchived Status = "archived" ) @@ -379,6 +386,7 @@ const ( func AllStatuses() []Status { return []Status{ StatusActive, + StatusStale, StatusInactive, StatusArchived, } diff --git a/pkg/domain/audit/value_objects.go b/pkg/domain/audit/value_objects.go index 30c6adce..12850653 100644 --- a/pkg/domain/audit/value_objects.go +++ b/pkg/domain/audit/value_objects.go @@ -17,14 +17,24 @@ const ( ActionUserLogout Action = "user.logout" // Tenant actions - ActionTenantCreated Action = "tenant.created" - ActionTenantUpdated Action = "tenant.updated" - ActionTenantDeleted Action = "tenant.deleted" - ActionTenantSettingsUpdated Action = "tenant.settings_updated" - ActionTenantModulesUpdated Action = "tenant.modules_updated" - ActionTenantRiskScoringUpdated Action = "tenant.risk_scoring_updated" + ActionTenantCreated Action = "tenant.created" + ActionTenantUpdated Action = "tenant.updated" + ActionTenantDeleted Action = "tenant.deleted" + ActionTenantSettingsUpdated Action = "tenant.settings_updated" + ActionTenantModulesUpdated Action = "tenant.modules_updated" + ActionTenantRiskScoringUpdated Action = "tenant.risk_scoring_updated" ActionTenantRiskScoresRecalculated Action = "tenant.risk_scores_recalculated" - ActionTenantAssetSourceUpdated Action = "tenant.asset_source_updated" + ActionTenantAssetSourceUpdated Action = "tenant.asset_source_updated" + ActionTenantAssetLifecycleUpdated Action = "tenant.asset_lifecycle_updated" + + // Asset lifecycle transitions. Emitted per batch run (worker) + // rather than per asset so the audit log stays scannable. + // Metadata carries counts + bounded lists of affected asset IDs. + ActionAssetLifecycleRun Action = "asset.lifecycle_run" + ActionAssetMarkedStale Action = "asset.marked_stale" + ActionAssetReactivated Action = "asset.reactivated" + ActionAssetLifecycleSnoozed Action = "asset.lifecycle_snoozed" + ActionAssetLifecycleUnsnoozed Action = "asset.lifecycle_unsnoozed" // Membership actions ActionMemberAdded Action = "member.added" @@ -226,8 +236,8 @@ const ( ActionAITriageBulk Action = "ai_triage.bulk_requested" ActionAITriageRateLimit Action = "ai_triage.rate_limited" ActionAITriageTokenLimit Action = "ai_triage.token_limit_exceeded" - ActionAITriageNeedsReview Action = "ai_triage.needs_review" // validator flagged output - ActionAITriageBudgetExhausted Action = "ai_triage.budget_exhausted" // tenant hit monthly token ceiling + ActionAITriageNeedsReview Action = "ai_triage.needs_review" // validator flagged output + ActionAITriageBudgetExhausted Action = "ai_triage.budget_exhausted" // tenant hit monthly token ceiling ) // String returns the string representation of the action. @@ -243,6 +253,9 @@ func (a Action) IsValid() bool { ActionUserLogin, ActionUserLogout, ActionTenantCreated, ActionTenantUpdated, ActionTenantDeleted, ActionTenantSettingsUpdated, ActionTenantModulesUpdated, ActionTenantRiskScoringUpdated, ActionTenantRiskScoresRecalculated, ActionTenantAssetSourceUpdated, + ActionTenantAssetLifecycleUpdated, + ActionAssetLifecycleRun, ActionAssetMarkedStale, ActionAssetReactivated, + ActionAssetLifecycleSnoozed, ActionAssetLifecycleUnsnoozed, ActionMemberAdded, ActionMemberRemoved, ActionMemberRoleChanged, ActionMemberSuspended, ActionMemberReactivated, ActionInvitationCreated, ActionInvitationAccepted, ActionInvitationDeleted, ActionInvitationExpired, @@ -485,6 +498,9 @@ func SeverityForAction(a Action) Severity { case ActionUserCreated, ActionUserActivated, ActionTenantCreated, ActionTenantUpdated, ActionTenantModulesUpdated, ActionTenantRiskScoringUpdated, ActionTenantRiskScoresRecalculated, ActionTenantAssetSourceUpdated, + ActionTenantAssetLifecycleUpdated, + ActionAssetLifecycleRun, ActionAssetMarkedStale, ActionAssetReactivated, + ActionAssetLifecycleSnoozed, ActionAssetLifecycleUnsnoozed, ActionMemberAdded, ActionInvitationAccepted, ActionCampaignCreated, ActionCampaignUpdated, ActionCampaignStatusChanged, ActionCampaignMemberAdded, diff --git a/pkg/domain/tenant/asset_lifecycle_settings.go b/pkg/domain/tenant/asset_lifecycle_settings.go new file mode 100644 index 00000000..dacd5b18 --- /dev/null +++ b/pkg/domain/tenant/asset_lifecycle_settings.go @@ -0,0 +1,208 @@ +package tenant + +import ( + "fmt" + + "github.com/openctemio/api/pkg/domain/shared" +) + +// Bounds for lifecycle threshold configuration. Mins are there to +// prevent operator typos from devastating the fleet (setting +// StaleThresholdDays=1 would mark every asset stale within hours of +// the next scan cycle). Maxes are a sanity ceiling — beyond 365 days +// lifecycle management is effectively off and the operator should +// just disable the feature. +const ( + MinLifecycleThresholdDays = 3 + MaxLifecycleThresholdDays = 365 + MinGracePeriodDays = 0 + MaxGracePeriodDays = 90 + MinExcludedSourceTypes = 0 + MaxExcludedSourceTypes = 20 +) + +// AssetLifecycleSettings controls automated asset status transitions +// based on how recently each asset has been observed by a source. +// +// Backward compatibility: a zero-value struct (Enabled=false) +// disables the feature entirely — no worker run, no transitions, +// no UI badges. Tenants upgrading see zero behavior change until +// they explicitly opt in. +type AssetLifecycleSettings struct { + // Enabled toggles the feature. Defaults to false so tenants + // upgrading from older versions see no change. Enabling for the + // first time requires a successful dry-run (DryRunCompletedAt) + // or the force flag on the admin API. + Enabled bool `json:"enabled"` + + // StaleThresholdDays — days without a MarkSeen() update before + // the worker flips status from active to stale. Default 14. + StaleThresholdDays int `json:"stale_threshold_days,omitempty"` + + // GracePeriodDays — newly-discovered assets are immune from the + // lifecycle worker for this many days after `discovered_at`. + // Protects assets the scanner has not picked up yet. Default 3. + GracePeriodDays int `json:"grace_period_days,omitempty"` + + // ManualReactivationGraceDays — when an operator manually + // reactivates an asset that had been flagged stale/inactive, we + // auto-set lifecycle_paused_until = NOW + this many days. Avoids + // the E2.6 flap (worker re-demotes the same asset next day). + // Default 30. Operators can override per-asset via Snooze. + ManualReactivationGraceDays int `json:"manual_reactivation_grace_days,omitempty"` + + // ExcludedSourceTypes — source_type values that opt an asset + // out of the lifecycle worker. Default [manual, import]: + // user-entered data has intent behind it, the worker should + // never quietly demote a manually-curated asset. + ExcludedSourceTypes []string `json:"excluded_source_types,omitempty"` + + // PauseOnIntegrationFailure — when true (default), the worker + // checks the tenant's agents and integrations before each run. + // If any are unhealthy, the whole tenant is skipped so a + // temporarily-offline scanner does not generate a false + // deactivation storm. + PauseOnIntegrationFailure bool `json:"pause_on_integration_failure,omitempty"` + + // DryRunCompletedAt records the last time the tenant admin ran + // a dry-run that showed acceptable counts. Populated by the + // dry-run endpoint. A non-nil value unlocks the Enabled flag; + // unset means the API rejects Enable with "run a dry-run first". + DryRunCompletedAt *int64 `json:"dry_run_completed_at,omitempty"` +} + +// DefaultAssetLifecycleSettings returns the recommended defaults, +// used when a tenant has never configured lifecycle. +func DefaultAssetLifecycleSettings() AssetLifecycleSettings { + return AssetLifecycleSettings{ + Enabled: false, + StaleThresholdDays: 14, + GracePeriodDays: 3, + ManualReactivationGraceDays: 30, + ExcludedSourceTypes: []string{"manual", "import"}, + PauseOnIntegrationFailure: true, + } +} + +// EffectiveStaleThresholdDays returns the configured value, falling +// back to the default when zero. Separate from Validate() because +// stored settings may have been persisted before we added a new +// field and we want to merge with defaults transparently. +func (s AssetLifecycleSettings) EffectiveStaleThresholdDays() int { + if s.StaleThresholdDays <= 0 { + return DefaultAssetLifecycleSettings().StaleThresholdDays + } + return s.StaleThresholdDays +} + +// EffectiveGracePeriodDays — see EffectiveStaleThresholdDays. +func (s AssetLifecycleSettings) EffectiveGracePeriodDays() int { + if s.GracePeriodDays < 0 { + return DefaultAssetLifecycleSettings().GracePeriodDays + } + if s.GracePeriodDays == 0 { + // Explicit zero is allowed for operators who want no grace + // period — only the < 0 (uninitialized) case falls back. + return 0 + } + return s.GracePeriodDays +} + +// EffectiveManualReactivationGraceDays — see above. +func (s AssetLifecycleSettings) EffectiveManualReactivationGraceDays() int { + if s.ManualReactivationGraceDays <= 0 { + return DefaultAssetLifecycleSettings().ManualReactivationGraceDays + } + return s.ManualReactivationGraceDays +} + +// EffectiveExcludedSourceTypes returns the configured slice or the +// defaults when the field is nil/empty. Nil check is required so an +// upgraded tenant who never set the field gets the protective +// default instead of "exclude nothing". +func (s AssetLifecycleSettings) EffectiveExcludedSourceTypes() []string { + if len(s.ExcludedSourceTypes) == 0 { + return DefaultAssetLifecycleSettings().ExcludedSourceTypes + } + return s.ExcludedSourceTypes +} + +// Validate enforces structural + range constraints. Called both from +// Settings.Validate (when the whole tenant settings blob is saved) +// and directly from the dedicated PUT /settings/asset-lifecycle +// endpoint. +func (s *AssetLifecycleSettings) Validate() error { + if s.StaleThresholdDays != 0 { + if s.StaleThresholdDays < MinLifecycleThresholdDays || s.StaleThresholdDays > MaxLifecycleThresholdDays { + return fmt.Errorf( + "%w: stale_threshold_days must be between %d and %d", + shared.ErrValidation, MinLifecycleThresholdDays, MaxLifecycleThresholdDays, + ) + } + } + if s.GracePeriodDays < MinGracePeriodDays || s.GracePeriodDays > MaxGracePeriodDays { + return fmt.Errorf( + "%w: grace_period_days must be between %d and %d", + shared.ErrValidation, MinGracePeriodDays, MaxGracePeriodDays, + ) + } + if s.ManualReactivationGraceDays != 0 { + if s.ManualReactivationGraceDays < MinLifecycleThresholdDays || s.ManualReactivationGraceDays > MaxLifecycleThresholdDays { + return fmt.Errorf( + "%w: manual_reactivation_grace_days must be between %d and %d", + shared.ErrValidation, MinLifecycleThresholdDays, MaxLifecycleThresholdDays, + ) + } + } + if len(s.ExcludedSourceTypes) > MaxExcludedSourceTypes { + return fmt.Errorf( + "%w: excluded_source_types exceeds the maximum of %d entries", + shared.ErrValidation, MaxExcludedSourceTypes, + ) + } + // Reject obvious typos in the exclusion list by matching against + // the canonical SourceType values from the datasource package. + // Kept decoupled here (string compare) to avoid a circular import + // between tenant and datasource — the valid set is small and + // stable so the duplication is acceptable. + validSourceTypes := map[string]struct{}{ + "integration": {}, + "collector": {}, + "scanner": {}, + "manual": {}, + "import": {}, + } + seen := make(map[string]struct{}, len(s.ExcludedSourceTypes)) + for _, st := range s.ExcludedSourceTypes { + if st == "" { + return fmt.Errorf( + "%w: excluded_source_types must not contain an empty string", + shared.ErrValidation, + ) + } + if _, ok := validSourceTypes[st]; !ok { + return fmt.Errorf( + "%w: excluded_source_types contains an unknown source type", + shared.ErrValidation, + ) + } + if _, dup := seen[st]; dup { + return fmt.Errorf( + "%w: excluded_source_types contains a duplicate entry", + shared.ErrValidation, + ) + } + seen[st] = struct{}{} + } + + // First-enable guard: if Enabled=true but no dry-run has ever + // completed, reject. The admin API bypasses this when it calls + // Validate() after a successful dry-run and stamps the timestamp. + if s.Enabled && s.DryRunCompletedAt == nil { + return fmt.Errorf( + "%w: lifecycle cannot be enabled without a successful dry-run first", + shared.ErrValidation, + ) + } + return nil +} diff --git a/pkg/domain/tenant/asset_lifecycle_settings_test.go b/pkg/domain/tenant/asset_lifecycle_settings_test.go new file mode 100644 index 00000000..6f83fa72 --- /dev/null +++ b/pkg/domain/tenant/asset_lifecycle_settings_test.go @@ -0,0 +1,158 @@ +package tenant + +import ( + "errors" + "strings" + "testing" + "time" + + "github.com/openctemio/api/pkg/domain/shared" +) + +func TestAssetLifecycleSettings_Defaults(t *testing.T) { + d := DefaultAssetLifecycleSettings() + // Pin values — if anyone changes these they have to update the + // test, forcing the decision to be conscious. Changing a default + // silently changes behavior for every tenant on next upgrade. + if d.Enabled { + t.Error("default must be disabled (backward-compat)") + } + if d.StaleThresholdDays != 14 { + t.Errorf("default StaleThresholdDays = %d, want 14", d.StaleThresholdDays) + } + if d.GracePeriodDays != 3 { + t.Errorf("default GracePeriodDays = %d, want 3", d.GracePeriodDays) + } + if d.ManualReactivationGraceDays != 30 { + t.Errorf("default ManualReactivationGraceDays = %d, want 30", d.ManualReactivationGraceDays) + } + if !d.PauseOnIntegrationFailure { + t.Error("default PauseOnIntegrationFailure must be true — safer default") + } + if len(d.ExcludedSourceTypes) != 2 { + t.Errorf("default excluded types should be [manual, import], got %v", d.ExcludedSourceTypes) + } +} + +func TestAssetLifecycleSettings_Effective_ZeroFallsBackToDefault(t *testing.T) { + // Legacy rows may have been persisted with zero-valued fields + // before the defaults existed. Effective*() must paper over this + // so callers never see a nonsensical 0. + var s AssetLifecycleSettings + if got := s.EffectiveStaleThresholdDays(); got != 14 { + t.Errorf("effective = %d, want 14", got) + } + if got := s.EffectiveManualReactivationGraceDays(); got != 30 { + t.Errorf("effective = %d, want 30", got) + } + if got := s.EffectiveExcludedSourceTypes(); len(got) != 2 { + t.Errorf("effective = %v, want default pair", got) + } +} + +func TestAssetLifecycleSettings_Effective_ExplicitZeroGraceRespected(t *testing.T) { + // "Grace 0" is a legitimate configuration (operator wants no + // grace period) and must NOT be overridden by the default. Only + // the < 0 "uninitialized" case falls back. + s := AssetLifecycleSettings{GracePeriodDays: 0} + if got := s.EffectiveGracePeriodDays(); got != 0 { + t.Errorf("explicit zero grace overridden to %d", got) + } +} + +func TestAssetLifecycleSettings_Validate_Ranges(t *testing.T) { + cases := []struct { + name string + s AssetLifecycleSettings + wantErr bool + }{ + {"zero is valid", AssetLifecycleSettings{}, false}, + {"threshold below min", AssetLifecycleSettings{StaleThresholdDays: 1}, true}, + {"threshold at min", AssetLifecycleSettings{StaleThresholdDays: 3}, false}, + {"threshold above max", AssetLifecycleSettings{StaleThresholdDays: 400}, true}, + {"threshold at max", AssetLifecycleSettings{StaleThresholdDays: 365}, false}, + {"grace negative", AssetLifecycleSettings{GracePeriodDays: -1}, true}, + {"grace above max", AssetLifecycleSettings{GracePeriodDays: 91}, true}, + {"grace at max", AssetLifecycleSettings{GracePeriodDays: 90}, false}, + {"reactivation grace below min", AssetLifecycleSettings{ManualReactivationGraceDays: 1}, true}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + err := tc.s.Validate() + if tc.wantErr && err == nil { + t.Fatal("expected validation error") + } + if !tc.wantErr && err != nil { + t.Fatalf("unexpected error: %v", err) + } + if err != nil && !errors.Is(err, shared.ErrValidation) { + t.Errorf("expected ErrValidation wrap, got %v", err) + } + }) + } +} + +func TestAssetLifecycleSettings_Validate_ExcludedSourceTypes(t *testing.T) { + cases := []struct { + name string + types []string + wantErr bool + }{ + {"nil is OK", nil, false}, + {"defaults OK", []string{"manual", "import"}, false}, + {"empty string rejected", []string{""}, true}, + {"unknown source type rejected", []string{"nessus"}, true}, + {"duplicate rejected", []string{"manual", "manual"}, true}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + s := AssetLifecycleSettings{ExcludedSourceTypes: tc.types} + err := s.Validate() + if tc.wantErr && err == nil { + t.Fatal("expected validation error") + } + if !tc.wantErr && err != nil { + t.Fatalf("unexpected error: %v", err) + } + }) + } +} + +func TestAssetLifecycleSettings_Validate_EnableRequiresDryRun(t *testing.T) { + // The first-enable gate: admin cannot flip Enabled=true unless + // they have completed a dry-run. Prevents the pathological + // "enable on 2-year-old tenant → 1M assets go stale overnight" + // scenario. Service layer stamps DryRunCompletedAt after a + // successful dry-run. + s := AssetLifecycleSettings{Enabled: true} + if err := s.Validate(); err == nil { + t.Fatal("enable without DryRunCompletedAt must fail") + } else if !strings.Contains(err.Error(), "dry-run") { + t.Errorf("error should mention dry-run: %v", err) + } + + ts := time.Now().Unix() + s.DryRunCompletedAt = &ts + if err := s.Validate(); err != nil { + t.Errorf("enable with DryRunCompletedAt should pass, got %v", err) + } +} + +func TestAssetLifecycleSettings_Validate_ErrorsAreErrValidation(t *testing.T) { + // Every failure mode must wrap shared.ErrValidation so HTTP + // handlers can classify as 400 without string-matching the + // message. + cases := []AssetLifecycleSettings{ + {StaleThresholdDays: 1}, + {GracePeriodDays: 200}, + {ExcludedSourceTypes: []string{"unknown-type"}}, + {Enabled: true}, // missing dry-run + } + for i, s := range cases { + if err := s.Validate(); err == nil { + t.Errorf("case %d: expected error", i) + } else if !errors.Is(err, shared.ErrValidation) { + t.Errorf("case %d: not ErrValidation: %v", i, err) + } + } +} diff --git a/pkg/domain/tenant/settings.go b/pkg/domain/tenant/settings.go index d2d1fab4..482b0b34 100644 --- a/pkg/domain/tenant/settings.go +++ b/pkg/domain/tenant/settings.go @@ -18,16 +18,17 @@ import ( // Settings represents the typed settings for a tenant. type Settings struct { - General GeneralSettings `json:"general"` - Security SecuritySettings `json:"security"` - API APISettings `json:"api"` - Branding BrandingSettings `json:"branding"` - Branch BranchSettings `json:"branch"` - AI AISettings `json:"ai"` - RiskScoring RiskScoringSettings `json:"risk_scoring"` - Pentest PentestSettings `json:"pentest"` - AssetIdentity AssetIdentitySettings `json:"asset_identity"` - AssetSource AssetSourceSettings `json:"asset_source"` + General GeneralSettings `json:"general"` + Security SecuritySettings `json:"security"` + API APISettings `json:"api"` + Branding BrandingSettings `json:"branding"` + Branch BranchSettings `json:"branch"` + AI AISettings `json:"ai"` + RiskScoring RiskScoringSettings `json:"risk_scoring"` + Pentest PentestSettings `json:"pentest"` + AssetIdentity AssetIdentitySettings `json:"asset_identity"` + AssetSource AssetSourceSettings `json:"asset_source"` + AssetLifecycle AssetLifecycleSettings `json:"asset_lifecycle"` } // AssetIdentitySettings controls asset dedup behavior per tenant. @@ -824,6 +825,9 @@ func (s *Settings) Validate() error { if err := s.AssetSource.Validate(); err != nil { return fmt.Errorf("asset_source settings: %w", err) } + if err := s.AssetLifecycle.Validate(); err != nil { + return fmt.Errorf("asset_lifecycle settings: %w", err) + } return nil } @@ -1236,3 +1240,17 @@ func (t *Tenant) UpdateAssetSourceSettings(as AssetSourceSettings) error { settings.AssetSource = as return t.UpdateSettings(settings) } + +// UpdateAssetLifecycleSettings updates only the asset-lifecycle +// settings. The DryRunCompletedAt-before-Enabled guard lives in +// AssetLifecycleSettings.Validate; the service layer is responsible +// for stamping that timestamp after a successful dry-run, not this +// entity method. +func (t *Tenant) UpdateAssetLifecycleSettings(al AssetLifecycleSettings) error { + if err := al.Validate(); err != nil { + return err + } + settings := t.TypedSettings() + settings.AssetLifecycle = al + return t.UpdateSettings(settings) +}