From 4697a821d5db02dcfac28a7ef8658af35835db3b Mon Sep 17 00:00:00 2001 From: Mark Gordon Date: Thu, 7 May 2026 00:12:16 -0700 Subject: [PATCH] infer_fks and include_dependencies fix --- CHANGELOG.md | 3 +++ subsetter.example.yaml | 10 +++++++--- subsetter/config_model.py | 7 +++++-- subsetter/metadata.py | 23 +++++++++++++++++++---- subsetter/planner.py | 8 ++++++-- 5 files changed, 40 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 95dd19c..e5f8cdf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # v0.4.5 +- Change `infer_foreign_keys` option to `infer_fks` to match other option names (old name still works) +- Added `infer_fks_ignore_tables` to allow skipping inferring fks to some tables +- Fix error when setting `include_dependencies` to false - Improved support for sampling rows when they have null fks - Added support for polymorphic foreign keys - Removed Python 3.8, 3.9 support and added 3.13, 3.14 support diff --git a/subsetter.example.yaml b/subsetter.example.yaml index 76552ca..58fa5d3 100644 --- a/subsetter.example.yaml +++ b/subsetter.example.yaml @@ -116,12 +116,16 @@ planner: # keys. It does this by inferring that a column name that matches the name # of a primary key column should function as a foreign key to that table. # If set to 'schema' will limit matches to tables within the same schema. - infer_foreign_keys: none # can be 'none', 'schema', or 'all' + infer_fks: none # can be 'none', 'schema', or 'all' + + # Do not attempt to infer foreign keys to these tables. + infer_fks_ignore_tables: + - db2.gizmos-foo # By default the subsetter will automatically pull in tables referenced by # tables already being selected to ensure their dependent rows can be pulled - # in with sampling. If this behavior is not desired you can set this value to - # false. + # in with sampling. Generally this behavior is desired otherwise foreign key + # constraints are likely to be violated at sample time. include_dependencies: true # Optional sampler config. Will write sample output to a directory named diff --git a/subsetter/config_model.py b/subsetter/config_model.py index 767dc85..32aeef4 100644 --- a/subsetter/config_model.py +++ b/subsetter/config_model.py @@ -1,6 +1,6 @@ from typing import Dict, List, Literal, Optional, Union -from pydantic import BaseModel, ConfigDict, Field, model_validator +from pydantic import AliasChoices, BaseModel, ConfigDict, Field, model_validator from typing_extensions import Annotated from subsetter.common import DatabaseConfig, SQLKnownOperator, SQLLiteralType @@ -82,7 +82,10 @@ class ColumnConstraint(ForbidBaseModel): ignore_fks: List[IgnoreFKConfig] = [] extra_fks: List[ExtraFKConfig] = [] polymorphic_fks: List[PolymorphicFKConfig] = [] - infer_foreign_keys: Literal["none", "schema", "all"] = "none" + infer_fks: Literal["none", "shema", "all"] = Field( + "none", validation_alias=AliasChoices("infer_fks", "infer_foreign_keys") + ) + infer_fks_ignore_tables: List[str] = [] include_dependencies: bool = True diff --git a/subsetter/metadata.py b/subsetter/metadata.py index b471e7b..56cc86f 100644 --- a/subsetter/metadata.py +++ b/subsetter/metadata.py @@ -2,7 +2,7 @@ import dataclasses import logging from fnmatch import fnmatch -from typing import Dict, List, Optional, Set, Tuple +from typing import Dict, Iterable, List, Optional, Set, Tuple import sqlalchemy as sa @@ -40,6 +40,8 @@ class TableMetadata: def __init__( self, table_obj: sa.Table, + *, + table_set: Optional[Set[Tuple[str, str]]] = None, ) -> None: assert table_obj.schema is not None self.table_obj = table_obj @@ -49,7 +51,10 @@ def __init__( column.name for column in table_obj.primary_key.columns ) self.foreign_keys = [ - ForeignKey.from_schema(fk) for fk in table_obj.foreign_key_constraints + ForeignKey.from_schema(fk) + for fk in table_obj.foreign_key_constraints + if table_set is None + or (fk.referred_table.schema, fk.referred_table.name) in table_set ] self.rev_foreign_keys: List[ForeignKey] = [] @@ -123,7 +128,8 @@ def from_engine( metadata_obj, { (schema, table): TableMetadata( - metadata_obj.tables[f"{schema}.{table}"] + metadata_obj.tables[f"{schema}.{table}"], + table_set=table_set, ) for schema, table in table_queue }, @@ -137,16 +143,25 @@ def track_new_table(self, table_obj: sa.Table) -> None: raise ValueError("Table schema must be set") self.tables[(table_obj.schema, table_obj.name)] = TableMetadata(table_obj) - def infer_missing_foreign_keys(self, *, infer_all: bool = False) -> None: + def infer_missing_foreign_keys( + self, + *, + infer_all: bool = False, + ignore_tables: Iterable[Tuple[str, str]] = (), + ) -> None: def _key_pk(schema: str, pk: Tuple[str, ...]): if infer_all: return pk return (schema, pk) + ignore_tables_st = set(ignore_tables) + pk_map: Dict[Tuple[str, Tuple[str, ...]], Optional[TableMetadata]] = {} for table in self.tables.values(): if not table.primary_key: continue + if (table.schema, table.name) in ignore_tables_st: + continue map_key = _key_pk(table.schema, table.primary_key) if map_key in pk_map: diff --git a/subsetter/planner.py b/subsetter/planner.py index 40dbfd6..f752f00 100644 --- a/subsetter/planner.py +++ b/subsetter/planner.py @@ -63,9 +63,13 @@ def plan(self) -> SubsetPlan: return self._plan_internal() def _plan_internal(self) -> SubsetPlan: - if self.config.infer_foreign_keys != "none": + if self.config.infer_fks != "none": self.meta.infer_missing_foreign_keys( - infer_all=self.config.infer_foreign_keys == "all" + infer_all=self.config.infer_fks == "all", + ignore_tables=( + parse_table_name(table) + for table in self.config.infer_fks_ignore_tables + ), ) self._remove_ignore_fks() self._add_extra_fks()