From 076c011704eacb3f6a717c85e3174961d8a2298d Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Tue, 24 Oct 2023 11:47:58 +0100 Subject: [PATCH 1/9] find blocking rules --- ..._with_comparison_counts_below_threshold.py | 225 ++++++++++++++++++ splink/linker.py | 23 ++ 2 files changed, 248 insertions(+) create mode 100644 splink/find_brs_with_comparison_counts_below_threshold.py diff --git a/splink/find_brs_with_comparison_counts_below_threshold.py b/splink/find_brs_with_comparison_counts_below_threshold.py new file mode 100644 index 0000000000..323dc00b14 --- /dev/null +++ b/splink/find_brs_with_comparison_counts_below_threshold.py @@ -0,0 +1,225 @@ +import logging +import string +from typing import TYPE_CHECKING, Dict, List, Set + +import pandas as pd + +from .input_column import InputColumn + +if TYPE_CHECKING: + from .linker import Linker +logger = logging.getLogger(__name__) + + +def sanitise_column_name(column_name): + allowed_chars = string.ascii_letters + string.digits + "_" + sanitized_name = "".join(c for c in column_name if c in allowed_chars) + return sanitized_name + + +def _generate_output_combinations_table_row( + blocking_columns, splink_blocking_rule, comparison_count, all_columns +): + row = {} + + blocking_columns = [sanitise_column_name(c) for c in blocking_columns] + all_columns = [sanitise_column_name(c) for c in all_columns] + + row["blocking_columns"] = blocking_columns + row["splink_blocking_rule"] = splink_blocking_rule + row["comparison_count"] = comparison_count + row["complexity"] = len(blocking_columns) + + for col in all_columns: + row[f"__fixed__{col}"] = 1 if col in blocking_columns else 0 + + return row + + +def _generate_combinations( + all_columns, current_combination, already_visited: Set[frozenset] +): + """Generate combinations of columns to visit that haven't been visited already + irrespective of order + """ + + combinations = [] + for col in all_columns: + if col not in current_combination: + next_combination = current_combination + [col] + if frozenset(next_combination) not in already_visited: + combinations.append(next_combination) + + return combinations + + +def _generate_blocking_rule(linker: "Linker", cols_as_string): + """Generate a blocking rule given a list of column names as string""" + + dialect = linker._sql_dialect + + module_mapping = { + "presto": "splink.athena.blocking_rule_library", + "duckdb": "splink.duckdb.blocking_rule_library", + "postgres": "splink.postgres.blocking_rule_library", + "spark": "splink.spark.blocking_rule_library", + "sqlite": "splink.sqlite.blocking_rule_library", + } + + if dialect not in module_mapping: + raise ValueError(f"Unsupported SQL dialect: {dialect}") + + module_name = module_mapping[dialect] + block_on_module = __import__(module_name, fromlist=["block_on"]) + block_on = block_on_module.block_on + + if len(cols_as_string) == 0: + return "1 = 1" + + br = block_on(cols_as_string) + + return br + + +def _search_tree_for_blocking_rules_below_threshold_count( + linker: "Linker", + all_columns: List[str], + threshold: float, + current_combination: List[str] = None, + already_visited: Set[frozenset] = None, + results: List[Dict[str, str]] = None, +) -> List[Dict[str, str]]: + """ + Recursively search combinations of fields to find ones that result in a count less + than the threshold. + + Uses the new, fast counting function + linker._count_num_comparisons_from_blocking_rule_pre_filter_conditions + to count + + The tree looks like this, where c1 c2 are columns: + c1 count_comparisons(c1) + ├── c2 count_comparisons(c1, c2) + │ └── c3 count_comparisons(c1, c2, c3) + ├── c3 count_comparisons(c1, c3) + │ └── c2 count_comparisons(c1, c3, c2) + c2 count_comparisons(c2) + ├── c1 count_comparisons(c2, c1) + │ └── c3 count_comparisons(c2, c1, c3) + ├── c3 count_comparisons(c2, c3) + │ └── c1 count_comparisons(c2, c3, c1) + + Once the count is below the threshold, no branches from the node are explored. + + When a count is below the threshold, create a dictionary with the relevant stats + like : + { + 'blocking_columns':['first_name'], + 'splink_blocking_rule':', + comparison_count':4827, + 'complexity':1, + '__fixed__first_name':1, + '__fixed__surname':0, + '__fixed__dob':0, + '__fixed__city':0, + '__fixed__email':0, + '__fixed__cluster':0, + } + + Return a list of these dicts. + + + Args: + linker: splink.Linker + fields (List[str]): List of fields to combine. + threshold (float): The count threshold. + current_combination (List[str], optional): Current combination of fields. + already_visited (Set[frozenset], optional): Set of visited combinations. + results (List[Dict[str, str]], optional): List of results. Defaults to []. + + Returns: + List[Dict]: List of results. Each result is a dict with statistics like + the number of comparisons, the blocking rule etc. + """ + if current_combination is None: + current_combination = [] + if already_visited is None: + already_visited = set() + if results is None: + results = [] + + if len(current_combination) == len(all_columns): + return results # All fields included, meaning we're at a leaf so exit recursion + + br = _generate_blocking_rule(linker, current_combination) + comparison_count = ( + linker._count_num_comparisons_from_blocking_rule_pre_filter_conditions(br) + ) + row = _generate_output_combinations_table_row( + current_combination, + br, + comparison_count, + all_columns, + ) + + already_visited.add(frozenset(current_combination)) + + if comparison_count > threshold: + # Generate all valid combinations and continue the search + combinations = _generate_combinations( + all_columns, current_combination, already_visited + ) + for next_combination in combinations: + _search_tree_for_blocking_rules_below_threshold_count( + linker, + all_columns, + threshold, + next_combination, + already_visited, + results, + ) + else: + results.append(row) + + return results + + +def find_blocking_rules_below_threshold_comparison_count( + linker: "Linker", max_comparisons_per_rule, columns=None +) -> pd.DataFrame: + """ + Finds blocking rules which return a comparison count below a given threshold. + + In addition to returning blocking rules, returns the comparison count and + 'complexity', which refers to the number of equi-joins used by the rule. + + Also returns one-hot encoding that describes which columns are __fixed__ by the + blocking rule + + e.g. equality on first_name and surname is complexity of 2 + + Args: + linker (Linker): The Linker object + max_comparisons_per_rule (int): Max comparisons allowed per blocking rule. + columns: Columns to consider. If None, uses all columns used by the + ComparisonLevels of the Linker. + + Returns: + pd.DataFrame: DataFrame with blocking rules, comparison_count, and complexity. + """ + + if not columns: + columns = linker._column_names_as_input_columns + + columns_as_strings = [] + + for c in columns: + if isinstance(c, InputColumn): + columns_as_strings.append(c.quote().name()) + else: + columns_as_strings.append(c) + + results = _search_tree_for_blocking_rules_below_threshold_count( + linker, columns_as_strings, max_comparisons_per_rule + ) + return pd.DataFrame(results) diff --git a/splink/linker.py b/splink/linker.py index 897dfc9899..a2c9998ba3 100644 --- a/splink/linker.py +++ b/splink/linker.py @@ -262,6 +262,29 @@ def _get_input_columns( return column_names + @property + def _column_names_as_input_columns( + self, + include_unique_id_col_names=False, + include_additional_columns_to_retain=False, + ): + """Retrieve the column names from the input dataset(s)""" + df_obj: SplinkDataFrame = next(iter(self._input_tables_dict.values())) + + input_columns = df_obj.columns + remove_columns = [] + if not include_unique_id_col_names: + remove_columns.extend(self._settings_obj._unique_id_input_columns) + if not include_additional_columns_to_retain: + remove_columns.extend(self._settings_obj._additional_columns_to_retain) + + remove_id_cols = [c.unquote().name() for c in remove_columns] + columns = [ + col for col in input_columns if col.unquote().name() not in remove_id_cols + ] + + return columns + @property def _cache_uid(self): if self._settings_dict: From 7f33e9db64f878d6bc6ed97960d442c8eb1aeb2e Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Mon, 20 Nov 2023 08:51:28 +0000 Subject: [PATCH 2/9] extend input columns to allow arguments --- ..._with_comparison_counts_below_threshold.py | 9 ++-- splink/linker.py | 51 ++++++++++--------- splink/missingness.py | 2 +- splink/profile_data.py | 2 +- 4 files changed, 35 insertions(+), 29 deletions(-) diff --git a/splink/find_brs_with_comparison_counts_below_threshold.py b/splink/find_brs_with_comparison_counts_below_threshold.py index 323dc00b14..0d2c2db7d1 100644 --- a/splink/find_brs_with_comparison_counts_below_threshold.py +++ b/splink/find_brs_with_comparison_counts_below_threshold.py @@ -11,7 +11,7 @@ logger = logging.getLogger(__name__) -def sanitise_column_name(column_name): +def sanitise_column_name(column_name) -> str: allowed_chars = string.ascii_letters + string.digits + "_" sanitized_name = "".join(c for c in column_name if c in allowed_chars) return sanitized_name @@ -209,13 +209,16 @@ def find_blocking_rules_below_threshold_comparison_count( """ if not columns: - columns = linker._column_names_as_input_columns + columns = linker._input_columns( + include_unique_id_col_names=False, + include_additional_columns_to_retain=False, + ) columns_as_strings = [] for c in columns: if isinstance(c, InputColumn): - columns_as_strings.append(c.quote().name()) + columns_as_strings.append(c.quote().name) else: columns_as_strings.append(c) diff --git a/splink/linker.py b/splink/linker.py index 2fd45e055e..742a03a4df 100644 --- a/splink/linker.py +++ b/splink/linker.py @@ -248,11 +248,26 @@ def __init__( self.debug_mode = False - @property def _input_columns( self, + include_unique_id_col_names=True, + include_additional_columns_to_retain=True, ) -> list[InputColumn]: - """Retrieve the column names from the input dataset(s)""" + """Retrieve the column names from the input dataset(s) as InputColumns + + Args: + include_unique_id_col_names (bool, optional): Whether to include unique ID + column names. Defaults to True. + include_additional_columns_to_retain (bool, optional): Whether to include + additional columns to retain. Defaults to True. + + Raises: + SplinkException: If the input frames have different sets of columns. + + Returns: + list[InputColumn] + """ + input_dfs = self._input_tables_dict.values() # get a list of the column names for each input frame @@ -280,38 +295,26 @@ def _input_columns( + ", ".join(problem_names) ) - return next(iter(input_dfs)).columns + columns = next(iter(input_dfs)).columns - @property - def _source_dataset_column_already_exists(self): - if self._settings_obj_ is None: - return False - input_cols = [c.unquote().name for c in self._input_columns] - return self._settings_obj._source_dataset_column_name in input_cols - - @property - def _column_names_as_input_columns( - self, - include_unique_id_col_names=False, - include_additional_columns_to_retain=False, - ): - """Retrieve the column names from the input dataset(s)""" - df_obj: SplinkDataFrame = next(iter(self._input_tables_dict.values())) - - input_columns = df_obj.columns remove_columns = [] if not include_unique_id_col_names: remove_columns.extend(self._settings_obj._unique_id_input_columns) if not include_additional_columns_to_retain: remove_columns.extend(self._settings_obj._additional_columns_to_retain) - remove_id_cols = [c.unquote().name() for c in remove_columns] - columns = [ - col for col in input_columns if col.unquote().name() not in remove_id_cols - ] + remove_id_cols = [c.unquote().name for c in remove_columns] + columns = [col for col in columns if col.unquote().name not in remove_id_cols] return columns + @property + def _source_dataset_column_already_exists(self): + if self._settings_obj_ is None: + return False + input_cols = [c.unquote().name for c in self._input_columns()] + return self._settings_obj._source_dataset_column_name in input_cols + @property def _cache_uid(self): if self._settings_dict: diff --git a/splink/missingness.py b/splink/missingness.py index 607d320a88..8479d951c1 100644 --- a/splink/missingness.py +++ b/splink/missingness.py @@ -40,7 +40,7 @@ def missingness_sqls(columns, input_tablename): def missingness_data(linker, input_tablename): - columns = linker._input_columns + columns = linker._input_columns() if input_tablename is None: splink_dataframe = linker._initialise_df_concat(materialise=True) else: diff --git a/splink/profile_data.py b/splink/profile_data.py index ea035a54c1..93986dfb2f 100644 --- a/splink/profile_data.py +++ b/splink/profile_data.py @@ -232,7 +232,7 @@ def profile_columns(linker, column_expressions=None, top_n=10, bottom_n=10): """ if not column_expressions: - column_expressions = [col.name for col in linker._input_columns] + column_expressions = [col.name for col in linker._input_columns()] df_concat = linker._initialise_df_concat() From 46a79c11e10af05b8d744ab330f2477eddc194a9 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Mon, 20 Nov 2023 09:06:22 +0000 Subject: [PATCH 3/9] name things more clearly --- ...brs_with_comparison_counts_below_threshold.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/splink/find_brs_with_comparison_counts_below_threshold.py b/splink/find_brs_with_comparison_counts_below_threshold.py index 0d2c2db7d1..7e895f9f7e 100644 --- a/splink/find_brs_with_comparison_counts_below_threshold.py +++ b/splink/find_brs_with_comparison_counts_below_threshold.py @@ -11,21 +11,23 @@ logger = logging.getLogger(__name__) -def sanitise_column_name(column_name) -> str: +def sanitise_column_name_for_one_hot_encoding(column_name) -> str: allowed_chars = string.ascii_letters + string.digits + "_" - sanitized_name = "".join(c for c in column_name if c in allowed_chars) - return sanitized_name + sanitised_name = "".join(c for c in column_name if c in allowed_chars) + return sanitised_name def _generate_output_combinations_table_row( blocking_columns, splink_blocking_rule, comparison_count, all_columns -): +) -> dict: row = {} - blocking_columns = [sanitise_column_name(c) for c in blocking_columns] - all_columns = [sanitise_column_name(c) for c in all_columns] + blocking_columns = [ + sanitise_column_name_for_one_hot_encoding(c) for c in blocking_columns + ] + all_columns = [sanitise_column_name_for_one_hot_encoding(c) for c in all_columns] - row["blocking_columns"] = blocking_columns + row["blocking_columns_sanitised"] = blocking_columns row["splink_blocking_rule"] = splink_blocking_rule row["comparison_count"] = comparison_count row["complexity"] = len(blocking_columns) From fd89e5eb737f1e471bf016513c3b20481b6fc6e6 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Mon, 20 Nov 2023 09:13:39 +0000 Subject: [PATCH 4/9] rename complexity for clarity --- ..._with_comparison_counts_below_threshold.py | 30 +++++++++++-------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/splink/find_brs_with_comparison_counts_below_threshold.py b/splink/find_brs_with_comparison_counts_below_threshold.py index 7e895f9f7e..2038cd8658 100644 --- a/splink/find_brs_with_comparison_counts_below_threshold.py +++ b/splink/find_brs_with_comparison_counts_below_threshold.py @@ -30,7 +30,7 @@ def _generate_output_combinations_table_row( row["blocking_columns_sanitised"] = blocking_columns row["splink_blocking_rule"] = splink_blocking_rule row["comparison_count"] = comparison_count - row["complexity"] = len(blocking_columns) + row["num_equi_joins"] = len(blocking_columns) for col in all_columns: row[f"__fixed__{col}"] = 1 if col in blocking_columns else 0 @@ -99,7 +99,7 @@ def _search_tree_for_blocking_rules_below_threshold_count( linker._count_num_comparisons_from_blocking_rule_pre_filter_conditions to count - The tree looks like this, where c1 c2 are columns: + The full tree looks like this, where c1 c2 are columns: c1 count_comparisons(c1) ├── c2 count_comparisons(c1, c2) │ └── c3 count_comparisons(c1, c2, c3) @@ -111,12 +111,16 @@ def _search_tree_for_blocking_rules_below_threshold_count( ├── c3 count_comparisons(c2, c3) │ └── c1 count_comparisons(c2, c3, c1) - Once the count is below the threshold, no branches from the node are explored. + But many nodes do not need to be visited: + - Once the count is below the threshold, no branches from the node are explored. + - If a combination has alraedy been evaluated, it is not evaluated again. For + example, c2 -> c1 will not be evaluated because c1 -> c2 has already been + counted When a count is below the threshold, create a dictionary with the relevant stats like : { - 'blocking_columns':['first_name'], + 'blocking_columns_sanitised':['first_name'], 'splink_blocking_rule':', comparison_count':4827, 'complexity':1, @@ -157,12 +161,6 @@ def _search_tree_for_blocking_rules_below_threshold_count( comparison_count = ( linker._count_num_comparisons_from_blocking_rule_pre_filter_conditions(br) ) - row = _generate_output_combinations_table_row( - current_combination, - br, - comparison_count, - all_columns, - ) already_visited.add(frozenset(current_combination)) @@ -181,6 +179,12 @@ def _search_tree_for_blocking_rules_below_threshold_count( results, ) else: + row = _generate_output_combinations_table_row( + current_combination, + br, + comparison_count, + all_columns, + ) results.append(row) return results @@ -193,12 +197,12 @@ def find_blocking_rules_below_threshold_comparison_count( Finds blocking rules which return a comparison count below a given threshold. In addition to returning blocking rules, returns the comparison count and - 'complexity', which refers to the number of equi-joins used by the rule. + 'num_equi_joins', which refers to the number of equi-joins used by the rule. Also returns one-hot encoding that describes which columns are __fixed__ by the blocking rule - e.g. equality on first_name and surname is complexity of 2 + e.g. equality on first_name and surname has num_equi_joins of 2 Args: linker (Linker): The Linker object @@ -207,7 +211,7 @@ def find_blocking_rules_below_threshold_comparison_count( ComparisonLevels of the Linker. Returns: - pd.DataFrame: DataFrame with blocking rules, comparison_count, and complexity. + pd.DataFrame: DataFrame with blocking rules, comparison_count and num_equi_joins """ if not columns: From 9d5f2c13077a86f1ca11e8b69ae507e101aa26af Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Mon, 20 Nov 2023 09:18:37 +0000 Subject: [PATCH 5/9] Add error handling for no blocking rules below threshold --- splink/find_brs_with_comparison_counts_below_threshold.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/splink/find_brs_with_comparison_counts_below_threshold.py b/splink/find_brs_with_comparison_counts_below_threshold.py index 2038cd8658..d692355a5b 100644 --- a/splink/find_brs_with_comparison_counts_below_threshold.py +++ b/splink/find_brs_with_comparison_counts_below_threshold.py @@ -231,4 +231,12 @@ def find_blocking_rules_below_threshold_comparison_count( results = _search_tree_for_blocking_rules_below_threshold_count( linker, columns_as_strings, max_comparisons_per_rule ) + + if not results: + raise ValueError( + "No blocking rules could be found that produce a comparison count below " + "your chosen max_comparisons_per_rule threshold of " + f"{max_comparisons_per_rule}. Try increasing the threshold." + ) + return pd.DataFrame(results) From 17e6fec95d056043538a5feead90f91912f782c9 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Mon, 20 Nov 2023 09:34:44 +0000 Subject: [PATCH 6/9] clarify naming and docstring --- ..._with_comparison_counts_below_threshold.py | 33 +++++++++++-------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/splink/find_brs_with_comparison_counts_below_threshold.py b/splink/find_brs_with_comparison_counts_below_threshold.py index d692355a5b..66ff294088 100644 --- a/splink/find_brs_with_comparison_counts_below_threshold.py +++ b/splink/find_brs_with_comparison_counts_below_threshold.py @@ -5,6 +5,7 @@ import pandas as pd from .input_column import InputColumn +from .blocking import BlockingRule if TYPE_CHECKING: from .linker import Linker @@ -40,7 +41,7 @@ def _generate_output_combinations_table_row( def _generate_combinations( all_columns, current_combination, already_visited: Set[frozenset] -): +) -> list: """Generate combinations of columns to visit that haven't been visited already irrespective of order """ @@ -55,8 +56,11 @@ def _generate_combinations( return combinations -def _generate_blocking_rule(linker: "Linker", cols_as_string): - """Generate a blocking rule given a list of column names as string""" +def _generate_blocking_rule( + linker: "Linker", cols_as_string: List[str] +) -> BlockingRule: + """Generate a Splink blocking rule given a list of column names which + are provided as as string""" dialect = linker._sql_dialect @@ -191,7 +195,7 @@ def _search_tree_for_blocking_rules_below_threshold_count( def find_blocking_rules_below_threshold_comparison_count( - linker: "Linker", max_comparisons_per_rule, columns=None + linker: "Linker", max_comparisons_per_rule, column_expressions: List[str] = None ) -> pd.DataFrame: """ Finds blocking rules which return a comparison count below a given threshold. @@ -207,29 +211,32 @@ def find_blocking_rules_below_threshold_comparison_count( Args: linker (Linker): The Linker object max_comparisons_per_rule (int): Max comparisons allowed per blocking rule. - columns: Columns to consider. If None, uses all columns used by the - ComparisonLevels of the Linker. + column_expressions: List[str] = Algorithm will find combinations of these + column expressions to use as blocking rules. If None, uses all columns used + by the ComparisonLevels of the Linker. Column expressions can be SQL + expressions, not just column names i.e. 'substr(surname, 1,1)' is a valid + entry in this list. Returns: pd.DataFrame: DataFrame with blocking rules, comparison_count and num_equi_joins """ - if not columns: - columns = linker._input_columns( + if not column_expressions: + column_expressions = linker._input_columns( include_unique_id_col_names=False, include_additional_columns_to_retain=False, ) - columns_as_strings = [] + column_expressions_as_strings = [] - for c in columns: + for c in column_expressions: if isinstance(c, InputColumn): - columns_as_strings.append(c.quote().name) + column_expressions_as_strings.append(c.quote().name) else: - columns_as_strings.append(c) + column_expressions_as_strings.append(c) results = _search_tree_for_blocking_rules_below_threshold_count( - linker, columns_as_strings, max_comparisons_per_rule + linker, column_expressions_as_strings, max_comparisons_per_rule ) if not results: From 04632698b25de1e9472978cfc3eb005e940ca24d Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Mon, 20 Nov 2023 09:39:18 +0000 Subject: [PATCH 7/9] lint --- splink/find_brs_with_comparison_counts_below_threshold.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/splink/find_brs_with_comparison_counts_below_threshold.py b/splink/find_brs_with_comparison_counts_below_threshold.py index 66ff294088..148645ce00 100644 --- a/splink/find_brs_with_comparison_counts_below_threshold.py +++ b/splink/find_brs_with_comparison_counts_below_threshold.py @@ -4,8 +4,8 @@ import pandas as pd -from .input_column import InputColumn from .blocking import BlockingRule +from .input_column import InputColumn if TYPE_CHECKING: from .linker import Linker From ce46245823af97b00c8b02dc5f1d6ba22a8140d6 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Mon, 20 Nov 2023 10:48:08 +0000 Subject: [PATCH 8/9] Update splink/find_brs_with_comparison_counts_below_threshold.py Co-authored-by: Tom Hepworth <45356472+ThomasHepworth@users.noreply.github.com> --- splink/find_brs_with_comparison_counts_below_threshold.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/splink/find_brs_with_comparison_counts_below_threshold.py b/splink/find_brs_with_comparison_counts_below_threshold.py index 148645ce00..12fce64028 100644 --- a/splink/find_brs_with_comparison_counts_below_threshold.py +++ b/splink/find_brs_with_comparison_counts_below_threshold.py @@ -117,7 +117,7 @@ def _search_tree_for_blocking_rules_below_threshold_count( But many nodes do not need to be visited: - Once the count is below the threshold, no branches from the node are explored. - - If a combination has alraedy been evaluated, it is not evaluated again. For + - If a combination has alraedy been evaluated, it is not evaluated again. For example, c2 -> c1 will not be evaluated because c1 -> c2 has already been counted From d74db7e1c120dd7f175f1f2d59817f9bbd438dc2 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Mon, 20 Nov 2023 10:49:59 +0000 Subject: [PATCH 9/9] address Tom's final comments --- splink/find_brs_with_comparison_counts_below_threshold.py | 1 + 1 file changed, 1 insertion(+) diff --git a/splink/find_brs_with_comparison_counts_below_threshold.py b/splink/find_brs_with_comparison_counts_below_threshold.py index 12fce64028..b151ab72c4 100644 --- a/splink/find_brs_with_comparison_counts_below_threshold.py +++ b/splink/find_brs_with_comparison_counts_below_threshold.py @@ -62,6 +62,7 @@ def _generate_blocking_rule( """Generate a Splink blocking rule given a list of column names which are provided as as string""" + # TODO: Refactor in Splink4 dialect = linker._sql_dialect module_mapping = {