From 076c011704eacb3f6a717c85e3174961d8a2298d Mon Sep 17 00:00:00 2001
From: Robin Linacre <robin.linacre@digital.justice.gov.uk>
Date: Tue, 24 Oct 2023 11:47:58 +0100
Subject: [PATCH 1/9] find blocking rules

---
 ..._with_comparison_counts_below_threshold.py | 225 ++++++++++++++++++
 splink/linker.py                              |  23 ++
 2 files changed, 248 insertions(+)
 create mode 100644 splink/find_brs_with_comparison_counts_below_threshold.py

diff --git a/splink/find_brs_with_comparison_counts_below_threshold.py b/splink/find_brs_with_comparison_counts_below_threshold.py
new file mode 100644
index 0000000000..323dc00b14
--- /dev/null
+++ b/splink/find_brs_with_comparison_counts_below_threshold.py
@@ -0,0 +1,225 @@
+import logging
+import string
+from typing import TYPE_CHECKING, Dict, List, Set
+
+import pandas as pd
+
+from .input_column import InputColumn
+
+if TYPE_CHECKING:
+    from .linker import Linker
+logger = logging.getLogger(__name__)
+
+
+def sanitise_column_name(column_name):
+    allowed_chars = string.ascii_letters + string.digits + "_"
+    sanitized_name = "".join(c for c in column_name if c in allowed_chars)
+    return sanitized_name
+
+
+def _generate_output_combinations_table_row(
+    blocking_columns, splink_blocking_rule, comparison_count, all_columns
+):
+    row = {}
+
+    blocking_columns = [sanitise_column_name(c) for c in blocking_columns]
+    all_columns = [sanitise_column_name(c) for c in all_columns]
+
+    row["blocking_columns"] = blocking_columns
+    row["splink_blocking_rule"] = splink_blocking_rule
+    row["comparison_count"] = comparison_count
+    row["complexity"] = len(blocking_columns)
+
+    for col in all_columns:
+        row[f"__fixed__{col}"] = 1 if col in blocking_columns else 0
+
+    return row
+
+
+def _generate_combinations(
+    all_columns, current_combination, already_visited: Set[frozenset]
+):
+    """Generate combinations of columns to visit that haven't been visited already
+    irrespective of order
+    """
+
+    combinations = []
+    for col in all_columns:
+        if col not in current_combination:
+            next_combination = current_combination + [col]
+            if frozenset(next_combination) not in already_visited:
+                combinations.append(next_combination)
+
+    return combinations
+
+
+def _generate_blocking_rule(linker: "Linker", cols_as_string):
+    """Generate a blocking rule given a list of column names as string"""
+
+    dialect = linker._sql_dialect
+
+    module_mapping = {
+        "presto": "splink.athena.blocking_rule_library",
+        "duckdb": "splink.duckdb.blocking_rule_library",
+        "postgres": "splink.postgres.blocking_rule_library",
+        "spark": "splink.spark.blocking_rule_library",
+        "sqlite": "splink.sqlite.blocking_rule_library",
+    }
+
+    if dialect not in module_mapping:
+        raise ValueError(f"Unsupported SQL dialect: {dialect}")
+
+    module_name = module_mapping[dialect]
+    block_on_module = __import__(module_name, fromlist=["block_on"])
+    block_on = block_on_module.block_on
+
+    if len(cols_as_string) == 0:
+        return "1 = 1"
+
+    br = block_on(cols_as_string)
+
+    return br
+
+
+def _search_tree_for_blocking_rules_below_threshold_count(
+    linker: "Linker",
+    all_columns: List[str],
+    threshold: float,
+    current_combination: List[str] = None,
+    already_visited: Set[frozenset] = None,
+    results: List[Dict[str, str]] = None,
+) -> List[Dict[str, str]]:
+    """
+    Recursively search combinations of fields to find ones that result in a count less
+    than the threshold.
+
+    Uses the new, fast counting function
+    linker._count_num_comparisons_from_blocking_rule_pre_filter_conditions
+    to count
+
+    The tree looks like this, where c1 c2 are columns:
+    c1                    count_comparisons(c1)
+    ├── c2                count_comparisons(c1, c2)
+    │   └── c3            count_comparisons(c1, c2, c3)
+    ├── c3                count_comparisons(c1, c3)
+    │   └── c2            count_comparisons(c1, c3, c2)
+    c2                    count_comparisons(c2)
+    ├── c1                count_comparisons(c2, c1)
+    │   └── c3            count_comparisons(c2, c1, c3)
+    ├── c3                count_comparisons(c2, c3)
+    │   └── c1            count_comparisons(c2, c3, c1)
+
+    Once the count is below the threshold, no branches from the node are explored.
+
+    When a count is below the threshold, create a dictionary with the relevant stats
+    like :
+    {
+        'blocking_columns':['first_name'],
+        'splink_blocking_rule':<Custom rule>',
+        comparison_count':4827,
+        'complexity':1,
+        '__fixed__first_name':1,
+        '__fixed__surname':0,
+        '__fixed__dob':0,
+        '__fixed__city':0,
+        '__fixed__email':0,
+        '__fixed__cluster':0,
+    }
+
+    Return a list of these dicts.
+
+
+    Args:
+        linker: splink.Linker
+        fields (List[str]): List of fields to combine.
+        threshold (float): The count threshold.
+        current_combination (List[str], optional): Current combination of fields.
+        already_visited (Set[frozenset], optional): Set of visited combinations.
+        results (List[Dict[str, str]], optional): List of results. Defaults to [].
+
+    Returns:
+        List[Dict]: List of results.  Each result is a dict with statistics like
+            the number of comparisons, the blocking rule etc.
+    """
+    if current_combination is None:
+        current_combination = []
+    if already_visited is None:
+        already_visited = set()
+    if results is None:
+        results = []
+
+    if len(current_combination) == len(all_columns):
+        return results  # All fields included, meaning we're at a leaf so exit recursion
+
+    br = _generate_blocking_rule(linker, current_combination)
+    comparison_count = (
+        linker._count_num_comparisons_from_blocking_rule_pre_filter_conditions(br)
+    )
+    row = _generate_output_combinations_table_row(
+        current_combination,
+        br,
+        comparison_count,
+        all_columns,
+    )
+
+    already_visited.add(frozenset(current_combination))
+
+    if comparison_count > threshold:
+        # Generate all valid combinations and continue the search
+        combinations = _generate_combinations(
+            all_columns, current_combination, already_visited
+        )
+        for next_combination in combinations:
+            _search_tree_for_blocking_rules_below_threshold_count(
+                linker,
+                all_columns,
+                threshold,
+                next_combination,
+                already_visited,
+                results,
+            )
+    else:
+        results.append(row)
+
+    return results
+
+
+def find_blocking_rules_below_threshold_comparison_count(
+    linker: "Linker", max_comparisons_per_rule, columns=None
+) -> pd.DataFrame:
+    """
+    Finds blocking rules which return a comparison count below a given threshold.
+
+    In addition to returning blocking rules, returns the comparison count and
+    'complexity', which refers to the number of equi-joins used by the rule.
+
+    Also returns one-hot encoding that describes which columns are __fixed__ by the
+    blocking rule
+
+    e.g. equality on first_name and surname is complexity of 2
+
+    Args:
+        linker (Linker): The Linker object
+        max_comparisons_per_rule (int): Max comparisons allowed per blocking rule.
+        columns: Columns to consider. If None, uses all columns used by the
+            ComparisonLevels of the Linker.
+
+    Returns:
+        pd.DataFrame: DataFrame with blocking rules, comparison_count, and complexity.
+    """
+
+    if not columns:
+        columns = linker._column_names_as_input_columns
+
+    columns_as_strings = []
+
+    for c in columns:
+        if isinstance(c, InputColumn):
+            columns_as_strings.append(c.quote().name())
+        else:
+            columns_as_strings.append(c)
+
+    results = _search_tree_for_blocking_rules_below_threshold_count(
+        linker, columns_as_strings, max_comparisons_per_rule
+    )
+    return pd.DataFrame(results)
diff --git a/splink/linker.py b/splink/linker.py
index 897dfc9899..a2c9998ba3 100644
--- a/splink/linker.py
+++ b/splink/linker.py
@@ -262,6 +262,29 @@ def _get_input_columns(
 
         return column_names
 
+    @property
+    def _column_names_as_input_columns(
+        self,
+        include_unique_id_col_names=False,
+        include_additional_columns_to_retain=False,
+    ):
+        """Retrieve the column names from the input dataset(s)"""
+        df_obj: SplinkDataFrame = next(iter(self._input_tables_dict.values()))
+
+        input_columns = df_obj.columns
+        remove_columns = []
+        if not include_unique_id_col_names:
+            remove_columns.extend(self._settings_obj._unique_id_input_columns)
+        if not include_additional_columns_to_retain:
+            remove_columns.extend(self._settings_obj._additional_columns_to_retain)
+
+        remove_id_cols = [c.unquote().name() for c in remove_columns]
+        columns = [
+            col for col in input_columns if col.unquote().name() not in remove_id_cols
+        ]
+
+        return columns
+
     @property
     def _cache_uid(self):
         if self._settings_dict:

From 7f33e9db64f878d6bc6ed97960d442c8eb1aeb2e Mon Sep 17 00:00:00 2001
From: Robin Linacre <robin.linacre@digital.justice.gov.uk>
Date: Mon, 20 Nov 2023 08:51:28 +0000
Subject: [PATCH 2/9] extend input columns to allow arguments

---
 ..._with_comparison_counts_below_threshold.py |  9 ++--
 splink/linker.py                              | 51 ++++++++++---------
 splink/missingness.py                         |  2 +-
 splink/profile_data.py                        |  2 +-
 4 files changed, 35 insertions(+), 29 deletions(-)

diff --git a/splink/find_brs_with_comparison_counts_below_threshold.py b/splink/find_brs_with_comparison_counts_below_threshold.py
index 323dc00b14..0d2c2db7d1 100644
--- a/splink/find_brs_with_comparison_counts_below_threshold.py
+++ b/splink/find_brs_with_comparison_counts_below_threshold.py
@@ -11,7 +11,7 @@
 logger = logging.getLogger(__name__)
 
 
-def sanitise_column_name(column_name):
+def sanitise_column_name(column_name) -> str:
     allowed_chars = string.ascii_letters + string.digits + "_"
     sanitized_name = "".join(c for c in column_name if c in allowed_chars)
     return sanitized_name
@@ -209,13 +209,16 @@ def find_blocking_rules_below_threshold_comparison_count(
     """
 
     if not columns:
-        columns = linker._column_names_as_input_columns
+        columns = linker._input_columns(
+            include_unique_id_col_names=False,
+            include_additional_columns_to_retain=False,
+        )
 
     columns_as_strings = []
 
     for c in columns:
         if isinstance(c, InputColumn):
-            columns_as_strings.append(c.quote().name())
+            columns_as_strings.append(c.quote().name)
         else:
             columns_as_strings.append(c)
 
diff --git a/splink/linker.py b/splink/linker.py
index 2fd45e055e..742a03a4df 100644
--- a/splink/linker.py
+++ b/splink/linker.py
@@ -248,11 +248,26 @@ def __init__(
 
         self.debug_mode = False
 
-    @property
     def _input_columns(
         self,
+        include_unique_id_col_names=True,
+        include_additional_columns_to_retain=True,
     ) -> list[InputColumn]:
-        """Retrieve the column names from the input dataset(s)"""
+        """Retrieve the column names from the input dataset(s) as InputColumns
+
+        Args:
+            include_unique_id_col_names (bool, optional): Whether to include unique ID
+                column names. Defaults to True.
+            include_additional_columns_to_retain (bool, optional): Whether to include
+                additional columns to retain. Defaults to True.
+
+        Raises:
+            SplinkException: If the input frames have different sets of columns.
+
+        Returns:
+            list[InputColumn]
+        """
+
         input_dfs = self._input_tables_dict.values()
 
         # get a list of the column names for each input frame
@@ -280,38 +295,26 @@ def _input_columns(
                 + ", ".join(problem_names)
             )
 
-        return next(iter(input_dfs)).columns
+        columns = next(iter(input_dfs)).columns
 
-    @property
-    def _source_dataset_column_already_exists(self):
-        if self._settings_obj_ is None:
-            return False
-        input_cols = [c.unquote().name for c in self._input_columns]
-        return self._settings_obj._source_dataset_column_name in input_cols
-
-    @property
-    def _column_names_as_input_columns(
-        self,
-        include_unique_id_col_names=False,
-        include_additional_columns_to_retain=False,
-    ):
-        """Retrieve the column names from the input dataset(s)"""
-        df_obj: SplinkDataFrame = next(iter(self._input_tables_dict.values()))
-
-        input_columns = df_obj.columns
         remove_columns = []
         if not include_unique_id_col_names:
             remove_columns.extend(self._settings_obj._unique_id_input_columns)
         if not include_additional_columns_to_retain:
             remove_columns.extend(self._settings_obj._additional_columns_to_retain)
 
-        remove_id_cols = [c.unquote().name() for c in remove_columns]
-        columns = [
-            col for col in input_columns if col.unquote().name() not in remove_id_cols
-        ]
+        remove_id_cols = [c.unquote().name for c in remove_columns]
+        columns = [col for col in columns if col.unquote().name not in remove_id_cols]
 
         return columns
 
+    @property
+    def _source_dataset_column_already_exists(self):
+        if self._settings_obj_ is None:
+            return False
+        input_cols = [c.unquote().name for c in self._input_columns()]
+        return self._settings_obj._source_dataset_column_name in input_cols
+
     @property
     def _cache_uid(self):
         if self._settings_dict:
diff --git a/splink/missingness.py b/splink/missingness.py
index 607d320a88..8479d951c1 100644
--- a/splink/missingness.py
+++ b/splink/missingness.py
@@ -40,7 +40,7 @@ def missingness_sqls(columns, input_tablename):
 
 
 def missingness_data(linker, input_tablename):
-    columns = linker._input_columns
+    columns = linker._input_columns()
     if input_tablename is None:
         splink_dataframe = linker._initialise_df_concat(materialise=True)
     else:
diff --git a/splink/profile_data.py b/splink/profile_data.py
index ea035a54c1..93986dfb2f 100644
--- a/splink/profile_data.py
+++ b/splink/profile_data.py
@@ -232,7 +232,7 @@ def profile_columns(linker, column_expressions=None, top_n=10, bottom_n=10):
     """
 
     if not column_expressions:
-        column_expressions = [col.name for col in linker._input_columns]
+        column_expressions = [col.name for col in linker._input_columns()]
 
     df_concat = linker._initialise_df_concat()
 

From 46a79c11e10af05b8d744ab330f2477eddc194a9 Mon Sep 17 00:00:00 2001
From: Robin Linacre <robin.linacre@digital.justice.gov.uk>
Date: Mon, 20 Nov 2023 09:06:22 +0000
Subject: [PATCH 3/9] name things more clearly

---
 ...brs_with_comparison_counts_below_threshold.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/splink/find_brs_with_comparison_counts_below_threshold.py b/splink/find_brs_with_comparison_counts_below_threshold.py
index 0d2c2db7d1..7e895f9f7e 100644
--- a/splink/find_brs_with_comparison_counts_below_threshold.py
+++ b/splink/find_brs_with_comparison_counts_below_threshold.py
@@ -11,21 +11,23 @@
 logger = logging.getLogger(__name__)
 
 
-def sanitise_column_name(column_name) -> str:
+def sanitise_column_name_for_one_hot_encoding(column_name) -> str:
     allowed_chars = string.ascii_letters + string.digits + "_"
-    sanitized_name = "".join(c for c in column_name if c in allowed_chars)
-    return sanitized_name
+    sanitised_name = "".join(c for c in column_name if c in allowed_chars)
+    return sanitised_name
 
 
 def _generate_output_combinations_table_row(
     blocking_columns, splink_blocking_rule, comparison_count, all_columns
-):
+) -> dict:
     row = {}
 
-    blocking_columns = [sanitise_column_name(c) for c in blocking_columns]
-    all_columns = [sanitise_column_name(c) for c in all_columns]
+    blocking_columns = [
+        sanitise_column_name_for_one_hot_encoding(c) for c in blocking_columns
+    ]
+    all_columns = [sanitise_column_name_for_one_hot_encoding(c) for c in all_columns]
 
-    row["blocking_columns"] = blocking_columns
+    row["blocking_columns_sanitised"] = blocking_columns
     row["splink_blocking_rule"] = splink_blocking_rule
     row["comparison_count"] = comparison_count
     row["complexity"] = len(blocking_columns)

From fd89e5eb737f1e471bf016513c3b20481b6fc6e6 Mon Sep 17 00:00:00 2001
From: Robin Linacre <robin.linacre@digital.justice.gov.uk>
Date: Mon, 20 Nov 2023 09:13:39 +0000
Subject: [PATCH 4/9] rename complexity for clarity

---
 ..._with_comparison_counts_below_threshold.py | 30 +++++++++++--------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/splink/find_brs_with_comparison_counts_below_threshold.py b/splink/find_brs_with_comparison_counts_below_threshold.py
index 7e895f9f7e..2038cd8658 100644
--- a/splink/find_brs_with_comparison_counts_below_threshold.py
+++ b/splink/find_brs_with_comparison_counts_below_threshold.py
@@ -30,7 +30,7 @@ def _generate_output_combinations_table_row(
     row["blocking_columns_sanitised"] = blocking_columns
     row["splink_blocking_rule"] = splink_blocking_rule
     row["comparison_count"] = comparison_count
-    row["complexity"] = len(blocking_columns)
+    row["num_equi_joins"] = len(blocking_columns)
 
     for col in all_columns:
         row[f"__fixed__{col}"] = 1 if col in blocking_columns else 0
@@ -99,7 +99,7 @@ def _search_tree_for_blocking_rules_below_threshold_count(
     linker._count_num_comparisons_from_blocking_rule_pre_filter_conditions
     to count
 
-    The tree looks like this, where c1 c2 are columns:
+    The full tree looks like this, where c1 c2 are columns:
     c1                    count_comparisons(c1)
     ├── c2                count_comparisons(c1, c2)
     │   └── c3            count_comparisons(c1, c2, c3)
@@ -111,12 +111,16 @@ def _search_tree_for_blocking_rules_below_threshold_count(
     ├── c3                count_comparisons(c2, c3)
     │   └── c1            count_comparisons(c2, c3, c1)
 
-    Once the count is below the threshold, no branches from the node are explored.
+    But many nodes do not need to be visited:
+        - Once the count is below the threshold, no branches from the node are explored.
+        - If a combination has alraedy been evaluated,  it is not evaluated again. For
+          example, c2 -> c1 will not be evaluated because c1 -> c2 has already been
+          counted
 
     When a count is below the threshold, create a dictionary with the relevant stats
     like :
     {
-        'blocking_columns':['first_name'],
+        'blocking_columns_sanitised':['first_name'],
         'splink_blocking_rule':<Custom rule>',
         comparison_count':4827,
         'complexity':1,
@@ -157,12 +161,6 @@ def _search_tree_for_blocking_rules_below_threshold_count(
     comparison_count = (
         linker._count_num_comparisons_from_blocking_rule_pre_filter_conditions(br)
     )
-    row = _generate_output_combinations_table_row(
-        current_combination,
-        br,
-        comparison_count,
-        all_columns,
-    )
 
     already_visited.add(frozenset(current_combination))
 
@@ -181,6 +179,12 @@ def _search_tree_for_blocking_rules_below_threshold_count(
                 results,
             )
     else:
+        row = _generate_output_combinations_table_row(
+            current_combination,
+            br,
+            comparison_count,
+            all_columns,
+        )
         results.append(row)
 
     return results
@@ -193,12 +197,12 @@ def find_blocking_rules_below_threshold_comparison_count(
     Finds blocking rules which return a comparison count below a given threshold.
 
     In addition to returning blocking rules, returns the comparison count and
-    'complexity', which refers to the number of equi-joins used by the rule.
+    'num_equi_joins', which refers to the number of equi-joins used by the rule.
 
     Also returns one-hot encoding that describes which columns are __fixed__ by the
     blocking rule
 
-    e.g. equality on first_name and surname is complexity of 2
+    e.g. equality on first_name and surname has num_equi_joins of 2
 
     Args:
         linker (Linker): The Linker object
@@ -207,7 +211,7 @@ def find_blocking_rules_below_threshold_comparison_count(
             ComparisonLevels of the Linker.
 
     Returns:
-        pd.DataFrame: DataFrame with blocking rules, comparison_count, and complexity.
+        pd.DataFrame: DataFrame with blocking rules, comparison_count and num_equi_joins
     """
 
     if not columns:

From 9d5f2c13077a86f1ca11e8b69ae507e101aa26af Mon Sep 17 00:00:00 2001
From: Robin Linacre <robin.linacre@digital.justice.gov.uk>
Date: Mon, 20 Nov 2023 09:18:37 +0000
Subject: [PATCH 5/9] Add error handling for no blocking rules below threshold

---
 splink/find_brs_with_comparison_counts_below_threshold.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/splink/find_brs_with_comparison_counts_below_threshold.py b/splink/find_brs_with_comparison_counts_below_threshold.py
index 2038cd8658..d692355a5b 100644
--- a/splink/find_brs_with_comparison_counts_below_threshold.py
+++ b/splink/find_brs_with_comparison_counts_below_threshold.py
@@ -231,4 +231,12 @@ def find_blocking_rules_below_threshold_comparison_count(
     results = _search_tree_for_blocking_rules_below_threshold_count(
         linker, columns_as_strings, max_comparisons_per_rule
     )
+
+    if not results:
+        raise ValueError(
+            "No blocking rules could be found that produce a comparison count below "
+            "your chosen max_comparisons_per_rule threshold of "
+            f"{max_comparisons_per_rule}. Try increasing the threshold."
+        )
+
     return pd.DataFrame(results)

From 17e6fec95d056043538a5feead90f91912f782c9 Mon Sep 17 00:00:00 2001
From: Robin Linacre <robin.linacre@digital.justice.gov.uk>
Date: Mon, 20 Nov 2023 09:34:44 +0000
Subject: [PATCH 6/9] clarify naming and docstring

---
 ..._with_comparison_counts_below_threshold.py | 33 +++++++++++--------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/splink/find_brs_with_comparison_counts_below_threshold.py b/splink/find_brs_with_comparison_counts_below_threshold.py
index d692355a5b..66ff294088 100644
--- a/splink/find_brs_with_comparison_counts_below_threshold.py
+++ b/splink/find_brs_with_comparison_counts_below_threshold.py
@@ -5,6 +5,7 @@
 import pandas as pd
 
 from .input_column import InputColumn
+from .blocking import BlockingRule
 
 if TYPE_CHECKING:
     from .linker import Linker
@@ -40,7 +41,7 @@ def _generate_output_combinations_table_row(
 
 def _generate_combinations(
     all_columns, current_combination, already_visited: Set[frozenset]
-):
+) -> list:
     """Generate combinations of columns to visit that haven't been visited already
     irrespective of order
     """
@@ -55,8 +56,11 @@ def _generate_combinations(
     return combinations
 
 
-def _generate_blocking_rule(linker: "Linker", cols_as_string):
-    """Generate a blocking rule given a list of column names as string"""
+def _generate_blocking_rule(
+    linker: "Linker", cols_as_string: List[str]
+) -> BlockingRule:
+    """Generate a Splink blocking rule given a list of column names which
+    are provided as as string"""
 
     dialect = linker._sql_dialect
 
@@ -191,7 +195,7 @@ def _search_tree_for_blocking_rules_below_threshold_count(
 
 
 def find_blocking_rules_below_threshold_comparison_count(
-    linker: "Linker", max_comparisons_per_rule, columns=None
+    linker: "Linker", max_comparisons_per_rule, column_expressions: List[str] = None
 ) -> pd.DataFrame:
     """
     Finds blocking rules which return a comparison count below a given threshold.
@@ -207,29 +211,32 @@ def find_blocking_rules_below_threshold_comparison_count(
     Args:
         linker (Linker): The Linker object
         max_comparisons_per_rule (int): Max comparisons allowed per blocking rule.
-        columns: Columns to consider. If None, uses all columns used by the
-            ComparisonLevels of the Linker.
+        column_expressions: List[str] = Algorithm will find combinations of these
+            column expressions to use as blocking rules. If None, uses all columns used
+            by the ComparisonLevels of the Linker. Column expressions can be SQL
+            expressions, not just column names i.e. 'substr(surname, 1,1)' is a valid
+            entry in this list.
 
     Returns:
         pd.DataFrame: DataFrame with blocking rules, comparison_count and num_equi_joins
     """
 
-    if not columns:
-        columns = linker._input_columns(
+    if not column_expressions:
+        column_expressions = linker._input_columns(
             include_unique_id_col_names=False,
             include_additional_columns_to_retain=False,
         )
 
-    columns_as_strings = []
+    column_expressions_as_strings = []
 
-    for c in columns:
+    for c in column_expressions:
         if isinstance(c, InputColumn):
-            columns_as_strings.append(c.quote().name)
+            column_expressions_as_strings.append(c.quote().name)
         else:
-            columns_as_strings.append(c)
+            column_expressions_as_strings.append(c)
 
     results = _search_tree_for_blocking_rules_below_threshold_count(
-        linker, columns_as_strings, max_comparisons_per_rule
+        linker, column_expressions_as_strings, max_comparisons_per_rule
     )
 
     if not results:

From 04632698b25de1e9472978cfc3eb005e940ca24d Mon Sep 17 00:00:00 2001
From: Robin Linacre <robin.linacre@digital.justice.gov.uk>
Date: Mon, 20 Nov 2023 09:39:18 +0000
Subject: [PATCH 7/9] lint

---
 splink/find_brs_with_comparison_counts_below_threshold.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/splink/find_brs_with_comparison_counts_below_threshold.py b/splink/find_brs_with_comparison_counts_below_threshold.py
index 66ff294088..148645ce00 100644
--- a/splink/find_brs_with_comparison_counts_below_threshold.py
+++ b/splink/find_brs_with_comparison_counts_below_threshold.py
@@ -4,8 +4,8 @@
 
 import pandas as pd
 
-from .input_column import InputColumn
 from .blocking import BlockingRule
+from .input_column import InputColumn
 
 if TYPE_CHECKING:
     from .linker import Linker

From ce46245823af97b00c8b02dc5f1d6ba22a8140d6 Mon Sep 17 00:00:00 2001
From: Robin Linacre <robin.linacre@digital.justice.gov.uk>
Date: Mon, 20 Nov 2023 10:48:08 +0000
Subject: [PATCH 8/9] Update
 splink/find_brs_with_comparison_counts_below_threshold.py

Co-authored-by: Tom Hepworth <45356472+ThomasHepworth@users.noreply.github.com>
---
 splink/find_brs_with_comparison_counts_below_threshold.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/splink/find_brs_with_comparison_counts_below_threshold.py b/splink/find_brs_with_comparison_counts_below_threshold.py
index 148645ce00..12fce64028 100644
--- a/splink/find_brs_with_comparison_counts_below_threshold.py
+++ b/splink/find_brs_with_comparison_counts_below_threshold.py
@@ -117,7 +117,7 @@ def _search_tree_for_blocking_rules_below_threshold_count(
 
     But many nodes do not need to be visited:
         - Once the count is below the threshold, no branches from the node are explored.
-        - If a combination has alraedy been evaluated,  it is not evaluated again. For
+        - If a combination has alraedy been evaluated, it is not evaluated again. For
           example, c2 -> c1 will not be evaluated because c1 -> c2 has already been
           counted
 

From d74db7e1c120dd7f175f1f2d59817f9bbd438dc2 Mon Sep 17 00:00:00 2001
From: Robin Linacre <robin.linacre@digital.justice.gov.uk>
Date: Mon, 20 Nov 2023 10:49:59 +0000
Subject: [PATCH 9/9] address Tom's final comments

---
 splink/find_brs_with_comparison_counts_below_threshold.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/splink/find_brs_with_comparison_counts_below_threshold.py b/splink/find_brs_with_comparison_counts_below_threshold.py
index 12fce64028..b151ab72c4 100644
--- a/splink/find_brs_with_comparison_counts_below_threshold.py
+++ b/splink/find_brs_with_comparison_counts_below_threshold.py
@@ -62,6 +62,7 @@ def _generate_blocking_rule(
     """Generate a Splink blocking rule given a list of column names which
     are provided as as string"""
 
+    # TODO: Refactor in Splink4
     dialect = linker._sql_dialect
 
     module_mapping = {