Merge pull request #1668 from moj-analytical-services/optimise_cost_o…

…f_brs Automatically detect blocking rules for prediction and blocking rules for EM training
moj-analytical-services · Nov 22, 2023 · 42de8da · 42de8da
2 parents 269abf2 + 253f237
commit 42de8da
Show file tree

Hide file tree

Showing 4 changed files with 396 additions and 12 deletions.
diff --git a/splink/cost_of_blocking_rules.py b/splink/cost_of_blocking_rules.py
@@ -1,8 +1,6 @@
 import logging
 from typing import Dict, List, Union
 
-import pandas as pd
-
 logger = logging.getLogger(__name__)
 
 
@@ -51,25 +49,25 @@ def calculate_field_freedom_cost(combination_of_brs: List[Dict]) -> int:
 
 
 def calculate_cost_of_combination_of_brs(
-    br_combination: pd.DataFrame,
+    br_combination: List[Dict],
     max_comparison_count: int,
-    complexity_weight: Union[int, float] = 1,
+    num_equi_join_weight: Union[int, float] = 1,
     field_freedom_weight: Union[int, float] = 1,
     num_brs_weight: Union[int, float] = 1,
     num_comparison_weight: Union[int, float] = 1,
 ) -> dict:
     """
     Calculates the cost for a given combination of blocking rules.
 
-    The cost is a weighted sum of the complexity of the rules, the count of rules,
-    the number of fields that are allowed to vary, and the number of rows.
+    The cost is a weighted sum of the number of equi joins in the rules, the count of
+    rules, the number of fields that are allowed to vary, and the number of rows.
 
     Args:
-        br_combination (pd.DataFrame): The combination of rows outputted by
+        br_combination (List[Dict]): The combination of rows outputted by
             find_blocking_rules_below_threshold_comparison_count.
         max_comparison_count (int): The maximum comparison count amongst the rules.
             This is needed to normalise the cost of more or fewer comparison rows.
-        complexity_weight (Union[int, float], optional): The weight for complexity.
+        num_equi_join_weight (Union[int, float], optional): The weight for num_equi_join
             Defaults to 1.
         field_freedom_weight (Union[int, float], optional): The weight for field
             freedom. Defaults to 1.
@@ -81,7 +79,6 @@ def calculate_cost_of_combination_of_brs(
     Returns:
         dict: The calculated cost and individual component costs.
     """
-    br_combination = br_combination.to_dict(orient="records")
 
     num_equi_join_cost = sum(row["num_equi_joins"] for row in br_combination)
     total_row_count = sum(row["comparison_count"] for row in br_combination)
@@ -92,7 +89,7 @@ def calculate_cost_of_combination_of_brs(
     field_freedom_cost = calculate_field_freedom_cost(br_combination)
     num_brs_cost = len(br_combination)
 
-    num_equi_join_cost_weighted = complexity_weight * num_equi_join_cost
+    num_equi_join_cost_weighted = num_equi_join_weight * num_equi_join_cost
     field_freedom_cost_weighted = field_freedom_weight * field_freedom_cost
     num_brs_cost_weighted = num_brs_weight * num_brs_cost
     num_comparison_rows_cost_weighted = num_comparison_weight * normalised_row_count

diff --git a/splink/find_brs_with_comparison_counts_below_threshold.py b/splink/find_brs_with_comparison_counts_below_threshold.py
@@ -81,7 +81,7 @@ def _generate_blocking_rule(
     block_on = block_on_module.block_on
 
     if len(cols_as_string) == 0:
-        return "1 = 1"
+        return block_on("1")
 
     br = block_on(cols_as_string)
 
@@ -128,7 +128,7 @@ def _search_tree_for_blocking_rules_below_threshold_count(
         'blocking_columns_sanitised':['first_name'],
         'splink_blocking_rule':<Custom rule>',
         comparison_count':4827,
-        'complexity':1,
+        'num_equi_join':1,
         '__fixed__first_name':1,
         '__fixed__surname':0,
         '__fixed__dob':0,

diff --git a/splink/linker.py b/splink/linker.py
@@ -65,6 +65,9 @@
 from .em_training_session import EMTrainingSession
 from .estimate_u import estimate_u_values
 from .exceptions import SplinkDeprecated, SplinkException
+from .find_brs_with_comparison_counts_below_threshold import (
+    find_blocking_rules_below_threshold_comparison_count,
+)
 from .find_matches_to_new_records import add_unique_id_and_source_dataset_cols_if_needed
 from .labelling_tool import (
     generate_labelling_tool_comparisons,
@@ -86,6 +89,7 @@
     prob_to_bayes_factor,
 )
 from .missingness import completeness_data, missingness_data
+from .optimise_cost_of_brs import suggest_blocking_rules
 from .pipeline import SQLPipeline
 from .predict import predict_from_comparison_vectors_sqls
 from .profile_data import profile_columns
@@ -3751,3 +3755,172 @@ def _remove_splinkdataframe_from_cache(self, splink_dataframe: SplinkDataFrame):
 
         for k in keys_to_delete:
             del self._intermediate_table_cache[k]
+
+    def _find_blocking_rules_below_threshold(
+        self, max_comparisons_per_rule, blocking_expressions=None
+    ):
+        return find_blocking_rules_below_threshold_comparison_count(
+            self, max_comparisons_per_rule, blocking_expressions
+        )
+
+    def _detect_blocking_rules_for_prediction(
+        self,
+        max_comparisons_per_rule,
+        blocking_expressions=None,
+        min_freedom=1,
+        num_runs=200,
+        num_equi_join_weight=0,
+        field_freedom_weight=1,
+        num_brs_weight=10,
+        num_comparison_weight=10,
+        return_as_df=False,
+    ):
+        """Find blocking rules for prediction below some given threshold of the
+        maximum number of comparisons that can be generated per blocking rule
+        (max_comparisons_per_rule).
+        Uses a heuristic cost algorithm to identify the 'best' set of blocking rules
+        Args:
+            max_comparisons_per_rule (int): The maximum number of comparisons that
+                each blocking rule is allowed to generate
+            blocking_expressions: By default, blocking rules will be equi-joins
+                on the columns used by the Splink model.  This allows you to manually
+                specify sql expressions from which combinations will be created. For
+                example, if you specify ["substr(dob, 1,4)", "surname", "dob"]
+                blocking rules will be chosen by blocking on combinations
+                of those expressions.
+            min_freedom (int, optional): The minimum amount of freedom any column should
+                be allowed.
+            num_runs (int, optional): Each run selects rows using a heuristic and costs
+                them. The more runs, the more likely you are to find the best rule.
+                Defaults to 5.
+            num_equi_join_weight (int, optional): Weight allocated to number of equi
+                joins in the blocking rules.
+                Defaults to 0 since this is cost better captured by other criteria.
+            field_freedom_weight (int, optional): Weight given to the cost of
+                having individual fields which don't havem much flexibility.  Assigning
+                a high weight here makes it more likely you'll generate combinations of
+                blocking rules for which most fields are allowed to vary more than
+                the minimum. Defaults to 1.
+            num_brs_weight (int, optional): Weight assigned to the cost of
+                additional blocking rules.  Higher weight here will result in a
+                 preference for fewer blocking rules. Defaults to 10.
+            num_comparison_weight (int, optional): Weight assigned to the cost of
+                larger numbers of comparisons, which happens when more of the blocking
+                rules are close to the max_comparisons_per_rule.  A higher
+                 weight here prefers sets of rules which generate lower total
+                comparisons. Defaults to 10.
+            return_as_df (bool, optional): If false, assign recommendation to settings.
+                If true, return a dataframe containing details of the weights.
+                Defaults to False.
+        """
+
+        df_br_below_thres = find_blocking_rules_below_threshold_comparison_count(
+            self, max_comparisons_per_rule, blocking_expressions
+        )
+
+        blocking_rule_suggestions = suggest_blocking_rules(
+            df_br_below_thres,
+            min_freedom=min_freedom,
+            num_runs=num_runs,
+            num_equi_join_weight=num_equi_join_weight,
+            field_freedom_weight=field_freedom_weight,
+            num_brs_weight=num_brs_weight,
+            num_comparison_weight=num_comparison_weight,
+        )
+
+        if return_as_df:
+            return blocking_rule_suggestions
+        else:
+            if blocking_rule_suggestions is None or len(blocking_rule_suggestions) == 0:
+                logger.warning("No set of blocking rules found within constraints")
+            else:
+                suggestion = blocking_rule_suggestions[
+                    "suggested_blocking_rules_as_splink_brs"
+                ].iloc[0]
+                self._settings_obj._blocking_rules_to_generate_predictions = suggestion
+
+                suggestion_str = blocking_rule_suggestions[
+                    "suggested_blocking_rules_for_prediction"
+                ].iloc[0]
+                msg = (
+                    "The following blocking_rules_to_generate_predictions were "
+                    "automatically detected and assigned to your settings:\n"
+                )
+                logger.info(f"{msg}{suggestion_str}")
+
+    def _detect_blocking_rules_for_em_training(
+        self,
+        max_comparisons_per_rule,
+        min_freedom=1,
+        num_runs=200,
+        num_equi_join_weight=0,
+        field_freedom_weight=1,
+        num_brs_weight=20,
+        num_comparison_weight=10,
+        return_as_df=False,
+    ):
+        """Find blocking rules for EM training below some given threshold of the
+        maximum number of comparisons that can be generated per blocking rule
+        (max_comparisons_per_rule).
+        Uses a heuristic cost algorithm to identify the 'best' set of blocking rules
+        Args:
+            max_comparisons_per_rule (int): The maximum number of comparisons that
+                each blocking rule is allowed to generate
+            min_freedom (int, optional): The minimum amount of freedom any column should
+                be allowed.
+            num_runs (int, optional): Each run selects rows using a heuristic and costs
+                them.  The more runs, the more likely you are to find the best rule.
+                Defaults to 5.
+            num_equi_join_weight (int, optional): Weight allocated to number of equi
+                joins in the blocking rules.
+                Defaults to 0 since this is cost better captured by other criteria.
+                Defaults to 0 since this is cost better captured by other criteria.
+            field_freedom_weight (int, optional): Weight given to the cost of
+                having individual fields which don't havem much flexibility.  Assigning
+                a high weight here makes it more likely you'll generate combinations of
+                blocking rules for which most fields are allowed to vary more than
+                the minimum. Defaults to 1.
+            num_brs_weight (int, optional): Weight assigned to the cost of
+                additional blocking rules.  Higher weight here will result in a
+                 preference for fewer blocking rules. Defaults to 10.
+            num_comparison_weight (int, optional): Weight assigned to the cost of
+                larger numbers of comparisons, which happens when more of the blocking
+                rules are close to the max_comparisons_per_rule.  A higher
+                 weight here prefers sets of rules which generate lower total
+                comparisons. Defaults to 10.
+            return_as_df (bool, optional): If false, return just the recommendation.
+                If true, return a dataframe containing details of the weights.
+                Defaults to False.
+        """
+
+        df_br_below_thres = find_blocking_rules_below_threshold_comparison_count(
+            self, max_comparisons_per_rule
+        )
+
+        blocking_rule_suggestions = suggest_blocking_rules(
+            df_br_below_thres,
+            min_freedom=min_freedom,
+            num_runs=num_runs,
+            num_equi_join_weight=num_equi_join_weight,
+            field_freedom_weight=field_freedom_weight,
+            num_brs_weight=num_brs_weight,
+            num_comparison_weight=num_comparison_weight,
+        )
+
+        if return_as_df:
+            return blocking_rule_suggestions
+        else:
+            if blocking_rule_suggestions is None or len(blocking_rule_suggestions) == 0:
+                logger.warning("No set of blocking rules found within constraints")
+                return None
+            else:
+                suggestion_str = blocking_rule_suggestions[
+                    "suggested_EM_training_statements"
+                ].iloc[0]
+                msg = "The following EM training strategy was detected:\n"
+                msg = f"{msg}{suggestion_str}"
+                logger.info(msg)
+                suggestion = blocking_rule_suggestions[
+                    "suggested_blocking_rules_as_splink_brs"
+                ].iloc[0]
+                return suggestion