Skip to content

Commit

Permalink
Merge pull request #1668 from moj-analytical-services/optimise_cost_o…
Browse files Browse the repository at this point in the history
…f_brs

Automatically detect blocking rules for prediction and blocking rules for EM training
  • Loading branch information
RobinL committed Nov 22, 2023
2 parents 269abf2 + 253f237 commit 42de8da
Show file tree
Hide file tree
Showing 4 changed files with 396 additions and 12 deletions.
17 changes: 7 additions & 10 deletions splink/cost_of_blocking_rules.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import logging
from typing import Dict, List, Union

import pandas as pd

logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -51,25 +49,25 @@ def calculate_field_freedom_cost(combination_of_brs: List[Dict]) -> int:


def calculate_cost_of_combination_of_brs(
br_combination: pd.DataFrame,
br_combination: List[Dict],
max_comparison_count: int,
complexity_weight: Union[int, float] = 1,
num_equi_join_weight: Union[int, float] = 1,
field_freedom_weight: Union[int, float] = 1,
num_brs_weight: Union[int, float] = 1,
num_comparison_weight: Union[int, float] = 1,
) -> dict:
"""
Calculates the cost for a given combination of blocking rules.
The cost is a weighted sum of the complexity of the rules, the count of rules,
the number of fields that are allowed to vary, and the number of rows.
The cost is a weighted sum of the number of equi joins in the rules, the count of
rules, the number of fields that are allowed to vary, and the number of rows.
Args:
br_combination (pd.DataFrame): The combination of rows outputted by
br_combination (List[Dict]): The combination of rows outputted by
find_blocking_rules_below_threshold_comparison_count.
max_comparison_count (int): The maximum comparison count amongst the rules.
This is needed to normalise the cost of more or fewer comparison rows.
complexity_weight (Union[int, float], optional): The weight for complexity.
num_equi_join_weight (Union[int, float], optional): The weight for num_equi_join
Defaults to 1.
field_freedom_weight (Union[int, float], optional): The weight for field
freedom. Defaults to 1.
Expand All @@ -81,7 +79,6 @@ def calculate_cost_of_combination_of_brs(
Returns:
dict: The calculated cost and individual component costs.
"""
br_combination = br_combination.to_dict(orient="records")

num_equi_join_cost = sum(row["num_equi_joins"] for row in br_combination)
total_row_count = sum(row["comparison_count"] for row in br_combination)
Expand All @@ -92,7 +89,7 @@ def calculate_cost_of_combination_of_brs(
field_freedom_cost = calculate_field_freedom_cost(br_combination)
num_brs_cost = len(br_combination)

num_equi_join_cost_weighted = complexity_weight * num_equi_join_cost
num_equi_join_cost_weighted = num_equi_join_weight * num_equi_join_cost
field_freedom_cost_weighted = field_freedom_weight * field_freedom_cost
num_brs_cost_weighted = num_brs_weight * num_brs_cost
num_comparison_rows_cost_weighted = num_comparison_weight * normalised_row_count
Expand Down
4 changes: 2 additions & 2 deletions splink/find_brs_with_comparison_counts_below_threshold.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def _generate_blocking_rule(
block_on = block_on_module.block_on

if len(cols_as_string) == 0:
return "1 = 1"
return block_on("1")

br = block_on(cols_as_string)

Expand Down Expand Up @@ -128,7 +128,7 @@ def _search_tree_for_blocking_rules_below_threshold_count(
'blocking_columns_sanitised':['first_name'],
'splink_blocking_rule':<Custom rule>',
comparison_count':4827,
'complexity':1,
'num_equi_join':1,
'__fixed__first_name':1,
'__fixed__surname':0,
'__fixed__dob':0,
Expand Down
173 changes: 173 additions & 0 deletions splink/linker.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,9 @@
from .em_training_session import EMTrainingSession
from .estimate_u import estimate_u_values
from .exceptions import SplinkDeprecated, SplinkException
from .find_brs_with_comparison_counts_below_threshold import (
find_blocking_rules_below_threshold_comparison_count,
)
from .find_matches_to_new_records import add_unique_id_and_source_dataset_cols_if_needed
from .labelling_tool import (
generate_labelling_tool_comparisons,
Expand All @@ -86,6 +89,7 @@
prob_to_bayes_factor,
)
from .missingness import completeness_data, missingness_data
from .optimise_cost_of_brs import suggest_blocking_rules
from .pipeline import SQLPipeline
from .predict import predict_from_comparison_vectors_sqls
from .profile_data import profile_columns
Expand Down Expand Up @@ -3751,3 +3755,172 @@ def _remove_splinkdataframe_from_cache(self, splink_dataframe: SplinkDataFrame):

for k in keys_to_delete:
del self._intermediate_table_cache[k]

def _find_blocking_rules_below_threshold(
self, max_comparisons_per_rule, blocking_expressions=None
):
return find_blocking_rules_below_threshold_comparison_count(
self, max_comparisons_per_rule, blocking_expressions
)

def _detect_blocking_rules_for_prediction(
self,
max_comparisons_per_rule,
blocking_expressions=None,
min_freedom=1,
num_runs=200,
num_equi_join_weight=0,
field_freedom_weight=1,
num_brs_weight=10,
num_comparison_weight=10,
return_as_df=False,
):
"""Find blocking rules for prediction below some given threshold of the
maximum number of comparisons that can be generated per blocking rule
(max_comparisons_per_rule).
Uses a heuristic cost algorithm to identify the 'best' set of blocking rules
Args:
max_comparisons_per_rule (int): The maximum number of comparisons that
each blocking rule is allowed to generate
blocking_expressions: By default, blocking rules will be equi-joins
on the columns used by the Splink model. This allows you to manually
specify sql expressions from which combinations will be created. For
example, if you specify ["substr(dob, 1,4)", "surname", "dob"]
blocking rules will be chosen by blocking on combinations
of those expressions.
min_freedom (int, optional): The minimum amount of freedom any column should
be allowed.
num_runs (int, optional): Each run selects rows using a heuristic and costs
them. The more runs, the more likely you are to find the best rule.
Defaults to 5.
num_equi_join_weight (int, optional): Weight allocated to number of equi
joins in the blocking rules.
Defaults to 0 since this is cost better captured by other criteria.
field_freedom_weight (int, optional): Weight given to the cost of
having individual fields which don't havem much flexibility. Assigning
a high weight here makes it more likely you'll generate combinations of
blocking rules for which most fields are allowed to vary more than
the minimum. Defaults to 1.
num_brs_weight (int, optional): Weight assigned to the cost of
additional blocking rules. Higher weight here will result in a
preference for fewer blocking rules. Defaults to 10.
num_comparison_weight (int, optional): Weight assigned to the cost of
larger numbers of comparisons, which happens when more of the blocking
rules are close to the max_comparisons_per_rule. A higher
weight here prefers sets of rules which generate lower total
comparisons. Defaults to 10.
return_as_df (bool, optional): If false, assign recommendation to settings.
If true, return a dataframe containing details of the weights.
Defaults to False.
"""

df_br_below_thres = find_blocking_rules_below_threshold_comparison_count(
self, max_comparisons_per_rule, blocking_expressions
)

blocking_rule_suggestions = suggest_blocking_rules(
df_br_below_thres,
min_freedom=min_freedom,
num_runs=num_runs,
num_equi_join_weight=num_equi_join_weight,
field_freedom_weight=field_freedom_weight,
num_brs_weight=num_brs_weight,
num_comparison_weight=num_comparison_weight,
)

if return_as_df:
return blocking_rule_suggestions
else:
if blocking_rule_suggestions is None or len(blocking_rule_suggestions) == 0:
logger.warning("No set of blocking rules found within constraints")
else:
suggestion = blocking_rule_suggestions[
"suggested_blocking_rules_as_splink_brs"
].iloc[0]
self._settings_obj._blocking_rules_to_generate_predictions = suggestion

suggestion_str = blocking_rule_suggestions[
"suggested_blocking_rules_for_prediction"
].iloc[0]
msg = (
"The following blocking_rules_to_generate_predictions were "
"automatically detected and assigned to your settings:\n"
)
logger.info(f"{msg}{suggestion_str}")

def _detect_blocking_rules_for_em_training(
self,
max_comparisons_per_rule,
min_freedom=1,
num_runs=200,
num_equi_join_weight=0,
field_freedom_weight=1,
num_brs_weight=20,
num_comparison_weight=10,
return_as_df=False,
):
"""Find blocking rules for EM training below some given threshold of the
maximum number of comparisons that can be generated per blocking rule
(max_comparisons_per_rule).
Uses a heuristic cost algorithm to identify the 'best' set of blocking rules
Args:
max_comparisons_per_rule (int): The maximum number of comparisons that
each blocking rule is allowed to generate
min_freedom (int, optional): The minimum amount of freedom any column should
be allowed.
num_runs (int, optional): Each run selects rows using a heuristic and costs
them. The more runs, the more likely you are to find the best rule.
Defaults to 5.
num_equi_join_weight (int, optional): Weight allocated to number of equi
joins in the blocking rules.
Defaults to 0 since this is cost better captured by other criteria.
Defaults to 0 since this is cost better captured by other criteria.
field_freedom_weight (int, optional): Weight given to the cost of
having individual fields which don't havem much flexibility. Assigning
a high weight here makes it more likely you'll generate combinations of
blocking rules for which most fields are allowed to vary more than
the minimum. Defaults to 1.
num_brs_weight (int, optional): Weight assigned to the cost of
additional blocking rules. Higher weight here will result in a
preference for fewer blocking rules. Defaults to 10.
num_comparison_weight (int, optional): Weight assigned to the cost of
larger numbers of comparisons, which happens when more of the blocking
rules are close to the max_comparisons_per_rule. A higher
weight here prefers sets of rules which generate lower total
comparisons. Defaults to 10.
return_as_df (bool, optional): If false, return just the recommendation.
If true, return a dataframe containing details of the weights.
Defaults to False.
"""

df_br_below_thres = find_blocking_rules_below_threshold_comparison_count(
self, max_comparisons_per_rule
)

blocking_rule_suggestions = suggest_blocking_rules(
df_br_below_thres,
min_freedom=min_freedom,
num_runs=num_runs,
num_equi_join_weight=num_equi_join_weight,
field_freedom_weight=field_freedom_weight,
num_brs_weight=num_brs_weight,
num_comparison_weight=num_comparison_weight,
)

if return_as_df:
return blocking_rule_suggestions
else:
if blocking_rule_suggestions is None or len(blocking_rule_suggestions) == 0:
logger.warning("No set of blocking rules found within constraints")
return None
else:
suggestion_str = blocking_rule_suggestions[
"suggested_EM_training_statements"
].iloc[0]
msg = "The following EM training strategy was detected:\n"
msg = f"{msg}{suggestion_str}"
logger.info(msg)
suggestion = blocking_rule_suggestions[
"suggested_blocking_rules_as_splink_brs"
].iloc[0]
return suggestion
Loading

0 comments on commit 42de8da

Please sign in to comment.