diff --git a/pyproject.toml b/pyproject.toml index b614c65eba..11f809acf8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "splink" -version = "0.4.1" +version = "0.4.2" description = "Implementation in Apache Spark of the EM algorithm to estimate parameters of Fellegi-Sunter's canonical model of record linkage." authors = ["Robin Linacre ", "Sam Lindsay", "Theodore Manassis"] license = "MIT" diff --git a/splink/estimate.py b/splink/estimate.py index 566e0acef6..7282d26983 100644 --- a/splink/estimate.py +++ b/splink/estimate.py @@ -105,6 +105,8 @@ def estimate_u_values( for i, col in enumerate(orig_settings["comparison_columns"]): u_probs = new_settings["comparison_columns"][i]["u_probabilities"] + # Ensure non-zero u (https://github.com/moj-analytical-services/splink/issues/161) + u_probs = [u or 1/target_rows for u in u_probs] col["u_probabilities"] = u_probs return orig_settings diff --git a/splink/settings.py b/splink/settings.py index f9a909cf39..8c4828d07f 100644 --- a/splink/settings.py +++ b/splink/settings.py @@ -162,6 +162,19 @@ def _complete_probabilities(col_settings: dict, setting_name: str): levels = col_settings["num_levels"] probs = col_settings[setting_name] + # Check for m and u manually set to zero (https://github.com/moj-analytical-services/splink/issues/161) + if not all(col_settings[setting_name]): + if "custom_name" in col_settings: + col_name = col_settings["custom_name"] + else: + col_name = col_settings["col_name"] + warnings.warn( + f"Your {setting_name} for {col_name} include zeroes." + f"Where {letter}=0 for a given level, it remains fixed rather than being estimated" + "along with other model parameters, and all comparisons at this level" + f"are assigned a match score of {1. if letter=='u' else 0.}, regardless of other comparisons columns." + ) + if len(probs) != levels: raise ValueError( f"Number of {setting_name} provided is not equal to number of levels specified"