From afdce1d43dfa2ca9dc49f771587fb29f9e29ecda Mon Sep 17 00:00:00 2001 From: Sam Lindsay Date: Thu, 24 Dec 2020 14:00:36 +0000 Subject: [PATCH 1/4] Non-zero estimate_u_values --- splink/estimate.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/splink/estimate.py b/splink/estimate.py index 566e0acef6..b959e6018b 100644 --- a/splink/estimate.py +++ b/splink/estimate.py @@ -105,6 +105,8 @@ def estimate_u_values( for i, col in enumerate(orig_settings["comparison_columns"]): u_probs = new_settings["comparison_columns"][i]["u_probabilities"] + # Ensure non-zero u + u_probs = [u or 1/target_rows for u in u_probs] col["u_probabilities"] = u_probs return orig_settings From d0fe1d06bf741bae40a19072cd6b2fb76aefa6c7 Mon Sep 17 00:00:00 2001 From: Sam Lindsay Date: Thu, 24 Dec 2020 14:47:31 +0000 Subject: [PATCH 2/4] Warning about zero m or u --- splink/settings.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/splink/settings.py b/splink/settings.py index f9a909cf39..7b3e4f6857 100644 --- a/splink/settings.py +++ b/splink/settings.py @@ -161,6 +161,18 @@ def _complete_probabilities(col_settings: dict, setting_name: str): else: levels = col_settings["num_levels"] probs = col_settings[setting_name] + + if not all(col_settings[setting_name]): + if "custom_name" in col_settings: + col_name = col_settings["custom_name"] + else: + col_name = col_settings["col_name"] + warnings.warn( + f"Your {setting_name} for {col_name} include zeroes.") + f"Where {letter}=0 for a given level, it remains fixed rather than being estimated") + "along with other model parameters, and all comparisons at this level") + f"are assigned a match score of {1. if letter=='u' else 0.}, regardless of other comparisons columns.") + ) if len(probs) != levels: raise ValueError( From a109c043f38ad8fe5318067c4bec3894ac1a70db Mon Sep 17 00:00:00 2001 From: Sam Lindsay Date: Thu, 24 Dec 2020 14:54:32 +0000 Subject: [PATCH 3/4] typos --- splink/settings.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/splink/settings.py b/splink/settings.py index 7b3e4f6857..7ae6d1fc2f 100644 --- a/splink/settings.py +++ b/splink/settings.py @@ -161,18 +161,18 @@ def _complete_probabilities(col_settings: dict, setting_name: str): else: levels = col_settings["num_levels"] probs = col_settings[setting_name] - + if not all(col_settings[setting_name]): if "custom_name" in col_settings: col_name = col_settings["custom_name"] else: col_name = col_settings["col_name"] warnings.warn( - f"Your {setting_name} for {col_name} include zeroes.") - f"Where {letter}=0 for a given level, it remains fixed rather than being estimated") - "along with other model parameters, and all comparisons at this level") - f"are assigned a match score of {1. if letter=='u' else 0.}, regardless of other comparisons columns.") - ) + f"Your {setting_name} for {col_name} include zeroes." + f"Where {letter}=0 for a given level, it remains fixed rather than being estimated" + "along with other model parameters, and all comparisons at this level" + f"are assigned a match score of {1. if letter=='u' else 0.}, regardless of other comparisons columns." + ) if len(probs) != levels: raise ValueError( From f2578f6a9f2d857cf2023d1b04668452845e6422 Mon Sep 17 00:00:00 2001 From: Sam Lindsay Date: Thu, 24 Dec 2020 15:06:21 +0000 Subject: [PATCH 4/4] Comment issue URL and bump version --- pyproject.toml | 2 +- splink/estimate.py | 2 +- splink/settings.py | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b614c65eba..11f809acf8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "splink" -version = "0.4.1" +version = "0.4.2" description = "Implementation in Apache Spark of the EM algorithm to estimate parameters of Fellegi-Sunter's canonical model of record linkage." authors = ["Robin Linacre ", "Sam Lindsay", "Theodore Manassis"] license = "MIT" diff --git a/splink/estimate.py b/splink/estimate.py index b959e6018b..7282d26983 100644 --- a/splink/estimate.py +++ b/splink/estimate.py @@ -105,7 +105,7 @@ def estimate_u_values( for i, col in enumerate(orig_settings["comparison_columns"]): u_probs = new_settings["comparison_columns"][i]["u_probabilities"] - # Ensure non-zero u + # Ensure non-zero u (https://github.com/moj-analytical-services/splink/issues/161) u_probs = [u or 1/target_rows for u in u_probs] col["u_probabilities"] = u_probs diff --git a/splink/settings.py b/splink/settings.py index 7ae6d1fc2f..8c4828d07f 100644 --- a/splink/settings.py +++ b/splink/settings.py @@ -162,6 +162,7 @@ def _complete_probabilities(col_settings: dict, setting_name: str): levels = col_settings["num_levels"] probs = col_settings[setting_name] + # Check for m and u manually set to zero (https://github.com/moj-analytical-services/splink/issues/161) if not all(col_settings[setting_name]): if "custom_name" in col_settings: col_name = col_settings["custom_name"]