Skip to content

Commit

Permalink
Merge pull request #162 from moj-analytical-services/sticky_probs
Browse files Browse the repository at this point in the history
Non-zero estimate_u_values
  • Loading branch information
samnlindsay committed Dec 24, 2020
2 parents 5852fd8 + f2578f6 commit 19f36ed
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 1 deletion.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "splink"
version = "0.4.1"
version = "0.4.2"
description = "Implementation in Apache Spark of the EM algorithm to estimate parameters of Fellegi-Sunter's canonical model of record linkage."
authors = ["Robin Linacre <robinlinacre@hotmail.com>", "Sam Lindsay", "Theodore Manassis"]
license = "MIT"
Expand Down
2 changes: 2 additions & 0 deletions splink/estimate.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,8 @@ def estimate_u_values(

for i, col in enumerate(orig_settings["comparison_columns"]):
u_probs = new_settings["comparison_columns"][i]["u_probabilities"]
# Ensure non-zero u (https://github.com/moj-analytical-services/splink/issues/161)
u_probs = [u or 1/target_rows for u in u_probs]
col["u_probabilities"] = u_probs

return orig_settings
13 changes: 13 additions & 0 deletions splink/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,19 @@ def _complete_probabilities(col_settings: dict, setting_name: str):
levels = col_settings["num_levels"]
probs = col_settings[setting_name]

# Check for m and u manually set to zero (https://github.com/moj-analytical-services/splink/issues/161)
if not all(col_settings[setting_name]):
if "custom_name" in col_settings:
col_name = col_settings["custom_name"]
else:
col_name = col_settings["col_name"]
warnings.warn(
f"Your {setting_name} for {col_name} include zeroes."
f"Where {letter}=0 for a given level, it remains fixed rather than being estimated"
"along with other model parameters, and all comparisons at this level"
f"are assigned a match score of {1. if letter=='u' else 0.}, regardless of other comparisons columns."
)

if len(probs) != levels:
raise ValueError(
f"Number of {setting_name} provided is not equal to number of levels specified"
Expand Down

0 comments on commit 19f36ed

Please sign in to comment.