From afdce1d43dfa2ca9dc49f771587fb29f9e29ecda Mon Sep 17 00:00:00 2001
From: Sam Lindsay <sam.lindsay@digital.justice.gov.uk>
Date: Thu, 24 Dec 2020 14:00:36 +0000
Subject: [PATCH 1/4] Non-zero estimate_u_values

---
 splink/estimate.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/splink/estimate.py b/splink/estimate.py
index 566e0acef6..b959e6018b 100644
--- a/splink/estimate.py
+++ b/splink/estimate.py
@@ -105,6 +105,8 @@ def estimate_u_values(
 
     for i, col in enumerate(orig_settings["comparison_columns"]):
         u_probs = new_settings["comparison_columns"][i]["u_probabilities"]
+        # Ensure non-zero u
+        u_probs = [u or 1/target_rows for u in u_probs]
         col["u_probabilities"] = u_probs
 
     return orig_settings

From d0fe1d06bf741bae40a19072cd6b2fb76aefa6c7 Mon Sep 17 00:00:00 2001
From: Sam Lindsay <sam.lindsay@digital.justice.gov.uk>
Date: Thu, 24 Dec 2020 14:47:31 +0000
Subject: [PATCH 2/4] Warning about zero m or u

---
 splink/settings.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/splink/settings.py b/splink/settings.py
index f9a909cf39..7b3e4f6857 100644
--- a/splink/settings.py
+++ b/splink/settings.py
@@ -161,6 +161,18 @@ def _complete_probabilities(col_settings: dict, setting_name: str):
     else:
         levels = col_settings["num_levels"]
         probs = col_settings[setting_name]
+        
+        if not all(col_settings[setting_name]):
+            if "custom_name" in col_settings:
+                col_name = col_settings["custom_name"]
+            else:
+                col_name = col_settings["col_name"]
+            warnings.warn(
+                f"Your {setting_name} for {col_name} include zeroes.")
+                f"Where {letter}=0 for a given level, it remains fixed rather than being estimated")
+                "along with other model parameters, and all comparisons at this level")
+                f"are assigned a match score of {1. if letter=='u' else 0.}, regardless of other comparisons columns.")
+            )
 
         if len(probs) != levels:
             raise ValueError(

From a109c043f38ad8fe5318067c4bec3894ac1a70db Mon Sep 17 00:00:00 2001
From: Sam Lindsay <sam.lindsay@digital.justice.gov.uk>
Date: Thu, 24 Dec 2020 14:54:32 +0000
Subject: [PATCH 3/4] typos

---
 splink/settings.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/splink/settings.py b/splink/settings.py
index 7b3e4f6857..7ae6d1fc2f 100644
--- a/splink/settings.py
+++ b/splink/settings.py
@@ -161,18 +161,18 @@ def _complete_probabilities(col_settings: dict, setting_name: str):
     else:
         levels = col_settings["num_levels"]
         probs = col_settings[setting_name]
-        
+
         if not all(col_settings[setting_name]):
             if "custom_name" in col_settings:
                 col_name = col_settings["custom_name"]
             else:
                 col_name = col_settings["col_name"]
             warnings.warn(
-                f"Your {setting_name} for {col_name} include zeroes.")
-                f"Where {letter}=0 for a given level, it remains fixed rather than being estimated")
-                "along with other model parameters, and all comparisons at this level")
-                f"are assigned a match score of {1. if letter=='u' else 0.}, regardless of other comparisons columns.")
-            )
+                f"Your {setting_name} for {col_name} include zeroes."
+                f"Where {letter}=0 for a given level, it remains fixed rather than being estimated"
+                "along with other model parameters, and all comparisons at this level"
+                f"are assigned a match score of {1. if letter=='u' else 0.}, regardless of other comparisons columns."
+                )
 
         if len(probs) != levels:
             raise ValueError(

From f2578f6a9f2d857cf2023d1b04668452845e6422 Mon Sep 17 00:00:00 2001
From: Sam Lindsay <sam.lindsay@digital.justice.gov.uk>
Date: Thu, 24 Dec 2020 15:06:21 +0000
Subject: [PATCH 4/4] Comment issue URL and bump version

---
 pyproject.toml     | 2 +-
 splink/estimate.py | 2 +-
 splink/settings.py | 1 +
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index b614c65eba..11f809acf8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "splink"
-version = "0.4.1"
+version = "0.4.2"
 description = "Implementation in Apache Spark of the EM algorithm to estimate parameters of Fellegi-Sunter's canonical model of record linkage."
 authors = ["Robin Linacre <robinlinacre@hotmail.com>", "Sam Lindsay", "Theodore Manassis"]
 license = "MIT"
diff --git a/splink/estimate.py b/splink/estimate.py
index b959e6018b..7282d26983 100644
--- a/splink/estimate.py
+++ b/splink/estimate.py
@@ -105,7 +105,7 @@ def estimate_u_values(
 
     for i, col in enumerate(orig_settings["comparison_columns"]):
         u_probs = new_settings["comparison_columns"][i]["u_probabilities"]
-        # Ensure non-zero u
+        # Ensure non-zero u (https://github.com/moj-analytical-services/splink/issues/161)
         u_probs = [u or 1/target_rows for u in u_probs]
         col["u_probabilities"] = u_probs
 
diff --git a/splink/settings.py b/splink/settings.py
index 7ae6d1fc2f..8c4828d07f 100644
--- a/splink/settings.py
+++ b/splink/settings.py
@@ -162,6 +162,7 @@ def _complete_probabilities(col_settings: dict, setting_name: str):
         levels = col_settings["num_levels"]
         probs = col_settings[setting_name]
 
+        # Check for m and u manually set to zero (https://github.com/moj-analytical-services/splink/issues/161)
         if not all(col_settings[setting_name]):
             if "custom_name" in col_settings:
                 col_name = col_settings["custom_name"]