Suggested solution

moj-analytical-services · Jun 24, 2020 · a3db597 · a3db597
1 parent 1384fcb
commit a3db597
Showing 1 changed file with 79 additions and 7 deletions.
diff --git a/settings_with_m_u.ipynb b/settings_with_m_u.ipynb
@@ -167,7 +167,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 1) \"_I'm setting up a new job and I want to use the results of another job as default where applicable_\"\n",
+    "## USE CASE 1) \"_I'm setting up a new job and I want to use the results of another job as default where applicable_\"\n",
     "### Update input settings with saved `m` and `u` probabilities\n",
     "\n",
     "Potential gotchas:\n",
@@ -184,9 +184,7 @@
    "source": [
     "from splink.validate import _get_default_value\n",
     "\n",
-    "def add_saved_m_and_u(settings, spark, json_path):\n",
-    "    \n",
-    "    #settings = complete_settings_dict(settings, spark)\n",
+    "def add_saved_m_and_u(settings, json_path):\n",
     "    saved_params = load_params_from_json(json_path)\n",
     "    \n",
     "    for comp in settings[\"comparison_columns\"]:\n",
@@ -222,7 +220,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "new_settings = add_saved_m_and_u(input_settings2, spark, \"saved_params.json\")\n",
+    "new_settings = add_saved_m_and_u(input_settings2, \"saved_params.json\")\n",
     "new_settings"
    ]
   },
@@ -239,7 +237,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 2) \"_I'm restarting/re-running a job and want to pick up where the parameters finished_\"\n",
+    "## USE CASE 2) \"_I'm restarting/re-running a job and want to pick up where the parameters finished_\"\n",
     "### As above but `settings` also comes from \"saved_params.json\""
    ]
   },
@@ -252,7 +250,81 @@
     "# complete input settings (default m and u probs)\n",
     "saved_settings = saved_params.settings\n",
     "\n",
-    "add_saved_m_and_u(saved_settings, spark, \"saved_params.json\")"
+    "add_saved_m_and_u(saved_settings, \"saved_params.json\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Possible solution \n",
+    "Bad naming and bad coding practice notwithstanding"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_or_update_settings(json_path, settings=None):\n",
+    "    saved_params = load_params_from_json(json_path)\n",
+    "    \n",
+    "    if not settings:\n",
+    "        settings = saved_params.settings\n",
+    "    \n",
+    "    for comp in settings[\"comparison_columns\"]:\n",
+    "        if \"col_name\" in comp.keys():\n",
+    "            label = \"gamma_\"+comp[\"col_name\"]\n",
+    "        else:\n",
+    "            label = \"gamma_\"+comp[\"custom_name\"]\n",
+    "            \n",
+    "        if \"num_levels\" in comp.keys():\n",
+    "            num_levels = comp[\"num_levels\"]\n",
+    "        else:\n",
+    "            num_levels = _get_default_value(\"num_levels\", is_column_setting=True)\n",
+    "        \n",
+    "        \n",
+    "        if label in saved_params.params[\"π\"].keys():\n",
+    "            saved = saved_params.params[\"π\"][label]\n",
+    "    \n",
+    "            if num_levels == saved[\"num_levels\"]:\n",
+    "                m_probs = [val['probability'] for key, val in saved[\"prob_dist_match\"].items()]\n",
+    "                u_probs = [val['probability'] for key, val in saved[\"prob_dist_non_match\"].items()]\n",
+    "    \n",
+    "                comp[\"m_probabilities\"] = m_probs\n",
+    "                comp[\"u_probabilities\"] = u_probs\n",
+    "            else:\n",
+    "                print(f\"{label}: Saved m and u probabilities do not match the specified number of levels ({num_levels}) - default probabilities will be used\")\n",
+    "    \n",
+    "    return(settings)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# update input settings with saved parameters\n",
+    "get_or_update_settings(\"saved_params.json\", input_settings2)\n",
+    "\n",
+    "# get previous settings and parameters\n",
+    "get_or_update_settings(\"saved_params.json\")"
    ]
   },
   {