# _m_ and _u_ probabilities
### TO DO:
1. Load settings from saved params
2. Function to add m and u probs to existing settings from saved params (e.g. `settings_with_m_u(settings, params)` with `params` from saved params)

In [None]:
import pandas as pd 
pd.options.display.max_columns = 500
pd.options.display.max_rows = 100

from pyspark.context import SparkContext, SparkConf
from pyspark.sql import SparkSession, Window
from pyspark.sql.types import StructType
import pyspark.sql.functions as f

conf=SparkConf()

# Load in a jar that provides extended string comparison functions such as Jaro Winkler.
# Splink 
conf.set('spark.driver.extraClassPath', 'jars/scala-udf-similarity-0.0.6.jar')
conf.set('spark.jars', 'jars/scala-udf-similarity-0.0.6.jar')   


# WARNING:
# These config options are appropriate only if you're running Spark locally!!!
conf.set('spark.driver.memory', '4g')
conf.set("spark.sql.shuffle.partitions", "8") 

sc = SparkContext.getOrCreate(conf=conf)

spark = SparkSession(sc)

 # Register UDFs
from pyspark.sql import types
spark.udf.registerJavaFunction('jaro_winkler_sim', 'uk.gov.moj.dash.linkage.JaroWinklerSimilarity', types.DoubleType())
spark.udf.registerJavaFunction('Dmetaphone', 'uk.gov.moj.dash.linkage.DoubleMetaphone', types.StringType())

### Ingredients:
- input settings
- complete settings (with missing fields populated by defaults - incl. m and u probs)
- saved params (including param history, and complete input settings dict)

In [None]:
input_settings = {
    "link_type": "dedupe_only",
    "blocking_rules": [
        "l.first_name = r.first_name",
        "l.surname = r.surname",
        "l.dob = r.dob"
    ],
    "comparison_columns": [
        {
            "col_name": "first_name",
            "num_levels": 3,
            "term_frequency_adjustments": True
        },
        {
            "col_name": "surname",
            "num_levels": 3,
            "term_frequency_adjustments": True
        },
        {
            "col_name": "dob"
        },
        {
            "col_name": "city"
        },
        {
            "col_name": "email"
        }
    ],
    "additional_columns_to_retain": ["group"]
}

### Alternative `input_settings` (for testing)
Includes:
- NEW comparison column NOT in the saved params (`full_name`)
- MISSING column that IS in the saved params (`city`)
- "custom column" example (`full_name`)
- different number of levels from saved params (`email`)

In [None]:
input_settings2 = {
    "link_type": "dedupe_only",
    "blocking_rules": [
        "l.first_name = r.first_name",
        "l.surname = r.surname",
        "l.dob = r.dob"
    ],
    "comparison_columns": [
        {
            "col_name": "first_name",
            "num_levels": 3,
            "term_frequency_adjustments": True
        },
        {
            "custom_name": "full_name",
            "custom_columns_used": ["first_name", "surname"],
            "num_levels": 2,
            "case_expression": """
                case when concat(first_name_l, surname_l) = concat(first_name_r, surname_r) then 1
                else 0 end
            """,
            "term_frequency_adjustments": True
        },
        {
            "col_name": "dob"
        },
        {
            "col_name": "email",
            "num_levels": 3
        }
    ],
    "additional_columns_to_retain": ["group"]
}

In [None]:
from splink.settings import complete_settings_dict
from splink import Params, load_params_from_json

complete_settings = complete_settings_dict(input_settings, spark)
generated_params = Params(input_settings, spark)
saved_params = load_params_from_json("saved_params.json")

## 1) "_I'm setting up a new job and I want to use the results of another job as default where applicable_"
### Update input settings with saved `m` and `u` probabilities

Potential gotchas:
- Column names don't match
- Number of levels missing/don't match
- Custom columns

In [None]:
from splink.validate import _get_default_value

def add_saved_m_and_u(settings, spark, json_path):
    
    #settings = complete_settings_dict(settings, spark)
    saved_params = load_params_from_json(json_path)
    
    for comp in settings["comparison_columns"]:
        if "col_name" in comp.keys():
            label = "gamma_"+comp["col_name"]
        else:
            label = "gamma_"+comp["custom_name"]
            
        if "num_levels" in comp.keys():
            num_levels = comp["num_levels"]
        else:
            num_levels = _get_default_value("num_levels", is_column_setting=True)
        
        
        if label in saved_params.params["π"].keys():
            saved = saved_params.params["π"][label]
    
            if num_levels == saved["num_levels"]:
                m_probs = [val['probability'] for key, val in saved["prob_dist_match"].items()]
                u_probs = [val['probability'] for key, val in saved["prob_dist_non_match"].items()]
    
                comp["m_probabilities"] = m_probs
                comp["u_probabilities"] = u_probs
            else:
                print(f"{label}: Saved m and u probabilities do not match the specified number of levels ({num_levels}) - default probabilities will be used")
    
    return(settings)

In [None]:
new_settings = add_saved_m_and_u(input_settings2, spark, "saved_params.json")
new_settings

In [None]:
complete_settings_dict(new_settings, spark)

## 2) "_I'm restarting/re-running a job and want to pick up where the parameters finished_"
### As above but `settings` also comes from "saved_params.json"

In [None]:
# complete input settings (default m and u probs)
saved_settings = saved_params.settings

add_saved_m_and_u(saved_settings, spark, "saved_params.json")