diff --git a/.gitignore b/.gitignore index 5a20b5a9a3..46f04bd44d 100644 --- a/.gitignore +++ b/.gitignore @@ -143,3 +143,6 @@ dmypy.json # gitignore exception for docs files (otherwise docs/build does not get saved in repo !docs/* + +# some files created by certain tests need to not be uploaded to the repo +saved_model.json diff --git a/saved_model.json b/saved_model.json deleted file mode 100644 index f1ec10c38e..0000000000 --- a/saved_model.json +++ /dev/null @@ -1,229 +0,0 @@ -{ - "current_params": { - "\u03bb": 0.4610420763492584, - "\u03c0": { - "gamma_surname": { - "gamma_index": 0, - "desc": "Comparison of surname", - "column_name": "surname", - "custom_comparison": false, - "num_levels": 2, - "prob_dist_match": { - "level_0": { - "value": 0, - "probability": 0.3785862326622009 - }, - "level_1": { - "value": 1, - "probability": 0.6214137673377991 - } - }, - "prob_dist_non_match": { - "level_0": { - "value": 0, - "probability": 0.6038612723350525 - }, - "level_1": { - "value": 1, - "probability": 0.3961387276649475 - } - } - }, - "gamma_mob": { - "gamma_index": 1, - "desc": "Comparison of mob", - "column_name": "mob", - "custom_comparison": false, - "num_levels": 2, - "prob_dist_match": { - "level_0": { - "value": 0, - "probability": 0.2493358850479126 - }, - "level_1": { - "value": 1, - "probability": 0.7506641149520874 - } - }, - "prob_dist_non_match": { - "level_0": { - "value": 0, - "probability": 0.25056809186935425 - }, - "level_1": { - "value": 1, - "probability": 0.7494319081306458 - } - } - } - } - }, - "historical_params": [ - { - "\u03bb": 0.3, - "\u03c0": { - "gamma_surname": { - "gamma_index": 0, - "desc": "Comparison of surname", - "column_name": "surname", - "custom_comparison": false, - "num_levels": 2, - "prob_dist_match": { - "level_0": { - "value": 0, - "probability": 0.1 - }, - "level_1": { - "value": 1, - "probability": 0.9 - } - }, - "prob_dist_non_match": { - "level_0": { - "value": 0, - "probability": 0.9 - }, - "level_1": { - "value": 1, - "probability": 0.1 - } - } - }, - "gamma_mob": { - "gamma_index": 1, - "desc": "Comparison of mob", - "column_name": "mob", - "custom_comparison": false, - "num_levels": 2, - "prob_dist_match": { - "level_0": { - "value": 0, - "probability": 0.1 - }, - "level_1": { - "value": 1, - "probability": 0.9 - } - }, - "prob_dist_non_match": { - "level_0": { - "value": 0, - "probability": 0.9 - }, - "level_1": { - "value": 1, - "probability": 0.1 - } - } - } - } - }, - { - "\u03bb": 0.46799999475479126, - "\u03c0": { - "gamma_surname": { - "gamma_index": 0, - "desc": "Comparison of surname", - "column_name": "surname", - "custom_comparison": false, - "num_levels": 2, - "prob_dist_match": { - "level_0": { - "value": 0, - "probability": 0.3205128312110901 - }, - "level_1": { - "value": 1, - "probability": 0.6794871687889099 - } - }, - "prob_dist_non_match": { - "level_0": { - "value": 0, - "probability": 0.6578947305679321 - }, - "level_1": { - "value": 1, - "probability": 0.34210526943206787 - } - } - }, - "gamma_mob": { - "gamma_index": 1, - "desc": "Comparison of mob", - "column_name": "mob", - "custom_comparison": false, - "num_levels": 2, - "prob_dist_match": { - "level_0": { - "value": 0, - "probability": 0.16025641560554504 - }, - "level_1": { - "value": 1, - "probability": 0.8397436141967773 - } - }, - "prob_dist_non_match": { - "level_0": { - "value": 0, - "probability": 0.32894736528396606 - }, - "level_1": { - "value": 1, - "probability": 0.6710526347160339 - } - } - } - } - } - ], - "settings": { - "link_type": "dedupe_only", - "comparison_columns": [ - { - "col_name": "surname", - "gamma_index": 0, - "num_levels": 2, - "data_type": "string", - "term_frequency_adjustments": false, - "case_expression": "case when surname_l is null or surname_r is null then -1 when surname_l = surname_r then 1 else 0 end as gamma_surname", - "m_probabilities": [ - 0.1, - 0.9 - ], - "u_probabilities": [ - 0.9, - 0.1 - ] - }, - { - "col_name": "mob", - "gamma_index": 1, - "num_levels": 2, - "data_type": "string", - "term_frequency_adjustments": false, - "case_expression": "case when mob_l is null or mob_r is null then -1 when mob_l = mob_r then 1 else 0 end as gamma_mob", - "m_probabilities": [ - 0.1, - 0.9 - ], - "u_probabilities": [ - 0.9, - 0.1 - ] - } - ], - "blocking_rules": [ - "l.mob = r.mob", - "l.surname = r.surname" - ], - "max_iterations": 2, - "em_convergence": 0.0001, - "unique_id_column_name": "unique_id", - "additional_columns_to_retain": [], - "retain_matching_columns": true, - "retain_intermediate_calculation_columns": true, - "proportion_of_matches": 0.3 - } -} \ No newline at end of file diff --git a/splink/term_frequencies.py b/splink/term_frequencies.py index 203b7ad704..aecdc4ece7 100644 --- a/splink/term_frequencies.py +++ b/splink/term_frequencies.py @@ -3,6 +3,7 @@ import logging import math +import warnings try: from pyspark.sql.dataframe import DataFrame @@ -54,7 +55,7 @@ def sql_gen_generate_adjusted_lambda(column_name, params, table_name="df_e"): m = params.params["π"][f"gamma_{column_name}"]["prob_dist_match"][f"level_{max_level}"]["probability"] u = params.params["π"][f"gamma_{column_name}"]["prob_dist_non_match"][f"level_{max_level}"]["probability"] - # ensure average adj calculation doesnt divide by zero +# ensure average adj calculation doesnt divide by zero (see issue 118) if ( math.isclose((m+u), 0.0, rel_tol=1e-9, abs_tol=0.0)): average_adjustment = 0.5 warnings.warn( f" Is most of column {column_name} or all of it comprised of NULL values??? There are levels where no comparisons are found.") diff --git a/tests/test_adj.py b/tests/test_adj.py index 38c99e2039..66a5125690 100644 --- a/tests/test_adj.py +++ b/tests/test_adj.py @@ -3,6 +3,7 @@ import pandas as pd import pyspark.sql.functions as f import pyspark +import warnings @pytest.fixture(scope="module") @@ -109,6 +110,7 @@ def test_freq_adj_divzero(spark, sparkdf): # create column weird in a way that could trigger a div by zero on the average adj calculation before the fix sparkdf = sparkdf.withColumn("weird",f.lit(None)) + try: linker = Splink(settings, spark, df=sparkdf) notpassing = False