Skip to content

Commit

Permalink
Merge pull request #181 from moj-analytical-services/waterfall_chart
Browse files Browse the repository at this point in the history
Bayes factor waterfall chart (intuition report)
  • Loading branch information
samnlindsay committed Mar 3, 2021
2 parents 3fd95fc + 254fea7 commit 99d0a55
Show file tree
Hide file tree
Showing 3 changed files with 274 additions and 1 deletion.
2 changes: 1 addition & 1 deletion pyproject.toml
@@ -1,6 +1,6 @@
[tool.poetry]
name = "splink"
version = "1.0.3"
version = "1.0.4"
description = "Implementation in Apache Spark of the EM algorithm to estimate parameters of Fellegi-Sunter's canonical model of record linkage."
authors = ["Robin Linacre <robinlinacre@hotmail.com>", "Sam Lindsay", "Theodore Manassis"]
license = "MIT"
Expand Down
235 changes: 235 additions & 0 deletions splink/files/chart_defs/bayes_factor_intuition_chart_def.json
@@ -0,0 +1,235 @@
{
"config": {"view": {"continuousWidth": 400, "continuousHeight": 300}},
"title": {
"text": "Bayes factor intuition chart",
"subtitle": "How each comparison column contributes to the final match score"
},
"transform": [
{"filter": "(datum.bayes_factor !== 1.0)"},
{
"window": [
{"op": "sum", "field": "log2_bayes_factor", "as": "sum"},
{"op": "lead", "field": "column_name", "as": "lead"}
],
"frame": [null, 0]
},
{
"calculate": "datum.column_name === \"Final score\" ? datum.sum - datum.log2_bayes_factor : datum.sum",
"as": "sum"
},
{
"calculate": "datum.lead === null ? datum.column_name : datum.lead",
"as": "lead"
},
{
"calculate": "datum.column_name === \"Final score\" || datum.column_name === \"Prior lambda\" ? 0 : datum.sum - datum.log2_bayes_factor",
"as": "previous_sum"
},
{
"calculate": "datum.sum > datum.previous_sum ? datum.column_name : \"\"",
"as": "top_label"
},
{
"calculate": "datum.sum < datum.previous_sum ? datum.column_name : \"\"",
"as": "bottom_label"
},
{
"calculate": "datum.sum > datum.previous_sum ? datum.sum : datum.previous_sum",
"as": "sum_top"
},
{
"calculate": "datum.sum < datum.previous_sum ? datum.sum : datum.previous_sum",
"as": "sum_bottom"
},
{"calculate": "(datum.sum + datum.previous_sum) / 2", "as": "center"},
{
"calculate": "(datum.log2_bayes_factor > 0 ? \"+\" : \"\") + datum.log2_bayes_factor",
"as": "text_log2_bayes_factor"
},
{"calculate": "datum.sum < datum.previous_sum ? 4 : -4", "as": "dy"},
{
"calculate": "datum.sum < datum.previous_sum ? \"top\" : \"bottom\"",
"as": "baseline"
},
{"calculate": "1. / (1 + pow(2, -1.*datum.sum))", "as": "prob"},
{"calculate": "0*datum.sum", "as": "zero"}
],
"layer": [
{
"layer": [
{
"mark": "rule",
"encoding": {
"y": {"field":"zero", "type":"quantitative"},
"size": {"value": 0.5},
"color": {"value": "black"}
}
},
{
"mark": {"type": "bar", "width": 60},
"encoding": {
"color": {
"condition": {
"value": "red",
"test": "(datum.log2_bayes_factor < 0)"
},
"value": "green"
},
"opacity": {
"condition": {
"value": 1,
"test": "datum.column_name == 'Prior lambda' || datum.column_name == 'Final score'"
},
"value": 0.5
},
"tooltip": [
{"type": "nominal", "field": "column_name", "title": "Comparison column"},
{"type": "nominal", "field": "value_l", "title": "Value (L)"},
{"type": "nominal", "field": "value_r", "title": "Value (R)"},
{"type": "nominal", "field": "gamma_index", "title": "Gamma level"},
{"type": "nominal", "field": "max_gamma_index", "title": "Max gamma level"},
{"type": "quantitative", "field": "bayes_factor", "format":".3r", "title": "Bayes factor"},
{"type": "quantitative", "field": "log2_bayes_factor", "format":".3r", "title": "log2(Bayes factor)"},
{"type": "quantitative", "field": "prob", "format":".3r", "title":"Adjusted match score"}
],
"x": {
"type": "nominal",
"axis": {
"labelExpr": "datum.value == 'Prior lambda' || datum.value == 'Final score' ? '' : datum.value",
"labelAngle": -20,
"labelAlign": "center",
"labelPadding": 10,
"title": "Column",
"grid": true,
"tickBand": "extent"
},
"field": "column_name",
"sort": null
},
"y": {
"type": "quantitative",
"axis": {
"grid": false,
"orient": "left",
"title": "log2(Bayes factor)"
},
"field": "previous_sum"
},
"y2": {"field": "sum"}
}
},
{
"mark": {"type": "text", "fontWeight": "bold"},
"encoding": {
"color": {"value": "white"},
"text": {
"condition": {
"type": "nominal",
"field": "log2_bayes_factor",
"format": ".2f",
"test": "abs(datum.log2_bayes_factor) > 1"
},
"value": ""
},
"x": {
"type": "nominal",
"axis": {"labelAngle": 0, "title": "Column"},
"field": "column_name",
"sort": null
},
"y": {
"type": "quantitative",
"axis": {"orient": "left"},
"field": "center"
}
}
},
{
"mark": {
"type": "text",
"baseline": "bottom",
"dy": -5,
"fontWeight": "bold"
},
"encoding": {
"color": {"value": "black"},
"text": {
"condition": {
"type": "nominal",
"field": "top_label",
"test": "abs(datum.log2_bayes_factor) > 1"
},
"value": ""
},
"x": {
"type": "nominal",
"axis": {"labelAngle": 0, "title": "Column"},
"field": "column_name",
"sort": null
},
"y": {"type": "quantitative", "field": "sum_top"}
}
},
{
"mark": {
"type": "text",
"baseline": "top",
"dy": 5,
"fontWeight": "bold"
},
"encoding": {
"color": {"value": "black"},
"text": {
"condition": {
"type": "nominal",
"field": "bottom_label",
"test": "abs(datum.log2_bayes_factor) > 1"
},
"value": ""
},
"x": {
"type": "nominal",
"axis": {"labelAngle": 0, "title": "Column"},
"field": "column_name",
"sort": null
},
"y": {"type": "quantitative", "field": "sum_bottom"}
}
}
]
},
{
"mark": {
"type": "rule",
"color": "black",
"strokeWidth": 2,
"x2Offset": 30,
"xOffset": -30
},
"encoding": {
"x": {
"type": "nominal",
"axis": {"labelAngle": 0, "title": "Column"},
"field": "column_name",
"sort": null
},
"x2": {"field": "lead"},
"y": {
"type": "quantitative",
"axis": {
"labelExpr": "format(1 / (1 + pow(2, -1*datum.value)), '.2r')",
"orient": "right",
"title": "Probability"
},
"field": "sum",
"scale": {"zero": false}
}
}
}
],
"height": 450,
"resolve": {"axis": {"y": "independent"}},
"width": {"step": 75},
"$schema": "https://vega.github.io/schema/vega-lite/v4.8.1.json",
"data": {"values": null}
}
38 changes: 38 additions & 0 deletions splink/intuition.py
Expand Up @@ -2,6 +2,9 @@

from .charts import load_chart_definition, altair_if_installed_else_json

import pandas as pd
from math import log2

initial_template = """
Initial probability of match (prior) = λ = {lam:.4g}
"""
Expand Down Expand Up @@ -99,3 +102,38 @@ def bayes_factor_chart(row_dict, model):
del bayes_factor_chart_def["encoding"]["row"]

return altair_if_installed_else_json(bayes_factor_chart_def)

def bayes_factor_intuition_chart(row_dict, model):
chart_path = "bayes_factor_intuition_chart_def.json"
bayes_factor_intuition_chart_def = load_chart_definition(chart_path)

data = _get_bayes_factors(row_dict, model)

# Get initial and final bayes factors
lam = model.current_settings_obj["proportion_of_matches"]
bf_init = lam/(1-lam)
bf_final = sum([d['log2_bayes_factor'] for d in data]) + log2(bf_init)

# Sort records in descending order of influence
# with start and end positions added
df = pd.DataFrame(data)\
.sort_values(by="log2_bayes_factor", key=abs, ascending=False)\
.reset_index(drop=True)\
.append({
'bayes_factor': 2**bf_final,
'log2_bayes_factor': bf_final,
'column_name': 'Final score'
},
ignore_index=True
)
df = pd.DataFrame({
'bayes_factor': bf_init,
'log2_bayes_factor': log2(bf_init),
'column_name': 'Prior lambda'
},
index=[0]
).append(df, ignore_index=True).reset_index()

bayes_factor_intuition_chart_def["data"]["values"] = df.to_dict('records')

return altair_if_installed_else_json(bayes_factor_intuition_chart_def)

0 comments on commit 99d0a55

Please sign in to comment.