Skip to content

Commit

Permalink
Merge pull request #119 from moj-analytical-services/bayes_factor
Browse files Browse the repository at this point in the history
WIP - Bayes factor charts
  • Loading branch information
samnlindsay committed Aug 4, 2020
2 parents 6127212 + e59f56d commit 8d24349
Show file tree
Hide file tree
Showing 4 changed files with 176 additions and 273 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "splink"
version = "0.2.2"
version = "0.2.3"
description = "[Beta]: Implementation in Apache Spark of the EM algorithm to estimate parameters of Fellegi-Sunter's canonical model of record linkage."
authors = ["Robin Linacre <robinlinacre@hotmail.com>", "Sam Lindsay", "Theodore Manassis"]
license = "MIT"
Expand Down
302 changes: 114 additions & 188 deletions splink/chart_definitions.py
Original file line number Diff line number Diff line change
@@ -1,73 +1,3 @@
pi_iteration_chart_def = {
"config": {
"view": {"width": 400, "height": 300}, # pragma: no cover
"mark": {"tooltip": None},
"title": {"anchor": "middle"},
},
"hconcat": [
{
"mark": "bar",
"encoding": {
"color": {"type": "nominal", "field": "value"},
"row": {
"type": "nominal",
"field": "column",
"sort": {"field": "gamma"},
},
"tooltip": [
{"type": "quantitative", "field": "probability"},
{"type": "ordinal", "field": "iteration"},
{"type": "nominal", "field": "column"},
{"type": "nominal", "field": "value"},
],
"x": {"type": "ordinal", "field": "iteration"},
"y": {
"type": "quantitative",
"aggregate": "sum",
"axis": {"title": "𝛾 value"},
"field": "probability",
},
},
"height": 150,
"resolve": {"scale": {"y": "independent"}},
"title": "Non Match",
"transform": [{"filter": "(datum.match === 0)"}],
},
{
"mark": "bar",
"encoding": {
"color": {"type": "nominal", "field": "value"},
"row": {
"type": "nominal",
"field": "column",
"sort": {"field": "gamma"},
},
"tooltip": [
{"type": "quantitative", "field": "probability"},
{"type": "ordinal", "field": "iteration"},
{"type": "nominal", "field": "column"},
{"type": "nominal", "field": "value"},
],
"x": {"type": "ordinal", "field": "iteration"},
"y": {
"type": "quantitative",
"aggregate": "sum",
"axis": {"title": "𝛾 value"},
"field": "probability",
},
},
"height": 150,
"resolve": {"scale": {"y": "independent"}},
"title": "Match",
"transform": [{"filter": "(datum.match === 1)"}],
},
],
"data": {"values": None},
"title": "Probability distribution of comparison vector values by iteration number",
"$schema": "https://vega.github.io/schema/vega-lite/v3.4.0.json",
}


lambda_iteration_chart_def = {
"config": {
"view": {"width": 400, "height": 300},
Expand Down Expand Up @@ -141,7 +71,11 @@
{"type": "quantitative", "field": "probability", "format": ".4f"},
{"type": "ordinal", "field": "value"},
],
"x": {"type": "quantitative", "field": "probability"},
"x": {
"type": "quantitative",
"field": "probability",
"axis": {"title": "proportion of non-matches"}
},
"y": {
"type": "nominal",
"axis": {"title": "𝛾 value"},
Expand Down Expand Up @@ -171,7 +105,11 @@
{"type": "quantitative", "field": "probability", "format": ".4f"},
{"type": "ordinal", "field": "value"},
],
"x": {"type": "quantitative", "field": "probability"},
"x": {
"type": "quantitative",
"field": "probability",
"axis": {"title": "proportion of matches"}
},
"y": {
"type": "nominal",
"axis": {"title": "𝛾 value"},
Expand Down Expand Up @@ -207,7 +145,11 @@
{"type": "quantitative", "field": "level_proportion", "format": ".4f"},
{"type": "ordinal", "field": "level"},
],
"x": {"type": "quantitative", "field": "level_proportion"},
"x": {
"type": "quantitative",
"field": "level_proportion",
"axis": {"title": "proportion of comparisons"}
},
"y": {
"type": "nominal",
"axis": {"title": "𝛾 value"},
Expand All @@ -222,70 +164,50 @@
"$schema": "https://vega.github.io/schema/vega-lite/v3.4.0.json",
}

adjustment_weight_chart_def = {
bayes_factor_chart_def = {
"config": {
"view": {"width": 400, "height": 300},
"mark": {"tooltip": None},
"title": {"anchor": "middle"},
},
"data": {"values": None},
"mark": "bar",
"mark": {"type": "bar", "clip": True},
"encoding": {
"color": {
"type": "quantitative",
"field": "normalised_adjustment",
"field": "logk",
"title": "log2(K)",
"scale": {
"domain": [-0.5, -0.4, 0, 0.4, 0.5],
"range": ["red", "orange", "green", "orange", "red"],
"scheme": "redyellowgreen",
"domainMid": 0.0,
#"domain": [-10, -7, 0, 7, 10],
#"range": ["red", "orange", "green", "orange", "red"],
},
},
"row": {"type": "nominal", "field": "column", "sort": {"field": "column"}},
"tooltip": [
{"type": "nominal", "field": "column"},
{"type": "quantitative", "field": "normalised_adjustment"},
{"type": "quantitative", "field": "bayes_factor", "title": "Bayes factor, K"},
{"type": "quantitative", "field": "logk", "title": "log2(K)"}
],
"x": {
"type": "quantitative",
"axis": {"title": "Influence on match probabiity."},
"field": "normalised_adjustment",
"scale": {"domain": [-0.5, 0.5]},
"axis": {"title": "log2(Bayes factor, K = m/u)",
"values": [-10,-5,0,5,10]},
"field": "logk",
"scale": {"domain": [-10, 10]},
},
"y": {"type": "nominal", "field": "level"},
},
"transform": [
{"calculate": "(log(datum.bayes_factor) / log(2))", "as": "logk"}
],
"height": 50,
"resolve": {"scale": {"y": "independent"}},
"title": "Influence of comparison vector values on match probability",
"$schema": "https://vega.github.io/schema/vega-lite/v3.4.0.json",
}

adjustment_factor_chart_def = {
"config": {"view": {"width": 400, "height": 300}, "mark": {"tooltip": None}},
"data": {"values": None},
"mark": "bar",
"encoding": {
"color": {
"type": "quantitative",
"field": "normalised",
"scale": {
"domain": [-0.5, -0.4, 0, 0.4, 0.5],
"range": ["red", "orange", "green", "orange", "red"],
},
},
"tooltip": [
{"type": "nominal", "field": "field"},
{"type": "quantitative", "field": "normalised"},
],
"x": {
"type": "quantitative",
"field": "normalised",
"scale": {"domain": [-0.5, 0.5]},
},
"y": {"type": "nominal", "field": "column", "sort": {"field": "gamma"}},
},
"$schema": "https://vega.github.io/schema/vega-lite/v3.4.0.json",
}


multi_chart_template = """
<!DOCTYPE html>
<html>
Expand All @@ -301,8 +223,6 @@
<div id="vis4"></div><div id="vis5"></div>
<br/>
<div id="vis6"></div>
<br/>
<div id="vis7"></div>
Expand All @@ -313,97 +233,103 @@
vegaEmbed('#vis4', {spec4}).catch(console.error);
vegaEmbed('#vis5', {spec5}).catch(console.error);
vegaEmbed('#vis6', {spec6}).catch(console.error);
vegaEmbed('#vis7', {spec7}).catch(console.error);
</script>
</body>
</html>
""" # pragma: no cover

adjustment_history_chart_def = {
'hconcat': [{
'mark': 'bar',
'encoding': {
'color': {
'type': 'quantitative',
'field': 'level',
'legend': {},
'scale': {'range': ['red', 'orange', 'green']}
},
'tooltip': [
{'type': 'nominal', 'field': 'column'},
{'type': 'ordinal', 'field': 'level'},
{'type': 'quantitative', 'field': 'm'},
{'type': 'quantitative', 'field': 'u'},
{'type': 'quantitative', 'field': 'normalised_adjustment'}],
'x': {
'type': 'ordinal',
'field': 'level'
},
'y': {
'type': 'quantitative',
'axis': {'title': 'Influence on match probability'},
'field': 'normalised_adjustment',
'scale': {'domain': [-0.5, 0.5]}
}
},
bayes_factor_history_chart_def = {
"hconcat": [{
"mark": "bar",
"encoding": {
"color": {
"type": "quantitative",
"field": "level",
"scale": {"range": ["red", "orange", "green"]}
},
"tooltip": [
{"type": "nominal", "field": "column"},
{"type": "ordinal", "field": "level"},
{"type": "quantitative", "field": "m"},
{"type": "quantitative", "field": "u"},
{"type": "quantitative", "field": "bayes_factor", "title": "Bayes factor, K"},
{"type": "quantitative", "field": "logk", "title": "log2(K)"}
],
"x": {"type": "ordinal", "field": "level"},
"y": {
"type": "quantitative",
"axis": {
"title": "log2(Bayes factor, K = m/u)",
"values": [-10,-5,-2,-1,0,1,2,5,10]
},
"field": "logk",
#"scale": {"domain": [-10, 10]},
}
},
"height": 150,
"selection": {
"selector190": {
"type": "single",
"on": "mouseover",
"fields": ["level", "column"]
}
},
"transform": [
{"calculate": "(log(datum.bayes_factor) / log(2))", "as": "logk"},
{"filter": "(datum.final === true)"}
],
'width': 100,
'height': 150,
'selection': {
'selector190': {'type': 'single', 'on': 'mouseover', 'fields': ['level', 'column']}
}
"width": 100
},
{
'layer': [{
'mark': 'line',
'height': 150,
'encoding': {
'color': {
'type': 'quantitative',
'field': 'level',
'legend': {'type': 'symbol', 'tickCount':2},
'scale': {'range': ['red', 'orange', 'green']}
},
'opacity': {
'condition': {
'value': 0.8,
'selection': {'not': 'selector190'}
"layer": [
{
"mark": "line",
"encoding": {
"color": {
"type": "quantitative",
"field": "level",
"legend": {"tickCount": 2, "type": "symbol"},
"scale": {"range": ["red", "orange", "green"]}
},
'value': 1
},
'size': {
'condition': {
'value': 3,
'selection': {'not': 'selector190'}
"opacity": {
"condition": {"selection": {"not": "selector190"}, "value": 0.8},
"value": 1
},
'value': 5
"size": {
"condition": {"selection": {"not": "selector190"}, "value": 3},
"value": 5
},
"tooltip": [
{"type": "nominal", "field": "column"},
{"type": "quantitative", "field": "iteration"},
{"type": "ordinal", "field": "level"},
{"type": "quantitative", "field": "m"},
{"type": "quantitative", "field": "u"},
{"type": "quantitative", "field": "bayes_factor", "title": "Bayes factor, K"},
{"type": "quantitative", "field": "logk", "title": "log2(K)"}
],
"x": {
"type": "ordinal",
"axis": {"title": "Iteration"},
"field": "iteration"
},
"y": {
"type": "quantitative",
"axis": {
"title": "log2(Bayes factor, K = m/u)",
"values": [-10,-5,-2,-1,0,1,2,5,10]
},
"field": "logk",
#"scale": {"domain": [-10, 10]},
}
},
'tooltip': [
{'type': 'nominal', 'field': 'column'},
{'type': 'quantitative', 'field': 'iteration'},
{'type': 'ordinal', 'field': 'level'},
{'type': 'quantitative', 'field': 'm'},
{'type': 'quantitative', 'field': 'u'},
{'type': 'quantitative', 'field': 'normalised_adjustment'}
"transform": [
{"calculate": "(log(datum.bayes_factor) / log(2))", "as": "logk"}
],
'x': {
'type': 'ordinal',
'axis': {'title': 'Iteration'},
'field': 'iteration'
},
'y': {
'type': 'quantitative',
'axis': {'title': 'Influence on match probability'},
'field': 'normalised_adjustment',
'scale': {'domain': [-0.5, 0.5]}
}
"height": 150
}
}
]
}
],
],
}],
'title': {'text': None, 'orient': 'top', 'dx': 200},
'data': {'values': None}
}
Loading

0 comments on commit 8d24349

Please sign in to comment.