In [None]:
import pandas as pd

In [None]:
df_model_00_rules_based = pd.read_parquet(
    "s3://alpha-splink-synthetic-data/charts/person/v01/uk_citizens_max_groupsize_20/model_00_rules_based/2021-01-01/truth_space_table/"
)

In [None]:
row = df_model_00_rules_based.loc[1].copy()
cols = ["P", "N", "TP", "TN", "FP", "FN", "P_rate", "N_rate", "TP_rate", "TN_rate", "FN_rate", "precision"]
for c in cols:
    row[c] = ""
row["truth_threshold"] = 1.0

row["FP_rate"] = 0.0
row["TP_rate"] = 0.0
row["recall"] = 0.0

df_model_00_rules_based.loc[2] = row
df_model_00_rules_based

In [None]:
df_model_01_two_levels = pd.read_parquet(
    "s3://alpha-splink-synthetic-data/charts/person/v01/uk_citizens_max_groupsize_20/model_01_two_levels/2021-01-01/truth_space_table/"
)
df_model_01_two_levels.head(2)

In [None]:
df_model_02_fuzzy_simple = pd.read_parquet(
    "s3://alpha-splink-synthetic-data/charts/person/v01/uk_citizens_max_groupsize_20/model_02_fuzzy_simple/2021-01-01/truth_space_table/"
)

In [None]:
df_model_03_fuzzy_complex = pd.read_parquet(
    "s3://alpha-splink-synthetic-data/charts/person/v01/uk_citizens_max_groupsize_20/model_03_fuzzy_complex/2021-01-01/truth_space_table/"
)

In [None]:
df_model_04_fuzzy_complex_and_tf = pd.read_parquet(
    "s3://alpha-splink-synthetic-data/charts/person/v01/uk_citizens_max_groupsize_20/model_04_fuzzy_complex_and_tf/2021-01-01/truth_space_table/"
)

In [None]:
model_05_fuzzy_complex_and_tf_weights = pd.read_parquet(
    "s3://alpha-splink-synthetic-data/charts/person/v01/uk_citizens_max_groupsize_20/model_05_fuzzy_complex_and_tf_weights/2021-01-01/truth_space_table/"
)

In [None]:
tts = {
    "model_00_rules_based":df_model_00_rules_based,
    "model_01_two_levels": df_model_01_two_levels,
    "model_02_fuzzy_simple": df_model_02_fuzzy_simple,
    "model_03_fuzzy_complex": df_model_03_fuzzy_complex,
    "model_04_fuzzy_complex_and_tf": df_model_04_fuzzy_complex_and_tf,
    "model_05_fuzzy_complex_and_tf_weights": model_05_fuzzy_complex_and_tf_weights,
}

In [None]:
roc_chart_tt(tts, x_domain=[0,0.05])


In [None]:
c = roc_chart_tt(tts, x_domain=[0,0.05])
c.save("/Users/robinlinacre/Documents/personal/robinl.github.io/src/mdx/comparing_splink_models/roc_comparison.vl.json")

In [None]:
c = roc_chart_tt(tts, x_domain=[0,0.6])
c.save("five_model_roc.html")

In [None]:
c.save("four_model_roc.json")

In [None]:
roc_chart_tt(tts, x_domain=[0,0.1])

In [None]:
roc_chart_tt(tts, x_domain=[0,0.01])

In [None]:
roc_chart_tt(tts, x_domain=[0,0.001])

In [None]:
c = roc_chart_tt(tts, x_domain=[0,0.01])

c.save("chart_def.json")

In [None]:
import boto3
from IPython.display import IFrame

s3_client = boto3.client("s3")
s3_client.download_file(
    "alpha-splink-synthetic-data",
    "charts/person/v01/uk_citizens_max_groupsize_20/model_01_two_levels/2021-01-01/roc.html",
    "roc1.html",
)
IFrame(
    src="./roc1.html", width=1400, height=500
)  # Show outputted html file in iframe in Juptyer

In [None]:
df = pd.read_parquet("s3://alpha-splink-synthetic-data/nodes/standardised_nodes/version=v01/input_datasets=uk_citizens_max_groupsize_20/job_name=basic/entity=person/snapshot_date=2021-01-01/")

In [None]:
len(df)

In [None]:
len(df['cluster'].unique())

In [None]:
pd.options.display.max_columns = 1000


cols = ['dob', 'birth_place', 'postcode',
       'gender', 'occupation', 'surname_std',
       'forename1_std', 'forename2_std']
to_keep = [
 "1", "4", "6", "8", "10", "17", "19"  
]
to_keep = ["Q34743-" + c for c in to_keep]

f2 = df["unique_id"].isin(to_keep)
print(df.loc[f2, cols].to_markdown(index=False))

In [None]:
from typing import Union

import altair as alt
import pandas as pd


def roc_chart_tt(
    df_truth_space: Union[pd.DataFrame, dict],
    threshold_actual: float = 0.5,
    x_domain: list = None,
    width: int = 400,
    height: int = 400,
):
    """Create a ROC chart from labelled data

    Args:
        df_truth_space (Union[DataFrame, dict]): A dataframe of the truth space, the
         output of the truth.truth_space_table function.  Or, a dict containing
            one such dataframe per key.  {'model 1': df1, 'model 2': df2}.  If a dict is provided, the
            ROC charts for each model will be plotted on the same figure.
        x_domain (list, optional): Domain for x axis. Defaults to None.
        width (int, optional):  Defaults to 400.
        height (int, optional):  Defaults to 400.

    """

    roc_chart_def = {
        "config": {"view": {"continuousWidth": 400, "continuousHeight": 300}},
        "data": {"name": "data-fadd0e93e9546856cbc745a99e65285d", "values": None},
        "mark": {"type": "line", "clip": True, "point": True},
        "encoding": {
            "tooltip": [
                {"type": "quantitative", "field": "truth_threshold"},
                {"type": "quantitative", "field": "FP_rate"},
                {"type": "quantitative", "field": "TP_rate"},
                {"type": "quantitative", "field": "TP"},
                {"type": "quantitative", "field": "TN"},
                {"type": "quantitative", "field": "FP"},
                {"type": "quantitative", "field": "FN"},
                {"type": "quantitative", "field": "precision"},
                {"type": "quantitative", "field": "recall"},
            ],
            "x": {
                "type": "quantitative",
                "field": "FP_rate",
                "sort": ["truth_threshold"],
                "title": "False Positive Rate amongst clerically reviewed records",
            },
            "y": {
                "type": "quantitative",
                "field": "TP_rate",
                "sort": ["truth_threshold"],
                "title": "True Positive Rate amongst clerically reviewed records",
            },
            "color": {
                "type": "nominal",
                "field": "roc_label",
            },
        },
        "selection": {
            "selector076": {
                "type": "interval",
                "bind": "scales",
                "encodings": ["x"],
            }
        },
        "height": height,
        "title": "Receiver operating characteristic curve",
        "width": width,
    }

    if type(df_truth_space) == pd.DataFrame:
        del roc_chart_def["encoding"]["color"]
        df_truth_space = {"model1": df_truth_space}

    dfs = []
    for key, df in df_truth_space.items():
        df["roc_label"] = key
        dfs.append(df)

    if not x_domain:

        f1 = df["FP_rate"] < 1.0
        filtered = df[f1]
        d1 = filtered["FP_rate"].max() * 1.5

        x_domain = [0, d1]

    roc_chart_def["encoding"]["x"]["scale"] = {"domain": x_domain}

    records = []
    for df in dfs:
        recs = df.to_dict(orient="records")
        records.extend(recs)

    roc_chart_def["data"]["values"] = records

    return alt.Chart.from_dict(roc_chart_def)

In [None]:
from dataengineeringutils3.s3 import read_json_from_s3
import splink
from splink.settings import Settings
models = ["model_01_two_levels",
"model_02_fuzzy_simple",
"model_03_fuzzy_complex",
"model_04_fuzzy_complex_and_tf"]

for model in  models:

    settings = read_json_from_s3(f"s3://alpha-splink-synthetic-data/model_training/person/v01/uk_citizens_max_groupsize_20/{model}/2021-01-01/combined_model/final_settings.json")

    s = Settings(settings)
    c = s.bayes_factor_chart()
    c.save(f"/Users/robinlinacre/Documents/personal/robinl.github.io/src/mdx/comparing_splink_models/{model}_bf.vl.json")