# Performance assessment


## Determine binary classification statistics 

For the v3 site filter, vs:
- phase 2 accessibility map
- permissive (100% pass, 0 fail)

based on:
- Mendelian error in all crosses (autosomes)
- male het calls (`n_male_het`, `all_males_called`) will be used instead of `mendel_error`

In [1]:
# Imports:
import gcsfs
import zarr
import dask.array as da
from dask.distributed import Client
import pandas as pd
pd.set_option('display.float_format', '{:.3E}'.format)
import numpy as np
import numba
from pathlib import Path

In [2]:
!pip install -q malariagen-data

In [3]:
import malariagen_data

In [4]:
from dask_kubernetes import KubeCluster
from dask.distributed import Client

In [5]:
import ag3, ag2

  import pandas.util.testing as tm


In [6]:
cluster = KubeCluster(n_workers=40)
cluster

distributed.scheduler - INFO - Clear task state
distributed.scheduler - INFO -   Scheduler at:   tcp://10.34.4.143:41071
distributed.scheduler - INFO -   dashboard at:                     :8787


VBox(children=(HTML(value='<h2>KubeCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n    .…

In [7]:
client = Client(cluster)
client

distributed.scheduler - INFO - Receive client connection: Client-0e48acf8-8c8e-11eb-841d-6e2ef4cab083
distributed.core - INFO - Starting established connection


0,1
Client  Scheduler: tcp://10.34.4.143:41071  Dashboard: /user/nicholasharding/proxy/8787/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


In [8]:
# Settings:
cross_family_ids = ['18-5', '29-2', '36-9', '37-3', '42-4', '45-1', '46-9', '47-6', '73-2', '78-2', '80-2', 'B5', 'K2', 'K4', 'K6']
chrom_arms = ['2R', '2L', '3R', '3L']

In [9]:
eval_family_ids = ['B5', 'K6', '73-2', '45-1', '37-3']

In [10]:
cross_family_ids

['18-5',
 '29-2',
 '36-9',
 '37-3',
 '42-4',
 '45-1',
 '46-9',
 '47-6',
 '73-2',
 '78-2',
 '80-2',
 'B5',
 'K2',
 'K4',
 'K6']

## Determine which crosses stats (inc. Mendelian error) are predicted by each site filter

Populate the confusion matrix, such that:
- If the crosses stats imply that a position is accessible ("good") and the site filter also says it passes, then mark the prediction as a `True Positive` (`TP`)
- If the crosses stats imply that a position is accessible ("good") but the site filter says it fails, then mark the prediction as a `False Negative` (`FN`)
- If the crosses stats imply that a position is inaccessible ("bad") and the site filter also says it fails, then mark the prediction as a `True Negative` (`TN`)
- If the crosses stats imply that a position is inaccessible ("bad") but the site filter says it passes, then mark the prediction as a `False Positive` (`FP`)
- If the crosses stats cannot determine whether a position is "good" or "bad" (e.g. there were no male het calls on the X chromosome, but not all samples were called), then mark the prediction result as undetermined and exclude from subsequent stats.

In [11]:
@numba.njit
def cross_tab_values(eval_arr, test_arr):
    
    out = np.zeros((2, 2))
    
    # TN FP
    # FN TP
    
    # eval arr has 
    for i in range(eval_arr.shape[0]):
        if eval_arr[i] == -1:
            continue
        
        if test_arr[i]:
            out[eval_arr[i], 1] += 1
        else:
            out[eval_arr[i], 0] += 1
    
    return out.reshape((1, 2, 2))

In [12]:
ag3_release_data = ag3.release_data()

def load_phase3_accessibility(chrom, mask_id='gamb_colu'):
    return ag3_release_data.load_mask(chrom, mask_id=mask_id)

In [13]:
ag2_release_data = ag2.release_data()

def load_phase2_accessibility(chrom):
    
    phase2_is_accessible = ag2_release_data.load_is_accessible(chrom)
    filter_n = ag2_release_data.load_filter_n(chrom)
    phase2_is_accessible_nonN = da.compress(~filter_n, phase2_is_accessible, axis=0)

    # Compute chunk sizes avoids this error when concatenating arrays: ValueError: Arrays chunk sizes are unknown: (nan,)
    phase2_is_accessible_nonN.compute_chunk_sizes() 
    return phase2_is_accessible_nonN

In [14]:
def load_ground_truth(chrom, xid):
    
    if chrom == "X":
        raise ValueError("X Not supported as hemizygous")
    
    me = ag3_release_data.load_crosses(chrom, xid, 'mendel_error')
    called = ag3_release_data.load_crosses(chrom, xid, 'n_samples_called')

    training_negative = ag3_release_data.load_mask(chrom, mask_id='gamb_colu', field='training_negative')
    training_positive = ag3_release_data.load_mask(chrom, mask_id='gamb_colu', field='training_positive')
    training_mask = training_negative | training_positive

    # Mark training sites as undetermined (-1)
    return da.where(training_mask, -1, da.where(me > 0, 0, da.where(called == da.max(called), 1, -1)))


In [15]:
def calculate_metrics_of_interest(true_neg, false_pos, false_neg, true_pos):
    
    r = {}
    r["tpr"] = true_pos / (true_pos + false_neg)
    r["fpr"] = false_pos / (false_pos + true_neg)
    r["fdr"] = false_pos / (true_pos + false_pos)
    r["tnr"] = 1 - (false_pos / (false_pos + true_neg))
    r["youden"] = r["tpr"] + r["tnr"] - 1
    r["frac_pass"] = (true_pos + false_pos) / (true_pos + false_pos + true_neg + false_neg)
    
    return pd.Series(r)

In [16]:
chrom_arms

['2R', '2L', '3R', '3L']

In [17]:
cross_family_ids

['18-5',
 '29-2',
 '36-9',
 '37-3',
 '42-4',
 '45-1',
 '46-9',
 '47-6',
 '73-2',
 '78-2',
 '80-2',
 'B5',
 'K2',
 'K4',
 'K6']

In [18]:
desired_chunks = (500_000,)

In [19]:
%%time
data_holder = {}

for chrom_arm in chrom_arms:
        
    # varies by chrom only, but need to be chunked depending on ground_truth
    alternative_calls = {
        "phase2": load_phase2_accessibility(chrom_arm).rechunk(desired_chunks),
        "phase3": load_phase3_accessibility(chrom_arm).rechunk(desired_chunks)
    }
    
    alternative_calls["r100"] = da.ones(alternative_calls["phase2"].shape[0], dtype=np.bool, chunks=desired_chunks)
    
    chunks = ((1, ) * alternative_calls["phase2"].numblocks[0], 2, 2)
#     x = da.map_blocks(
#         cross_tab_values, 
#         alternative_calls["phase2"].astype("int"), 
#         alternative_calls["phase3"], 
#         chunks=chunks,
#         dtype=np.int32,
#         new_axis=[1, 2]).sum(axis=0).compute()

    for cross_id in cross_family_ids:
        
        print("processing", chrom_arm, cross_id)

        # varies by cross and chrom
        ground_truth = load_ground_truth(chrom_arm, cross_id).rechunk(desired_chunks)


        for key, eval_data in alternative_calls.items():
            
            if (key, chrom_arm, cross_id) in data_holder:
                print("skipping", (key, chrom_arm, cross_id))
                continue

            chunks = ((1, ) * ground_truth.numblocks[0], 2, 2)
            q = da.map_blocks(
                cross_tab_values, 
                ground_truth, 
                eval_data, 
                chunks=chunks, 
                dtype=np.int32, 
                new_axis=[1, 2]).sum(axis=0)

            (tn, fp), (fn, tp) = q.compute()

            # frac accessible is simply TPs + FPs. / all
            #res["frac_accessible"] = eval_data.mean().compute()

            data_holder[key, chrom_arm, cross_id] = pd.Series(
                [tn, fp, fn, tp], dtype=np.int32, index=["TN", "FP", "FN", "TP"])


distributed.scheduler - INFO - Register tcp://10.33.119.18:38485
distributed.scheduler - INFO - Starting worker compute stream, tcp://10.33.119.18:38485
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Register tcp://10.34.9.120:41671
distributed.scheduler - INFO - Starting worker compute stream, tcp://10.34.9.120:41671
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Register tcp://10.34.129.10:35471
distributed.scheduler - INFO - Starting worker compute stream, tcp://10.34.129.10:35471
distributed.core - INFO - Starting established connection


processing 2R 18-5
processing 2R 29-2
processing 2R 36-9
processing 2R 37-3
processing 2R 42-4
processing 2R 45-1
processing 2R 46-9
processing 2R 47-6
processing 2R 73-2


distributed.scheduler - INFO - Register tcp://10.35.8.2:43201
distributed.scheduler - INFO - Starting worker compute stream, tcp://10.35.8.2:43201
distributed.core - INFO - Starting established connection


processing 2R 78-2
processing 2R 80-2
processing 2R B5


distributed.scheduler - INFO - Register tcp://10.35.32.2:43105
distributed.scheduler - INFO - Starting worker compute stream, tcp://10.35.32.2:43105
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Register tcp://10.35.10.2:43955
distributed.scheduler - INFO - Starting worker compute stream, tcp://10.35.10.2:43955
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Register tcp://10.35.28.2:46605
distributed.scheduler - INFO - Starting worker compute stream, tcp://10.35.28.2:46605
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Register tcp://10.35.39.2:45687
distributed.scheduler - INFO - Starting worker compute stream, tcp://10.35.39.2:45687
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Register tcp://10.35.20.2:39473
distributed.scheduler - INFO - Starting worker compute stream, tcp://10.35.20.2:39473
distributed.core - INFO

processing 2R K2


distributed.scheduler - INFO - Register tcp://10.35.22.2:41389
distributed.scheduler - INFO - Starting worker compute stream, tcp://10.35.22.2:41389
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Register tcp://10.35.30.2:36687
distributed.scheduler - INFO - Starting worker compute stream, tcp://10.35.30.2:36687
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Register tcp://10.35.34.2:36749
distributed.scheduler - INFO - Starting worker compute stream, tcp://10.35.34.2:36749
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Register tcp://10.35.11.2:43019
distributed.scheduler - INFO - Starting worker compute stream, tcp://10.35.11.2:43019
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Register tcp://10.35.41.2:43131
distributed.scheduler - INFO - Starting worker compute stream, tcp://10.35.41.2:43131
distributed.core - INFO

processing 2R K4


distributed.scheduler - INFO - Register tcp://10.35.44.2:35089
distributed.scheduler - INFO - Starting worker compute stream, tcp://10.35.44.2:35089
distributed.core - INFO - Starting established connection


processing 2R K6
processing 2L 18-5
processing 2L 29-2
processing 2L 36-9
processing 2L 37-3
processing 2L 42-4
processing 2L 45-1
processing 2L 46-9


distributed.scheduler - INFO - Register tcp://10.35.27.2:36847
distributed.scheduler - INFO - Starting worker compute stream, tcp://10.35.27.2:36847
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Remove worker tcp://10.35.22.2:41389
distributed.core - INFO - Removing comms to tcp://10.35.22.2:41389
distributed.batched - INFO - Batched Comm Closed: in <closed TCP>: ConnectionResetError: [Errno 104] Connection reset by peer


processing 2L 47-6
processing 2L 73-2
processing 2L 78-2
processing 2L 80-2




processing 2L B5




processing 2L K2




processing 2L K4




processing 2L K6




processing 3R 18-5




processing 3R 29-2




processing 3R 36-9




processing 3R 37-3




processing 3R 42-4




processing 3R 45-1




processing 3R 46-9




processing 3R 47-6




processing 3R 73-2




processing 3R 78-2




processing 3R 80-2




processing 3R B5




processing 3R K2




processing 3R K4




processing 3R K6




processing 3L 18-5




processing 3L 29-2




processing 3L 36-9




processing 3L 37-3




processing 3L 42-4




processing 3L 45-1




processing 3L 46-9




processing 3L 47-6
processing 3L 73-2
processing 3L 78-2
processing 3L 80-2
processing 3L B5
processing 3L K2
processing 3L K4
processing 3L K6
CPU times: user 10min 22s, sys: 32.7 s, total: 10min 54s
Wall time: 13min 57s


In [20]:
df = pd.concat(data_holder, axis=0, sort=False, names=["description", "chrom", "cross_id", "metric"])
df.name = "value"

## List of tables

1. Autosomes: Phase 3 vs unfiltered. FDR, TNR, ie 4 cols x (5 x 4) rows
2. X: Phase 3 vs unfiltered
3. Autosomes: Phase 2 vs phase 3. FDR / % accessible / Youden.
4. X: Phase 2 va phase 3
5. Accessibility summary

Accessibility is shifted to separate section, where we can pull arab + gamb_colu_arab.

In [21]:
output = pd.pivot_table(
    pd.DataFrame(df).reset_index(level="metric"), 
    index=["description", "chrom", "cross_id"], 
    columns="metric", 
    values="value")

In [22]:
output

Unnamed: 0_level_0,Unnamed: 1_level_0,metric,FN,FP,TN,TP
description,chrom,cross_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
phase2,2L,18-5,15317681,114016,428818,27812440
phase2,2L,29-2,14162876,163874,579996,27716841
phase2,2L,36-9,14892225,46364,426516,27894179
phase2,2L,37-3,14991818,56932,476413,27858836
phase2,2L,42-4,15420518,25529,253127,27959521
...,...,...,...,...,...,...
r100,3R,80-2,0,490412,0,45845877
r100,3R,B5,0,449260,0,45832308
r100,3R,K2,0,487009,0,45992770
r100,3R,K4,0,428585,0,45997772


In [23]:
output_all_chroms = output.groupby(level=[0, 2]).agg(np.sum)

In [24]:
output_summary = output.apply(lambda y: calculate_metrics_of_interest(*y), axis=1)

In [25]:
output_all_chroms_summary = output_all_chroms.apply(lambda y: calculate_metrics_of_interest(*y), axis=1)

## generate table 1.

In [38]:
f = output_summary.reorder_levels([2, 0, 1]).loc[eval_family_ids]
f = f.drop("phase2", level=1)

f.rename({
    "fdr": "FalseDiscoveryRate", 
    "tpr": "TruePositiveRate"}, axis=1, inplace=True)
f.rename({"phase3": "site_filters", "r100": "unfiltered"}, level=1, inplace=True)

tab1 = f.reset_index().pivot(
    index=["chrom", "cross_id"], 
    columns=["description"], 
    values=["FalseDiscoveryRate", "TruePositiveRate"])
tab1.columns = [".".join(s) for s in tab1.columns.to_flat_index()]

## generate table 3

In [42]:
f = output_summary.reorder_levels([2, 0, 1]).loc[eval_family_ids]
f = f.drop("r100", level=1)

f.rename({
    "fdr": "FalseDiscoveryRate", 
    "tpr": "TruePositiveRate",
    "youden": "Youden_J"}, axis=1, inplace=True)
f.rename({"phase3": "Phase3_filters", "phase2": "Phase2_filters"}, level=1, inplace=True)

tab3 = f.reset_index().pivot(
    index=["chrom", "cross_id"], 
    columns=["description"], 
    values=["FalseDiscoveryRate", "TruePositiveRate", "Youden_J"])


tab3.columns = [".".join(s) for s in tab3.columns.to_flat_index()]

In [43]:
tab1.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,FalseDiscoveryRate.site_filters,FalseDiscoveryRate.unfiltered,TruePositiveRate.site_filters,TruePositiveRate.unfiltered
chrom,cross_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2L,37-3,0.001093,0.01229,0.9864,1.0
2L,45-1,0.0007734,0.01013,0.9886,1.0
2L,73-2,0.0005383,0.0103,0.9883,1.0
2L,B5,0.0005832,0.007267,0.9917,1.0
2L,K6,0.0004129,0.007422,0.9915,1.0


In [64]:
tab1.to_csv(
    "../content/tables/site-filters/table1_performance.csv", float_format='%.4E')

In [65]:
tab3.to_csv(
    "../content/tables/site-filters/table2_phase2vsphase3.csv", float_format='%.4E')

## Numbers for paper

In [50]:
pd.set_option('display.float_format', '{:.5f}'.format)

In [51]:
output_all_chroms_summary.loc["phase3"].loc[eval_family_ids].apply([np.max, np.min]) * 100

Unnamed: 0,tpr,fpr,fdr,tnr,youden,frac_pass
amax,99.11868,0.44761,0.10352,99.8372,98.95588,81.17613
amin,98.7452,0.1628,0.03709,99.55239,98.29759,80.45451


In [52]:
output_all_chroms_summary.loc["r100"].loc[eval_family_ids].apply([np.max, np.min]) * 100

Unnamed: 0,tpr,fpr,fdr,tnr,youden,frac_pass
amax,100.0,100.0,1.1046,0.0,0.0,100.0
amin,100.0,100.0,0.74891,0.0,0.0,100.0


In [53]:
output_all_chroms_summary.loc["r100"].loc[eval_family_ids].fdr / output_all_chroms_summary.loc["phase3"].loc[eval_family_ids].fdr

cross_id
B5     13.13199
K6     20.19048
73-2   19.16880
45-1   13.49985
37-3   10.67007
Name: fdr, dtype: float64

In [54]:
output_all_chroms_summary.loc["phase2"].loc[eval_family_ids]

Unnamed: 0_level_0,tpr,fpr,fdr,tnr,youden,frac_pass
cross_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
B5,0.98937,0.00181,0.0008,0.99819,0.98756,0.68788
K6,0.98983,0.00134,0.00059,0.99866,0.98849,0.68962
73-2,0.98773,0.00291,0.00126,0.99709,0.98481,0.69223
45-1,0.98824,0.00361,0.00158,0.99639,0.98462,0.69077
37-3,0.98606,0.00447,0.00199,0.99553,0.98159,0.68603


## Check fraction accessible matches up

In [55]:
chromosomes = chrom_arms + ["X",]

In [56]:
contig_shape = pd.DataFrame(
    0,
    index=chromosomes, 
    columns=["phase2", "phase3:gamb_colu", "phase3:arab", "phase3:gamb_colu_arab"],
    dtype=int)

contig_sum_accessible = pd.DataFrame(
    0,
    index=chromosomes, 
    columns=["phase2", "phase3:gamb_colu", "phase3:arab", "phase3:gamb_colu_arab"],
    dtype=int)

In [57]:
for c in chromosomes:

    for mid in ["arab", "gamb_colu", "gamb_colu_arab"]:
        a = load_phase3_accessibility(c, mid)
        contig_sum_accessible.at[c, f"phase3:{mid}"] = a.sum().compute()
        contig_shape.at[c, f"phase3:{mid}"] = a.shape[0]
        
    a = load_phase2_accessibility(c)
    contig_sum_accessible.at[c, "phase2"] = a.sum().compute()
    contig_shape.at[c, "phase2"] = a.shape[0]

In [58]:
cluster.adapt()

<distributed.deploy.adaptive.Adaptive at 0x7f9f7a844d10>

In [59]:
accessibility_sumz = contig_sum_accessible / contig_shape
accessibility_sumz

Unnamed: 0,phase2,phase3:gamb_colu,phase3:arab,phase3:gamb_colu_arab
2R,0.65746,0.73903,0.73597,0.67454
2L,0.58099,0.74198,0.72577,0.67037
3R,0.62107,0.71227,0.6978,0.63935
3L,0.61206,0.70434,0.69681,0.6347
X,0.62462,0.6997,0.53318,0.4797


In [60]:
accessibility_sumz.to_csv("../content/tables/accessibility/table3_summary.csv")

## Phase 2 vs phase 3

In [61]:
p2stats = output_all_chroms_summary.loc["phase2"].loc[eval_family_ids]
p2stats

Unnamed: 0_level_0,tpr,fpr,fdr,tnr,youden,frac_pass
cross_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
B5,0.98937,0.00181,0.0008,0.99819,0.98756,0.68788
K6,0.98983,0.00134,0.00059,0.99866,0.98849,0.68962
73-2,0.98773,0.00291,0.00126,0.99709,0.98481,0.69223
45-1,0.98824,0.00361,0.00158,0.99639,0.98462,0.69077
37-3,0.98606,0.00447,0.00199,0.99553,0.98159,0.68603


In [62]:
p3stats = output_all_chroms_summary.loc["phase3"].loc[eval_family_ids]
p3stats

Unnamed: 0_level_0,tpr,fpr,fdr,tnr,youden,frac_pass
cross_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
B5,0.99084,0.00261,0.0006,0.99739,0.98823,0.80646
K6,0.99119,0.00163,0.00037,0.99837,0.98956,0.80883
73-2,0.98895,0.00223,0.00049,0.99777,0.98672,0.81176
45-1,0.98931,0.00306,0.00069,0.99694,0.98625,0.80965
37-3,0.98745,0.00448,0.00104,0.99552,0.98298,0.80455


In [63]:
p2stats.fdr - p3stats.fdr

cross_id
B5     0.00020
K6     0.00022
73-2   0.00077
45-1   0.00089
37-3   0.00096
Name: fdr, dtype: float64

distributed.scheduler - INFO - Retire worker names (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39)
distributed.scheduler - INFO - Retire workers {<Worker 'tcp://10.35.24.2:43567', memory: 0, processing: 0>, <Worker 'tcp://10.35.32.2:43105', memory: 0, processing: 0>, <Worker 'tcp://10.35.43.2:37911', memory: 0, processing: 0>, <Worker 'tcp://10.35.23.2:33803', memory: 0, processing: 0>, <Worker 'tcp://10.35.28.2:46605', memory: 0, processing: 0>, <Worker 'tcp://10.35.9.2:38957', memory: 0, processing: 0>, <Worker 'tcp://10.33.119.18:38485', memory: 0, processing: 0>, <Worker 'tcp://10.35.10.2:43955', memory: 0, processing: 0>, <Worker 'tcp://10.35.29.2:33017', memory: 0, processing: 0>, <Worker 'tcp://10.35.11.2:43019', memory: 0, processing: 0>, <Worker 'tcp://10.34.9.120:41671', memory: 0, processing: 0>, <Worker 'tcp://10.35.36.2:41741', memory: 0, processing: 0>, <Worker 'tcp://10