## Description

This notebook calculates the number of samples dropped at each step

In [1]:
import pandas as pd
from ag3 import release_data
from pathlib import Path

In [2]:
v3 = release_data()

In [58]:
tracking_dir = Path("/home/jovyan/vector-ops/tracking/release/v3")

In [67]:
qc_paths = [
    "wgs_sequence_qc/sequence_qc_filters_{sset}.tsv",
    "wgs_replicate_qc/replicate_qc_filters_{sset}.tsv",
    "wgs_population_qc/anomaly_qc_filters_{sset}.tsv",
    "wgs_population_qc/pca_qc_filters_{sset}.tsv"
]

In [68]:
all_contrib_sets = v3.all_sample_sets + ["AG1000G-GA-B"]
all_contrib_sets

['AG1000G-AO',
 'AG1000G-BF-A',
 'AG1000G-BF-B',
 'AG1000G-BF-C',
 'AG1000G-CD',
 'AG1000G-CF',
 'AG1000G-CI',
 'AG1000G-CM-A',
 'AG1000G-CM-B',
 'AG1000G-CM-C',
 'AG1000G-FR',
 'AG1000G-GA-A',
 'AG1000G-GH',
 'AG1000G-GM-A',
 'AG1000G-GM-B',
 'AG1000G-GM-C',
 'AG1000G-GN-A',
 'AG1000G-GN-B',
 'AG1000G-GQ',
 'AG1000G-GW',
 'AG1000G-KE',
 'AG1000G-ML-A',
 'AG1000G-ML-B',
 'AG1000G-MW',
 'AG1000G-MZ',
 'AG1000G-TZ',
 'AG1000G-UG',
 'AG1000G-X',
 'AG1000G-GA-B']

In [95]:
def read_qc_data(sample_set):
    
    print(sample_set, tracking_dir / qc_paths[0].format(sset=sample_set))
    
    return pd.concat(
        [pd.read_csv(tracking_dir / step.format(sset=sample_set), sep="\t", index_col="derived_sample_id") for step in qc_paths],
        axis=1, 
        sort=False)        

In [96]:
seq_df.shape

(4693, 9)

In [97]:
seq_df

Unnamed: 0_level_0,FILTER_frac_genome_cov,FILTER_median_cov,FILTER_contamination,FILTER_nosexcall,FILTER_divergence,FILTER_fail_replicateqc,FILTER_second_rep_hi_skew,FILTER_anomaly,FILTER_pca
derived_sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AR0047-C,PASS,PASS,PASS,PASS,PASS,PASS,PASS,PASS,PASS
AR0091-C,PASS,FAIL,PASS,FAIL,PASS,PASS,PASS,PASS,PASS
AR0049-C,PASS,PASS,PASS,PASS,PASS,PASS,PASS,PASS,PASS
AR0051-C,PASS,PASS,PASS,PASS,PASS,PASS,PASS,PASS,PASS
AR0061-C,PASS,PASS,PASS,PASS,PASS,PASS,PASS,PASS,PASS
...,...,...,...,...,...,...,...,...,...
AS0105-CW,PASS,FAIL,FAIL,PASS,PASS,PASS,PASS,PASS,PASS
AS0090-CW,PASS,PASS,FAIL,FAIL,PASS,PASS,PASS,PASS,PASS
AS0106-CW,PASS,FAIL,PASS,FAIL,PASS,PASS,PASS,PASS,PASS
AS0093-CW,PASS,PASS,FAIL,FAIL,PASS,PASS,PASS,PASS,PASS


In [98]:
# create an object that tells us which step failed, and ignore subsequent failures.
int_fail_df = ((seq_df == "FAIL").cumsum(axis=1) == 1) & (seq_df == "FAIL")

In [99]:
# check the max fail is 1 per row.
assert int_fail_df.sum(1).max() == 1

In [100]:
int_fail_df.sum(0)

FILTER_frac_genome_cov       227
FILTER_median_cov            183
FILTER_contamination         229
FILTER_nosexcall              29
FILTER_divergence             17
FILTER_fail_replicateqc        8
FILTER_second_rep_hi_skew    407
FILTER_anomaly                82
FILTER_pca                    28
dtype: int64

In [101]:
int_fail_df[int_fail_df.columns[:4]].sum(0).sum()

668

In [102]:
int_fail_df[int_fail_df.columns[:4]].sum(0)

FILTER_frac_genome_cov    227
FILTER_median_cov         183
FILTER_contamination      229
FILTER_nosexcall           29
dtype: int64

In [103]:
227 + 183

410

In [108]:
n_fail = int_fail_df.sum(axis=0).sum()
n_fail

1210

In [107]:
n_all_pass = (seq_df == "PASS").all(axis=1).sum()
n_all_pass

3483

In [110]:
n_all_pass / (n_all_pass + n_fail)

0.7421691881525676