## Description

This notebook calculates the number of samples dropped at each step

In [1]:
import pandas as pd
from ag3 import release_data
from pathlib import Path

In [2]:
v3 = release_data()

In [3]:
tracking_dir = Path("../../vector-ops/tracking/release/v3")

In [4]:
qc_paths = [
    "wgs_sequence_qc/sequence_qc_filters_{sset}.tsv",
    "wgs_replicate_qc/replicate_qc_filters_{sset}.tsv",
    "wgs_population_qc/anomaly_qc_filters_{sset}.tsv",
    "wgs_population_qc/pca_qc_filters_{sset}.tsv"
]

In [5]:
all_contrib_sets = v3.all_sample_sets + ["AG1000G-GA-B"]
all_contrib_sets

['AG1000G-AO',
 'AG1000G-BF-A',
 'AG1000G-BF-B',
 'AG1000G-BF-C',
 'AG1000G-CD',
 'AG1000G-CF',
 'AG1000G-CI',
 'AG1000G-CM-A',
 'AG1000G-CM-B',
 'AG1000G-CM-C',
 'AG1000G-FR',
 'AG1000G-GA-A',
 'AG1000G-GH',
 'AG1000G-GM-A',
 'AG1000G-GM-B',
 'AG1000G-GM-C',
 'AG1000G-GN-A',
 'AG1000G-GN-B',
 'AG1000G-GQ',
 'AG1000G-GW',
 'AG1000G-KE',
 'AG1000G-ML-A',
 'AG1000G-ML-B',
 'AG1000G-MW',
 'AG1000G-MZ',
 'AG1000G-TZ',
 'AG1000G-UG',
 'AG1000G-X',
 'AG1000G-GA-B']

In [6]:
def read_qc_data(sample_set):
    
    print(sample_set, tracking_dir / qc_paths[0].format(sset=sample_set))
    
    return pd.concat(
        [pd.read_csv(tracking_dir / step.format(sset=sample_set), sep="\t", index_col="derived_sample_id") for step in qc_paths],
        axis=1, 
        sort=False)        

In [7]:
seq_df = pd.concat(
    [read_qc_data(x) for x in v3.all_sample_sets], axis=0)

AG1000G-AO ../../vector-ops/tracking/release/v3/wgs_sequence_qc/sequence_qc_filters_AG1000G-AO.tsv
AG1000G-BF-A ../../vector-ops/tracking/release/v3/wgs_sequence_qc/sequence_qc_filters_AG1000G-BF-A.tsv
AG1000G-BF-B ../../vector-ops/tracking/release/v3/wgs_sequence_qc/sequence_qc_filters_AG1000G-BF-B.tsv
AG1000G-BF-C ../../vector-ops/tracking/release/v3/wgs_sequence_qc/sequence_qc_filters_AG1000G-BF-C.tsv
AG1000G-CD ../../vector-ops/tracking/release/v3/wgs_sequence_qc/sequence_qc_filters_AG1000G-CD.tsv
AG1000G-CF ../../vector-ops/tracking/release/v3/wgs_sequence_qc/sequence_qc_filters_AG1000G-CF.tsv
AG1000G-CI ../../vector-ops/tracking/release/v3/wgs_sequence_qc/sequence_qc_filters_AG1000G-CI.tsv
AG1000G-CM-A ../../vector-ops/tracking/release/v3/wgs_sequence_qc/sequence_qc_filters_AG1000G-CM-A.tsv
AG1000G-CM-B ../../vector-ops/tracking/release/v3/wgs_sequence_qc/sequence_qc_filters_AG1000G-CM-B.tsv
AG1000G-CM-C ../../vector-ops/tracking/release/v3/wgs_sequence_qc/sequence_qc_filters_AG1

In [8]:
# create an object that tells us which step failed, and ignore subsequent failures.
int_fail_df = ((seq_df == "FAIL").cumsum(axis=1) == 1) & (seq_df == "FAIL")

In [9]:
# check the max fail is 1 per row.
assert int_fail_df.sum(1).max() == 1

In [10]:
int_fail_df.sum(0)

FILTER_frac_genome_cov       227
FILTER_median_cov            154
FILTER_contamination         192
FILTER_nosexcall              25
FILTER_divergence             17
FILTER_fail_replicateqc        8
FILTER_second_rep_hi_skew    407
FILTER_anomaly                82
FILTER_pca                    26
dtype: int64

In [11]:
int_fail_df[int_fail_df.columns[:4]].sum(0).sum()

598

In [12]:
int_fail_df[int_fail_df.columns[:4]].sum(0)

FILTER_frac_genome_cov    227
FILTER_median_cov         154
FILTER_contamination      192
FILTER_nosexcall           25
dtype: int64

In [13]:
227 + 183

410

In [14]:
n_fail = int_fail_df.sum(axis=0).sum()
n_fail

1138

In [15]:
n_all_pass = (seq_df == "PASS").all(axis=1).sum()
n_all_pass

3483

In [16]:
n_all_pass / (n_all_pass + n_fail)

0.7537329582341484

## What happened to samples in phase 2?

In [17]:
phase2_samples = pd.read_csv("../data/phase2_samples.meta.txt", sep="\t", index_col=0)

In [18]:
phase2_filter_reasons = int_fail_df.loc[phase2_samples.index]
phase2_filter_reasons.sum(0)


FILTER_frac_genome_cov         0
FILTER_median_cov              0
FILTER_contamination           0
FILTER_nosexcall               1
FILTER_divergence              0
FILTER_fail_replicateqc        4
FILTER_second_rep_hi_skew    172
FILTER_anomaly                 0
FILTER_pca                     4
dtype: int64

In [19]:
# find an example of a phase 2 samples that failed 2nd replicate filter and check makes sense.
phase2_filter_reasons.query("FILTER_second_rep_hi_skew").head()

Unnamed: 0_level_0,FILTER_frac_genome_cov,FILTER_median_cov,FILTER_contamination,FILTER_nosexcall,FILTER_divergence,FILTER_fail_replicateqc,FILTER_second_rep_hi_skew,FILTER_anomaly,FILTER_pca
ox_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AB0085-C,False,False,False,False,False,False,True,False,False
AB0171-C,False,False,False,False,False,False,True,False,False
AB0172-C,False,False,False,False,False,False,True,False,False
AB0175-C,False,False,False,False,False,False,True,False,False
AB0176-C,False,False,False,False,False,False,True,False,False


In [20]:
seq_df.loc[["AB0085-C", "AB0085-Cx"]]

Unnamed: 0_level_0,FILTER_frac_genome_cov,FILTER_median_cov,FILTER_contamination,FILTER_nosexcall,FILTER_divergence,FILTER_fail_replicateqc,FILTER_second_rep_hi_skew,FILTER_anomaly,FILTER_pca
derived_sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AB0085-C,PASS,PASS,PASS,PASS,PASS,PASS,FAIL,PASS,PASS
AB0085-Cx,PASS,PASS,PASS,PASS,PASS,PASS,PASS,PASS,PASS


In [21]:
seq_df.loc[["AB0171-C", "AB0171-Cx"]]

Unnamed: 0_level_0,FILTER_frac_genome_cov,FILTER_median_cov,FILTER_contamination,FILTER_nosexcall,FILTER_divergence,FILTER_fail_replicateqc,FILTER_second_rep_hi_skew,FILTER_anomaly,FILTER_pca
derived_sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AB0171-C,PASS,PASS,PASS,PASS,PASS,PASS,FAIL,PASS,PASS
AB0171-Cx,PASS,PASS,PASS,PASS,PASS,PASS,PASS,PASS,PASS


In [22]:
seq_df.loc[["AB0176-C", "AB0176-Cx"]]

Unnamed: 0_level_0,FILTER_frac_genome_cov,FILTER_median_cov,FILTER_contamination,FILTER_nosexcall,FILTER_divergence,FILTER_fail_replicateqc,FILTER_second_rep_hi_skew,FILTER_anomaly,FILTER_pca
derived_sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AB0176-C,PASS,PASS,PASS,PASS,PASS,PASS,FAIL,PASS,PASS
AB0176-Cx,PASS,PASS,PASS,PASS,PASS,PASS,PASS,PASS,PASS
