## Description

Update 21/3/2021 deprecate the ag3 module in favour of the public malaraigen one.

This notebook calculates the number of samples dropped at each step

In [1]:
import malariagen_data
import pandas as pd
from pathlib import Path

In [2]:
tracking_dir = Path("../../vector-ops/tracking/release/v3")

In [3]:
qc_paths = [
    "wgs_sequence_qc/sequence_qc_filters_{sset}.tsv",
    "wgs_replicate_qc/replicate_qc_filters_{sset}.tsv",
    "wgs_population_qc/anomaly_qc_filters_{sset}.tsv",
    "wgs_population_qc/pca_qc_filters_{sset}.tsv"
]

In [4]:
ag3 = malariagen_data.Ag3("gs://vo_agam_release/")
ag3

<malariagen_data.ag3.Ag3 at 0x7fb20fd31880>

In [5]:
all_wild_sets = ag3.sample_sets("v3").sample_set.tolist()
all_wild_sets += ["AG1000G-GA-B"]
all_wild_sets.remove("AG1000G-X")

In [6]:
def read_qc_data(sample_set):
    
    print(sample_set, tracking_dir / qc_paths[0].format(sset=sample_set))
    
    return pd.concat(
        [pd.read_csv(tracking_dir / step.format(sset=sample_set), sep="\t", index_col="derived_sample_id") for step in qc_paths],
        axis=1, 
        sort=False)        

In [7]:
seq_df = pd.concat(
    [read_qc_data(x) for x in all_wild_sets], axis=0)

AG1000G-AO ../../vector-ops/tracking/release/v3/wgs_sequence_qc/sequence_qc_filters_AG1000G-AO.tsv
AG1000G-BF-A ../../vector-ops/tracking/release/v3/wgs_sequence_qc/sequence_qc_filters_AG1000G-BF-A.tsv
AG1000G-BF-B ../../vector-ops/tracking/release/v3/wgs_sequence_qc/sequence_qc_filters_AG1000G-BF-B.tsv
AG1000G-BF-C ../../vector-ops/tracking/release/v3/wgs_sequence_qc/sequence_qc_filters_AG1000G-BF-C.tsv
AG1000G-CD ../../vector-ops/tracking/release/v3/wgs_sequence_qc/sequence_qc_filters_AG1000G-CD.tsv
AG1000G-CF ../../vector-ops/tracking/release/v3/wgs_sequence_qc/sequence_qc_filters_AG1000G-CF.tsv
AG1000G-CI ../../vector-ops/tracking/release/v3/wgs_sequence_qc/sequence_qc_filters_AG1000G-CI.tsv
AG1000G-CM-A ../../vector-ops/tracking/release/v3/wgs_sequence_qc/sequence_qc_filters_AG1000G-CM-A.tsv
AG1000G-CM-B ../../vector-ops/tracking/release/v3/wgs_sequence_qc/sequence_qc_filters_AG1000G-CM-B.tsv
AG1000G-CM-C ../../vector-ops/tracking/release/v3/wgs_sequence_qc/sequence_qc_filters_AG1

In [8]:
seq_df.shape

(3964, 9)

In [9]:
# create an object that tells us which step failed, and ignore subsequent failures.
int_fail_df = ((seq_df == "FAIL").cumsum(axis=1) == 1) & (seq_df == "FAIL")

In [10]:
# check the max fail is 1 per row.
assert int_fail_df.sum(1).max() == 1

In [11]:
int_fail_df.sum(0)

FILTER_frac_genome_cov       220
FILTER_median_cov            178
FILTER_contamination         219
FILTER_nosexcall              25
FILTER_divergence             17
FILTER_fail_replicateqc        8
FILTER_second_rep_hi_skew    403
FILTER_anomaly                82
FILTER_pca                    28
dtype: int64

In [12]:
int_fail_df[int_fail_df.columns[:4]].sum(0).sum()

642

In [13]:
int_fail_df[int_fail_df.columns[:4]].sum(0)

FILTER_frac_genome_cov    220
FILTER_median_cov         178
FILTER_contamination      219
FILTER_nosexcall           25
dtype: int64

In [14]:
n_fail = int_fail_df.sum(axis=0).sum()
n_fail

1180

In [15]:
n_all_pass = (seq_df == "PASS").all(axis=1).sum()
n_all_pass

2784

In [16]:
(n_all_pass + n_fail)

3964

In [17]:
n_all_pass / (n_all_pass + n_fail)

0.7023208879919274

In [18]:
metadata = ag3.sample_metadata(sample_sets="v3_wild")

In [19]:
metadata.shape

(2784, 17)

## What about the crosses?

In [20]:
qc_paths 

['wgs_sequence_qc/sequence_qc_filters_{sset}.tsv',
 'wgs_replicate_qc/replicate_qc_filters_{sset}.tsv',
 'wgs_population_qc/anomaly_qc_filters_{sset}.tsv',
 'wgs_population_qc/pca_qc_filters_{sset}.tsv']

In [21]:
qc_paths += ["wgs_cross_selection_qc/cross_selection_filters_{sset}.tsv"]

In [22]:
crosses_qc = read_qc_data("AG1000G-X") == "PASS"

AG1000G-X ../../vector-ops/tracking/release/v3/wgs_sequence_qc/sequence_qc_filters_AG1000G-X.tsv


In [23]:
crosses_qc.shape

(729, 10)

In [24]:
crosses_qc["FILTER_cross_selected"].sum()

298

In [25]:
(crosses_qc.query("FILTER_cross_selected") == "FAIL")

Unnamed: 0_level_0,FILTER_frac_genome_cov,FILTER_median_cov,FILTER_contamination,FILTER_nosexcall,FILTER_divergence,FILTER_fail_replicateqc,FILTER_second_rep_hi_skew,FILTER_anomaly,FILTER_pca,FILTER_cross_selected
derived_sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AC0300-C,False,False,False,False,False,False,False,False,False,False
AC0301-C,False,False,False,False,False,False,False,False,False,False
AC0302-C,False,False,False,False,False,False,False,False,False,False
AC0303-C,False,False,False,False,False,False,False,False,False,False
AC0304-C,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
AD0494-C,False,False,False,False,False,False,False,False,False,False
AD0495-C,False,False,False,False,False,False,False,False,False,False
AD0496-C,False,False,False,False,False,False,False,False,False,False
AD0497-C,False,False,False,False,False,False,False,False,False,False


In [26]:
x_meta = ag3.sample_metadata("AG1000G-X")

In [27]:
x_meta.shape

(297, 17)

In [28]:
metadata.shape[0] + x_meta.shape[0]

3081

In [29]:
3081 - 1142

1939

## What happened to samples in phase 2?

In [30]:
phase2_samples = pd.read_csv("../data/phase2_samples.meta.txt", sep="\t", index_col=0)

In [31]:
phase2_filter_reasons = int_fail_df.loc[phase2_samples.index]
phase2_filter_reasons.sum(0)


FILTER_frac_genome_cov         0
FILTER_median_cov              0
FILTER_contamination           0
FILTER_nosexcall               1
FILTER_divergence              0
FILTER_fail_replicateqc        4
FILTER_second_rep_hi_skew    172
FILTER_anomaly                 0
FILTER_pca                     4
dtype: int64

In [32]:
# find an example of a phase 2 samples that failed 2nd replicate filter and check makes sense.
phase2_filter_reasons.query("FILTER_second_rep_hi_skew").head()

Unnamed: 0_level_0,FILTER_frac_genome_cov,FILTER_median_cov,FILTER_contamination,FILTER_nosexcall,FILTER_divergence,FILTER_fail_replicateqc,FILTER_second_rep_hi_skew,FILTER_anomaly,FILTER_pca
ox_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AB0085-C,False,False,False,False,False,False,True,False,False
AB0171-C,False,False,False,False,False,False,True,False,False
AB0172-C,False,False,False,False,False,False,True,False,False
AB0175-C,False,False,False,False,False,False,True,False,False
AB0176-C,False,False,False,False,False,False,True,False,False


In [33]:
seq_df.loc[["AB0085-C", "AB0085-Cx"]]

Unnamed: 0_level_0,FILTER_frac_genome_cov,FILTER_median_cov,FILTER_contamination,FILTER_nosexcall,FILTER_divergence,FILTER_fail_replicateqc,FILTER_second_rep_hi_skew,FILTER_anomaly,FILTER_pca
derived_sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AB0085-C,PASS,PASS,PASS,PASS,PASS,PASS,FAIL,PASS,PASS
AB0085-Cx,PASS,PASS,PASS,PASS,PASS,PASS,PASS,PASS,PASS


In [34]:
seq_df.loc[["AB0171-C", "AB0171-Cx"]]

Unnamed: 0_level_0,FILTER_frac_genome_cov,FILTER_median_cov,FILTER_contamination,FILTER_nosexcall,FILTER_divergence,FILTER_fail_replicateqc,FILTER_second_rep_hi_skew,FILTER_anomaly,FILTER_pca
derived_sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AB0171-C,PASS,PASS,PASS,PASS,PASS,PASS,FAIL,PASS,PASS
AB0171-Cx,PASS,PASS,PASS,PASS,PASS,PASS,PASS,PASS,PASS


In [35]:
seq_df.loc[["AB0176-C", "AB0176-Cx"]]

Unnamed: 0_level_0,FILTER_frac_genome_cov,FILTER_median_cov,FILTER_contamination,FILTER_nosexcall,FILTER_divergence,FILTER_fail_replicateqc,FILTER_second_rep_hi_skew,FILTER_anomaly,FILTER_pca
derived_sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AB0176-C,PASS,PASS,PASS,PASS,PASS,PASS,FAIL,PASS,PASS
AB0176-Cx,PASS,PASS,PASS,PASS,PASS,PASS,PASS,PASS,PASS
