In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
import numpy as np
from itertools import combinations
import allel
import pandas as pd
from pathlib import Path

## Steps

1. Load in distance matrix and compute x/y

2. Drop samples that failed sample QC

3. For each set of replicates assert that the diff is low\*.
Exclude both members of pair if false.
Exclude one if true.

4. For each pair of non replicates assert the dist is not low\*.
Exclude pair if false
No action if 

\* low definition tbc.

In [None]:
threshold = 0.0005

In [None]:
sampleset = Path(".").absolute().name
sampleset

In [None]:
tracking_dir = Path(".")

In [None]:
manifest_fn = "/gcs/observatory/{sampleset}/manifest".format(sampleset=sampleset)

In [None]:
df = pd.read_csv(manifest_fn).rename({"sample_name": "derived_sample_id"}, axis=1)
df["FILTER_replicateqc"] = "PASS"
df["FILTER_isreplicate"] = "PASS"

In [None]:
df.head()

In [None]:
orig_samples = pd.read_csv(tracking_dir / 'original_samples.tsv', sep="\t")
deri_samples = pd.read_csv(tracking_dir / 'derived_samples.tsv', sep="\t")

In [None]:
a = np.load("replicate-qc-AG1000G-UG.npz")

In [None]:
list(a.keys())

In [None]:
dist = a["cityblock"].sum(axis=0) / a["nsites"].sum(axis=0)

In [None]:
# assume this is ok for now. Normally use the manifest
samples = df["derived_sample_id"].tolist()

In [None]:
pairs = list(combinations(range(len(samples)), 2))
npairs = len(pairs)
del pairs
del npairs

In [None]:
qc_filters = pd.read_csv(tracking_dir / "wgs_qc" / "qc_filters.tsv", sep="\t", index_col=0)

In [None]:
passing_samples = (qc_filters == "PASS").all(axis=1)
passing_samples.name = "PASS"

In [None]:
pass_df = passing_samples.reset_index().query("PASS")

In [None]:
pairs_pass = list(combinations(pass_df.index, 2))

In [None]:
same_list = []
diff_list = []

for i, j in pairs_pass:
    ix = allel.condensed_coords(i, j, len(samples))
    if samples[i][:8] == samples[j][:8]:
        
        same_list.append(dist[ix])
        if dist[ix] >= threshold:
            print(i, j, "failed", "expected pair", dist[ix])
            df.loc[i, "FILTER_replicateqc"] = "FAIL"
            df.loc[j, "FILTER_replicateqc"] = "FAIL"
        else:
            df.loc[j, "FILTER_isreplicate"] = "FAIL"
            
    else:
        diff_list.append(dist[ix])
        if dist[ix] < threshold:
            print(i, j, "failed", "unexpected closeness", dist[ix])
            df.loc[i, "FILTER_replicateqc"] = "FAIL"
            df.loc[j, "FILTER_replicateqc"] = "FAIL"        

In [None]:
(df == "FAIL").sum(axis=0)

In [None]:
f, ax = plt.subplots()
ax.grid(True)
sns.despine(ax=ax)
sns.distplot(np.array(same_list), ax=ax, label="within replicates")
sns.distplot(np.array(diff_list), ax=ax, label="outside replicates ")
ax.vlines([threshold], *ax.get_ylim(), linestyles="dashed")
ax.legend()

In [None]:
df.head()

In [None]:
df.to_csv(tracking_dir / 'wgs_qc/replicate_qc_filters.tsv', sep='\t', index=True)