In [3]:
%matplotlib notebook

In [4]:
import os
import shutil
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import pandas as pd
import numpy as np
import subprocess as sp
import gzip
import pysam as ps
import csv
from pathlib import Path
from matplotlib.ticker import FormatStrFormatter
import yaml
import openpyxl

In [5]:
project = "Dong"

In [338]:
# Experiment directories and files
project_dir = Path().absolute().parents[1]
data_dir = project_dir / "data"
config_dir = project_dir / "config"
results_dir = project_dir / "results"
truth_dir = data_dir / "truth"
calls_dir = results_dir / "calls" / project
vcfeval_dir = results_dir / "eval" / project
figure_dir = results_dir / "figures"
tables_dir = results_dir / "tables"

all_callers = ["Octopus", "SCCaller", "prosolo"]

caller_colours = {
    'Octopus': sns.xkcd_rgb["windows blue"],
    'SCcaller': sns.xkcd_rgb["pale red"],
    'prosolo': sns.xkcd_rgb["faded green"]
}

caller_linestyles = {
    'Octopus': '',
    'SCcaller': (1,2),
    'prosolo': (6,3)
}

caller_pass_scores = {
    'Octopus': 5.,
    'SCcaller': None
}

In [7]:
def read_experiments(config_filename):
    with config_filename.open() as config_file:
        config = yaml.load(config_file, Loader=yaml.FullLoader)
        res = []
        for control in config["groups"]["controls"]:
            for test in config["groups"][config["test"]]:
                experiment = config.copy()
                experiment["group"] = config["test"]
                experiment["control"] = control
                experiment["test"] = test
                del experiment["groups"]
                res.append(experiment)
        return res

In [8]:
experiments = read_experiments(config_dir / "dong.yaml")

In [9]:
def read_vcfeval_performance_summary(rtg_vcfeval_dir):
    with (rtg_vcfeval_dir / 'summary.txt').open() as summary:
        lines = summary.readlines()
        header, stats = lines[0].strip().split(), lines[-1].strip().split()
        result = pd.DataFrame([stats[1:]], columns=header[1:]).reset_index(drop=True)
        for column in result.columns:
            try:
                result[column] = pd.to_numeric(result[column])
            except: continue
        return result
    return None

def read_vcfeval_roc_header(rtg_vcfeval_roc_filename):
    with gzip.open(rtg_vcfeval_roc_filename, 'rt') as roc:
        result = None
        for line in roc:
            if line.startswith('#'):
                result = line.strip()[1:].split('\t')
            else:
                break
        return result

def get_vcfeval_roc_filename(variant_type):
    if variant_type == 'snv' or variant_type == 'snp':
        return 'snp_roc.tsv.gz'
    elif variant_type == 'indel':
        return 'non_snp_roc.tsv.gz'
    else:
        return 'weighted_roc.tsv.gz'

def read_vcfeval_roc(vcfeval_dir, variant_type='all'):
    roc = vcfeval_dir / get_vcfeval_roc_filename(variant_type)
    result = pd.read_csv(roc, sep='\t', comment='#')
    result.columns = read_vcfeval_roc_header(roc)
    return result

In [10]:
def get_vcfeval_dir(experiment, caller, genotypes=True, raw=True):
    return vcfeval_dir / (experiment["group"] + '.' + experiment["reference"] + '.' + experiment["mapper"] + '.' + caller + "." + experiment["test"] + ".vs." + experiment["control"] + ('.raw' if raw else '.pass') + ('.GT' if genotypes else '.AL') + '.vcfeval')

def read_vcfeval_df_helper(experiments, summary_df_getter, genotypes=True, raw=True):
    dfs = []
    for experiment in experiments:
        for caller in experiment["callers"]:
            caller_vcfeval_dir = get_vcfeval_dir(experiment, caller, genotypes=genotypes, raw=raw)
            if caller_vcfeval_dir.exists():
                try:
                    df = summary_df_getter(experiment, caller_vcfeval_dir)
                    df.insert(0, "Group", experiment["group"])
                    df.insert(0, "Control", experiment["control"])
                    df.insert(1, "Test", experiment["test"])
                    df.insert(3, "Caller", caller)
                    dfs.append(df)
                except FileNotFoundError:
                    continue
    if len(dfs) > 0:
        return pd.concat(dfs).reset_index(drop=True)
    else:
        return None

def read_vcfeval_df_helper2(experiments, summary_df_getter, alleles=True):
    dfs = []
    filtered_gt_df = read_vcfeval_df_helper(experiments, summary_df_getter, genotypes=True, raw=False)
    if filtered_gt_df is not None:
        filtered_gt_df.insert(3, "Filtered", True)
        filtered_gt_df.insert(4, "Match", "GT")
        dfs.append(filtered_gt_df)
    raw_gt_df = read_vcfeval_df_helper(experiments, summary_df_getter,  genotypes=True, raw=True)
    if raw_gt_df is not None:
        raw_gt_df.insert(3, "Filtered", False)
        raw_gt_df.insert(4, "Match", "GT")
        dfs.append(raw_gt_df)
    if alleles:
        filtered_alleles_df = read_vcfeval_df_helper(experiments, summary_df_getter,  genotypes=False, raw=False)
        if filtered_alleles_df is not None:
            filtered_alleles_df.insert(3, "Filtered", True)
            filtered_alleles_df.insert(4, "Match", "AL")
            dfs.append(filtered_alleles_df)
        raw_alleles_df = read_vcfeval_df_helper(experiments, summary_df_getter,  genotypes=False, raw=True)
        if raw_alleles_df is not None:
            raw_alleles_df.insert(3, "Filtered", False)
            raw_alleles_df.insert(4, "Match", "AL")
            dfs.append(raw_alleles_df)
    return pd.concat(dfs).reset_index(drop=True)

def read_vcfeval_performance_summaries(experiments):
    return read_vcfeval_df_helper2(experiments, lambda _, d: read_vcfeval_performance_summary(d))

def read_vcfeval_rocs(experiments):
    return read_vcfeval_df_helper2(experiments, lambda _, d: read_vcfeval_roc(d))

In [208]:
def count_records(vcf_path, matches):
    vcf = ps.VariantFile(vcf_path)
    return sum(matches(record) for record in vcf)

def is_indel(ref, alt, min_size=None, max_size=None):
    if min_size is None and max_size is None:
        return len(alt) != len(ref)
    else:
        size = abs(len(alt) - len(record.ref))
        if min_size is None:
            return size <= max_size
        elif max_size is None:
            return size >= min_size
        else:
            return min_size <= size <= max_size

def has_indel(record, min_size=None):
    return any(is_indel(record.ref, alt, min_size=min_size) for alt in record.alts)

def count_indel_records(vcf_path, min_size=None):
    return count_records(vcf_path, lambda record: has_indel(record, min_size))

def count_alt_indels(record, min_size=None):
    return sum(is_indel(record.ref, alt) for alt in record.alts)

def count_called_indels(vcf_path, min_size=None):
    vcf = ps.VariantFile(vcf_path)
    return sum(count_alt_indels(record, min_size=min_size) for record in vcf)

def subset_vcf(in_vcf_path, bed_regions, out_vcf_path):
    cmd = ['bcftools', 'view', '-T', bed_regions, '-Oz', '-o', out_vcf_path, in_vcf_path]
    call(cmd)
    index(out_vcf_path)

def read_indel_size_distribution(vcf_path, bed_regions=None):
    if bed_regions is not None:
        tmp_vcf = vcf_path.replace('.vcf', '.tmp.vcf')
        subset_vcf(vcf_path, bed_regions, tmp_vcf)
        res = read_indel_size_distribution(tmp_vcf)
        remove_vcf(tmp_vcf)
        return res
    vcf = ps.VariantFile(vcf_path)
    dist = {}
    min_size, max_size = None, None
    for record in vcf:
        for alt in record.alts:
            if is_indel(record.ref, alt):
                size = len(alt) - len(record.ref)
                if size in dist:
                    dist[size] += 1
                else:
                    dist[size] = 1
                if min_size is None:
                    min_size = size
                    max_size = min_size
                else:
                    min_size = min(min_size, size)
                    max_size = max(max_size, size)
    if max_size is not None:
        sizes = list(range(min_size, max_size + 1))
    else:
        sizes = []
    counts = [dist[size] if size in dist else 0 for size in sizes]
    return sizes, counts, dist

def read_indel_size_distribution_df(vcf_path, bed_regions=None):
    sizes, counts, _ = read_indel_size_distribution(vcf_path, bed_regions=bed_regions)
    result = pd.DataFrame({'size': sizes, 'count': counts})
    result[['size', 'count']] = result[['size', 'count']].astype(int)
    return result

def read_indel_size_distributions(experiments, baseline=True):
    vcf = "tp-baseline.vcf.gz" if baseline else "tp.vcf.gz"
    return read_vcfeval_df_helper2(experiments, lambda _, d: read_indel_size_distribution_df(d / vcf))

In [272]:
def read_vaf_df(vcf_path, sample):
    vcf = ps.VariantFile(vcf_path)
    allele_depths = []
    for rec in vcf:
        if len(rec.alts) == 1 and sum(rec.samples[sample]["GT"]) == 1:
            allele_depths.append(rec.samples[sample]["AD"])
    result = pd.DataFrame(allele_depths, columns=["REF", "ALT"])
    result["VAF"] = result.apply(lambda row: row.ALT / (row.REF + row.ALT), axis=1)
    return result

def read_vaf_distributions(experiments):
    return read_vcfeval_df_helper2(experiments, lambda experiment, d: read_vaf_df(d / "tp.vcf.gz", experiment["test"]))

In [349]:
def add_precision_recall(ax, df, ylim=None, xlim=None, despine=True,
                         title=None, legend=True, legend_title=False,
                         y_format='%.3f', x_format='%.3f',
                         ylabel='Precision (PPV)', xlabel='Sensitivity'):
    if legend:
        legend = 'brief'
    callers = list(df["Caller"].unique())
    sns.lineplot(x="sensitivity", y="precision",
                 hue="Caller",
                 style="Caller",
                 palette=caller_colours,
                 #dashes=caller_linestyles,
                 hue_order=list(reversed(callers)),
                 legend=legend, data=df, ax=ax)
    ax.set(ylabel=ylabel, xlabel=xlabel)
    ax.yaxis.set_major_formatter(FormatStrFormatter(y_format))
    ax.xaxis.set_major_formatter(FormatStrFormatter(x_format))
    if legend and not legend_title:
        handles, labels = ax.get_legend_handles_labels()
        ax.legend(handles=handles[1:], labels=labels[1:])
    if despine:
        sns.despine(ax=ax)
    if title is not None:
        ax.set(title=title)
    if ylim:
        ax.set_ylim(ylim[0], ylim[1])
    if xlim:
        ax.set_xlim(xlim[0], xlim[1])
    return ax

def add_marker(ax, x, y, colour, style='o', size=50):
    sns.regplot(x=np.array([x]), y=np.array([y]), scatter=True, fit_reg=False,
                marker=style, color=colour, scatter_kws={"s": size}, ax=ax)

def find_marker(df, caller, score):
    df = df.replace('None', 0)
    df['score'] = df['score'].apply(pd.to_numeric)
    if score is not None:
        for tol in [1e-10, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, .5]:
            lines = df[(df.Caller == caller) & np.isclose(df.score, score, atol=tol)]
            if lines.shape[0] > 0:
                break
    else:
        score = df.query("Caller=='" + caller + "'").score.min()
        lines = df[(df.Caller == caller) & (df.score == score)]
    if lines.shape[0] == 0:
        raise Exception("Could not find a PASS marker for " + caller)
    return float(lines['sensitivity'].iloc[0]), float(lines['precision'].iloc[0])

def plot_precision_recall(df, 
                          ylim=None, xlim=None,
                          marker_scores=None,
                          title=None, legend_out=True,
                          y_format='%.1f', x_format='%.1f',
                          save=None, save_format='pdf'):
    sns.set(style="whitegrid")
    fig, ax = plt.subplots(figsize=(9, 6))
    add_precision_recall(ax, df, ylim=ylim, xlim=xlim, y_format=y_format, x_format=x_format, title=title)
    callers = list(df["Caller"].unique())
    if marker_scores is not None:
        for caller in reversed(callers):
            if caller in list(df['Caller'].unique()) and caller in marker_scores:
                try:
                    marker_x, marker_y = find_marker(df, caller, caller_pass_scores[caller])
                    add_marker(ax, marker_x, marker_y, caller_colours[caller])
                except:
                    print('Could not add PASS marker for ', caller)
    if legend_out:
        handles, labels = ax.get_legend_handles_labels()
        lgd = fig.legend(handles=reversed(handles), labels=reversed(labels),
                         frameon=False, loc='upper center',
                         bbox_to_anchor=(0.5, 1.03),
                         borderaxespad=0, ncol=len(labels))
        ax.legend_.remove()
    plt.tight_layout()
    if save is None:
        plt.show()
    else:
        plt.savefig(save, format=save_format, transparent=True, bbox_inches='tight')

def add_precision_recalls(axes, dfs, titles=None, ylims=None, xlims=None, y_format=None, x_format=None, xlabel='Sensitivity'):
    if titles is None:
        titles = len(dfs) * [None]
    for df, title, ax, ylim, xlim in zip(dfs, titles, axes, ylims, xlims):
        add_precision_recall(ax, df,
                             ylim=ylim, xlim=xlim,
                             y_format=y_format, x_format=x_format,
                             title=title,
                             legend_title=True,
                             xlabel=xlabel)
    for ax in axes[1:]:
        ax.set(ylabel='')

def add_markers(dfs, marker_scores, axes):
    for df, ax in zip(dfs, axes):
        callers = list(df["Caller"].unique())
        for caller in reversed(callers):
            if caller in marker_scores:
                try:
                    marker_x, marker_y = find_marker(df, caller, caller_pass_scores[caller])
                    add_marker(ax, marker_x, marker_y, caller_colours[caller], size=25)
                except:
                    print('Could not add PASS marker for ', caller)

def plot_precision_recalls(dfs,
                           titles=None,
                           xlims=None, ylims=None,
                           y_format='%.1f', x_format='%.1f',
                           marker_scores=None,
                           ncols=4,
                           save=None, save_format='pdf'):
    sns.set(style="whitegrid")
    n_subplots = len(dfs)
    nrows, ncols = int(n_subplots/ncols), min(n_subplots, ncols)
    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(2*ncols*ncols, 2*ncols*nrows))
    if ylims is None: 
        ylims = len(dfs) * [None]
    elif type(ylims) is not list:
        ylims = len(dfs) * [ylims]
    if xlims is None:
        xlims = len(dfs) * [None]
    elif type(xlims) is not list:
        xlims = len(dfs) * [xlims]
    for i in range(nrows - 1):
        add_precision_recalls(axes.flat[i*ncols:i*ncols+ncols], 
                              dfs[i*ncols:i*ncols+ncols], 
                              titles[i*ncols:i*ncols+ncols],
                              ylims=ylims[i*ncols:i*ncols+ncols],
                              xlims=xlims[i*ncols:i*ncols+ncols],
                              y_format=y_format, x_format=x_format,
                              xlabel='')
    add_precision_recalls(axes.flat[-ncols:],
                          dfs[-ncols:],
                          titles[-ncols:],
                          ylims=ylims, xlims=xlims,
                          y_format=y_format, x_format=x_format)
    if marker_scores is not None:
        add_markers(dfs, marker_scores, axes.flat)
    handles, labels = axes.flat[-1].get_legend_handles_labels()
    lgd = fig.legend(handles=reversed(handles), labels=reversed(labels),
                     frameon=False, loc='upper center',
                     bbox_to_anchor=(0.5, 1.03),
                     borderaxespad=0, ncol=len(labels))
    for ax in axes.flat:
        ax.legend_.remove()
    plt.tight_layout()
    if save is None:
        plt.show()
    else:
        plt.savefig(save, format=save_format, transparent=True, bbox_extra_artists=[lgd], bbox_inches='tight')

In [165]:
summary_df = read_vcfeval_performance_summaries(experiments)

In [166]:
summary_df

Unnamed: 0,Control,Test,Group,Filtered,Match,Caller,True-pos-baseline,True-pos-call,False-pos,False-neg,Precision,Sensitivity,F-measure
0,Hunamp,IL-2,clones_and_cells,True,GT,Octopus,4253757,4282215,546577,185063,0.8868,0.9583,0.9212
1,Hunamp,IL-2,clones_and_cells,True,GT,SCcaller,2586109,2586088,121292,1852873,0.9552,0.5826,0.7238
2,Hunamp,IL-3,clones_and_cells,True,GT,Octopus,4251490,4279921,548799,187330,0.8863,0.9578,0.9207
3,Hunamp,IL-3,clones_and_cells,True,GT,SCcaller,2001148,2001127,204009,2437834,0.9075,0.4508,0.6024
4,Hunamp,IL-4,clones_and_cells,True,GT,Octopus,4252413,4280868,548708,186413,0.8864,0.9580,0.9208
...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,IL1C,IL-5,clones_and_cells,False,AL,SCcaller,2222654,2222518,9614,3189114,0.9957,0.4107,0.5815
92,IL1C,IL11,clones_and_cells,False,AL,Octopus,4501527,4506598,333036,910241,0.9312,0.8318,0.8787
93,IL1C,IL11,clones_and_cells,False,AL,SCcaller,2903345,2903193,18046,2508423,0.9938,0.5365,0.6968
94,IL1C,IL12,clones_and_cells,False,AL,Octopus,4499710,4504819,333925,912058,0.9310,0.8315,0.8784


In [298]:
agg_summary_df = summary_df.query("Filtered and Control=='IL1C' and Match=='GT'").groupby(["Caller"]).agg({
    "True-pos-baseline": "sum",
    "True-pos-call": "sum",
    "False-pos": "sum",
    "False-neg": "sum",
}).reset_index()
agg_summary_df["Precision"] = agg_summary_df["True-pos-baseline"] / (agg_summary_df["True-pos-baseline"] + agg_summary_df["False-pos"])
agg_summary_df["Sensitivity"] = agg_summary_df["True-pos-baseline"] / (agg_summary_df["True-pos-baseline"] + agg_summary_df["False-neg"])
agg_summary_df["F-measure"] = (2 * agg_summary_df["Sensitivity"] * agg_summary_df["Precision"]) / (agg_summary_df["Sensitivity"] + agg_summary_df["Precision"])
agg_summary_df

Unnamed: 0,Caller,True-pos-baseline,True-pos-call,False-pos,False-neg,Precision,Sensitivity,F-measure
0,Octopus,25589057,25763848,3209631,1070441,0.888549,0.959848,0.922823
1,SCcaller,12451104,12450997,832579,14209260,0.937323,0.467027,0.623427


In [278]:
summary_df.to_excel(tables_dir / "dong_summary.xlsx")

In [197]:
sns.set(style="white")
fig, ax = plt.subplots(figsize=(6, 5))
g = sns.barplot(x="Test", y="F-measure", hue="Caller",
                palette=caller_colours, ci=None,
                data=summary_df.query("Control=='IL1C'"),
                ax=ax)
sns.despine(ax=ax)
ax.set(xlabel='')
handles, labels = ax.get_legend_handles_labels()
lgd = fig.legend(handles=handles, labels=labels, frameon=False,
                 loc='upper center', #bbox_to_anchor=(0.5, .95),
                 borderaxespad=0, ncol=len(labels))
ax.legend_.remove()
plt.tight_layout()
plt.savefig(figure_dir / "dong_fmeasures.pdf", format="pdf", transparent=True, bbox_inches='tight')

<IPython.core.display.Javascript object>

In [168]:
rocs = read_vcfeval_rocs(experiments)
filterd_rocs, raw_rocs = rocs.query('Filtered'), rocs.query('not Filtered')

In [172]:
plot_precision_recall(filterd_rocs.query("Match=='GT' and Control=='IL1C' and Test=='IL12'"))

<IPython.core.display.Javascript object>

In [350]:
cells = ["IL-2", "IL-3", "IL-4", "IL-5", "IL11", "IL12"]
dfs = [filterd_rocs.query("Match=='GT' and Control=='IL1C' and Test=='" + cell + "'") for cell in cells]
plot_precision_recalls(dfs, titles=cells, ncols=3, y_format='%.2f',
                       marker_scores=caller_pass_scores,
                       save = figure_dir / "dong_precision_recalls.pdf")

<IPython.core.display.Javascript object>

In [209]:
indel_stats = read_indel_size_distributions(experiments)

In [210]:
indel_stats

Unnamed: 0,Control,Test,Group,Filtered,Match,Caller,size,count
0,Hunamp,IL-2,clones_and_cells,True,GT,Octopus,-49,38
1,Hunamp,IL-2,clones_and_cells,True,GT,Octopus,-48,77
2,Hunamp,IL-2,clones_and_cells,True,GT,Octopus,-47,32
3,Hunamp,IL-2,clones_and_cells,True,GT,Octopus,-46,57
4,Hunamp,IL-2,clones_and_cells,True,GT,Octopus,-45,52
...,...,...,...,...,...,...,...,...
9383,IL1C,IL12,clones_and_cells,False,AL,SCcaller,45,5
9384,IL1C,IL12,clones_and_cells,False,AL,SCcaller,46,2
9385,IL1C,IL12,clones_and_cells,False,AL,SCcaller,47,3
9386,IL1C,IL12,clones_and_cells,False,AL,SCcaller,48,5


In [289]:
indel_stats.query("Filtered and Control=='IL1C' and Match=='GT'").groupby(["Caller"]).agg({"count": "sum"})

Unnamed: 0_level_0,count
Caller,Unnamed: 1_level_1
Octopus,4417551
SCcaller,891310


In [291]:
indel_stats.query("Filtered and Control=='IL1C' and Match=='GT' and size>0").groupby(["Caller"]).agg({"count": "sum"})

Unnamed: 0_level_0,count
Caller,Unnamed: 1_level_1
Octopus,2138137
SCcaller,381354


In [292]:
indel_stats.query("Filtered and Control=='IL1C' and Match=='GT' and size<0").groupby(["Caller"]).agg({"count": "sum"})

Unnamed: 0_level_0,count
Caller,Unnamed: 1_level_1
Octopus,2279414
SCcaller,509956


In [290]:
summary_df.query("Filtered and Control=='IL1C' and Match=='GT'").groupby(["Caller"]).agg({"True-pos-baseline": "sum"})

Unnamed: 0_level_0,True-pos-baseline
Caller,Unnamed: 1_level_1
Octopus,25589057
SCcaller,12451104


In [293]:
indel_stats.query("Filtered and Control=='IL1C' and Match=='AL'").groupby(["Caller"]).agg({"count": "sum"})

Unnamed: 0_level_0,count
Caller,Unnamed: 1_level_1
Octopus,4552634
SCcaller,1252903


In [287]:
sns.set(style="white")
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(12, 7))
df = indel_stats.query("Filtered and Match=='GT' and Control=='IL1C' and Test=='IL11' and count>0").copy()
sns.scatterplot(x="size", y="count",
                hue="Caller", 
                hue_order=list(reversed(df["Caller"].unique())),
                palette=caller_colours,
                s=20, marker="o",
                data=df,
                ax=axes[0])
df = indel_stats.query("Filtered and Match=='GT' and Control=='IL1C' and Test=='IL12' and count>0").copy()
sns.scatterplot(x="size", y="count",
                hue="Caller", 
                hue_order=list(reversed(df["Caller"].unique())),
                palette=caller_colours,
                s=20, marker="o",
                data=df,
                ax=axes[1])
sns.despine(ax=axes[0])
sns.despine(ax=axes[1])
handles, labels = axes[0].get_legend_handles_labels()
lgd = fig.legend(handles=reversed(handles), labels=reversed(labels),
                 frameon=False, loc='upper center',
                 bbox_to_anchor=(0.5, 0.95),
                 borderaxespad=0,
                 ncol=len(labels))
axes[0].set_title('IL11', loc='left')
axes[1].set_title('IL12', loc='left')
axes[0].set_yscale('log')
axes[1].set_yscale('log')
axes[0].set(ylabel='True indels')
axes[1].set(ylabel='True indels')
axes[0].set(xlabel='')
axes[1].set(xlabel='Indel length')
axes[0].legend_.remove()
axes[1].legend_.remove()
# plt.tight_layout()
plt.savefig(figure_dir / "dong_indel_sizes.pdf",
            format='pdf', 
            transparent=True,
            bbox_extra_artists=[lgd],
            bbox_inches='tight')

<IPython.core.display.Javascript object>

In [260]:
def find_experiment(experiments, control, test):
    for experiment in experiments:
        if experiment["control"] == control and experiment["test"] == test:
            return experiment
    return None

In [282]:
e1 = find_experiment(experiments, "IL1C", "IL11")
e2 = find_experiment(experiments, "IL1C", "IL12")
e1["callers"] = ["SCcaller"]
e2["callers"] = ["SCcaller"]
vaf_df = read_vaf_distributions([e1, e2])

In [274]:
vaf_df

Unnamed: 0,Control,Test,Group,Filtered,Match,Caller,REF,ALT,VAF
0,IL1C,IL11,clones_and_cells,True,GT,SCcaller,2,16,0.888889
1,IL1C,IL11,clones_and_cells,True,GT,SCcaller,18,10,0.357143
2,IL1C,IL11,clones_and_cells,True,GT,SCcaller,4,8,0.666667
3,IL1C,IL11,clones_and_cells,True,GT,SCcaller,8,16,0.666667
4,IL1C,IL11,clones_and_cells,True,GT,SCcaller,52,25,0.324675
...,...,...,...,...,...,...,...,...,...
4783509,IL1C,IL11,clones_and_cells,False,AL,SCcaller,32,45,0.584416
4783510,IL1C,IL11,clones_and_cells,False,AL,SCcaller,23,11,0.323529
4783511,IL1C,IL11,clones_and_cells,False,AL,SCcaller,6,8,0.571429
4783512,IL1C,IL11,clones_and_cells,False,AL,SCcaller,7,8,0.533333


In [281]:
sns.violinplot(x="VAF", 
               y="Test",
               cut=True,
               inner=None,
               data=vaf_df.query("Filtered and Match=='GT'"))

<IPython.core.display.Javascript object>

<AxesSubplot:xlabel='VAF', ylabel='Test'>

In [135]:
from collections import defaultdict

def intersect_samples(vcf_filename, samples=None, genotypes=True, fold=True):
    vcf = ps.VariantFile(vcf_filename)
    if samples is None:
        samples = list(vcf.header.samples)
    intsersections = defaultdict(int)
    for rec in vcf:
        if len(rec.filter) == 0 or "PASS" in rec.filter:
            assignments = defaultdict(list)
            for sample in samples:
                if not all(a is None for a in rec.samples[sample].alleles):
                    if genotypes:
                        assignments[rec.samples[sample]["GT"]].append(sample)
                    else:
                        assignments[tuple(set(rec.samples[sample]["GT"]) - set([0]))].append(sample)
            if fold:
                intsersections[tuple(min(assignments.items(), key=lambda x: len(x[1]))[1])] += 1
            else:
                for _, assigned_samples in assignments.items():
                    intsersections[tuple(assigned_samples)] += 1
    data = []
    for sample_set, count in intsersections.items():
        labels = [sample in sample_set for sample in samples]
        data.append(labels + [sum(labels), count])
    return pd.DataFrame(data, columns=samples + ["Cells", "Count"])

In [89]:
cell_names = ["IL-2", "IL-3", "IL-4", "IL-5", "IL11", "IL12"]

In [136]:
sccanner_isecs = intersect_samples(calls_dir / "clones_and_cells.hs38DH.bwa.SCcaller.vcf.gz")

In [127]:
sccanner_alleles_isecs = intersect_samples(calls_dir / "clones_and_cells.hs38DH.bwa.SCcaller.vcf.gz", genotypes=False)

In [138]:
octopus_isecs = intersect_samples(calls_dir / "clones_and_cells.hs38DH.bwa.Octopus.vcf.gz", samples=cell_names)

In [129]:
octopus_alleles_isecs = intersect_samples(calls_dir / "clones_and_cells.hs38DH.bwa.Octopus.vcf.gz", genotypes=False, samples=cell_names)

In [139]:
octopus_isecs

Unnamed: 0,IL-2,IL-3,IL-4,IL-5,IL11,IL12,Cells,Count
0,True,True,True,True,True,True,6,4795803
1,False,True,True,False,False,False,2,1761
2,True,True,False,False,True,False,3,1589
3,False,False,False,False,True,False,1,3778
4,True,True,False,False,False,False,2,1788
5,False,True,False,False,False,True,2,2304
6,False,False,False,True,False,False,1,4738
7,False,False,True,False,False,False,1,5656
8,False,False,False,False,False,True,1,7378
9,False,False,True,False,True,False,2,1639


In [202]:
sccaller_agg_df = sccanner_isecs.groupby("Cells").aggregate({"Count": "sum"})
sccaller_agg_df["Caller"] = "SCcaller"
octopus_agg_df = octopus_isecs.groupby("Cells").aggregate({"Count": "sum"})
octopus_agg_df["Caller"] = "Octopus"
agg_df = pd.concat([sccaller_agg_df, octopus_agg_df]).reset_index()

In [153]:
agg_df

Unnamed: 0,Cells,Count,Caller
0,1,1402326,SCCaller
1,2,478371,SCCaller
2,3,405282,SCCaller
3,4,492403,SCCaller
4,5,573333,SCCaller
5,6,501769,SCCaller
6,1,32125,Octopus
7,2,28342,Octopus
8,3,14361,Octopus
9,6,4795803,Octopus


In [277]:
sns.set(style="white")
fig, ax = plt.subplots(figsize=(6, 5))
g = sns.barplot(x="Cells", y="Count", hue="Caller",
                palette=caller_colours, ci=None,
                data=agg_df,
                ax=ax)
sns.despine(ax=ax)
ax.set_yscale('log')
ax.set(xlabel='', ylabel='Number of variants')
handles, labels = ax.get_legend_handles_labels()
lgd = fig.legend(handles=handles, labels=labels, frameon=False,
                 loc='upper center', #bbox_to_anchor=(0.5, .95),
                 borderaxespad=0, ncol=len(labels))
ax.legend_.remove()
plt.tight_layout()
plt.savefig(figure_dir / "dong_isec_summary.pdf", format="pdf", transparent=True, bbox_inches='tight')

<IPython.core.display.Javascript object>

In [190]:
int(octopus_isecs.query("Cells<6")["Count"].sum()), octopus_isecs["Count"].sum(), int(octopus_isecs.query("Cells<6")["Count"].sum())/ octopus_isecs["Count"].sum()

(74828, 4870631, 0.015363101823973116)

In [191]:
int(sccanner_isecs.query("Cells<6")["Count"].sum()), sccanner_isecs["Count"].sum(), int(sccanner_isecs.query("Cells<6")["Count"].sum()) / sccanner_isecs["Count"].sum()

(3351715, 3853484, 0.8697882228134333)

In [79]:
sns.color_palette()

In [137]:
def get_upsetr_input(isecs):
    items = []
    for _, row in isecs.iterrows():
        labels = []
        for sample, present in zip(isecs.columns[:-2], row):
            if present: labels.append(sample)
        items.append("'" + '&'.join(labels) + "'=" + str(row[-2]))
    return ','.join(items)

def get_upsetr_query(isecs, colours=None):
    if colours is None:
        palette = sns.color_palette().as_hex()
        colours = {
            1: palette[3],
            2: palette[4],
            3: palette[0],
            4: palette[5],
            5: palette[1],
            6: palette[2]
        }
    queries = []
    for _, row in isecs.iterrows():
        labels = []
        for sample, present in zip(isecs.columns[:-2], row):
            if present: labels.append('"' + sample + '"')
        queries.append('list(query = intersects, params = list(' + ','.join(labels) + '), active = T, color = "' + colours[len(labels)] + '")')
    return "list(" + ','.join(queries) + ")"

In [130]:
get_upsetr_input(octopus_alleles_isecs)

"'IL-2&IL-3&IL-4&IL-5&IL11&IL12'=4813085,'IL-3&IL-4'=1373,'IL-2&IL-3&IL11'=1241,'IL11'=2812,'IL-2&IL-3'=1418,'IL-3&IL12'=1834,'IL-5'=3619,'IL12'=5386,'IL-4&IL11'=1299,'IL-3'=5132,'IL-2'=2497,'IL-2&IL-3&IL-5'=1117,'IL-3&IL-5'=1401,'IL-4'=4248,'IL-2&IL-4&IL12'=993,'IL-2&IL-3&IL-4'=1205,'IL-2&IL11'=1307,'IL11&IL12'=1990,'IL-2&IL-4&IL-5'=1266,'IL-3&IL11'=1341,'IL-2&IL-5&IL12'=1046,'IL-4&IL-5'=1946,'IL-2&IL-3&IL12'=1169,'IL-4&IL12'=1583,'IL-2&IL-5'=1250,'IL-5&IL12'=1891,'IL-2&IL-5&IL11'=1044,'IL-2&IL-4&IL11'=1074,'IL-2&IL11&IL12'=1214,'IL-5&IL11'=1127,'IL-2&IL12'=1316,'IL-2&IL-4'=1407"

In [111]:
get_upsetr_query(sccanner_isecs)

'list(list(query = intersects, params = list("IL-3","IL11"), active = T, color = "#8172b3"),list(query = intersects, params = list("IL11"), active = T, color = "#c44e52"),list(query = intersects, params = list("IL-2"), active = T, color = "#c44e52"),list(query = intersects, params = list("IL-2","IL-4"), active = T, color = "#8172b3"),list(query = intersects, params = list("IL-2","IL-3","IL11"), active = T, color = "#4c72b0"),list(query = intersects, params = list("IL12"), active = T, color = "#c44e52"),list(query = intersects, params = list("IL-2","IL11"), active = T, color = "#8172b3"),list(query = intersects, params = list("IL-4","IL12"), active = T, color = "#8172b3"),list(query = intersects, params = list("IL-5"), active = T, color = "#c44e52"),list(query = intersects, params = list("IL-2","IL-3","IL-4","IL-5","IL11","IL12"), active = T, color = "#55a868"),list(query = intersects, params = list("IL-4"), active = T, color = "#c44e52"),list(query = intersects, params = list("IL-2","I