Assess the accuracy of BEAN estimates by # of co-occurring variants within the editing window of a gRNA.

In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import bean as be

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
plt.style.use('jr')
plt.rcParams["pdf.use14corefonts"] = True
matplotlib.rcParams['pdf.fonttype'] = 42 
matplotlib.rcParams["axes.unicode_minus"] = False
sns.set_theme(style="ticks", rc={"axes.facecolor": (0, 0, 0, 0)})

In [3]:
bdata = be.read_h5ad("../../../results/filtered_annotated/LDLRCDS/bean_count_LDLRCDS_annotated_spacer0.1_0.3.h5ad")

In [4]:
score_tbl = pd.read_csv("../../Fig5c_hits/TableSX_tiling_screen_UKBmerged.20230817.alpha_fixed.csv")

In [5]:
score_tbl_single = score_tbl.loc[score_tbl.n_guides == 1]

In [6]:
allele_tbl = bdata.uns["sig_allele_counts_spacer_0_19_A.G_translated_prop0.1_0.3"].copy()

In [7]:
allele_rates_tbl = bdata.get_normalized_allele_counts(allele_tbl)

In [16]:
allele_rates_tbl

Unnamed: 0,guide,aa_allele,rep5_top,rep5_high,rep5_bulk,rep5_low,rep5_bot,rep6_top,rep6_high,rep6_bulk,...,rep8_bot,rep9_top,rep9_high,rep9_bulk,rep9_low,rep9_bot,bulk_mean,nt_edits,aa_edits,edits
0,10_2511_neg,|11224168:20:-:A>G,0.009091,0.144186,0.000000,0.150000,0.006369,0.209503,0.067568,0.124260,...,0.146341,0.000000,0.011834,0.146341,0.086022,,0.103728,[11224168:T>C],[],[11224168:T>C]
1,10_2511_neg,"|11224175:13:-:A>G,11224178:10:-:A>G",0.490909,0.362791,0.443396,0.212500,0.050955,0.222462,0.319820,0.278107,...,0.414634,0.000000,0.213018,0.121951,0.333333,,0.357741,"[11224175:T>C, 11224178:T>C]",[],"[11224175:T>C, 11224178:T>C]"
2,10_2511_neg,|11224178:10:-:A>G,0.300000,0.148837,0.405660,0.250000,0.305732,0.127430,0.279279,0.171598,...,0.067073,0.000000,0.360947,0.268293,0.204301,,0.220288,[11224178:T>C],[],[11224178:T>C]
3,10_2514_neg,|11224178:13:-:A>G,0.190722,0.251553,0.337748,0.280255,0.296552,0.314894,0.260753,0.430693,...,0.195767,0.000000,0.191781,0.133333,0.205674,0.583333,0.299068,[11224178:T>C],[],[11224178:T>C]
4,10_2514_neg,|11224184:7:-:A>G,0.254296,0.478261,0.271523,0.337580,0.344828,0.591489,0.502688,0.227723,...,0.222222,0.000000,0.150685,0.300000,0.262411,0.416667,0.236297,[11224184:T>C],[],[11224184:T>C]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11706,Intron 1 DNaseHS 2_5704_pos,"|11203065:6:+:A>G,11203076:17:+:A>G,11203077:1...",0.666667,0.867925,0.500000,0.883117,,0.301887,0.431818,0.939394,...,0.650000,0.000000,0.705882,,0.409091,,0.623585,"[11203076:A>G, 11203079:A>G, 11203065:A>G, 112...",[],"[11203076:A>G, 11203079:A>G, 11203065:A>G, 112..."
11707,Intron 1 DNaseHS 2_5707_pos,"|11203077:15:+:A>G,11203079:17:+:A>G",0.000000,0.049383,0.107527,0.151515,0.260563,0.035417,0.134078,0.195980,...,0.371429,0.000000,0.184971,0.235294,0.077519,1.000000,0.217760,"[11203077:A>G, 11203079:A>G]",[],"[11203077:A>G, 11203079:A>G]"
11708,Intron 1 DNaseHS 2_5707_pos,|11203079:17:+:A>G,0.487685,0.370370,0.870968,0.745455,0.521127,0.393750,0.430168,0.628141,...,0.342857,0.266667,0.630058,0.647059,0.635659,0.000000,0.631733,[11203079:A>G],[],[11203079:A>G]
11709,Intron 1 DNaseHS 2_5710_pos,|11203079:14:+:A>G,0.772727,0.000000,0.400000,,,,,0.882353,...,0.920000,0.192308,0.333333,,0.777778,0.428571,0.727451,[11203079:A>G],[],[11203079:A>G]


In [8]:
allele_rates_tbl["bulk_mean"] = allele_rates_tbl.loc[:,allele_rates_tbl.columns.map(lambda s: "bulk" in s)].mean(axis=1)

In [9]:
allele_rates_tbl["nt_edits"] = allele_rates_tbl.aa_allele.map(lambda cnallele: [e.get_abs_edit() for e in cnallele.nt_allele.edits])
allele_rates_tbl["aa_edits"] = allele_rates_tbl.aa_allele.map(lambda cnallele: [e for e in cnallele.aa_allele.edits])
allele_rates_tbl["edits"] = allele_rates_tbl[["guide","nt_edits"]].apply(lambda row: [f"{row.guide}!{s}" for s in row.nt_edits] if "CONTROL" in row.guide else [str(s) for s in row.nt_edits], axis=1) + allele_rates_tbl["aa_edits"].map(lambda a: ["A"+str(s) for s in a])

In [10]:
allele_rates_tbl = allele_rates_tbl.reset_index(drop=True)

In [11]:
score_tbl_single = score_tbl_single.reset_index(drop=True)

In [12]:
def get_allele_rate(e: be.Edit):
    n_vars = []
    cooccurring_edits = set()
    mean_rates = []
    for i, edits_list in enumerate(allele_rates_tbl.edits):
        if e in edits_list:
            n_vars.append(len(edits_list))
            cooccurring_edits.update(edits_list)
            mean_rates.append(allele_rates_tbl.iloc[i]["bulk_mean"])
    return n_vars, mean_rates, cooccurring_edits
n_vars, variant_mean_rates, coocc = zip(*score_tbl.edit.map(get_allele_rate))

In [13]:
score_tbl["n_vars"] = n_vars
score_tbl["allele_rate"] = variant_mean_rates
score_tbl['cooccurring_edits'] = coocc
score_tbl['n_coo'] = score_tbl['cooccurring_edits'].map(len) - 1

In [14]:
score_tbl_with_ukb = score_tbl.loc[~score_tbl.mean_LDL.isnull()]

In [15]:
nbins = 4
score_tbl_with_ukb["eer_qbin"] = pd.qcut(score_tbl_with_ukb.effective_edit_rate, q=nbins)
score_tbl_with_ukb["nco_qbin"] = pd.qcut(score_tbl_with_ukb.n_coo, q=nbins)
score_tbl_with_ukb["ng_qbin"] = pd.qcut(score_tbl_with_ukb.n_guides, q=nbins)
score_tbl_with_ukb["cdg_qbin"] = pd.qcut(score_tbl_with_ukb.n_coo_div_nguides, q=nbins)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  score_tbl_with_ukb["eer_qbin"] = pd.qcut(score_tbl_with_ukb.effective_edit_rate, q=nbins)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  score_tbl_with_ukb["nco_qbin"] = pd.qcut(score_tbl_with_ukb.n_coo, q=nbins)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  score_tbl_with_ukb["ng_qbin"] = pd.qcut

AttributeError: 'DataFrame' object has no attribute 'n_coo_div_nguides'

In [None]:
fig, ax = plt.subplots(1, 4, figsize=(12,4))

p = ax[0].hist2d(x=score_tbl_with_ukb.n_coo, y=score_tbl_with_ukb.n_guides, bins= (np.arange(-0.5, score_tbl_with_ukb.n_coo.max()+1.5, 1), np.arange(0.5, score_tbl_with_ukb.n_guides.max()+1.5, 1)))
ax[0].set_ylabel("# guides")
ax[0].set_xlabel('# co-occurring variants')
fig.colorbar(p[3], orientation="horizontal", location="top")

measures = ["n_coo", "n_guides", "n_coo_div_nguides",]
div = ["nco_qbin", "ng_qbin", "cdg_qbin",]
labels = ['# co-occurring variants', '# of variant supporting gRNAs', "# co-occurring variants\nnormalized by # supporting gRNAs"]

for i in range(4):
    bins = np.arange(-0.5, score_tbl_with_ukb[measures[i]].max()+1.5, 1) if i < 2 else None
    ax[1+i].hist(score_tbl_with_ukb[measures[i]], bins= bins)
    ax[1+i].set_ylabel("Frequency")
    ax[1+i].set_xlabel(labels[i])
    for interval in score_tbl_with_ukb[div[i]].unique():
        ax[1+i].axvline(interval.right, color='black', linestyle="--")

plt.setp(ax, box_aspect=1)
plt.tight_layout()
plt.suptitle("Histogram of variant stats, 76 variants with UKB LDL-C measurements")
fig.savefig("variant_stats_4bins.pdf", bbox_inches="tight")

In [None]:
fig, ax = plt.subplots(4, nbins, figsize=(12,12))
reverse_qbin=["nco_qbin", "cdg_qbin"]
div = ["nco_qbin", "ng_qbin", "cdg_qbin", ]
label = ["n_coo", "n_guides", "n_norm", ]
for i in range(4):
    qbins = score_tbl_with_ukb[div[i]].sort_values(ascending=not(div[i] in reverse_qbin))
    for j, qbin in enumerate(qbins.unique()):
        df = score_tbl_with_ukb.loc[score_tbl_with_ukb[div[i]] == qbin]
        sns.scatterplot(data=df, x="mean_LDL", y="mu_z_adj", ax=ax[i, j])
        r, pvalue = stats.spearmanr(df["mean_LDL"], df["mu_z_adj"])
        #ax[i, j].collections[0].label()
        ax[i, j].set_xlabel("UKB LDL-C (mg/dL)")
        ax[i, j].set_ylabel("BEAN z-score")
        ax[i, j].text(0.98, 0.98, f"r={r:.2f}", transform=ax[i, j].transAxes, horizontalalignment='right', verticalalignment='top')
        ax[i, j].set_title(f"{label[i]} in {qbin} ")
        ax[i, j].set_box_aspect(1)
plt.tight_layout()
plt.savefig("correlation_by_bins_v2.pdf", bbox_inches="tight")

In [None]:
fig, ax = plt.subplots(2, 4, figsize=(12, 6))

sns.scatterplot(score_tbl, y="effective_edit_rate", x="n_coo_div_nguides", alpha=0.1, ax=ax[0, 0], label = f"rho={stats.spearmanr(score_tbl.effective_edit_rate, score_tbl.n_coo_div_nguides)[0]:.5f}")
ax[0, 0].set_xlim((0, max(score_tbl.n_coo_div_nguides)*1.05))
ax[0, 0].legend()
ax[0, 0].set_xlabel("n_norm")
ax[0, 0].set_ylabel("Effective edit rate")

sns.scatterplot(score_tbl, y="mu_sd_adj", x="effective_edit_rate", alpha=0.1, ax=ax[0, 1], label = f"rho={stats.spearmanr(score_tbl.mu_sd_adj, score_tbl.effective_edit_rate)[0]:.5f}")
ax[0, 1].legend()
ax[0, 1].set_xlabel("Effective edit rate")
ax[0, 1].set_ylabel("BEAN mu_sd")

sns.scatterplot(score_tbl, y="mu_sd_adj", x="n_coo_div_nguides", alpha=0.1, ax=ax[0, 2], label = f"rho={stats.spearmanr(score_tbl.mu_sd_adj, score_tbl.n_coo_div_nguides)[0]:.5f}")
ax[0, 2].set_xlim((0, max(score_tbl.n_coo_div_nguides)*1.05))
ax[0, 2].legend()
ax[0, 2].set_xlabel("n_norm")
ax[0, 2].set_ylabel("BEAN mu_sd")

ax[0, 3].hist(score_tbl_with_ukb["effective_edit_rate"])
ax[0, 3].set_ylabel("Frequency")
ax[0, 3].set_xlabel(labels[i])
for interval in score_tbl_with_ukb["eer_qbin"].unique():
    ax[0, 3].axvline(interval.right, color='black', linestyle="--")
    

qbins = score_tbl_with_ukb["eer_qbin"].sort_values()
for j, qbin in enumerate(qbins.unique()):
    df = score_tbl_with_ukb.loc[score_tbl_with_ukb["eer_qbin"] == qbin]
    sns.scatterplot(data=df, x="mean_LDL", y="mu_z_adj", ax=ax[1, j])
    r, pvalue = stats.spearmanr(df["mean_LDL"], df["mu_z_adj"])
    #ax[1, j].collections[0].label()
    ax[1, j].set_xlabel("UKB LDL-C (mg/dL)")
    ax[1, j].set_ylabel("BEAN z-score")
    ax[1, j].text(0.98, 0.98, f"r={r:.2f}", transform=ax[1, j].transAxes, horizontalalignment='right', verticalalignment='top')
    ax[1, j].set_title(f"{label[i]} in {qbin} ")
    ax[1, j].set_box_aspect(1)

plt.setp(ax, box_aspect=1)
plt.suptitle(f"LDLR tiling library variant stats (n={len(score_tbl)})")
plt.tight_layout()
fig.savefig("FigR2.pdf", bbox_inches="tight")