In [1]:
from pathlib import Path
import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(font_scale=1.5, palette='Set2')
sns.set_style('whitegrid')

%load_ext autoreload
%autoreload 2

sys.path.insert(0, '..')
import dsdna_mpra
from dsdna_mpra import config, plots

In [2]:
control_projects = ['ctrl_neg', 'ctrl_ORF', 'ctrl_shuffle', 'ctrl_pos']
control_ratio_df = pd.read_csv(config.PROCESSED_DIR / "control_norm_counts_and_log_ratios.csv")
control_ratio_df.sample(2, random_state=0)


Unnamed: 0,ID,project,Plasmid_r1,Plasmid_r2,Plasmid_r3,Plasmid_r4,HEK293_r1,HEK293_r2,HEK293_r3,HEK293_r4,...,MRC5_r2,MRC5_r3,MRC5_r4,GM12878_log2p_ratio,Jurkat_log2p_ratio,MRC5_log2p_ratio,A549_log2p_ratio,HEK293_log2p_ratio,K562_log2p_ratio,ctrl_mean
1271,ORF53717:1659:900-1100,ctrl_ORF,301.735386,352.409449,345.793532,312.616184,543.585466,415.865431,509.992256,596.27174,...,402.683763,532.624446,311.951313,1.335527,1.109118,1.186153,1.278299,1.364442,1.075001,328.138638
880,ORF52773:2787:1000-1200,ctrl_ORF,501.298988,502.085728,528.194564,519.61162,674.162046,914.65641,717.856398,713.736154,...,685.419171,776.947587,794.834852,1.3594,1.277368,1.278297,1.564613,1.304599,2.18528,512.797725


In [3]:
controls = dict()
for cell in config.CELL_LINES:
    controls[cell] = dict()
    for control_project in control_projects:
        values = control_ratio_df[control_ratio_df.project == control_project][f"{cell}_log2p_ratio"]
        controls[cell][control_project] = values[~np.isnan(values) & ~np.isinf(values)]
    controls[cell]['ctrl_neg_(union)'] = np.concatenate([
        controls[cell]['ctrl_ORF'], controls[cell]['ctrl_neg'], controls[cell]['ctrl_shuffle']
    ])


Control activities stratified by cell line

In [4]:
fig, axes = plt.subplots(figsize=(20, 14), nrows=2, ncols=3, layout="tight")
fig.suptitle('Distribution of control sequence activities')
for cell_index, cell in enumerate(config.CELL_LINES):
    ax = axes[cell_index // 3, cell_index % 3]
    for project_ind, project in enumerate(control_projects):
        plots.violin(ax, controls[cell][project], project_ind * 6, width_factor=1, box_width=.1)
    ax.set_ylim([0, 8])
    ax.set_ylabel(r'$\log_2 (\frac{RNA}{DNA} + 1)$')
    ax.set_xticklabels([proj.replace('_', ' ') for proj in control_projects], rotation=90)
    ax.set_title(cell)
plt.savefig(config.FIGURES_DIR / 'control_tiles_by_cell_line.pdf', format="pdf", bbox_inches="tight")
plt.close(fig)


Activity of viral genome tiles stratified by cell line

In [5]:
virus_ratio_df = pd.read_csv(config.PROCESSED_DIR / "virus_norm_counts_and_log_ratios.csv")
virus_ratio_df.sample(2, random_state=0)


Unnamed: 0,family,strain,tile_number,strand,tile_id,Plasmid_r1,Plasmid_r2,Plasmid_r3,Plasmid_r4,HEK293_r1,...,MRC5_r2,MRC5_r3,MRC5_r4,GM12878_log2p_ratio,Jurkat_log2p_ratio,MRC5_log2p_ratio,A549_log2p_ratio,HEK293_log2p_ratio,K562_log2p_ratio,ctrl_mean
4637,Adenoviridae,"Type 37, Strain GW (76-19026)",627,+,Adenovirus:Type_37_Strain_GW_[76-19026]:627:+,350.730043,375.866179,368.846434,374.714815,273.727201,...,162.787053,273.641917,183.752143,0.857314,0.897313,0.733975,1.052587,0.755397,1.55703,367.539368
30744,Herpesviridae,"Herpes Simplex 2, Strain G",1286,+,Herpesvirus:Herpes_Simplex_2_Strain_G:1286:+,696.082624,645.618578,651.952249,644.339639,383.024634,...,753.961088,1221.615703,598.262792,0.828293,0.984337,1.377748,1.074075,0.635311,1.649574,659.498272


In [6]:
for cell in config.CELL_LINES:
    viruses = list()
    activity = list()
    for virus, virus_df in virus_ratio_df.groupby(['family', 'strain']):
        viruses.append(f'{virus[0]}, {virus[1]}')
        values = virus_df[f"{cell}_log2p_ratio"].values
        activity.append(values[~np.isnan(values) & ~np.isinf(values)])
    sorted_idx = np.array([np.median(arr) for arr in activity]).argsort()
    viruses = np.array(viruses)[sorted_idx]
    activity = [activity[idx] for idx in sorted_idx]
    fig, ax = plt.subplots(figsize=(15, 15), layout="tight")
    for index, values in enumerate(activity):
        plots.box(ax, values, index * 10, width=6)
    ax.set_ylim([0, 5.5])
    ax.set_ylabel(r'$\log_2 (\frac{RNA}{DNA} + 1)$')
    ax.set_xticklabels(viruses, fontsize=10, rotation=90)
    plt.title(f'Distribution of viral tile activity, {cell}')
    plt.savefig(config.FIGURES_DIR / f'pan_virus_{cell}.pdf', format="pdf", bbox_inches="tight")
    plt.close(fig)


### Threshold Assignment and FDR Calculation

Thresholds are determined based on the separation between the distributions of negative and positive controls. Since we have three types of negative control distributions (`ctrl_neg`, `ctrl_ORF`, and `ctrl_shuffle`), thresholds will be generated for the union of all three negative control distributions compared to the positive control distribution.

> **Note:** Before calculating the false discovery rate (FDR), the top and bottom 2.5% of control values (outliers) will be removed.


In [7]:
NEGATIVE_CONTROL = 'ctrl_neg_(union)'

sep_stats = list()
for cell in config.CELL_LINES:
    upper_dist = controls[cell]['ctrl_pos']
    up_low, up_high = np.quantile(upper_dist, .025), np.quantile(upper_dist, .975)
    upper_dist = upper_dist[(upper_dist > up_low) & (upper_dist < up_high)]
    lower_dist = controls[cell][NEGATIVE_CONTROL]
    lo_low, lo_high = np.quantile(lower_dist, .025), np.quantile(lower_dist, .975)
    lower_dist = lower_dist[(lower_dist > lo_low) & (lower_dist < lo_high)]
    cell_sep = dsdna_mpra.thresholds.binary_separation_stats(lower_dist, upper_dist)
    cell_sep.insert(0, 'cell', cell)
    sep_stats.append(cell_sep)
sep_stats = pd.concat(sep_stats)

# # sanity check: FDR ~ threshold is monotonically decreasing
# ex_stats = sep_stats[sep_stats.cell == 'K562']
# plt.scatter(ex_stats.threshold, ex_stats.FDR);

In [8]:
FDR_THRESHOLD = .01

thresholds_df = list()
fig, ax = plt.subplots(figsize=(10, 7), layout="tight")
fig.suptitle(f'{FDR_THRESHOLD:.1%} FDR')
for cell_index, cell in enumerate(config.CELL_LINES):
    sep_tb = sep_stats[(sep_stats.cell == cell)]
    thresh_idx = np.argmin(abs(sep_tb.FDR - FDR_THRESHOLD))
    thresh = sep_tb.threshold[thresh_idx]
    thresholds_df.append([cell, f'fdr_{FDR_THRESHOLD}', NEGATIVE_CONTROL, thresh])
    ax.plot(sep_tb.threshold[:-1], sep_tb.FDR[:-1], color=config.CELL_LINE_COLORS[cell_index], label=cell)
    ax.plot([thresh, thresh], [0, .5], ls='--', color=config.CELL_LINE_COLORS[cell_index])
    ax.set_ylabel('False Discovery Rate', fontsize=15)
    ax.set_xlabel(r'Threshold ($\log_2 (\frac{RNA}{DNA} + 1)$)', fontsize=15)
    ax.legend()
plt.savefig(config.FIGURES_DIR / "control_tiles_fdr-threshold.pdf", format="pdf", bbox_inches="tight")
plt.close(fig)

thresholds_df = pd.DataFrame(thresholds_df, columns=['cell', 'type', 'lower_dist', 'threshold'])
thresholds_df.to_csv(config.RESULTS_DIR / 'thresholds_log2_1p.csv', index=False)
thresholds_df.sample(2, random_state=0)

Unnamed: 0,cell,type,lower_dist,threshold
5,K562,fdr_0.01,ctrl_neg_(union),2.580456
2,MRC5,fdr_0.01,ctrl_neg_(union),1.800548


Control activities stratified by cell line with threshold lines

In [9]:
fig, axes = plt.subplots(figsize=(20, 14), nrows=2, ncols=3, layout="tight")
fig.suptitle('Distribution of control sequence activities')
for cell_index, cell in enumerate(config.CELL_LINES):
    ax = axes[cell_index // 3, cell_index % 3]
    fdr_thresh = thresholds_df[(thresholds_df.cell == cell)].threshold.values
    ax.plot([-2, 3 * 6 + 2], [fdr_thresh, fdr_thresh], ls='--', color='firebrick', label=f'{round(FDR_THRESHOLD * 100, 1)}% FDR')
    for project_ind, project in enumerate(control_projects):
        values = controls[cell][project]
        plots.violin(ax, values, project_ind * 6, width_factor=1, box_width=.1)
    if cell_index == 2:
        ax.legend(title='Thresholds', bbox_to_anchor=(1.1, 1.05))
    ax.set_ylim([0, 8])
    ax.set_ylabel(r'$\log_2 (\frac{signal}{plasmid} + 1)$')
    ax.set_xticklabels(control_projects, rotation=90)
    ax.set_title(cell)
plt.savefig(config.FIGURES_DIR / f"control_tiles_fdr-threshold_{int(FDR_THRESHOLD * 100)}.pdf", format="pdf", bbox_inches="tight")
plt.close(fig)

Results: number of active virus tiles per cell line

In [10]:
thresholds_df = pd.read_csv(config.RESULTS_DIR / 'thresholds_log2_1p.csv')[['cell', 'threshold']]
virus_genomes = pd.read_csv(config.RAW_DIR / 'virus_genbank_ids.txt').columns.values
paired_tiles = pd.read_csv(config.PROCESSED_DIR / 'virus_paired_tiles_log2p_ratios.csv')
paired_tiles = paired_tiles[paired_tiles['genome'].isin(virus_genomes) & paired_tiles['family'].isin(config.DSDNA_FAMILIES)].reset_index(drop=True)
paired_tiles = pd.merge(paired_tiles.fillna(0), thresholds_df, on='cell', how='left')
paired_tiles['is_active'] = paired_tiles[['fwd_lfc', 'rev_lfc']].max(1) >= paired_tiles.threshold
summary_activity = paired_tiles.value_counts(['cell', 'is_active']).to_frame().reset_index()
summary_activity[summary_activity.is_active]


Unnamed: 0,cell,is_active,count
6,Jurkat,True,6896
7,MRC5,True,6562
8,K562,True,6513
9,HEK293,True,6332
10,A549,True,5955
11,GM12878,True,5275


- Separately, activity thresholds were determined using predictions from the Malinois model (PMID: [39443793](https://pubmed.ncbi.nlm.nih.gov/39443793/)).

Predictions were generated using the script [predict_activity_control_tiles.py](../scripts/predict_activity_control_tiles.py).


In [11]:
control_df = pd.read_csv(config.PROCESSED_DIR / "control_tile_malinois_predictions.csv")

malinois_cell_lines = ['k562', 'hepg2', 'sknsh']
sep_stats = list()
for cell in malinois_cell_lines:
    upper_dist = control_df[control_df.project == 'ctrl_pos'][f'malinois_{cell}_lfc'].dropna().values
    up_low, up_high = np.quantile(upper_dist, .025), np.quantile(upper_dist, .975)
    upper_dist = upper_dist[(upper_dist > up_low) & (upper_dist < up_high)]
    lower_dist = control_df[control_df.project != 'ctrl_pos'][f'malinois_{cell}_lfc'].dropna().values
    lo_low, lo_high = np.quantile(lower_dist, .025), np.quantile(lower_dist, .975)
    lower_dist = lower_dist[(lower_dist > lo_low) & (lower_dist < lo_high)]
    cell_sep = dsdna_mpra.thresholds.binary_separation_stats(lower_dist, upper_dist)
    cell_sep.insert(0, 'cell', cell)
    sep_stats.append(cell_sep)
sep_stats = pd.concat(sep_stats)

# # sanity check: FDR ~ threshold is monotonically decreasing
# ex_stats = sep_stats[sep_stats.cell == 'k562']
# plt.scatter(ex_stats.threshold, ex_stats.FDR);

thresholds_df = list()
for cell_index, cell in enumerate(malinois_cell_lines):
    sep_tb = sep_stats[(sep_stats.cell == cell)]
    thresh_idx = np.argmin(abs(sep_tb.FDR - FDR_THRESHOLD))
    thresh = sep_tb.threshold[thresh_idx]
    thresholds_df.append([cell, f'fdr_{FDR_THRESHOLD}', NEGATIVE_CONTROL, thresh])
thresholds_df = pd.DataFrame(thresholds_df, columns=['cell', 'type', 'lower_dist', 'threshold'])
thresholds_df.to_csv(config.RESULTS_DIR / 'thresholds_malinois_log2_1p.csv', index=False)
