In [1]:
from pathlib import Path
import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import matplotlib.colors as mcolors
import seaborn as sns
sns.set_theme(font_scale=1.5, palette='Set2')
sns.set_style('whitegrid')

%load_ext autoreload
%autoreload 2

sys.path.insert(0, '..')
import dsdna_mpra
from dsdna_mpra import config, plots

CRE annotation is performed by the script [`cre_annotation.py`](../scripts/cre_annotation.py).


Total number of CREs identified across all viruses

In [2]:
cell_merged_df = pd.read_csv(config.RESULTS_DIR / "cre_positions_strands_and_cell_merged.csv")

total_n_cres = 0
for virus, virus_cres_df in cell_merged_df.groupby(['family', 'strain']):
    total_n_cres += virus_cres_df.shape[0]
print(f"Total number of CREs: {total_n_cres}")


Total number of CREs: 2069


Number of CREs per cell line per virus

In [3]:
genomes_summary = pd.read_csv(config.PROCESSED_DIR / 'summary_virus_genome_records.csv')
cres_df = pd.merge(
    pd.read_csv(config.RESULTS_DIR / "cre_positions_strands_merged.csv"),
    genomes_summary[['accession_id', 'genome_size']].rename({'accession_id': 'genome'}, axis=1),
    on='genome', how='left'
)

stats = list()
for virus, virus_cres_df in cres_df.groupby(['family', 'strain', 'cell', 'genome']):
    n_cres = virus_cres_df.shape[0]
    total_size = (virus_cres_df['end'] - virus_cres_df['begin']).sum()
    stats.append(list(virus) + [n_cres, total_size, total_size / virus_cres_df.genome_size.iloc[0]])
stats = pd.DataFrame(stats, columns=['family', 'strain', 'cell', 'genome', 'n_cres', 'cres_size', 'cres_covered_fraction'])
stats.sort_values(['family', 'strain', 'cell', 'n_cres', 'cres_size'], inplace=True)
stats.to_csv(config.RESULTS_DIR / 'summary_cre_strands_merged.csv', index=False)

stats['virus'] = (stats.family + ', ' + stats.strain).astype('category').cat.set_categories(config.VIRUSES)
stats['family'] = stats.family.astype('category').cat.set_categories(config.DSDNA_FAMILIES[::-1])


**Heatmap:** Number of CREs or fraction of genome covered by CREs per virus -- cell line pair.


In [4]:
import warnings


n_families = stats.family.unique().size
n_strains = (stats.value_counts('family', sort=False)).values

for col, valtype, colorscale, figtitle, figname, cmap in [
    ['n_cres', int, 'symlog', 'Number of CREs', 'number_cres', 'magma'],
    ['cres_covered_fraction', float, None, 'Fraction of genome covered by CREs', 'genome_fraction_cres', 'viridis']
]:
    summary_df = stats.pivot_table(values=col, index=['family', 'strain', 'virus'], columns='cell', observed=True).fillna(0).astype(valtype).reset_index()
    summary_df.sort_values(['virus'], kind='stable', inplace=True)
    max_val = summary_df[config.CELL_LINES].max().max()
    # summary_df.to_excel(config.RESULTS_DIR / f"summary_strands_merged_{figname}.xlsx", index=False)

    fig, ax = plt.subplots(figsize=(8, 15), nrows=n_families, height_ratios=n_strains)
    fig.suptitle(f"{figtitle}", fontsize=25)
    for fam_ind, (family, fam_n_cres) in enumerate(summary_df.groupby('family', observed=False)):
        img = plots.heatmap_with_stats(ax[fam_ind],
                                        fam_n_cres.drop(['family', 'virus'], axis=1).set_index('strain'),
                                        imshow_args={'cmap': cmap, 'vmin': 0, 'vmax': max_val, 'norm': colorscale},
                                        title_args={'label': rf"${family}$", 'fontsize': 20})
        if fam_ind != 0:
            ax[fam_ind].tick_params(axis='x', which='both', top=False, labeltop=False)
    fig.subplots_adjust(right=1.2)
    cbar_ax = fig.add_axes([1.01, 0.15, 0.02, 0.7])
    fig.colorbar(img, cax=cbar_ax)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        plt.tight_layout()
    fig.savefig(config.FIGURES_DIR / f"summary_strands_merged_{figname}.pdf", bbox_inches='tight', format='pdf')
    plt.close()


### CREs cell type specificity

Correlation of CRE tile activity across cell lines is computed in the script [`cre_tile_clustering.py`](../scripts/cre_tile_clustering.py).

In [5]:
pcc_matrix = np.load(config.RESULTS_DIR / 'cre_tiles_cell_lines_correlations.npy')

fig, ax = plt.subplots(figsize=(8, 7))
ordered_by_cluster = [5, 4, 1, 3, 2, 0]  # cells ordered manually
heatmap = sns.heatmap(
    pcc_matrix[ordered_by_cluster, :][:, ordered_by_cluster],
    vmin=0, center=0, vmax=1,
    cmap='RdBu_r',
    ax=ax,
    cbar=True
)
colorbar = heatmap.collections[0].colorbar
colorbar.ax.set_title('PCC', fontsize=14, pad=10)
ax.set_xticks(np.arange(6) + .5, np.array(config.CELL_LINES)[ordered_by_cluster], fontsize=10)
ax.set_yticks(np.arange(6) + .5, np.array(config.CELL_LINES)[ordered_by_cluster], fontsize=10)
ax.set_title('Correlation of CRE tile activity across cell lines')
fig.tight_layout()
fig.savefig(config.FIGURES_DIR / 'cre_tiles_cell_lines_correlations.pdf', bbox_inches='tight', format='pdf')
plt.close()


**Supervised clustering:** Tiles are grouped into 64 classes based on binary activity patterns across 6 cell lines. Top 20 most frequent classes are plotted.


In [6]:
coverage_threshold = .8
clustering_results = np.load(config.RESULTS_DIR / 'cre_tiles_supervised_clustering.npz')
matrix = clustering_results['matrix']
borders = clustering_results['final_borders']
selected_class_ids = clustering_results['selected_class_ids']

fig, ax = plt.subplots(figsize=(7, 15))
vcenter = 0.5
vmin, vmax = 0, 1
colorbar_title = 'activity rank'
norm = mcolors.TwoSlopeNorm(vcenter=vcenter, vmin=vmin, vmax=vmax)
img = ax.imshow(matrix, norm=norm, cmap='coolwarm', aspect='auto', interpolation='nearest')
cbar_ax = fig.add_axes([1.05, 0.15, 0.02, 0.7])
cbar = fig.colorbar(img, cax=cbar_ax)
cbar.ax.set_title(colorbar_title, x=1.05, y=1.05, fontsize=15)
ax.set_xticks(np.arange(matrix.shape[1]))
ax.set_xticklabels(config.CELL_LINES, fontsize=12)
ax.xaxis.tick_top()
ax.tick_params(axis='x', which='both', top=False, labeltop=True)
ax.set_yticks(np.arange(0, matrix.shape[0], 500))
ax.set_yticklabels(np.arange(0, matrix.shape[0], 500), fontsize=15)
ax.yaxis.tick_left()
ax.grid(False)
for y in borders:
    ax.axhline(y=y, linestyle='-', color='black', linewidth=2)
ax.set_title(f'{selected_class_ids.size} clusters encompassing {coverage_threshold:.0%} of CRE tiles')
ax.set_ylabel('CRE tiles')
borders = np.hstack([[0], borders])
ax.set_yticks((borders[1:] + borders[:-1]) / 2)
ax.set_yticklabels([f'class {i}' for i in range(selected_class_ids.size)], fontsize=8)
fig.savefig(config.FIGURES_DIR / 'cre_tiles_supervised_clustering.pdf', bbox_inches='tight', format='pdf')
plt.close()

The intersection of cell-specific CRE positions (‘partitioned CREs’) is computed in the script [`cre_cell_specificity.py`](../scripts/cre_cell_specificity.py).

In [7]:
part_cres_df = pd.read_csv(config.RESULTS_DIR / "cre_positions_partitioned_cres_strands_merged.csv")
part_cres_df.sample(2, random_state=0)

Unnamed: 0,family,strain,genome,begin,end,GM12878,Jurkat,MRC5,A549,HEK293,K562,n_cells
4285,Herpesviridae,Human Herpes 7,AF037218.1,25050,25150,1,0,0,0,0,0,1
3964,Herpesviridae,"Herpes Simplex 2, Strain G",OM370995.1,102300,102400,1,0,0,0,0,0,1


- Size distribution of partitioned CREs  


In [8]:
max_n_cells = part_cres_df.n_cells.max()
fig, ax = plt.subplots(figsize=(14, 7), layout="tight")
n_cells_grid = np.arange(1, max_n_cells + 1)
for n_cells in n_cells_grid:
    cre_tb = part_cres_df[part_cres_df.n_cells == n_cells]
    values = cre_tb['end'] - cre_tb['begin']
    plots.violin(ax, values, (n_cells - 1) * 1.1, width_factor=15, box_width=1.9e-1)
ax.set_ylabel('CRE size, bp')
ax.set_xticks((n_cells_grid - 1) * 1)
ax.set_xticklabels(n_cells_grid)
ax.set_xlabel('# of cells sharing the CRE')
ax.set_title('Size Distribution of Partitioned CREs')
ax.set_ylim([0, 500])
ax.grid(False)
plt.savefig(config.FIGURES_DIR / 'partitioned_cre_size_distribution.pdf', bbox_inches="tight", format='pdf')
plt.close(fig)


* Cell-specificity of partitioned CREs

In [9]:
PART_CRES_SIZE_THRESHOLD = 200
size_filter = (part_cres_df['end'] - part_cres_df['begin']) >= PART_CRES_SIZE_THRESHOLD
part_cres_df = part_cres_df[size_filter]

colors = {1: 'bisque', 2: 'lightsalmon', 3: 'tomato', 4: 'red', 5: 'firebrick', 6: 'darkred'}
spec_tb = part_cres_df.value_counts(['family', 'strain', 'n_cells']).to_frame().reset_index().rename({'count': 'n_cres'}, axis=1)
spec_tb['color'] = np.vectorize(colors.get)(spec_tb.n_cells)
spec_tb['virus'] = (spec_tb.family + ', ' + spec_tb.strain).astype('category').cat.set_categories(config.VIRUSES)
spec_tb.family = spec_tb.family.astype('category').cat.set_categories(config.DSDNA_FAMILIES[::-1])
spec_tb.sort_values(['family', 'virus', 'n_cells'], ascending=[True, True, False], inplace=True)


In [10]:
n_families = spec_tb['family'].nunique()
n_strains = spec_tb.groupby('family', sort=False, observed=False)['strain'].nunique().values
yticks_step = {'Herpesviridae': 50, 'Adenoviridae': 10, 'Papillomaviridae': 2, 'Polyomaviridae': 1}

fig, axes = plt.subplots(
    nrows=1, ncols=n_families, figsize=(4 * n_families + 5, 10),
    gridspec_kw={'width_ratios': n_strains}
)
fig.suptitle("Number of Partitioned CREs Shared Across X Cell Lines", fontsize=16)

if n_families == 1:
    axes = [axes]

for fam_idx, (family, group_df) in enumerate(spec_tb.groupby('family', sort=False, observed=False)):
    legend_title = '# cells' if fam_idx == 0 else None

    plots.stacked_bar_plot(
        ax=axes[fam_idx],
        dataframe=group_df,
        x_value='strain',
        hue='n_cells',
        weight='n_cres',
        color='color',
        legend_title=legend_title,
        normalize_weights=False
    )
    axes[fam_idx].set_title(family, fontsize=15)

    max_height = group_df.groupby('strain')['n_cres'].sum().max()
    yticks = np.arange(0, max_height * 1.2, yticks_step.get(family, 1))
    axes[fam_idx].set_yticks(yticks)
    axes[fam_idx].set_yticklabels(yticks.astype(int))

    if fam_idx == 0:
        axes[fam_idx].set_ylabel('# of shared CREs', fontsize=15)
        legend_handles = [
            Patch(facecolor=color, edgecolor=color, label=label) for label, color in colors.items()
        ]
        axes[fam_idx].legend(handles=legend_handles, title='# cells', bbox_to_anchor=(-0.42, 1), loc='upper left')

fig.tight_layout()
fig.savefig(config.FIGURES_DIR / 'partitioned_cres_shared_by_cell_lines.pdf', format='pdf', bbox_inches='tight')
plt.close(fig)
