In [1]:
from pathlib import Path
import sys
import json

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(font_scale=1.5, palette='Set2')
sns.set_style('whitegrid')

import logomaker
from IPython.display import HTML
from bs4 import BeautifulSoup

%load_ext autoreload
%autoreload 2

sys.path.insert(0, '..')
import dsdna_mpra
from dsdna_mpra import config, motifs, plots

**Preprocessing**

1. **Compute Contribution Scores**  
   Contribution scores &mdash; quantifying the influence of input features on Malinois model predictions &mdash; are computed for viral tiles and human K562 DNase I hypersensitive sites (DHSs) using the script [`compute_contribution_scores.py`](../scripts/compute_contribution_scores.py).

2. **Motif Discovery and Annotation**  
   [TF-MoDISco-lite](https://github.com/jmschrei/tfmodisco-lite) is applied to the concatenated hypothetical contribution scores of viral tiles and K562 DHSs to identify motif matrices. Motifs are then matched against the SCENIC human transcription factor (TF) motif collection using TOMTOM.

   ```bash
   modisco motifs \
     -s "${PROCESSED_DIR}/malinois_K562_onehot_sequences.npz" \
     -a "${PROCESSED_DIR}/malinois_K562_contribution_scores.npz" \
     --window 200 \
     -n 1000000 \
     -o "${PROCESSED_DIR}/malinois_K562_modisco_results.h5" \
     -v

   modisco report \
     -i "${PROCESSED_DIR}/malinois_K562_modisco_results.h5" \
     -m "${RAW_DIR}/scenic_human_motif_collection.meme" \
     -o "${PROCESSED_DIR}/malinois_K562_modisco_report/" \
     -s "${PROCESSED_DIR}/malinois_K562_modisco_report/"
   ```

3. **TF Family Assignment**

   Motif matrices output by TF-MoDISco-lite are postprocessed using the script [`tfmodisco_postprocessing.py`](../scripts/tfmodisco_postprocessing.py). Transcription factor (TF) gene families are manually assigned to each motif based on TOMTOM alignment results. The final curated motif collection includes:
    - TF-MoDISco-lite motifs  
    - E2F transcription factor motif matrix  
    - 3′ splice site position weight matrix (PWM)


- Motifs Identified by TF-MoDISco-lite and Matched via TOMTOM


In [2]:
report_path = config.PROCESSED_DIR / "malinois_K562_modisco_report/motifs_with_logos.html"
# HTML(report_path.read_text(encoding='utf-8'))

with open(report_path, encoding='utf-8') as f:
    soup = BeautifulSoup(f.read(), "html.parser")
HTML("<table>\n" + "\n".join(str(row) for row in soup.find("table").find_all("tr")[:3]) + "\n</table>")


Unnamed: 0,pattern,modisco_cwm_fwd,modisco_cwm_rev,match0_logo,match1_logo,match2_logo,gene0,gene1,gene2,num_seqlets,match0,match1,match2,qval0,qval1,qval2
0,pos_patterns.pattern_0,,,,,,SP5,SP4,SP4,14061,dbtfbs__SP5_HepG2_ENCSR019NPF_merged_N1,hocomoco__SP4_HUMAN.H11MO.1.A,jaspar__MA0685.2,2.61676e-06,1.65245e-05,2.11298e-05
1,pos_patterns.pattern_1,,,,,,JUN,BATF,JUNB,13214,dbtfbs__JUN_representative_N1,swissregulon__hs__BATF,hocomoco__JUNB_HUMAN.H11MO.0.A,0.000162922,0.000162922,0.000162922


- K562 TF Motif Logos

In [3]:
motif_df = pd.read_csv(config.RESULTS_DIR / 'malinois_K562_tf_motif_families.csv')
tf_motifs = motifs.parse_pwm_file(config.RESULTS_DIR / 'malinois_K562_tf_motifs.cb')

for motif_id, assigned_tf in motif_df.values:
    fwd_motif = tf_motifs[f'{motif_id}_fwd']
    fig, ax = plt.subplots(figsize=(6, 3), layout="tight")
    cs_matrix = pd.DataFrame(fwd_motif.T, columns=['A', 'C', 'G', 'T'])
    logo = logomaker.Logo(cs_matrix, ax=ax, center_values=False)
    ax.set_ylim([-1, 1])
    ax.set_xlabel('Position')
    ax.set_ylabel('IC')
    ax.set_title(f'{motif_id} ({assigned_tf})', fontsize=12);
    ax.grid(False)
    fig.savefig(config.FIGURES_DIR / f'malinois_K562_tf_motifs/{motif_id}.pdf', format='pdf', bbox_inches='tight')
    plt.close()

### TF–Motif Scan

Contribution score arrays are scanned using the K562 motif collection with the script [`tf_motif_annotation.py`](../scripts/tf_motif_annotation.py) to identify motif occurrences.

**Example TF–motif map:**


In [4]:
tf_motifs = motifs.parse_pwm_file(config.RESULTS_DIR / 'malinois_K562_tf_motifs.cb')
tf_motif_genes = pd.read_csv(config.RESULTS_DIR / 'malinois_K562_tf_motif_families.csv')

contrib_scores_data = np.load(config.PROCESSED_DIR / "malinois_K562_contribution_scores.npz", allow_pickle=True)
contrib_scores = contrib_scores_data['arr_0']
tile_ids = np.concatenate([contrib_scores_data['virus_tile_ids'], contrib_scores_data['dhs_tile_ids']])

with open(config.RESULTS_DIR / "malinois_K562_tf_motif_map.json", 'r', encoding='utf-8') as f:
    tf_motif_map = json.load(f)


In [5]:
tile_index = 122
tile_info = tf_motif_map[tile_index]
tile_contribution_scores = contrib_scores[tile_index]

fig = plots.motif_annotation_plot(
    tf_motifs, tile_info['tile_id'], tile_contribution_scores, tile_info['peak_positions'],
    np.array([tile_info['motif_positions']]).swapaxes(0, 1), np.array([tile_info['motifs']]).swapaxes(0, 1),
    dict(tf_motif_genes.values)
)
fig.savefig(config.FIGURES_DIR / f"contribution_scores_and_motif_map_tile_{tile_index}.pdf", format="pdf", bbox_inches="tight")
plt.close()

### TF Binding Site Occurrence Statistics

To ensure a consistent procedure across both viral tiles and K562 DHSs, tile MPRA activity is inferred from *Malinois* model predictions.

Statistics are precomputed using the script [`tf_motif_statistics.py`](../scripts/tf_motif_statistics.py).


In [6]:
thresholds_df = pd.read_csv(config.RESULTS_DIR / 'thresholds_malinois_log2_1p.csv')[['cell', 'threshold']]
K562_THRESHOLD = thresholds_df[thresholds_df.cell == 'k562'].threshold.iloc[0]
paired_tiles = pd.read_csv(config.RESULTS_DIR / "malinois_predicted_and_observed_activities.csv")

with open(config.RESULTS_DIR / "malinois_K562_tf_motif_map.json", 'r', encoding='utf-8') as f:
    tile_motif_map = {
        tile_map['tile_id']: tile_map
        for tile_map in json.load(f)
    }


Sanity Check: *Malinois* K562 MPRA activity predictions match experimental measurements


In [7]:
fig, ax = plt.subplots(figsize=(7, 7), layout='tight')
ax.scatter(paired_tiles[['fwd_lfc', 'rev_lfc']].mean(1), paired_tiles.malinois_k562_lfc, color='cornflowerblue', s=50, alpha=.1)
ax.plot([.5, 8.5], [.5, 8.5], color='black')
ax.set_xlim([0, 9])
ax.set_ylim([0, 9])
ax.set_xlabel(r'Experimental K562 $\log_2 (FC + 1)$')
ax.set_ylabel('Malinois model prediction for K562')
ax.grid(False)
plt.savefig(config.FIGURES_DIR / 'malinois_k562_predictions.pdf', format='pdf', bbox_inches='tight')
plt.close()

- **Number of Motif Instances per K562-active viral tile or human DHS**

  *Note:* DHSs are centered at the position of maximum information content, whereas viral tile positions are arbitrary with respect to cis-regulatory element (CRE) centers. This discrepancy may lead to fewer detected motifs per viral CRE tile compared to human DHSs.


In [8]:
counts = pd.read_csv(config.RESULTS_DIR / "malinois_K562_number_tfbs_per_active_tile.csv").set_index('virus')
fractions = counts.div(counts.sum(axis=1), axis=0)
fractions.to_csv(config.RESULTS_DIR / "malinois_k562_number_motifs_per_cre_tile.csv")

fig, ax = plt.subplots(figsize=(10, 20))
img = plots.heatmap_with_stats(
        ax, fractions, imshow_args={'cmap': 'Reds', 'vmin': 0, 'vmax': .4, 'norm': None},
        title_args={'label': 'Number of motifs per CRE tile', 'fontsize': 20}, text_values=counts,
    )
cbar_ax = fig.add_axes([.92, 0.15, 0.02, 0.7])
cbar = fig.colorbar(img, cax=cbar_ax)
cbar.set_label("Fraction of active tiles")
plt.savefig(config.FIGURES_DIR / 'malinois_k562_number_motifs_per_cre_tile.pdf', format='pdf', bbox_inches='tight')
plt.close()


- Number of motif instances per tile, per TF

In [9]:
tfbs_counts_dhs = pd.read_csv(config.RESULTS_DIR / "malinois_K562_tfbs_counts_dhs.csv")
tfbs_counts_dhs = tfbs_counts_dhs[tfbs_counts_dhs.malinois_k562_lfc >= K562_THRESHOLD]
tfbs_counts_tiles = pd.read_csv(config.RESULTS_DIR / "malinois_K562_tfbs_counts_virus_tiles.csv")
tfbs_counts_tiles = tfbs_counts_tiles[tfbs_counts_tiles.malinois_k562_lfc >= K562_THRESHOLD]

grouped_means = tfbs_counts_tiles.groupby("virus")[config.TF_GENES_K562].mean().reindex(config.VIRUSES)
dhs_mean = tfbs_counts_dhs[config.TF_GENES_K562].mean().to_frame().T
dhs_mean.index = ["DHS"]
tfbs_counts_df = pd.concat([grouped_means, dhs_mean])
tfbs_counts_df.to_csv(config.RESULTS_DIR / "malinois_k562_number_tfbs_per_cre_tile.csv")

fig, ax = plt.subplots(figsize=(20, 20))
img = plots.heatmap_with_stats(
        ax, tfbs_counts_df, imshow_args={'cmap': 'Purples', 'vmin': 0, 'vmax': 1.5, 'norm': None},
        title_args={'label': 'Number of TFBS per CRE tile', 'fontsize': 20},
    )
cbar_ax = fig.add_axes([.92, 0.15, 0.02, 0.7])
cbar = fig.colorbar(img, cax=cbar_ax)
cbar.set_label("Number TFBS per tile")
plt.savefig(config.FIGURES_DIR / 'malinois_k562_number_tfbs_per_cre_tile.pdf', format='pdf', bbox_inches='tight')
plt.close()

- TFBS frequency relative to K562 DHSs

In [10]:
pseudocount = 1e-2
tfbs_relative_df = (grouped_means + pseudocount).divide((dhs_mean + pseudocount).values, axis=1)
tfbs_relative_df = np.log2(tfbs_relative_df)
tfbs_relative_df.to_csv(config.RESULTS_DIR / "malinois_k562_number_tfbs_log2_dhs_ratio.csv")

fig, ax = plt.subplots(figsize=(20, 20))
img = plots.heatmap_with_stats(
        ax, tfbs_relative_df, imshow_args={'cmap': 'RdBu_r', 'vmin': -3.5, 'vmax': 3.5, 'norm': None},
        title_args={'label': r'Number of TFBS per CRE tile, $\log_2$ (virus / DHS)', 'fontsize': 20},
    )
cbar_ax = fig.add_axes([.92, 0.15, 0.02, 0.7])
cbar = fig.colorbar(img, cax=cbar_ax)
cbar.set_label(r"$\log_2$ (virus / DHS)")
plt.savefig(config.FIGURES_DIR / 'malinois_k562_number_tfbs_log2_dhs_ratio.pdf', format='pdf', bbox_inches='tight')
plt.close()

- Motif frequency across Herpesvirus Kinetic Groups

In [11]:
herpes_kinetics_df = pd.read_csv(config.RESULTS_DIR / 'herpesvirus_tss_kinetics_tfbs_counts.csv')
avg_tfbs_per_group = list()
for kinetic_group in config.GENE_KINETIC_GROUPS:
    avg_tfbs_per_group.append(herpes_kinetics_df[herpes_kinetics_df[kinetic_group]][config.TF_GENES_K562].mean(0).tolist())
avg_tfbs_per_group = pd.DataFrame(np.array(avg_tfbs_per_group).T, columns=config.GENE_KINETIC_GROUPS)
avg_tfbs_per_group.insert(0, 'tf_gene', config.TF_GENES_K562)
avg_tfbs_per_group.to_csv(config.RESULTS_DIR / 'fig_7D_herpes_tss_kinetics_tfbs_counts.csv', index=False)

fig, ax = plt.subplots(figsize=(10, 25))
img = plots.heatmap_with_stats(
        ax, avg_tfbs_per_group.set_index('tf_gene'), imshow_args={'cmap': 'Reds', 'vmin': 0, 'vmax': 2, 'norm': None},
    )
cbar_ax = fig.add_axes([.92, 0.15, 0.02, 0.7])
cbar = fig.colorbar(img, cax=cbar_ax)
cbar.set_label("Average number of motifs per promoter")
plt.savefig(config.FIGURES_DIR / 'fig_7D_herpes_tss_kinetics_tfbs_counts.pdf', format='pdf', bbox_inches='tight')
plt.close()


Separately for each virus

In [12]:
for name, strain in [
    ['HHV-1', 'Herpes Simplex 1, KOS'],
    ['HHV-5', 'Human cytomegalovirus'],
    ['HHV-8', 'Kaposi Sarcoma (HHV-8)']
]:
    strain_df = herpes_kinetics_df[herpes_kinetics_df.strain == strain]
    group_averages = []
    for kinetic_group in config.GENE_KINETIC_GROUPS:
        mask = strain_df[kinetic_group]
        group_avg = strain_df[mask][config.TF_GENES_K562].mean(0).tolist()
        group_averages.append(group_avg)
    df = pd.DataFrame(np.array(group_averages).T, columns=config.GENE_KINETIC_GROUPS)
    df.insert(0, 'tf_gene', config.TF_GENES_K562)
    df.to_csv(config.RESULTS_DIR / f'fig_S7A_{name}_herpes_tss_kinetics_tfbs_counts.csv', index=False)


- Motif frequency in Coding Regions

In [13]:
tfbs_cds_counts = pd.read_csv(config.RESULTS_DIR / 'malinois_K562_number_tfbs_per_cds_kbp.csv')

fig, ax = plt.subplots(figsize=(7, 7), layout='tight')
ax.scatter(tfbs_cds_counts.not_cds_counts_per_kbp, tfbs_cds_counts.cds_counts_per_kbp, s=150, color='cornflowerblue', alpha=0.7)
ax.plot([0, 3.5], [0, 3.5], color='black')
top_tfs = tfbs_cds_counts.nlargest(5, 'not_cds_counts_per_kbp')  # TF genes with highest non-CDS motif density
for _, row in top_tfs.iterrows():
    ax.text(row['not_cds_counts_per_kbp'] + 5e-2, row['cds_counts_per_kbp'] + 5e-2, row['tf_gene'], fontsize=10, weight='bold', ha='left', va='bottom')
ax.set_xlabel("Not CDS")
ax.set_ylabel("CDS")
ax.set_title("Number TFBS per 1 kbp")
ax.set_xlim([0, 4])
ax.set_ylim([0, 4])
ax.grid(False)
plt.savefig(config.FIGURES_DIR / "malinois_k562_tfbs_per_kbp_in_cds.pdf", format="pdf", bbox_inches="tight")
plt.close()
