In [1]:
params_notebook_name = "datasets.rbps_metadata.py.ipynb"
params_resource_dir = "../resources/"

# RBPs - metadata

## Overview

## Imports

In [100]:
import os
import sys
from io import StringIO
from pathlib import Path

import numpy as np
import pandas as pd
from dotmap import DotMap

In [3]:
def assert_notebook_working_dir(expected_local_file: os.PathLike) -> Path:
    """Assert or try updating the current working directory to where the notebook is located, to enable relative paths references.

    This function is used in a set-up where notebooks are contained within a project
    directory structure in which we want to reference filepaths relative to the notebook.
    e.g. "../src" or "../resources" should be accessible if the notebook is in
    "../notebooks/<notebook_name>.ipynb".

    The function first check the filepath of the expected local file relative
    to the current working directory.

    If not found, the function will try to use the VSCode Jupyter variable `__vsc_ipynb_file__`
    which should report the path of the notebook file being executed.

    It then checks if the expected local file exists, relative to the new working directory.


    Args:
        expected_local_file (os.PathLike): The expected local file to check for in the current working directory.
            This can be the name of the notebook file.

    Raises:
        KeyError: if the `__vsc_ipynb_file__` variable is not found in the global scope, while the first CWD check failed.
        FileNotFoundError: if the expected local file is not found in the current working directory after attempting to change it.
    """
    import os
    from pathlib import Path

    cwd = Path(os.getcwd())
    expected_local_filepath = cwd / expected_local_file

    if not expected_local_filepath.exists():
        if "__vsc_ipynb_file__" not in globals():
            raise KeyError(
                f"Detected CWD: {cwd} ; CWD does not contain expected file, but cannot use __vsc_ipynb_file__ to recover."
            )
        else:
            os.chdir(Path(globals()["__vsc_ipynb_file__"]).parent)
            cwd = Path(os.getcwd())
            print(f"Changed CWD to {cwd}")

            expected_local_filepath = cwd / expected_local_file
            if not expected_local_filepath.exists():
                raise FileNotFoundError(
                    f"Updated (using __vsc_ipynb_file__) CWD: {cwd} ; CWD does not contain expected file."
                )

            return cwd
    else:
        print(f"Confirmed CWD: {cwd} contains expected file: {expected_local_file}")
        return cwd


expected_local_file: str = params_notebook_name
cwd = assert_notebook_working_dir(expected_local_file=expected_local_file)
print(cwd)

Confirmed CWD: /home/l10n/projects/ml4rg25-parnet/parnet_demo/notebooks contains expected file: datasets.rbps_metadata.py.ipynb
/home/l10n/projects/ml4rg25-parnet/parnet_demo/notebooks


## Init

In [4]:
resource_dir = Path(params_resource_dir)
if not resource_dir.exists():
    raise FileNotFoundError("Resource directory does not exist: " + str(resource_dir))

print("Using resources from:", resource_dir)

Using resources from: ../resources


## Load

### Benchmark sequence vs multimodal performance

In [5]:
seq_vs_struct_perf_metadata_filepath = (
    resource_dir
    / "parnet_encore_eclip"
    / "rbps_metadata"
    / "benchmark-project.seq-vs-struct-performance.prismnet_tmp_inputfeat_n2_piv_ALT.tsv"
)
seq_vs_struct_perf_metadata = pd.read_csv(seq_vs_struct_perf_metadata_filepath, sep="\t")
# Filter for the "ENCODE" dataset

seq_vs_struct_perf_metadata = seq_vs_struct_perf_metadata[seq_vs_struct_perf_metadata["dataset"] == "ENCODE"].copy()

display(seq_vs_struct_perf_metadata.head())
print(seq_vs_struct_perf_metadata.shape)

Unnamed: 0,dataset,RBP_dataset,seq,seq+struct,delta,is_top_decile,is_top_decile_n1
0,ENCODE,AARS_K562,0.65103,0.670804,0.019775,False,False
1,ENCODE,AATF_K562,0.771926,0.774804,0.002878,False,False
2,ENCODE,ABCF1_K562,0.742177,0.759099,0.016922,False,True
3,ENCODE,AGGF1_HepG2,0.778248,0.76714,-0.011108,False,False
4,ENCODE,AGGF1_K562,0.804949,0.818631,0.013682,False,False


(223, 7)


### Precomputed PARNET performance

This comes from a previous evaluation of the PARNET model(s) on various tasks.

Here the models were evaluated on a classification task, evaluating whether in an input window the model associated greater probability densities to positions overlapping with peaks.

In [6]:
# Load precomputed performance from PARNET 7m p=0.0
# We want to make sure that the selected RBPs are not too poorly performing.

performance_table_filepath = resource_dir / "parnet_encore_eclip" / "rbps_metadata" / "tmp_aucs_model_rbp.tsv"
performance_table = pd.read_csv(performance_table_filepath, sep="\t")

params_selected_model = "PARNET_7M_P0.0"
params_selected_track = "target"


# filter to keep only the selected model
performance_table = performance_table.loc[
    lambda df: ((df["model_name"] == params_selected_model) & (df["track_name"] == params_selected_track)), :
].copy()


# Assign a rank to the RBP based on the prc_auc
performance_table["rank"] = performance_table["prc_auc"].rank(ascending=False, method="min").astype(int)

display(performance_table.head())
print(performance_table.shape)


Unnamed: 0,model_name,rbp_ct,track_name,prc_auc,roc_auc,n_peaks,rank
223,PARNET_7M_P0.0,AARS_K562,target,0.065987,0.622589,144,215
224,PARNET_7M_P0.0,AATF_K562,target,0.079344,0.621983,21,210
225,PARNET_7M_P0.0,ABCF1_K562,target,0.017529,0.465338,2,222
226,PARNET_7M_P0.0,AGGF1_HepG2,target,0.101915,0.70396,200,196
227,PARNET_7M_P0.0,AGGF1_K562,target,0.140139,0.710133,200,153


(223, 7)


### RBP binding domains

In [73]:
filepath = (
    resource_dir
    / "parnet_encore_eclip"
    / "rbps_metadata"
    / "yeo_et_al_annotation_table"
    / "yeo_RBP_annotation.domains.csv"
)

rbps_domains = pd.read_csv(filepath, sep=",")

display(rbps_domains.head())
display(rbps_domains.shape)


Unnamed: 0,name,geneID,RRM,ZNF,KH,Helicase,Nuclease,dRBM,PUM_HD
0,A1CF,ENSG00000148584,1,0,0,0,0,0,0
1,AARS,ENSG00000090861,0,0,0,0,0,0,0
2,AATF,ENSG00000108270,0,0,0,0,0,0,0
3,ABCF1,ENSG00000204574,0,0,0,0,0,0,0
4,ABT1,ENSG00000146109,1,0,0,0,0,0,0


(356, 9)

## RBPs benefiting from RNA structure information

In [None]:
# Here: merging the benchmark seq vs seq+struct performance metadata table
# with the performance table from the the PARNET model evaluation.
# The goal is to identify RBPs that benefit from the seq+struct features,
# while also being well-performing in the PARNET model evaluation.

# We can then check the performance improvement of PARNET seq+struct vs seq-only.
#
performance_metadata_full = pd.merge(
    seq_vs_struct_perf_metadata,
    performance_table.loc[:, ["rbp_ct", "rank"]].rename(columns={"rank": "parnet_perf_rank"}),
    left_on="RBP_dataset",
    right_on="rbp_ct",
    how="left",
)


# Here: displaying the top 30 RBPs that:
# - have very high DELTA between seq+struct and seq-only performance (from benchmark analysis on other models)
# - are in the "top decile" of highest delta in this benchmark
#   - note on `is_top_decile` and `is_top_decile_n1` :
#       - `_n1` suffix: models were trained to classify peaks against randomly sampled regions.
#       - `is_top_decile`: models trained to classify peaks from one RBP against peaks from other RBPs.
#       - i.e. the `is_top_decile` situation is more strict as all regions may contain structure related to RBPs.
# - sorted on the high-performance from PARNET (lowest-ranking = best performance_table_filepath)
display(
    performance_metadata_full.sort_values(
        by=["delta", "is_top_decile", "is_top_decile_n1", "parnet_perf_rank"],
        ascending=[False, False, False, True],
    )
    .head(15)
    .reset_index(drop=True)
)

Unnamed: 0,dataset,RBP_dataset,seq,seq+struct,delta,is_top_decile,is_top_decile_n1,rbp_ct,parnet_perf_rank
0,ENCODE,APOBEC3C_K562,0.610052,0.819037,0.208984,True,True,APOBEC3C_K562,60
1,ENCODE,LARP4_HepG2,0.635173,0.827773,0.1926,True,True,LARP4_HepG2,73
2,ENCODE,SBDS_K562,0.610017,0.785654,0.175637,True,False,SBDS_K562,49
3,ENCODE,DDX55_HepG2,0.662662,0.817323,0.154661,True,True,DDX55_HepG2,70
4,ENCODE,DDX55_K562,0.668456,0.817482,0.149026,True,True,DDX55_K562,35
5,ENCODE,PABPC4_K562,0.705461,0.853262,0.147801,True,True,PABPC4_K562,134
6,ENCODE,UPF1_K562,0.732386,0.854118,0.121732,True,True,UPF1_K562,67
7,ENCODE,UPF1_HepG2,0.754238,0.857766,0.103528,True,True,UPF1_HepG2,75
8,ENCODE,DDX6_K562,0.650847,0.75349,0.102644,True,False,DDX6_K562,24
9,ENCODE,AKAP1_HepG2,0.736106,0.828647,0.092541,True,True,AKAP1_HepG2,42


For simplicity we apply the following filters:

- Separate HepG2 and K562 RBPs: for the Multimodal-PARNET models, we will have separate models for HepG2 and K562.
- (optional) Max rank: 100
- (optional) Have the three represented sets of "benefit from structure" : True True, True False, False False.


### HepG2

In [43]:
selected_hepg2 = (
    performance_metadata_full.loc[lambda df: df["RBP_dataset"].str.endswith("HepG2"), :].sort_values(
        by=["delta", "is_top_decile", "is_top_decile_n1", "parnet_perf_rank"],
        ascending=[False, False, False, True],
    )
).head(15)


display(selected_hepg2)


display(selected_hepg2.groupby(["is_top_decile", "is_top_decile_n1"]).size().reset_index(name="count"))


Unnamed: 0,dataset,RBP_dataset,seq,seq+struct,delta,is_top_decile,is_top_decile_n1,rbp_ct,parnet_perf_rank
98,ENCODE,LARP4_HepG2,0.635173,0.827773,0.1926,True,True,LARP4_HepG2,73
29,ENCODE,DDX55_HepG2,0.662662,0.817323,0.154661,True,True,DDX55_HepG2,70
199,ENCODE,UPF1_HepG2,0.754238,0.857766,0.103528,True,True,UPF1_HepG2,75
5,ENCODE,AKAP1_HepG2,0.736106,0.828647,0.092541,True,True,AKAP1_HepG2,42
100,ENCODE,LARP7_HepG2,0.684264,0.756756,0.072491,True,False,LARP7_HepG2,197
191,ENCODE,TROVE2_HepG2,0.659024,0.729493,0.070469,True,False,TROVE2_HepG2,177
32,ENCODE,DDX6_HepG2,0.610479,0.673289,0.062809,True,False,DDX6_HepG2,202
104,ENCODE,LSM11_HepG2,0.656695,0.71059,0.053895,True,False,LSM11_HepG2,122
122,ENCODE,PABPN1_HepG2,0.758125,0.811134,0.053008,True,False,PABPN1_HepG2,52
112,ENCODE,NIP7_HepG2,0.785349,0.822604,0.037254,False,False,NIP7_HepG2,200


Unnamed: 0,is_top_decile,is_top_decile_n1,count
0,False,False,6
1,True,False,5
2,True,True,4


In [None]:
additional_hepg2_noimpact = (
    performance_metadata_full.loc[lambda df: df["RBP_dataset"].str.endswith("HepG2"), :]
    .sort_values(
        by=["parnet_perf_rank"],
        ascending=[True],
    )
    .loc[lambda df: ~df["RBP_dataset"].isin(selected_k562["RBP_dataset"]).values, :]  # Exclude already selected RBPs
).head(20 - selected_hepg2.shape[0])  # We want a total of 20 RBPs.


subset_hepg2 = (
    pd.concat([selected_hepg2, additional_hepg2_noimpact], ignore_index=True).drop_duplicates().reset_index(drop=True)
)

display(subset_hepg2.shape[0])

display(subset_hepg2.groupby(["is_top_decile", "is_top_decile_n1"]).size().reset_index(name="count"))

20

Unnamed: 0,is_top_decile,is_top_decile_n1,count
0,False,False,11
1,True,False,5
2,True,True,4


In [106]:
display(list(subset_hepg2["RBP_dataset"].values))

['LARP4_HepG2',
 'DDX55_HepG2',
 'UPF1_HepG2',
 'AKAP1_HepG2',
 'LARP7_HepG2',
 'TROVE2_HepG2',
 'DDX6_HepG2',
 'LSM11_HepG2',
 'PABPN1_HepG2',
 'NIP7_HepG2',
 'SSB_HepG2',
 'SUB1_HepG2',
 'BUD13_HepG2',
 'PCBP1_HepG2',
 'AQR_HepG2',
 'PRPF8_HepG2',
 'U2AF2_HepG2',
 'POLR2G_HepG2',
 'TRA2A_HepG2',
 'TIAL1_HepG2']

In [104]:
output = StringIO()
subset_hepg2.to_csv(output, sep=",", index=False)
output.seek(0)
print(output.read())

dataset,RBP_dataset,seq,seq+struct,delta,is_top_decile,is_top_decile_n1,rbp_ct,parnet_perf_rank
ENCODE,LARP4_HepG2,0.6351733307999995,0.8277728760716772,0.1925995452716776,True,True,LARP4_HepG2,73
ENCODE,DDX55_HepG2,0.6626620997591289,0.8173231348815819,0.154661035122453,True,True,DDX55_HepG2,70
ENCODE,UPF1_HepG2,0.754238382429206,0.8577664592586105,0.1035280768294044,True,True,UPF1_HepG2,75
ENCODE,AKAP1_HepG2,0.7361057773433687,0.8286465646508141,0.0925407873074454,True,True,AKAP1_HepG2,42
ENCODE,LARP7_HepG2,0.6842644848557238,0.7567559497148317,0.0724914648591079,True,False,LARP7_HepG2,197
ENCODE,TROVE2_HepG2,0.6590240829875056,0.729493159160649,0.0704690761731434,True,False,TROVE2_HepG2,177
ENCODE,DDX6_HepG2,0.6104794431141986,0.6732888419352899,0.0628093988210913,True,False,DDX6_HepG2,202
ENCODE,LSM11_HepG2,0.6566949568651034,0.7105899785449412,0.0538950216798378,True,False,LSM11_HepG2,122
ENCODE,PABPN1_HepG2,0.7581253726678074,0.8111336554700694,0.053008282802262,True,False,PABPN1

### K562

In [94]:
selected_k562 = (
    performance_metadata_full.loc[lambda df: df["RBP_dataset"].str.endswith("K562"), :].sort_values(
        by=["delta", "is_top_decile", "is_top_decile_n1", "parnet_perf_rank"],
        ascending=[False, False, False, True],
    )
).head(15)


display(selected_k562)


display(selected_k562.groupby(["is_top_decile", "is_top_decile_n1"]).size().reset_index(name="count"))


Unnamed: 0,dataset,RBP_dataset,seq,seq+struct,delta,is_top_decile,is_top_decile_n1,rbp_ct,parnet_perf_rank
8,ENCODE,APOBEC3C_K562,0.610052,0.819037,0.208984,True,True,APOBEC3C_K562,60
153,ENCODE,SBDS_K562,0.610017,0.785654,0.175637,True,False,SBDS_K562,49
30,ENCODE,DDX55_K562,0.668456,0.817482,0.149026,True,True,DDX55_K562,35
121,ENCODE,PABPC4_K562,0.705461,0.853262,0.147801,True,True,PABPC4_K562,134
200,ENCODE,UPF1_K562,0.732386,0.854118,0.121732,True,True,UPF1_K562,67
33,ENCODE,DDX6_K562,0.650847,0.75349,0.102644,True,False,DDX6_K562,24
180,ENCODE,SUPV3L1_K562,0.710404,0.796817,0.086413,True,True,SUPV3L1_K562,201
149,ENCODE,RPS3_K562,0.758465,0.844651,0.086186,True,False,RPS3_K562,108
66,ENCODE,GNL3_K562,0.592475,0.671342,0.078867,True,True,GNL3_K562,184
6,ENCODE,AKAP1_K562,0.766397,0.843571,0.077174,True,True,AKAP1_K562,23


Unnamed: 0,is_top_decile,is_top_decile_n1,count
0,True,False,7
1,True,True,8


In [95]:
additional_k562_noimpact = (
    performance_metadata_full.loc[lambda df: df["RBP_dataset"].str.endswith("K562"), :]
    .sort_values(
        by=["parnet_perf_rank"],
        ascending=[True],
    )
    .loc[lambda df: (~df["RBP_dataset"].isin(selected_k562["RBP_dataset"]).values), :]  # Exclude already selected RBPs
).head(20 - selected_k562.shape[0])  # We want a total of 20 RBPs.


In [96]:
additional_k562_noimpact = (
    performance_metadata_full.loc[lambda df: df["RBP_dataset"].str.endswith("K562"), :]
    .sort_values(
        by=["parnet_perf_rank"],
        ascending=[True],
    )
    .loc[lambda df: (~df["RBP_dataset"].isin(selected_k562["RBP_dataset"]).values), :]  # Exclude already selected RBPs
).head(20 - selected_k562.shape[0])  # We want a total of 20 RBPs.


subset_k562 = (
    pd.concat([selected_k562, additional_k562_noimpact], ignore_index=True).drop_duplicates().reset_index(drop=True)
)

display(subset_k562.groupby(["is_top_decile", "is_top_decile_n1"]).size().reset_index(name="count"))

Unnamed: 0,is_top_decile,is_top_decile_n1,count
0,False,False,5
1,True,False,7
2,True,True,8


In [None]:
list(subset_k562["RBP_dataset"].values)

['APOBEC3C_K562',
 'SBDS_K562',
 'DDX55_K562',
 'PABPC4_K562',
 'UPF1_K562',
 'DDX6_K562',
 'SUPV3L1_K562',
 'RPS3_K562',
 'GNL3_K562',
 'AKAP1_K562',
 'ZC3H8_K562',
 'METAP2_K562',
 'UTP18_K562',
 'IGF2BP2_K562',
 'GPKOW_K562',
 'U2AF1_K562',
 'RBM22_K562',
 'DDX42_K562',
 'HNRNPM_K562',
 'PTBP1_K562']

In [None]:
output = StringIO()
subset_k562.to_csv(output, sep=",", index=False)
output.seek(0)
print(output.read())

dataset,RBP_dataset,seq,seq+struct,delta,is_top_decile,is_top_decile_n1,rbp_ct,parnet_perf_rank
ENCODE,APOBEC3C_K562,0.6100524146027858,0.8190366470149153,0.2089842324121295,True,True,APOBEC3C_K562,60
ENCODE,SBDS_K562,0.6100166525277281,0.7856536902629843,0.1756370377352561,True,False,SBDS_K562,49
ENCODE,DDX55_K562,0.6684556383665868,0.8174819062056824,0.1490262678390955,True,True,DDX55_K562,35
ENCODE,PABPC4_K562,0.7054611126448499,0.8532620393900807,0.1478009267452308,True,True,PABPC4_K562,134
ENCODE,UPF1_K562,0.7323863590236607,0.8541183143726966,0.1217319553490359,True,True,UPF1_K562,67
ENCODE,DDX6_K562,0.6508466946960998,0.7534903307339846,0.1026436360378848,True,False,DDX6_K562,24
ENCODE,SUPV3L1_K562,0.7104036908881199,0.7968166089965398,0.0864129181084198,True,True,SUPV3L1_K562,201
ENCODE,RPS3_K562,0.7584649609630604,0.84465066054548,0.0861856995824196,True,False,RPS3_K562,108
ENCODE,GNL3_K562,0.5924751545044846,0.6713421705494634,0.0788670160449788,True,True,GNL3_K562,184
ENCODE

### Check common RBPs between the two sets

In [None]:
tmp = set(subset_hepg2["RBP_dataset"].str.split("_").str[0].values) & set(
    subset_k562["RBP_dataset"].str.split("_").str[0].values
)

display(subset_k562.loc[lambda df: df["RBP_dataset"].str.split("_").str[0].isin(tmp), :])
display(subset_hepg2.loc[lambda df: df["RBP_dataset"].str.split("_").str[0].isin(tmp), :])

Unnamed: 0,dataset,RBP_dataset,seq,seq+struct,delta,is_top_decile,is_top_decile_n1,rbp_ct,parnet_perf_rank
2,ENCODE,DDX55_K562,0.668456,0.817482,0.149026,True,True,DDX55_K562,35
4,ENCODE,UPF1_K562,0.732386,0.854118,0.121732,True,True,UPF1_K562,67
5,ENCODE,DDX6_K562,0.650847,0.75349,0.102644,True,False,DDX6_K562,24
9,ENCODE,AKAP1_K562,0.766397,0.843571,0.077174,True,True,AKAP1_K562,23


Unnamed: 0,dataset,RBP_dataset,seq,seq+struct,delta,is_top_decile,is_top_decile_n1,rbp_ct,parnet_perf_rank
1,ENCODE,DDX55_HepG2,0.662662,0.817323,0.154661,True,True,DDX55_HepG2,70
2,ENCODE,UPF1_HepG2,0.754238,0.857766,0.103528,True,True,UPF1_HepG2,75
3,ENCODE,AKAP1_HepG2,0.736106,0.828647,0.092541,True,True,AKAP1_HepG2,42
6,ENCODE,DDX6_HepG2,0.610479,0.673289,0.062809,True,False,DDX6_HepG2,202


## RBP binding domains

In [74]:
display(rbps_domains.head())
display(rbps_domains.shape)

Unnamed: 0,name,geneID,RRM,ZNF,KH,Helicase,Nuclease,dRBM,PUM_HD
0,A1CF,ENSG00000148584,1,0,0,0,0,0,0
1,AARS,ENSG00000090861,0,0,0,0,0,0,0
2,AATF,ENSG00000108270,0,0,0,0,0,0,0
3,ABCF1,ENSG00000204574,0,0,0,0,0,0,0
4,ABT1,ENSG00000146109,1,0,0,0,0,0,0


(356, 9)

In [78]:
assert (
    len(set(performance_metadata_full["RBP_dataset"].str.split("_").str[0].unique()) - set(rbps_domains["name"].values))
    == 0
), "The set of RBPs with performance values from benchmark should have annotations in the domains table."

In [None]:
# Here: showing that some RBPs have multiple domains annotated.
rbps_domains.drop(columns=["geneID"]).set_index("name").sum(axis=1).sort_values(ascending=False).head(20)

name
IGF2BP3    2
XRN2       2
U2AF1      2
ZCRB1      2
SF1        2
DHX30      2
RBM27      2
RBM4B      2
RBM5       2
DROSHA     2
IGF2BP1    2
IGF2BP2    2
RBM22      2
SRSF7      2
RBM4       2
CNOT4      2
WRN        2
FMR1       1
KHSRP      1
KHDRBS2    1
dtype: int64