In [3]:
params_notebook_name = "parnet.training_data.py.ipynb"
params_resource_dir = "../resources/"

# PARNET - training data

## Overview

## Imports

In [26]:
import os
import sys
from pathlib import Path

import numpy as np
import pandas as pd
import parnet
import parnet.additional_utils
import parnet.data
import parnet.utils
import pybedtools as pbt
import torch
import torch.utils
import yaml
from datasets import load_from_disk

In [4]:
def assert_notebook_working_dir(expected_local_file: os.PathLike) -> Path:
    """Assert or try updating the current working directory to where the notebook is located, to enable relative paths references.

    This function is used in a set-up where notebooks are contained within a project
    directory structure in which we want to reference filepaths relative to the notebook.
    e.g. "../src" or "../resources" should be accessible if the notebook is in
    "../notebooks/<notebook_name>.ipynb".

    The function first check the filepath of the expected local file relative
    to the current working directory.

    If not found, the function will try to use the VSCode Jupyter variable `__vsc_ipynb_file__`
    which should report the path of the notebook file being executed.

    It then checks if the expected local file exists, relative to the new working directory.


    Args:
        expected_local_file (os.PathLike): The expected local file to check for in the current working directory.
            This can be the name of the notebook file.

    Raises:
        KeyError: if the `__vsc_ipynb_file__` variable is not found in the global scope, while the first CWD check failed.
        FileNotFoundError: if the expected local file is not found in the current working directory after attempting to change it.
    """
    import os
    from pathlib import Path

    cwd = Path(os.getcwd())
    expected_local_filepath = cwd / expected_local_file

    if not expected_local_filepath.exists():
        if "__vsc_ipynb_file__" not in globals():
            raise KeyError(
                f"Detected CWD: {cwd} ; CWD does not contain expected file, but cannot use __vsc_ipynb_file__ to recover."
            )
        else:
            os.chdir(Path(globals()["__vsc_ipynb_file__"]).parent)
            cwd = Path(os.getcwd())
            print(f"Changed CWD to {cwd}")

            expected_local_filepath = cwd / expected_local_file
            if not expected_local_filepath.exists():
                raise FileNotFoundError(
                    f"Updated (using __vsc_ipynb_file__) CWD: {cwd} ; CWD does not contain expected file."
                )

            return cwd
    else:
        print(f"Confirmed CWD: {cwd} contains expected file: {expected_local_file}")
        return cwd


expected_local_file: str = params_notebook_name
cwd = assert_notebook_working_dir(expected_local_file=expected_local_file)
print(cwd)

Confirmed CWD: /home/l10n/projects/ml4rg25-parnet/parnet_demo/notebooks contains expected file: parnet.training_data.py.ipynb
/home/l10n/projects/ml4rg25-parnet/parnet_demo/notebooks


## Init

In [5]:
resource_dir = Path(params_resource_dir)
if not resource_dir.exists():
    raise FileNotFoundError("Resource directory does not exist: " + str(resource_dir))

print("Using resources from:", resource_dir)

Using resources from: ../resources


## Load 

In [None]:
bed6_cols = ["chrom", "start", "end", "name", "score", "strand"]


selected_annotation_set = "complete"

filepath_minus = (
    resource_dir
    / "general"
    / "gene_annotations.hg38.gencode_v40"
    / f"{selected_annotation_set}.non-overlap.annotated.minus.bed.gz"
)
filepath_plus = (
    resource_dir
    / "general"
    / "gene_annotations.hg38.gencode_v40"
    / f"{selected_annotation_set}.non-overlap.annotated.plus.bed.gz"
)

gencode_minus = pd.read_csv(filepath_minus, sep="\t", compression="gzip", header=None, comment="#", names=bed6_cols)
gencode_plus = pd.read_csv(filepath_plus, sep="\t", compression="gzip", header=None, comment="#", names=bed6_cols)

display(gencode_plus.head(10))
display(gencode_minus.head(10))


Unnamed: 0,chrom,start,end,name,score,strand
0,chr1,29553,30365,lncRNA;MIR1302-2HG;lncRNA;ENSG00000243485.5,.,+
1,chr1,30365,30503,ncRNA;MIR1302-2;miRNA;ENSG00000284332.1,.,+
2,chr1,30503,31109,lncRNA;MIR1302-2HG;lncRNA;ENSG00000243485.5,.,+
3,chr1,65418,65433,five_prime_UTR;OR4F5;protein_coding;ENSG000001...,.,+
4,chr1,65433,65519,intron;OR4F5;protein_coding;ENSG00000186092.7,.,+
5,chr1,65519,65564,five_prime_UTR;OR4F5;protein_coding;ENSG000001...,.,+
6,chr1,65564,65567,start_codon;OR4F5;protein_coding;ENSG000001860...,.,+
7,chr1,65567,65573,CDS;OR4F5;protein_coding;ENSG00000186092.7,.,+
8,chr1,65573,69036,intron;OR4F5;protein_coding;ENSG00000186092.7,.,+
9,chr1,69036,70005,CDS;OR4F5;protein_coding;ENSG00000186092.7,.,+


Unnamed: 0,chrom,start,end,name,score,strand
0,chr1,17368,17436,ncRNA;MIR6859-1;miRNA;ENSG00000278267.1,.,-
1,chr1,34553,36081,lncRNA;FAM138A;lncRNA;ENSG00000237613.2,.,-
2,chr1,89294,89550,lncRNA;ENSG00000238009;lncRNA;ENSG00000238009.6,.,-
3,chr1,89550,91105,lncRNA;ENSG00000238009;lncRNA;ENSG00000238009....,.,-
4,chr1,91105,133723,lncRNA;ENSG00000238009;lncRNA;ENSG00000238009.6,.,-
5,chr1,139789,140339,lncRNA;ENSG00000239906;lncRNA;ENSG00000239906.1,.,-
6,chr1,141473,157783,lncRNA;ENSG00000241860;lncRNA;ENSG00000241860.7,.,-
7,chr1,157783,157887,ncRNA;RNU6-1100P;snRNA;ENSG00000222623.1,.,-
8,chr1,157887,173862,lncRNA;ENSG00000241860;lncRNA;ENSG00000241860.7,.,-
9,chr1,187890,187958,ncRNA;MIR6859-2;miRNA;ENSG00000273874.1,.,-


## Demo - Load

In [6]:
filepath = "../resources/parnet_encore_eclip/encode.filtered.hfds/"

In [7]:
# Load the data using the parnet library.
test_dataset = parnet.data.datasets.HFDSDataset(filepath, split="test")

# Wrap in DataLoader
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=False)

# Iterator ; we only want to inspect the first batch.
for batch in test_loader:
    batch_input_data, batch_target_data = batch[0], batch[1]

    print(f"{batch_input_data.keys()=}")
    print(f"{len(batch_input_data['sequence'])=}")
    print(f"{batch_input_data['sequence'].shape}")

    print("\n")

    print(f"{batch_target_data.keys()=}")
    print(f"{batch_target_data['total'].shape=}")
    print(f"{batch_target_data['control'].shape=}")

    break

Loading dataset from disk:   0%|          | 0/76 [00:00<?, ?it/s]

Loading dataset from disk:   0%|          | 0/18 [00:00<?, ?it/s]

batch_input_data.keys()=dict_keys(['sequence'])
len(batch_input_data['sequence'])=64
torch.Size([64, 4, 600])


batch_target_data.keys()=dict_keys(['total', 'control'])
batch_target_data['total'].shape=torch.Size([64, 223, 600])
batch_target_data['control'].shape=torch.Size([64, 223, 600])


Expected: 

```python
# Input data
batch_input_data.keys()=dict_keys(['sequence'])
len(batch_input_data['sequence'])=64
torch.Size([64, 4, 600])

# Target data
batch_target_data.keys()=dict_keys(['total', 'control'])
batch_target_data['total'].shape=torch.Size([64, 223, 600])
batch_target_data['control'].shape=torch.Size([64, 223, 600])
```

Input data: each of the 64 sequences in the batch are of length 600, with 4 channels (one for each nucleotide)

Output data:

- two tracks named "total" (for the eCLIP signal) and "control" (for the input control signal) for each of the 64 sequences. 
- each track is of length 600
- there are 223 experiments (RBP_CELL-LINE)


NOTE: the input data actually contains more information, which may be useful to exploit.

e.g. the `meta` dictionary contains the `name` of a given input region,
which is actually built from the genomic coordinates of that region.

This may be useful to augment the data with e.g. icSHAPE data.

In [8]:
test_dataset = load_from_disk(filepath)["test"]
element = next(iter(test_dataset))

print(element.keys())
print(f"{element['inputs'].keys()=}")
print(f"{element['inputs']['sequence'].keys()=}")

# TODO: make sure to parse correctly these elements.
print(f"{element['inputs']['sequence']['values'][:10]=}")
print(f"{len(element['inputs']['sequence']['values'])=}")
display(torch.Tensor(element["inputs"]["sequence"]["indices"]))

print(element["meta"])


Loading dataset from disk:   0%|          | 0/76 [00:00<?, ?it/s]

Loading dataset from disk:   0%|          | 0/18 [00:00<?, ?it/s]

dict_keys(['meta', 'inputs', 'outputs'])
element['inputs'].keys()=dict_keys(['sequence'])
element['inputs']['sequence'].keys()=dict_keys(['indices', 'size', 'values'])
element['inputs']['sequence']['values'][:10]=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
len(element['inputs']['sequence']['values'])=600


tensor([[  0.,   1.,   2.,  ..., 597., 598., 599.],
        [  2.,   2.,   0.,  ...,   3.,   2.,   0.]])

{'name': b'chr8:8838703-8839303:-'}


In [9]:
# TODO : parse correctly these elements

element["inputs"]["sequence"]["size"]

[600, 4]

## Annotate - check which genomic regions are covered by the input data

Here : check whether the input data used to train the model is restricted to specific genomic regions.

To do so: we will gather all the intervals metadata, and intersect them with GENCODE segmented regions.

### Gather intervals metadata

In [None]:
intervals = []

test_dataset = load_from_disk(filepath)["test"]
for element in iter(test_dataset):
    name = element["meta"]["name"].decode("utf-8")

    # Parse into a bed6
    chrom, start_end, strand = name.split(":")
    start, end = map(int, start_end.split("-"))

    interval = pd.Series(
        {
            "chrom": chrom,
            "start": start,
            "end": end,
            "name": name,
            "score": 0,  # Placeholder score
            "strand": strand,
        }
    )
    intervals.append(interval)

Loading dataset from disk:   0%|          | 0/76 [00:00<?, ?it/s]

Loading dataset from disk:   0%|          | 0/18 [00:00<?, ?it/s]

In [21]:
intervals_df = pd.DataFrame(intervals, columns=bed6_cols).sort_values(by=["chrom", "start"])

### Annotate with GENCODE segments

In [33]:
intervals_df_minus = intervals_df[intervals_df["strand"] == "-"].sort_values(by=["chrom", "start"])
intervals_df_plus = intervals_df[intervals_df["strand"] == "+"].sort_values(by=["chrom", "start"])

In [None]:
bt_intervals_df_minus = pbt.BedTool.from_dataframe(intervals_df_minus)
intersect_minus = bt_intervals_df_minus.intersect(
    pbt.BedTool.from_dataframe(gencode_minus), wao=True, s=True
).to_dataframe(names=["a." + c for c in bed6_cols] + ["b." + c for c in bed6_cols] + ["ol"])

bt_intervals_df_plus = pbt.BedTool.from_dataframe(intervals_df_plus)
intersect_plus = bt_intervals_df_plus.intersect(
    pbt.BedTool.from_dataframe(gencode_plus), wao=True, s=True
).to_dataframe(names=["a." + c for c in bed6_cols] + ["b." + c for c in bed6_cols] + ["ol"])

In [42]:
# Here: for each 600nt window, keep only the largest overlap with a genomic annotation.


unique_intersect_minus = (
    intersect_minus.sort_values(by=["a.name", "ol"], ascending=[True, False])
    .drop_duplicates(subset=["a.chrom", "a.start", "a.end", "a.name"], keep="first")
    .reset_index(drop=True)
)

unique_intersect_plus = (
    intersect_plus.sort_values(by=["a.name", "ol"], ascending=[True, False])
    .drop_duplicates(subset=["a.chrom", "a.start", "a.end", "a.name"], keep="first")
    .reset_index(drop=True)
)

In [44]:
(intersect_plus["ol"] != 0).sum() / len(intersect_plus), (intersect_minus["ol"] != 0).sum() / len(intersect_minus)

(0.9969896182510496, 0.9945260269173503)

In [50]:
# Here, showing the total number of unique genomic annotations in which the 600nt windows overlap
# This was mostly to verify that intronic regions are indeed considered.
(
    unique_intersect_minus["b.name"].str.split(";").str[0].value_counts()
    + unique_intersect_plus["b.name"].str.split(";").str[0].value_counts()
)

b.name
intron             55100
lncRNA              6422
three_prime_UTR     4215
exon                2391
CDS                 1193
five_prime_UTR       682
.                    560
ncRNA                 63
Name: count, dtype: int64