# PARNET - training data

## Overview

## Imports

In [40]:
import os
import sys
from pathlib import Path

import parnet
import parnet.additional_utils
import parnet.data
import parnet.utils
import torch
import torch.utils
import yaml
from datasets import load_from_disk

## Load

In [27]:
filepath = "../resources/parnet_encore_eclip/encode.filtered.hfds/"

In [34]:
# Load the data using the parnet library.
test_dataset = parnet.data.datasets.HFDSDataset(filepath, split="test")

# Wrap in DataLoader
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=False)

# Iterator ; we only want to inspect the first batch.
for batch in test_loader:
    batch_input_data, batch_target_data = batch[0], batch[1]

    print(f"{batch_input_data.keys()=}")
    print(f"{len(batch_input_data['sequence'])=}")
    print(f"{batch_input_data['sequence'].shape}")

    print("\n")

    print(f"{batch_target_data.keys()=}")
    print(f"{batch_target_data['total'].shape=}")
    print(f"{batch_target_data['control'].shape=}")

    break

Loading dataset from disk:   0%|          | 0/76 [00:00<?, ?it/s]

Loading dataset from disk:   0%|          | 0/18 [00:00<?, ?it/s]

batch_input_data.keys()=dict_keys(['sequence'])
len(batch_input_data['sequence'])=64
torch.Size([64, 4, 600])


batch_target_data.keys()=dict_keys(['total', 'control'])
batch_target_data['total'].shape=torch.Size([64, 223, 600])
batch_target_data['control'].shape=torch.Size([64, 223, 600])


Expected: 

```python
# Input data
batch_input_data.keys()=dict_keys(['sequence'])
len(batch_input_data['sequence'])=64
torch.Size([64, 4, 600])

# Target data
batch_target_data.keys()=dict_keys(['total', 'control'])
batch_target_data['total'].shape=torch.Size([64, 223, 600])
batch_target_data['control'].shape=torch.Size([64, 223, 600])
```

Input data: each of the 64 sequences in the batch are of length 600, with 4 channels (one for each nucleotide)

Output data:

- two tracks named "total" (for the eCLIP signal) and "control" (for the input control signal) for each of the 64 sequences. 
- each track is of length 600
- there are 223 experiments (RBP_CELL-LINE)


NOTE: the input data actually contains more information, which may be useful to exploit.

e.g. the `meta` dictionary contains the `name` of a given input region,
which is actually built from the genomic coordinates of that region.

This may be useful to augment the data with e.g. icSHAPE data.

In [None]:
test_dataset = load_from_disk(filepath)["test"]
element = next(iter(test_dataset))

print(element.keys())
print(f"{element['inputs'].keys()=}")
print(f"{element['inputs']['sequence'].keys()=}")

# TODO: make sure to parse correctly these elements.
print(f"{element['inputs']['sequence']['values'][:10]=}")
print(f"{len(element['inputs']['sequence']['values'])=}")
display(torch.Tensor(element["inputs"]["sequence"]["indices"]))

print(element["meta"])


Loading dataset from disk:   0%|          | 0/76 [00:00<?, ?it/s]

Loading dataset from disk:   0%|          | 0/18 [00:00<?, ?it/s]

dict_keys(['meta', 'inputs', 'outputs'])
element['inputs'].keys()=dict_keys(['sequence'])
element['inputs']['sequence'].keys()=dict_keys(['indices', 'size', 'values'])
element['inputs']['sequence']['values'][:10]=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
len(element['inputs']['sequence']['values'])=600


tensor([[  0.,   1.,   2.,  ..., 597., 598., 599.],
        [  2.,   2.,   0.,  ...,   3.,   2.,   0.]])

{'name': b'chr8:8838703-8839303:-'}


In [73]:
# TODO : parse correctly these elements

element["inputs"]["sequence"]["size"]

[600, 4]