In [4]:
import pandas as pd

In [6]:
filename = 'https://data.starklab.org/almeida/DeepSTARR/Tutorial/Sequences_activity_subset.txt'
data = pd.read_table(filename)

In [7]:
data

Unnamed: 0,seqnames,start,end,ID,set,Sequence,Dev_log2_enrichment,Hk_log2_enrichment
0,chr3R,21360001,21360249,chr3R_21360001_21360249_+_negative,Train,TGGGTCAGCTCGGCGTAGTCCGAAATCTATTCTTTCAATTATTAAT...,0.438053,-1.102117
1,chr3L,4121751,4121999,chr3L_4121751_4121999_-_positive_peaks,Train,TTGTCAAGATTTTATCTTCGCGCGCCAAATGCCAAAAATTAGCCAA...,5.796507,2.271401
2,chrX,17616495,17616743,chrX_17616495_17616743_+_peak_849bp_region,Train,GTTCTATTGCTCGACTGTGTGTGCGGCAATCTATAATATAAGATGT...,1.271845,0.089503
3,chr3R,23774097,23774345,chr3R_23774097_23774345_+_peak_849bp_region,Train,TACATGAAAAGATACTAATTTGTTTCAAATATAAATCATATATCTA...,-1.425885,-1.103772
4,chr3L,17300157,17300405,chr3L_17300157_17300405_-_peak_849bp_region,Train,GGTCCGCAAACAAACACACTCAATTACATGCAGTAAAATTTGTTTT...,-0.964305,-1.241142
...,...,...,...,...,...,...,...,...
131751,chr2R,21141601,21141849,chr2R_21141601_21141849_-_negative,Test,CGGGATTGTCTATTTAAGTCACTCAGCTCCCTTGCTATACCCAAGA...,0.104630,-0.644837
131752,chr2R,21141901,21142149,chr2R_21141901_21142149_-_negative,Test,GCACTAGCTGAGTAACAGGTATTTGATCGTTGGGGAACTCTCGTTT...,-1.318970,0.663313
131753,chr2R,21142401,21142649,chr2R_21142401_21142649_-_negative,Test,TGAAAGTGTGTGCGTTCTGTTCTCTGTACTTTTCGGTGTAAAAGTA...,0.681030,-2.151505
131754,chr2R,21142501,21142749,chr2R_21142501_21142749_-_negative,Test,GCGCCGTGTTAAACACAAGTTTTTTGGCGGAATGCCTATTTAATCT...,1.144431,-1.877330


In [9]:
train_data = data[data['set'] == "Train"]
val_data = data[data['set'] == "Val"]
test_data = data[data['set'] == "Test"]

train_data.to_csv('../data/train_data.csv', index=False)
val_data.to_csv('../data/val_data.csv', index=False)
test_data.to_csv('../data/test_data.csv', index=False)

In [1]:
import torch
from torch.utils.data import Dataset
import pandas as pd
import selene_sdk
import numpy as np


def one_hot_encode(seqs):
    """Given a list of sequences, one-hot encode them.

    Parameters
    ----------
    seqs : list-like
        Each entry is a DNA sequence to be one-hot encoded

    Returns
    -------
    seqs_hot : ndarray, shape (number of sequences, 4, length of sequence)
    """
    seqs_hot = list()
    for seq in seqs:
        seqs_hot.append(
            selene_sdk.sequences.Genome.sequence_to_encoding(seq).T
        )
    seqs_hot = np.stack(seqs_hot)
    return seqs_hot

class DeepSTARRDataset(Dataset):
    def __init__(self, data_type):
        if data_type == 'train':
            data_df = pd.read_csv('../data/train_data.csv')
        
        if data_type == 'val':
            data_df = pd.read_csv('../data/val_data.csv')
        
        if data_type == 'test':
            data_df = pd.read_csv('../data/test_data.csv')
            
        sequence = data_df['Sequence']
        
        self.seqs_hot = one_hot_encode(sequence)
        self.target = torch.tensor(data_df["Dev_log2_enrichment"].values, dtype=torch.float)
        
    def __len__(self):
        return len(self.target)
    
    def __getitem__(self, idx):
        x = self.seqs_hot[idx]
        y = self.target[idx]
        
        return x, y

In [2]:
dataset = DeepSTARRDataset('train')
print(dataset[0][0].shape)

(4, 249)
