In [1]:
import numpy as np

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim

In [2]:
data = np.load('test.for.torch.npz', allow_pickle=True)

In [6]:
#### Dataset Class
## Notice that the __init__ method contains an argument `apply_log10`, if you set it to True
## you will apply a log10 to the raw counts. We can experiment with this
class MethDataset(Dataset):
    def __init__(self, sequence, histone, methylation, coords, apply_log10=False):
        self.sequence = sequence
        self.histone = histone
        self.methylation = methylation
        self.transform = apply_log10
        self.coords = coords
        self.histone_names = ['H3K4me3', 'H3K36me2', 'H3K27me3', 'H3K9me3']

    def __len__(self):
        return self.methylation.shape[0]

    def __getitem__(self, idx):
        
        sequence = torch.from_numpy(self.sequence[idx])
        histone = self.histone.astype(np.float32)

        H3K4me3 = torch.from_numpy(histone[:, :, 0][idx].astype(np.float32)) if not self.transform else torch.from_numpy(np.log10(histone[:, :, 0]+1e-4)[idx])
        H3K36me2 = torch.from_numpy(histone[:, :, 1][idx].astype(np.float32)) if not self.transform else torch.from_numpy(np.log10(histone[:, :, 1]+1e-4)[idx])
        H3K27me3 = torch.from_numpy(histone[:, :, 2][idx].astype(np.float32)) if not self.transform else torch.from_numpy(np.log10(histone[:, :, 2]+1e-4)[idx])
        H3K9me3 = torch.from_numpy(histone[:, :, 3][idx].astype(np.float32)) if not self.transform else torch.from_numpy(np.log10(histone[:, :, 3]+1e-4)[idx])

        methylation = self.methylation[idx]
        coordinates = self.coords[idx]

        return sequence, H3K4me3, H3K36me2, H3K27me3, H3K9me3, methylation, coordinates

In [7]:
size = data['dna'].shape[0]
split_index = int(0.8 * size) ### 80% of the data will be for training

# I'm applying log10 in both cases
train_dataset = MethDataset(sequence = data['dna'][:split_index],
                           histone = data['histone'][:split_index], 
                           methylation = data['methyl'][:split_index],
                           coords = data['coords'][:split_index],
                           apply_log10=True)

test_dataset = MethDataset(sequence = data['dna'][split_index:],
                           histone = data['histone'][split_index:], 
                           methylation = data['methyl'][split_index:],
                           coords = data['coords'][split_index:],
                           apply_log10=True)

In [12]:
batch_size = 5
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [186]:
### Model class
## 1- My idea is to be able to control de architecture of the model, and training parameters since the model is created.
## This should make it easier to debug and to try different architectures, and the architecture of the model can be 
## specified through the arguments.
## 2- The `forward` method unsqueezes the input so the model understands the structure in batches.
## 3- There is a method called `training_loop`. Please, complete it, after you specify the architecture, add the loss function, and backward
## propagation step
## 4- I think we can add an `eval_loop` method, in which we iterate over the `test_dataloader` and evaluate the accuracy of the model (R^2)
## 5- Try some architectures, and some way to pass arguments to the model, such that we can try different numbers without having problems
## with tensor shapes and things like that. The idea is to be able to test certain combinations of numbers, so we can use Optuna to make
## a bayesian search for "optimal" parameters. Look at papers where people use CNNs for DNA and histone marks, try to have a similar architecture
## and let's start with that
class Model(nn.Module):
    def __init__(self, DNA_in_channels, DNA_out_channels, epochs=100, learning_rate=1e-3, optimizer=torch.optim.SGD):
        super().__init__()
        # Module parameters
        self.DNA_in_channels = DNA_in_channels
        self.DNA_out_channels = DNA_out_channels


        # Training parameters
        self.epochs = epochs
        self.learning_rate=learning_rate
        self.optimizer = optimizer
        
        # Modules and architecture
        self.dna_module = nn.Sequential(
            nn.Conv2d(in_channels=self.DNA_in_channels, out_channels=self.DNA_out_channels, kernel_size=(50, 4), stride=50)
        )


    # self.attn = nn.MultiheadAttention(embed_dim=attn_dim, num_heads=n_heads, batch_first=True)

    # self.fc = nn.Sequential(
    #     nn.Linear(attn_dim, attn_dim // 2),
    #     nn.ReLU(),
    #     nn.Linear(attn_dim // 2, output_dim)
    # )

    def forward(self, sequence, H3K4me3, H3K36me2, H3K27me3, H3K9me3, methylation):
        sequence = sequence.to(torch.float32).unsqueeze(1)
        return self.dna_module(sequence)


    def training_loop(self, loss_fn, train_dataloader):
        optimizer = self.optimizer(self.parameters(), lr=self.learning_rate)
        loss_fn = loss_fn()

        self.train()
        for e in range(self.epochs):
            for i, (sequence, H3K4me3, H3K36me2, H3K27me3, H3K9me3, methylation, coordinates) in enumerate(train_dataloader):
        
                out = self.forward(sequence, H3K4me3, H3K36me2, H3K27me3, H3K9me3, methylation)

                break
    
    def eval_loop(args, kwargs):
        pass

model = Model(DNA_in_channels=1, DNA_out_channels=1, epochs=1)

In [187]:
model.training_loop(loss_fn=nn.MSELoss, train_dataloader=train_dataloader)

tensor([0.0000, 0.1600, 0.6100, 0.4200, 0.4000], dtype=torch.float64)


In [159]:
lr = 1e-3
optim = torch.optim.SGD(model.parameters(), lr=lr)