In [1]:
import pandas as pd
from src.deepromoter import DeePromoter
from src.utils import load_dataset, protein2num, get_list_kmer


from torch.utils.data import DataLoader, TensorDataset, random_split
import torch
import math
import argparse
import torch.optim as optim
from torch import nn
from icecream import ic
from pathlib import Path


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [2]:
data = load_dataset("train_data.tsv")

In [3]:
MAPPER = {
    "A": 0,
    "C": 1,
    "G": 2,
    "T": 3,
}


def one_hot_seq(sequences: pd.Series) -> torch.Tensor:
    total = []

    for seq in sequences:
        x = torch.zeros(size=(4, len(seq)))
        for i, aa in enumerate(seq):
            x[MAPPER[aa], i] = 1
        total.append(x)
    return torch.stack(total, dim=0)


X = one_hot_seq(data["sequence"])

In [4]:
y = data["is_active"]
y = torch.tensor(y, dtype=torch.float)


In [6]:

# combine X and y into a TensorDataset
dataset = TensorDataset(X, y)

# 80% train, 20% test)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# dataloader
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

## train classifier

In [7]:
# init model
ker = [27, 14, 7]
net = DeePromoter(ker, 
                  input_shape=(32, 271, 4))
net.to(device)

# define parameters

epoch_num = 1000
criterion = nn.CrossEntropyLoss()

optimizer = optim.Adam(net.parameters(), lr=0.001)

exp_folder = Path("exp")
running_loss = 0

early_stopping_patience = 20

best_acc_val = 0
best_epoch = 0

pbar = range(epoch_num)
print("Start training")

for epoch in pbar:
    running_loss = 0
    for i, (X, y) in enumerate(train_loader):
        net.train()
        # get the inputs
        inputs, labels = X.to(device), y.to(device)
        inputs = inputs.permute(0, 2, 1)
        # zero the parameter gradients
        optimizer.zero_grad()

        # pass model to
        outputs = net(inputs)
        loss = criterion(outputs, labels.long())
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    # print statistics
    running_loss /= len(train_loader)
    print(f"Epoch {epoch+1}, loss: {running_loss:.3f}")
    if epoch % 5 == 0:
        net.eval()
        torch.save(net.state_dict(), str(exp_folder.joinpath("epoch_" + str(epoch) + ".pth")))
        correct = 0
        total = 0
        with torch.no_grad():
            for test_inputs, test_labels in test_loader:
                test_inputs, test_labels = test_inputs.to(device), test_labels.to(device)
                test_inputs = test_inputs.permute(0, 2, 1)
                test_outputs = net(test_inputs)
                _, predicted = torch.max(test_outputs, 1)
                total += test_labels.size(0)
                correct += (predicted == test_labels.long()).sum().item()

        accuracy = 100 * correct / total
        
        if accuracy > best_acc_val:
            best_acc_val = accuracy
            best_epoch = epoch
            print(f"Best epoch: {best_epoch}, accuracy: {best_acc_val}%")
            torch.save(net.state_dict(), str(exp_folder.joinpath("best_model.pth")))
        else:
            print(f"Epoch {epoch}, accuracy: {accuracy}%")
        if epoch - best_epoch > early_stopping_patience:
            print("Early stopping")
            break
        print(f"Accuracy on test set: {accuracy}%")
        #        precision, recall, MCC = mcc(eval_data)


  return F.conv1d(


Start training
Epoch 1, loss: 0.660
Best epoch: 0, accuracy: 64.98143045405534%
Accuracy on test set: 64.98143045405534%
Epoch 2, loss: 0.633
Epoch 3, loss: 0.618
Epoch 4, loss: 0.611
Epoch 5, loss: 0.608
Epoch 6, loss: 0.605
Best epoch: 5, accuracy: 67.17383491074638%
Accuracy on test set: 67.17383491074638%
Epoch 7, loss: 0.600
Epoch 8, loss: 0.601
Epoch 9, loss: 0.596
Epoch 10, loss: 0.596
Epoch 11, loss: 0.595
Best epoch: 10, accuracy: 71.76230981190847%
Accuracy on test set: 71.76230981190847%
Epoch 12, loss: 0.592
Epoch 13, loss: 0.591
Epoch 14, loss: 0.589
Epoch 15, loss: 0.588
Epoch 16, loss: 0.586
Epoch 15, accuracy: 69.80951239966456%
Accuracy on test set: 69.80951239966456%
Epoch 17, loss: 0.586
Epoch 18, loss: 0.582
Epoch 19, loss: 0.580
Epoch 20, loss: 0.580
Epoch 21, loss: 0.580
Epoch 20, accuracy: 71.53468311968372%
Accuracy on test set: 71.53468311968372%
Epoch 22, loss: 0.577
Epoch 23, loss: 0.575
Epoch 24, loss: 0.571
Epoch 25, loss: 0.571
Epoch 26, loss: 0.567
Epoch 

## train regressor

In [4]:
encoded_seq = one_hot_seq(data["sequence"])

In [5]:
y_reg = data["rna_dna_ratio"]
y_reg = torch.tensor(y_reg, dtype=torch.float)
y_reg

tensor([1.1798, 1.4064, 1.0213,  ..., 0.7452, 0.5936, 1.0647])

In [7]:
# combine X and y into a TensorDataset
dataset_reg = TensorDataset(encoded_seq, y_reg)

# 80% train, 20% test)
train_size = int(0.8 * len(dataset_reg))
test_size = len(dataset_reg) - train_size
train_dataset_reg, test_dataset_reg = random_split(dataset_reg, [train_size, test_size])

# dataloader
train_loader_reg = DataLoader(train_dataset_reg, batch_size=32, shuffle=True)
test_loader_reg = DataLoader(test_dataset_reg, batch_size=32, shuffle=False)

In [11]:
# init model
ker = [27, 14, 7]
net = DeePromoter(ker, 
                  input_shape=(32, 271, 4),
                  is_regressor=True)
net.to(device)

# define parameters

epoch_num = 1000
criterion = nn.MSELoss()

optimizer = optim.Adam(net.parameters(), lr=0.001)

exp_folder = Path("exp")
running_loss = 0

early_stopping_patience = 20

best_mse = torch.inf
best_epoch = 0

pbar = range(epoch_num)
print("Start training")

for epoch in pbar:
    running_loss = 0
    for i, (X, y) in enumerate(train_loader_reg):
        net.train()
        # get the inputs
        inputs, labels = X.to(device), y.to(device)
        inputs = inputs.permute(0, 2, 1)
        # zero the parameter gradients
        optimizer.zero_grad()

        # pass model to
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    # print statistics
    running_loss /= len(train_loader_reg)
    print(f"Epoch {epoch+1}, loss: {running_loss:.3f}")
    if epoch % 5 == 0:
        net.eval()
        torch.save(net.state_dict(), str(exp_folder.joinpath("reg_epoch_" + str(epoch) + ".pth")))
        mse_total = 0
        with torch.no_grad():
            for test_inputs, test_labels in test_loader_reg:
                test_inputs, test_labels = test_inputs.to(device), test_labels.to(device)
                test_inputs = test_inputs.permute(0, 2, 1)
                test_outputs = net(test_inputs)
                mse = criterion(test_outputs.squeeze(), test_labels)
                mse_total += mse
        
        mse_total /= len(test_loader_reg)
        if mse_total < best_mse:
            best_mse = mse
            best_epoch = epoch
            print(f"Best epoch: {best_epoch}, MSE: {best_mse.item()}")
            torch.save(net.state_dict(), str(exp_folder.joinpath("best_model.pth")))
        else:
            print(f"Epoch {epoch}, MSE: {mse.item()}")
    
        if epoch - best_epoch > early_stopping_patience:
            print("Early stopping")
            break
        print(f"MSE on test set: {mse.item()}")


Start training


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 1, loss: 0.140
Best epoch: 0, MSE: 0.17585620284080505
MSE on test set: 0.17585620284080505
Epoch 2, loss: 0.138
Epoch 3, loss: 0.137
Epoch 4, loss: 0.136
Epoch 5, loss: 0.136
Epoch 6, loss: 0.136
Best epoch: 5, MSE: 0.17797088623046875
MSE on test set: 0.17797088623046875
Epoch 7, loss: 0.136
Epoch 8, loss: 0.136
Epoch 9, loss: 0.135
Epoch 10, loss: 0.136
Epoch 11, loss: 0.135
Best epoch: 10, MSE: 0.18263593316078186
MSE on test set: 0.18263593316078186
Epoch 12, loss: 0.135
Epoch 13, loss: 0.135
Epoch 14, loss: 0.135
Epoch 15, loss: 0.135
Epoch 16, loss: 0.135
Best epoch: 15, MSE: 0.17589929699897766
MSE on test set: 0.17589929699897766
Epoch 17, loss: 0.135
Epoch 18, loss: 0.135
Epoch 19, loss: 0.135
Epoch 20, loss: 0.135
Epoch 21, loss: 0.135
Best epoch: 20, MSE: 0.18960417807102203
MSE on test set: 0.18960417807102203
Epoch 22, loss: 0.135
Epoch 23, loss: 0.135
Epoch 24, loss: 0.135
Epoch 25, loss: 0.135
Epoch 26, loss: 0.135
Best epoch: 25, MSE: 0.18687570095062256
MSE on t

KeyboardInterrupt: 