In [None]:
# !pip install pandas
# !pip install torch
# !pip install tqdm

In [1]:
from dna_dataset import *
from constants import *
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
# import pandas as pd

Unzip the datafile

In [2]:
!unzip $DATA_ZIP_FILE

Archive:  Files.zip
  inflating: Files/accessible.fasta  
  inflating: Files/notaccessible.fasta  
  inflating: Files/test.fasta        


In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
train_dataset = DNADataset(ACCESSSIBLE_FILE, NOT_ACCESSIBLE_FILE)

In [4]:
# ensure the DNADataset is loaded properly
print('total sequences', len(train_dataset.sequences))
print('num accessible', train_dataset.accessible_count)
print('num not accessible', train_dataset.not_accessible_count)
i = 0
print(f"example entry {i}")
item = train_dataset[i]
print("label", item['label'])
print(item['sequence'])

# ensure dataset was shuffled properly
# check that not all the accessible labels are at the front
for i in range(train_dataset.accessible_count):
    if train_dataset[i]['label'] != ACCESSIBLE_LABEL:
        print('shuffled')
        break

total sequences 525688
num accessible 47239
num not accessible 478449
example entry 0
label 0
GAAATAATACCTATTGGGTTGCTTGAACCCGGGTTTTCATTTTATGATGC
TAATTTTATTATACTGCACAGAAGCAGATTCATCTAATTCAGGAGCAACT
TCATGTGTTATATCTGCAGGATGTACATAGTCAGTTGTTACAACATTATC
TCTCTCCTGAGAAACATAGGCAAAGATTCCATGAAAGAAAATTTCTGCAG
shuffled


In [None]:
# parameters
epochs = EPOCHS
batch_size = BATCH_SIZE
n_eval = N_EVAL
loss_fn = nn.BCE.Loss()
optimizer = optim.Adam()  # optim.Adam(model.parameters)

# model =  insert torch model here, that takes sequence as input and output a label 0 or 1

In [8]:
train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True
)

# TODO: separate train and val dataset
val_dataset = train_dataset
val_loader = torch.utils.data.DataLoader(
    val_dataset, batch_size=batch_size, shuffle=True
)

writer = SummaryWriter()  # tensorboard log

step = 0

for epoch in range(epochs):
    print(f"Epoch {epoch + 1} of {epochs}")

    for batch in tqdm(train_loader):  # show the times for each batch
        # Forward propagate
        samples, labels = batch["sequence"], batch['label']

        samples.to(device)
        labels.to(device)
        outputs = model(samples)

        labels = labels.reshape(-1,1).float()
        # Backpropagation and gradient descent
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()  # reset gradients before next iteration


        # Periodically evaluate our model + log to Tensorboard
        if step % n_eval == 0:
            # Compute training loss and accuracy.
            # Log the results to Tensorboard.
            with torch.no_grad():
                accuracy = compute_accuracy(outputs, labels)

                writer.add_scalar('Training Loss', loss, epoch)
                writer.add_scalar('Training Accuracy', accuracy, epoch)


                # Compute validation loss and accuracy.
                # Log the results to Tensorboard.
                # Don't forget to turn off gradient calculations!
                val_loss, val_accuracy = evaluate(val_loader, model, loss_fn)
                writer.add_scalar('Validation Loss', val_loss, epoch)
                writer.add_scalar('Validation Accuracy', val_accuracy, epoch)

        step += 1

    print()