In [None]:
# !pip install pandas
# !pip install torch
# !pip install tqdm  # pip install ipywidgets or something
!pip install wandb
!pip install gensim
!pip install torcheval

In [None]:
USING_WANDB = True  # Set to false if not Peter

if USING_WANDB:
    import wandb
    wandb.login()
    # !wandb login

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
data_folder = "/content/drive/MyDrive/dna_ml_model_data"
accessible_file = f"{data_folder}/accessible.fasta"  # constants.ACCESSIBLE_FILE
reduced_nonaccessible_file = f"{data_folder}/reduced_nonaccessible.fasta"  # constants.NONACCESSIBLE_FILE
test_file = f"{data_folder}/test.fasta"  # constants.TEST_FILE
rest_nonaccessible_file = f"{data_folder}/reduced_nonaccessible.fasta"

In [57]:
import dna_dataset, constants, utils, CNNModel, LSTMCNNModel
import torch.nn as nn, torch.optim as optim, torch
from tqdm.notebook import tqdm
import importlib, os, numpy as np
from datetime import datetime
from torcheval.metrics.functional import multiclass_f1_score

importlib.reload(dna_dataset)
importlib.reload(constants)
importlib.reload(utils)
importlib.reload(CNNModel)
importlib.reload(LSTMCNNModel)

<module 'LSTMCNNModel' from '/content/LSTMCNNModel.py'>

In [7]:
# !mkdir Files
# !mv reduced_nonaccessible.fasta Files
# !mv accessible.fasta Files
# !mv test.fasta Files

Unzip the datafile

In [None]:
!unzip $constants.DATA_ZIP_FILE

In [8]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [11]:
full_dataset = dna_dataset.DNADataset(accessible_file, reduced_nonaccessible_file)

In [12]:
# ensure the DNADataset is loaded properly
print('total sequences', len(full_dataset.sequences))
print('num accessible', full_dataset.accessible_count)
print('num not accessible', full_dataset.not_accessible_count)
i = 0
print(f"example entry {i}")
item = full_dataset[i]
print("label", item['labels'])
# print(item['sequences'])

# ensure dataset was shuffled properly
# check that not all the accessible labels are at the front
for i in range(full_dataset.accessible_count):
    if full_dataset[i]['labels'] != constants.ACCESSIBLE_LABEL:
        print('shuffled')
        break

total sequences 94478
num accessible 47239
num not accessible 47239
example entry 0
label 0
shuffled


In [13]:
# Split dataset
full_size = len(full_dataset)
temp_size = round(constants.VALIDATION_SPLIT * 2 * full_size)  # * 2 for both validation and test split
train_size = full_size - temp_size

train_dataset, temp_dataset = torch.utils.data.random_split(full_dataset, [train_size, temp_size])

val_size = temp_size // 2
test_size = temp_size - val_size
assert(val_size + test_size == temp_size)

val_dataset, test_dataset = torch.utils.data.random_split(temp_dataset, [val_size, test_size])

print(len(train_dataset), len(val_dataset), len(test_dataset))
assert(len(train_dataset) + len(val_dataset) + len(test_dataset) == full_size)

66135 14171 14172


In [58]:
importlib.reload(CNNModel)
importlib.reload(LSTMCNNModel)
torch.manual_seed(0)

kernel_size = 3     # 2  # should use odd size
embed_dim=4  # 4
num_filters1 = 64  # 128
num_filters2=128  # 64
pool_kernel_size=2  # 2
hidden_dense1=64  # 128
hidden_dense2=32  # 64
dropout_rate_Dense=0.5  # .5

# insert torch model here, that takes sequence as input and output a label 0 or 1
model = CNNModel.CNNModel(kernel_size,
                           embed_dim,
                           num_filters1,
                           num_filters2,
                           pool_kernel_size,
                           hidden_dense1,
                           hidden_dense2,
                           dropout_rate_Dense
                           )

# model = LSTMCNNModel.LSTMCNNModel(
#                             kernel_size=2,
#                             embed_dim=4,
#                             num_filters1=128,
#                             num_filters2=64,
#                             pool_kernel_size=2,
#                             hidden_dense1=128,
#                             hidden_dense2=64,
#                             dropout_rate_Dense=0.5,
#                             lstm_units=1
# )

model.to(device)

CNNModel(
  (Conv1): Conv1d(4, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (Conv2): Conv1d(64, 128, kernel_size=(3,), stride=(1,), padding=(1,))
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear1): Linear(in_features=6400, out_features=64, bias=True)
  (linear2): Linear(in_features=64, out_features=32, bias=True)
  (linear3): Linear(in_features=32, out_features=1, bias=True)
  (batch_norm): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout_Dense): Dropout(p=0.5, inplace=False)
  (sigmoid): Sigmoid()
)

In [59]:
# parameters
epochs = 20
batch_size = 64
learning_rate = .0001
# n_eval = constants.N_EVAL
loss_fn = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)  # weight_decay=1

In [60]:
train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=batch_size, # shuffle=True
)
val_loader = torch.utils.data.DataLoader(
    val_dataset, batch_size=batch_size, # shuffle=True
)
test_loader = torch.utils.data.DataLoader(
    test_dataset, batch_size=batch_size, # shuffle=True
)

In [None]:
# TRAINING LOOP
# USING_WANDB = False
if USING_WANDB:

    note = f"{kernel_size}-kernel-{num_filters1}-{num_filters2}-conv-{hidden_dense1}-{hidden_dense2}-dense-" # "lstm_no_dropout" # "0.5_dropout"
    wandb.init(
        project="dna_ml_model",
        # name=f"experiment_{run}"
        name=f"{note}",
        settings=wandb.Settings(start_method="fork"),
        config = {
            "architecture": model.__class__.__name__,
            "epochs": epochs,
            "learning_rate": learning_rate,
            "batch_size": batch_size,
            "notes": note
    })


# step = 0
best_acc = 0.0

for epoch in range(epochs):
    print(f"Epoch {epoch + 1} of {epochs}")

    running_loss = 0.0
    running_corrects = 0.0

    model.train()
    for batch in tqdm(train_loader):  # show the times for each batch
        # Forward propagate
        samples, labels = batch["sequences"].to(device), batch['labels'].to(device)

        outputs = model(samples)
        labels = labels.reshape(-1,1).float()

        # Backpropagation and gradient descent
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()  # reset gradients before next iteration

        running_loss += loss.item() * samples.size(0)  # loss per sample times batch size

        preds = torch.round(outputs)
        running_corrects += torch.sum(preds == labels.data).item()


    # if step % n_eval == 0:
    # Compute training loss and accuracy.
    with torch.no_grad():
        model.eval()
        # Compute validation loss and accuracy.
        # accuracy = utils.compute_accuracy(outputs, labels)  # only does current batch
        val_loss, val_acc, f1_score = utils.evaluate(val_loader, model, loss_fn, device)

        epoch_loss = running_loss / len(train_dataset)
        epoch_acc = running_corrects / len(train_dataset)
        print(f"Training Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}")
        print(f"Validation Loss: {val_loss:.4f} Acc: {val_acc:.4f}")
        print(f"F1 Score: {f1_score:.4f}")

        # # deep copy the model
        # if val_accuracy > best_acc:
        #     best_acc = epoch_acc
        #     torch.save(model.state_dict(), best_model_params_path)

        if USING_WANDB:
            wandb.log({"Train Loss": epoch_loss,
                        "Train Acc": epoch_acc,
                        "Val Loss": val_loss,
                        "Val Acc": val_acc,
                        "Epoch": epoch
            })
        # step += 1

    print()

In [62]:
# INFERENCE

total_predictions = 0
total_correct = 0

model.eval()

for batch in tqdm(test_loader):
    test_samples, test_labels = batch['sequences'].to(device), batch['labels'].to(device)
    test_outputs = model(test_samples)
    # print(torch.round(val_outputs))
    test_labels = test_labels.reshape(-1, 1).float()

    test_loss = loss_fn(test_outputs, test_labels).item()  # change tensor to single val

    total_correct += (torch.round(test_outputs) == test_labels).sum().item()

    total_predictions += len(test_outputs)

model.train()
test_accuracy = total_correct / total_predictions
print(f"Final Test Accuracy: {test_accuracy * 100} %")

wandb.summary['test_accuracy'] = test_accuracy


  0%|          | 0/222 [00:00<?, ?it/s]

Final Test Accuracy: 77.20152413209145 %


In [28]:
# test on rest of nonaccessible
rest_nonacc_dataset = dna_dataset.TestDataset(rest_nonaccessible_file)
rest_nonacc_loader = torch.utils.data.DataLoader(
    rest_nonacc_dataset, batch_size=batch_size, # shuffle=True
)

In [None]:
probs_out = []  # tuples of probability, id

for batch in tqdm(rest_nonacc_loader):
    samples, _ = batch["sequences"].to(device), batch['ids'] # not a tensor
    outputs = model(samples)
    out_list = outputs.tolist()

    for i in range(len(out_list)):
        probs_out.append(out_list[i])

probs_out = np.array(probs_out)
falses = len(probs_out[probs_out<0.5])
print(falses, "non_accessible values out of ", len(probs_out), " total")
rest_correct = falses / len(probs_out)

print(rest_correct, " correct")

wandb.summary['rest_nonacc_accuracy'] = rest_correct

wandb.finish()

Save Model

In [None]:
# Create pretrained directory if not yet created
if not os.path.isdir(constants.PRETRAINED_DIR):
    os.mkdir(constants.PRETRAINED_DIR)

now = datetime.now()
datetime_str = now.strftime("%m-%d-%H-%M-%S")
model_save_path = os.path.join(
    constants.PRETRAINED_DIR,
    f'{note}{datetime_str}-{model.__class__.__name__}-model-{learning_rate}lr-{epochs}epochs.pt'
)
print('model_save_path', model_save_path)
CNNModel.save_CNNModel(model_save_path, model)
print(f"model saved at {datetime_str}")

Load Model

In [None]:
model_save_path = "/content/pretrained/12-10-05-30-17-CNNModel-model-10-epochs.pt"
model = CNNModel.load_CNNModel(model_save_path)
model.to(device)

Inference on Test File

In [16]:
competition_dataset = dna_dataset.TestDataset(test_file) # TODO
competition_loader = torch.utils.data.DataLoader(
    competition_dataset, batch_size=batch_size, # shuffle=True
)
print("Competition dataset loaded.")

Competition dataset loaded.


In [18]:
probs = []  # tuples of probability, id

for batch in tqdm(competition_loader):

    samples, ids = batch["sequences"].to(device), batch['ids'] # not a tensor

    outputs = model(samples)

    out_list = outputs.tolist()

    for i in range(len(out_list)):
        probs.append((out_list[i], ids[i]))

print("Finished inference")

  0%|          | 0/4209 [00:00<?, ?it/s]

Finished inference


In [19]:
import numpy as np
np_probs = np.array(list(zip(*probs))[0])
print(len(np_probs[np_probs>0.5]), "true values out of ", len(np_probs), " total")
not_zero = np_probs[np_probs<=0.5]
not_zero.sort()
not_one = np_probs[np_probs>0.5]
not_one[::-1].sort()
print(not_zero)
print(not_one)

# print(np_probs[np_probs>0.0 and np_probs<1.0])

47831 true values out of  269315  total
[7.48928386e-08 7.78024400e-08 9.36609581e-08 ... 5.00000000e-01
 5.00000000e-01 5.00000000e-01]
[0.99984241 0.99980801 0.9997322  ... 0.5000056  0.50000548 0.50000215]


In [None]:
probs.sort(reverse=True)

highest_probs = probs[:10000]  # top 10,000

with open(constants.SOLUTION_FILE, "w") as f:
    for pair in highest_probs:
        f.write(pair[1])
        # f.write("a")
        f.write("\n")
print("first 10\n", list(zip(*probs[:10]))[0])
print("last 10\n", list(zip(*probs[9990:10000]))[0])  # probs only

In [None]:
zip_file_name = "predictions.zip"
!zip $zip_file_name $constants.SOLUTION_FILE


  adding: predictions.csv (deflated 65%)


In [None]:
# ONLY for use on google colab. download files
from google.colab import files
import os
dir = 'pretrained'
model_file = os.path.join(dir, os.listdir(dir)[0])
files.download(model_file)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
files.download(zip_file_name)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>