In [1]:
# !pip install pandas
# !pip install torch
# !pip install tqdm  # pip install ipywidgets or something
!pip install wandb

Installing collected packages: smmap, setproctitle, sentry-sdk, docker-pycreds, gitdb, GitPython, wandb
Successfully installed GitPython-3.1.40 docker-pycreds-0.4.0 gitdb-4.0.11 sentry-sdk-1.38.0 setproctitle-1.3.3 smmap-5.0.1 wandb-0.16.1


In [2]:
USING_WANDB = True  # Set to false if not Peter

if USING_WANDB:
    import wandb
    wandb.login()
    # !wandb login

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [8]:
# !mkdir Files
# !mv test.fasta Files
# !mv reduced_nonaccessible.fasta Files
# !mv accessible.fasta Files

In [3]:
import dna_dataset, constants, utils, ConvModel
import torch.nn as nn, torch.optim as optim, torch
from tqdm.notebook import tqdm
import importlib, os
from datetime import datetime

importlib.reload(dna_dataset)
importlib.reload(constants)
importlib.reload(utils)
importlib.reload(ConvModel)

<module 'ConvModel' from '/content/ConvModel.py'>

Unzip the datafile

In [None]:
!unzip $constants.DATA_ZIP_FILE

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [9]:
full_dataset = dna_dataset.DNADataset(constants.ACCESSIBLE_FILE, constants.NOT_ACCESSIBLE_FILE)

In [10]:
# ensure the DNADataset is loaded properly
print('total sequences', len(full_dataset.sequences))
print('num accessible', full_dataset.accessible_count)
print('num not accessible', full_dataset.not_accessible_count)
i = 0
print(f"example entry {i}")
item = full_dataset[i]
print("label", item['labels'])
# print(item['sequences'])

# ensure dataset was shuffled properly
# check that not all the accessible labels are at the front
for i in range(full_dataset.accessible_count):
    if full_dataset[i]['labels'] != constants.ACCESSIBLE_LABEL:
        print('shuffled')
        break

total sequences 94478
num accessible 47239
num not accessible 47239
example entry 0
label 0
shuffled


In [11]:
# Split dataset
full_size = len(full_dataset)
val_size = round(constants.VALIDATION_SPLIT * full_size)
train_size = full_size - val_size

train_dataset, val_dataset = torch.utils.data.random_split(full_dataset, [train_size, val_size])

print(len(train_dataset), len(val_dataset))
assert(len(train_dataset) + len(val_dataset) == full_size)

80306 14172


In [6]:
# insert torch model here, that takes sequence as input and output a label 0 or 1
model = ConvModel.CNNModel(kernel_size=2, embed_dim=4)
model.to(device)

CNNModel(
  (Conv1): Conv1d(4, 128, kernel_size=(2,), stride=(1,))
  (pool1): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (Conv2): Conv1d(128, 64, kernel_size=(2,), stride=(1,))
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear1): Linear(in_features=3136, out_features=128, bias=True)
  (linear2): Linear(in_features=128, out_features=64, bias=True)
  (linear3): Linear(in_features=64, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (sigmoid): Sigmoid()
)

In [27]:
# parameters
epochs = constants.EPOCHS
batch_size = constants.BATCH_SIZE
learning_rate = constants.LEARNING_RATE
n_eval = constants.N_EVAL
loss_fn = nn.BCELoss()
optimizer = optim.Adam(model.parameters())

In [28]:
train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True
)

val_loader = torch.utils.data.DataLoader(
    val_dataset, batch_size=batch_size, shuffle=True
)

In [None]:
# TRAINING LOOP
if USING_WANDB:

    note = "" #"0.5-dropout"
    wandb.init(
        project="dna_ml_model",
        # name=f"experiment_{run}"
        name=f"{note}colab_run2_balanced_data",
        config = {
            "architecture": model.__class__.__name__,
            "epochs": epochs,
            "learning_rate": learning_rate,
            "notes": note
    })


step = 0

for epoch in range(epochs):
    print(f"Epoch {epoch + 1} of {epochs}")

    for batch in tqdm(train_loader):  # show the times for each batch
        # Forward propagate
        samples, labels = batch["sequences"].to(device), batch['labels'].to(device)

        outputs = model(samples)

        labels = labels.reshape(-1,1).float()
        # Backpropagation and gradient descent
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()  # reset gradients before next iteration


        # Periodically evaluate our model + log to Tensorboard
        if step % n_eval == 0:
            # Compute training loss and accuracy.
            # Log the results to Tensorboard.
            with torch.no_grad():
                # Compute validation loss and accuracy.
                accuracy = utils.compute_accuracy(outputs, labels)
                val_loss, val_accuracy = utils.evaluate(val_loader, model, loss_fn, device)

                wandb.log({"Train Loss": loss,
                           "Train Acc": accuracy,
                           "Val Loss": val_loss,
                           "Val Acc": val_accuracy,
                           "Epoch": epoch
                })

        step += 1

    print()

Save Model

In [31]:
# Create pretrained directory if not yet created
if not os.path.isdir(constants.PRETRAINED_DIR):
    os.mkdir(constants.PRETRAINED_DIR)

now = datetime.now()
datetime_str = now.strftime("%m-%d-%H-%M-%S")
model_save_path = os.path.join(
    constants.PRETRAINED_DIR,
    f'{datetime_str}-{model.__class__.__name__}-model-{epochs}-epochs.pt'
)
print('model_save_path', model_save_path)
ConvModel.save_CNNModel(model_save_path, model)
print(f"model saved at {datetime_str}")

model_save_path pretrained/12-10-05-59-29-CNNModel-model-100-epochs.pt
model saved at 12-10-05-59-29


Load Model

In [37]:
model_save_path = "/content/pretrained/12-10-05-30-17-CNNModel-model-10-epochs.pt"
model = ConvModel.load_CNNModel(model_save_path)
model.to(device)

CNNModel(
  (Conv1): Conv1d(4, 128, kernel_size=(2,), stride=(1,))
  (pool1): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (Conv2): Conv1d(128, 64, kernel_size=(2,), stride=(1,))
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear1): Linear(in_features=3136, out_features=128, bias=True)
  (linear2): Linear(in_features=128, out_features=64, bias=True)
  (linear3): Linear(in_features=64, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (sigmoid): Sigmoid()
)

Inference

In [38]:
total_predictions = 0
total_correct = 0

for batch in tqdm(val_loader):
    val_samples, val_labels = batch['sequences'].to(device), batch['labels'].to(device)
    val_outputs = model(val_samples)
    # print(torch.round(val_outputs))
    val_labels = val_labels.reshape(-1, 1).float()

    val_loss = loss_fn(val_outputs, val_labels).item()  # change tensor to single val

    total_correct += (torch.round(val_outputs) == val_labels).sum().item()

    total_predictions += len(val_outputs)

val_accuracy = total_correct / total_predictions
print(f"Final Validation Accuracy: {val_accuracy * 100} %")

# wandb.summary['test_accuracy'] = val_accuracy

# wandb.finish()

  0%|          | 0/142 [00:00<?, ?it/s]

Final Validation Accuracy: 76.4817950889077 %


Inference on Test File

In [16]:
test_dataset = dna_dataset.TestDataset(constants.TEST_FILE) # TODO
test_loader = torch.utils.data.DataLoader(
    test_dataset, batch_size=batch_size, shuffle=True
)
print("Test dataset loaded.")

Test dataset loaded.


In [17]:
probs = []  # tuples of probability, id

for batch in tqdm(test_loader):

    samples, ids = batch["sequences"].to(device), batch['ids'] # not a tensor

    outputs = model(samples)

    out_list = outputs.tolist()

    for i in range(len(out_list)):
        probs.append((out_list[i], ids[i]))

print("Finished inference")

  0%|          | 0/2694 [00:00<?, ?it/s]

Finished inference


In [23]:
import numpy as np
np_probs = np.array(list(zip(*probs))[0])
print(len(np_probs[np_probs>0.5]), "true values out of ", len(np_probs), " total")  # assumes exactly 1
not_zero = np_probs[np_probs<=0.5]
not_zero.sort()
not_one = np_probs[np_probs>0.5]
not_one[::-1].sort()
print(not_zero)
print(not_one)

# print(np_probs[np_probs>0.0 and np_probs<1.0])

65517 true values out of  269315  total
[1.03285450e-10 1.81566789e-10 2.03635844e-10 ... 4.99967217e-01
 4.99980271e-01 4.99983966e-01]
[0.99994242 0.99988949 0.99988389 ... 0.50001734 0.50001717 0.5000127 ]


In [19]:
probs.sort(reverse=True)

highest_probs = probs[:10000]  # top 10,000

with open(constants.SOLUTION_FILE, "w") as f:
    for pair in highest_probs:
        f.write(pair[1])
        # f.write("a")
        f.write("\n")

print(list(zip(*probs[:10000]))[0])  # probs only

([0.999942421913147], [0.9998894929885864], [0.9998838901519775], [0.9998441934585571], [0.9998428821563721], [0.9998108744621277], [0.9998043179512024], [0.9997965693473816], [0.9997956156730652], [0.9997811913490295], [0.9997619986534119], [0.9997389912605286], [0.999729573726654], [0.9997232556343079], [0.9997197985649109], [0.9996976852416992], [0.9996857643127441], [0.999677300453186], [0.9996527433395386], [0.999632716178894], [0.9996241331100464], [0.9996077418327332], [0.9996057152748108], [0.9995895028114319], [0.9995889067649841], [0.9995754361152649], [0.9995701909065247], [0.99955815076828], [0.9995423555374146], [0.999541163444519], [0.9995146989822388], [0.9995119571685791], [0.9994933605194092], [0.9994811415672302], [0.9994713664054871], [0.999459445476532], [0.9994528889656067], [0.999406099319458], [0.9993971586227417], [0.9993896484375], [0.999386191368103], [0.9993799924850464], [0.9993793964385986], [0.999362051486969], [0.9993610978126526], [0.9993601441383362], [

In [24]:
# zip_file_name = "predictions.zip"
# !zip $zip_file_name $constants.SOLUTION_FILE


  adding: predictions.csv (deflated 65%)


In [25]:
# ONLY for use on google colab. download files
from google.colab import files
import os
dir = 'pretrained'
model_file = os.path.join(dir, os.listdir(dir)[0])
files.download(model_file)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
files.download(constants.SOLUTION_FILE)