<a href="https://colab.research.google.com/github/nmach22/Promoter-Classification/blob/main/notebooks/train_transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Set Env**

In [12]:
%%capture
from google.colab import drive
drive.mount('/content/drive')

from google.colab import userdata
token = userdata.get('GITHUB_TOKEN')
user_name = userdata.get('GITHUB_USERNAME')
mail = userdata.get('GITHUB_MAIL')

!git config --global user.name "{user_name}"
!git config --global user.email "{mail}"
!git clone https://{token}@github.com/nmach22/Promoter-Classification.git
!pip install -r ./Promoter-Classification/requirements.txt


# **Imports**

In [9]:

# Imports
import sys
import os
import torch
import yaml
import matplotlib.pyplot as plt
import torch
import torchvision.transforms as transforms
from torchvision.utils import make_grid
from torch.utils.data import random_split

# Add the root directory of the cloned repository to the Python path
ROOT_DIR = '/content/Promoter-Classification'
sys.path.append(ROOT_DIR)


import importlib
import utils.fasta_dataset as dataset_module
import utils.encoding_functions as encoding_module
import utils.data_split as splitter_module
import models.transformer as transformer_module
import utils.train as train_module
import eval.train_evals as eval_module
importlib.reload(dataset_module)
importlib.reload(encoding_module)
importlib.reload(transformer_module)
importlib.reload(train_module)
importlib.reload(eval_module)
from utils.fasta_dataset import FastaDataset
from utils.encoding_functions import *
from models.transformer import DNATransformer
from utils.data_split import dataset_split
from utils.train import Train
from eval.train_evals import TrainEvals

with open(f"{ROOT_DIR}/config/config.yaml", "r") as f:
    config = yaml.safe_load(f)

print(config.keys())

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

dict_keys(['data', 'dataset_split', 'train'])
cuda


# **Read Data**

In [7]:
ecoli_data = config['data']['ecoli']

prom_path = f"{ROOT_DIR}/{ecoli_data['promoter_fasta']}"
non_prom_path = f"{ROOT_DIR}/{ecoli_data['non_promoter_fasta']}"
seq_length = ecoli_data['seq_len']

dataset = FastaDataset(prom_path, non_prom_path, seq_len=seq_length,encoding_func=token_encode)

# **Split Data**

In [8]:
train_dataset, val_dataset, test_dataset = dataset_split(dataset)

# **Create Data Loaders**

In [10]:
batch_size = config['train']['batch_size']

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# **Train Model**

In [11]:
model = DNATransformer(seq_length).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = torch.nn.BCELoss()

train = Train(model, train_loader,val_loader,optimizer,criterion,[TrainEvals()],device)
train.train(10)

Epoch 1/10 | Train Loss: 0.3617 | Val Loss: 0.3192
  Train accuracy: 0.8515
  Val accuracy: 0.8626
  Train sensitivity: 0.5026
  Val sensitivity: 0.8480
  Train specificity: 0.9490
  Val specificity: 0.8667
  Train correlation_coef: 0.5232
  Val correlation_coef: 0.6505
Epoch 2/10 | Train Loss: 0.2667 | Val Loss: 0.2164
  Train accuracy: 0.8820
  Val accuracy: 0.9217
  Train sensitivity: 0.7053
  Val sensitivity: 0.8240
  Train specificity: 0.9314
  Val specificity: 0.9489
  Train correlation_coef: 0.6486
  Val correlation_coef: 0.7707
Epoch 3/10 | Train Loss: 0.2051 | Val Loss: 0.1909
  Train accuracy: 0.9181
  Val accuracy: 0.9287
  Train sensitivity: 0.7973
  Val sensitivity: 0.8000
  Train specificity: 0.9519
  Val specificity: 0.9644
  Train correlation_coef: 0.7577
  Val correlation_coef: 0.7857
Epoch 4/10 | Train Loss: 0.1795 | Val Loss: 0.2131
  Train accuracy: 0.9285
  Val accuracy: 0.9148
  Train sensitivity: 0.8126
  Val sensitivity: 0.6560
  Train specificity: 0.9610
  Val 