<a href="https://colab.research.google.com/github/nmach22/Promoter-Classification/blob/main/notebooks/train_rnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Set Env**

In [1]:
%%capture
from google.colab import drive
drive.mount('/content/drive')

from google.colab import userdata
token = userdata.get('GITHUB_TOKEN')
user_name = userdata.get('GITHUB_USERNAME')
mail = userdata.get('GITHUB_MAIL')

!git config --global user.name "{user_name}"
!git config --global user.email "{mail}"
!git clone https://{token}@github.com/nmach22/Promoter-Classification.git
!pip install -r ./Promoter-Classification/requirements.txt

In [2]:
import sys
import torch

ROOT_DIR = '/content/Promoter-Classification'
sys.path.append(ROOT_DIR)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [31]:

import os
import yaml

from pathlib import Path

from utils.data_split import dataset_split
from utils.encoding_functions import KmerEncoding
from utils.fasta_dataset import FastaDataset

_PATH_TO_ROOT = os.path.join(Path.cwd().absolute(), 'Promoter-Classification')
_DEFAULT_CONFIG_PATH = os.path.join(_PATH_TO_ROOT, 'config', 'config.yaml')

with open(_DEFAULT_CONFIG_PATH, 'r') as f:
    config = yaml.safe_load(f)

In [32]:
ecoli_data = config['data']['ecoli']

prom_path = f"{ROOT_DIR}/{ecoli_data['promoter_fasta']}"
non_prom_path = f"{ROOT_DIR}/{ecoli_data['non_promoter_fasta']}"
seq_length = ecoli_data['seq_len']

In [33]:
k3_encoder = KmerEncoding(k=3)
dataset = FastaDataset(prom_path, non_prom_path, seq_len=seq_length, encoding_func=k3_encoder)
train_ds, val_ds, test_ds = dataset_split(dataset)

In [34]:
len(test_ds[0][0])

81

### 1. Setup Data

In [35]:
batch_size = config['train']['batch_size']

train_loader = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_ds, batch_size=batch_size, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_ds, batch_size=batch_size, shuffle=False)

### 2. Initialize Model, Loss, and Optimizer

In [41]:
from torch import nn
from models.rnn import PromoterRNN

model = PromoterRNN(
    vocab_size=65,
    embed_dim=16,
    hidden_dim=32,
    num_layers=4,
    dropout=0.2,
    bidirectional=False,
    fc_hidden_dims=[64, 32, 16]
)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


### 3. Training Loop

In [None]:
from utils.train import Train

train = Train(model, train_loader, val_loader, optimizer, criterion)
train.train(100, True)