<a href="https://colab.research.google.com/github/nmach22/Promoter-Classification/blob/main/notebooks/train_rnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Set Env**

In [None]:
%%capture
from google.colab import drive
drive.mount('/content/drive')

from google.colab import userdata
token = userdata.get('GITHUB_TOKEN')
user_name = userdata.get('GITHUB_USERNAME')
mail = userdata.get('GITHUB_MAIL')

!git config --global user.name "{user_name}"
!git config --global user.email "{mail}"
!git clone https://{token}@github.com/nmach22/Promoter-Classification.git
!pip install -r ./Promoter-Classification/requirements.txt

In [None]:
import sys
import torch

ROOT_DIR = '/content/Promoter-Classification'
sys.path.append(ROOT_DIR)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [None]:
import os
import yaml

from pathlib import Path

from utils.data_split import dataset_split
from utils.encoding_functions import KmerEncoding
from utils.fasta_dataset import FastaDataset

_PATH_TO_ROOT = os.path.join(Path.cwd().absolute(), 'Promoter-Classification')
_DEFAULT_CONFIG_PATH = os.path.join(_PATH_TO_ROOT, 'config', 'config.yaml')

with open(_DEFAULT_CONFIG_PATH, 'r') as f:
    config = yaml.safe_load(f)

fasta_positive = config['data']['bacillus']['promoter_fasta']
fasta_negative = config['data']['bacillus']['promoter_fasta']
seq_length = config['data']['bacillus']['seq_len']

full_pos_path = os.path.join(_PATH_TO_ROOT, fasta_positive)
full_neg_path = os.path.join(_PATH_TO_ROOT, fasta_negative)

k3_encoder = KmerEncoding(k=3)
data = FastaDataset(full_pos_path, full_neg_path, encoding_func=k3_encoder, seq_len=seq_length)
train_ds, val_ds, test_ds = dataset_split(data)

In [None]:
len(test_ds[0][0])

81

### 1. Setup Data

In [None]:
batch_size = config['train']['batch_size']

train_loader = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_ds, batch_size=batch_size, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_ds, batch_size=batch_size, shuffle=False)

### 2. Initialize Model, Loss, and Optimizer

In [None]:
from torch import nn
from models.rnn import PromoterRNN

model = PromoterRNN(vocab_size=65, embed_dim=16, hidden_dim=32)
criterion = nn.BCEWithLogitsLoss() # Combines Sigmoid + Binary Cross Entropy
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


### 3. Training Loop

In [None]:
from eval.train_evals import TrainEvals
from utils.train import Train

train = Train(model, train_loader,val_loader,optimizer,criterion,[TrainEvals()],device)
train.train(10)

In [None]:
# epochs = 100
# for epoch in range(epochs):
#     model.train()
#     epoch_loss = 0
#
#     for batch_x, batch_y in train_loader:
#         # Clear previous gradients
#         optimizer.zero_grad()
#
#         # Forward pass
#         predictions = model(batch_x).squeeze() # remove extra dims
#
#         # Calculate loss (labels must be float for this criterion)
#         loss = criterion(predictions, batch_y.float())
#
#         # Backward pass
#         loss.backward()
#
#         # Update weights
#         optimizer.step()
#
#         epoch_loss += loss.item()
#
#     avg_loss = epoch_loss / len(train_loader)
#     print(f"Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.4f}")