In [1]:
from utils.data_split import dataset_split
from utils.encoding_functions import token_encode
import os
import yaml

from pathlib import Path

from utils.fasta_dataset import FastaDataset

_PATH_TO_ROOT = Path.cwd().parent.absolute()
_DEFAULT_CONFIG_PATH = os.path.join(_PATH_TO_ROOT, 'config', 'config.yaml')

with open(_DEFAULT_CONFIG_PATH, 'r') as f:
    config = yaml.safe_load(f)

fasta_positive = config['data']['bacillus']['promoter_fasta']
fasta_negative = config['data']['bacillus']['promoter_fasta']
seq_length = config['data']['bacillus']['seq_len']

full_pos_path = os.path.join(_PATH_TO_ROOT, fasta_positive)
full_neg_path = os.path.join(_PATH_TO_ROOT, fasta_negative)

data = FastaDataset(full_pos_path, full_neg_path, encoding_func=token_encode, seq_len=seq_length)
train, val, test = dataset_split(data)


In [8]:
print(f'{len(train)}')
print(f'{len(val)}')
print(f'{len(test)}')

522: positive samples: 373
110
114


In [9]:
train_labels = [train.dataset.labels[i] for i in train.indices]
pos_count = sum(train_labels)
neg_count = len(train_labels) - pos_count
ratio = pos_count / neg_count if neg_count > 0 else float('inf')
print(f'Train - Positive: {pos_count}, Negative: {neg_count}, Ratio: {ratio:.2f}')

Train - Positive: 261, Negative: 261, Ratio: 1.00


In [10]:
val_labels = [val.dataset.labels[i] for i in val.indices]
pos_count = sum(val_labels)
neg_count = len(val_labels) - pos_count
ratio = pos_count / neg_count if neg_count > 0 else float('inf')
print(f'Val - Positive: {pos_count}, Negative: {neg_count}, Ratio: {ratio:.2f}')

Val - Positive: 55, Negative: 55, Ratio: 1.00
