In [None]:
import torch
import torch.nn as nn
from torch import optim
import yaml
import pandas as pd
import os
from collections import Counter
import numpy as np
from torch.utils.data import DataLoader

from dataset import DNASequenceDataSet
from model import DNATransformerEncoder
from training_utils import train_model, validate_model, test_model
from dataset import DNASequenceDataSet

from data_utils import load_and_preprocess_weighted_frequency_data, load_and_preprocess_data, load_and_preprocess_weighted_frequency_data_no_r1

%load_ext autoreload
%autoreload 2

In [None]:
with open('config.yaml', 'r') as stream:
    try:
        cfg = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)

In [None]:
df = load_and_preprocess_weighted_frequency_data(cfg)

In [None]:
df

In [None]:
df.hist(column='Normalized_Frequency', bins=100)

In [None]:
df = data_utils.load_and_preprocess_enrichment_data(cfg)

In [None]:
enrichment_length = {key: len(df[key]) for key in df.keys()}

In [None]:
enrichment_length

In [None]:
df[('HanS_R2.txt', 'HanS_R8.txt')]

In [None]:
result = data_utils.fix_mislabed_nucleotides(cfg)

In [None]:
# Initialize the model and optimizer here as you did during training
device = torch.device("cuda:1")  
cfg.update({
    'device': device,
})

model = DNATransformerEncoder(cfg).to(device)
optimizer = optim.Adam(model.parameters(), lr=cfg['learning_rate'])

# Load the checkpoint
checkpoint = torch.load("model_checkpoint.pt")  # Replace X with the epoch number

# Restore the model and optimizer states
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

# Move the model to evaluation mode if you are doing inference
model.eval()


In [None]:
import random

def generate_random_dna_sequence(length=40):
    nucleotides = ['A', 'C', 'G', 'T']
    return ''.join(random.choice(nucleotides) for _ in range(length))

# Generate a random DNA sequence of 40 nucleotides

n_sequences = 1000000

random_dna_sequence_data_set = [(generate_random_dna_sequence(), 1) for _ in range(n_sequences)]
df = pd.DataFrame(random_dna_sequence_data_set, columns=['Sequence', 'Normalized_Frequency'])
dna_dataset = DNASequenceDataSet(df)
test_loader = DataLoader(dna_dataset, batch_size=2048, num_workers=10)

In [None]:
result = []
with torch.no_grad():
    for batch in test_loader:
        x, y, len_x = batch
        x = x.to(cfg['device'])
        
        output = model(x, len_x)
        
        result.append(output)

In [None]:
best_random_sequence = torch.sort(torch.cat(result).movedim(1,0))

In [None]:
best_random_sequence

In [None]:
sorted(result, reverse=True)

In [None]:
def get_min_max_normalized_frequency(combined_counter):
    # Extract the raw frequencies
    raw_frequencies = np.array(list(combined_counter.values()))
    
    # Calculate Min-Max normalization constants
    min_val = np.min(raw_frequencies)
    max_val = np.max(raw_frequencies)
    
    # Perform Min-Max normalization
    normalized_frequencies = {key: (value - min_val) / (max_val - min_val) for key, value in combined_counter.items()}
    
    # Sort by normalized frequency
    sorted_items = sorted(normalized_frequencies.items(), key=lambda item: item[1], reverse=True)
    
    return sorted_items

def load_and_preprocess_data(dfs):
    
    counter_set = Counter(dfs)
    sorted_normalized_items = get_min_max_normalized_frequency(counter_set)
    df = pd.DataFrame(sorted_normalized_items, columns=['Sequence', 'Normalized_Frequency'])
    return df

df = load_and_preprocess_data(df)
df

In [None]:
load_and_preprocess_data(cfg)