In [None]:
import pandas as pd

from dna_classification.models import DNASequenceClassifier
from dna_classification.tokenization import DNATokenizer

from tqdm import tqdm

In [None]:
data = pd.read_csv("data/virus.txt", skiprows=1, header=None, names=["sequence", "label"], sep="\t")

In [4]:
tokenizer = DNATokenizer()
tokenizer.build_vocab("data/virus.txt", k=15)

model = DNASequenceClassifier(
    vocab_size=tokenizer.vocab_size,
    embedding_dim=192,
    hidden_dim=64,
    num_layers=2,
    num_classes=data["label"].nunique(),
    dropout=0.05
)

model.add_tokenizer(tokenizer)

In [None]:
train_loss, val_loss = model.train_model(
    data=data,
    epochs=100,
    batch_size=256,
    device="cuda",
    optimizer_params={
        "lr": 0.005,
    }
)

In [None]:
# get accuracy
model.cpu()
model.eval()
correct = 0
total = 0

# subset
data_sample = data.sample(1000)

for i in tqdm(range(len(data_sample))):
    sequence = data.iloc[i]["sequence"]
    label = data.iloc[i]["label"]
    pred = model.predict(sequence)
    if pred == label:
        correct += 1
    total += 1

print(f"Accuracy: {correct / total}")