In [17]:
import torch
from tokenizers.vocab import create_NGram_vocab
from tokenizers.NGram import nGram_tokenize
from models.Transformer import Transformer
from training.preprocess import batch_load_and_preprocess
from data.loader import read_data
from data.generate import SmithWaterman
import timeit

MODEL_PATH = "saved_models/Transformer_4.pt"
MODEL_SIZE = 512
INPUT_DIM = 24
PAD_TOKEN = "0000"

In [44]:
# load model
vocab_4gram = create_NGram_vocab(4)
model = Transformer(
    vocab_size=len(vocab_4gram),
    stack_size=4,
    d_model=MODEL_SIZE,
    d_feed_fwd=2048,
    n_attn_heads=8,
    dropout=0.1
)
model.load_state_dict(torch.load(MODEL_PATH))

<All keys matched successfully>

In [3]:
# load data
data = list(
    read_data(
        path="/data/minhpham/SW-ML-data/SRR622461",
        sample_limit=1000,
        start_part=10,
        part_limit=1
    )
)

loading data  /data/minhpham/SW-ML-data/SRR622461
    loading file  /data/minhpham/SW-ML-data/SRR622461_10


In [48]:
def predict(seq1: str, seq2: str):
    tokens1 = nGram_tokenize(seq1)
    tokens2 = nGram_tokenize(seq2)
    tokens1 = tokens1[:INPUT_DIM]  # trim long seqs
    tokens2 = tokens2[:INPUT_DIM]
    # pad short seqs
    if len(tokens1) < INPUT_DIM:
        tokens1 = tokens1 + (INPUT_DIM - len(tokens1)) * [PAD_TOKEN]
        tokens2 = tokens2 + (INPUT_DIM - len(tokens2)) * [PAD_TOKEN]

    x1 = torch.tensor(vocab_4gram.lookup_indices(tokens1), dtype=torch.int32).unsqueeze(0)
    x2 = torch.tensor(vocab_4gram.lookup_indices(tokens2), dtype=torch.int32).unsqueeze(0)
    mask = (torch.logical_or(x1 != 0, x2 != 0)).unsqueeze(-2)
    y_hat = model(x1, x2, mask)
    return y_hat.item()

In [49]:
for sample_id in range(18):
    print("--------------------------------------------------------------------------------------------------")
    seq1 = data[sample_id][0]
    seq2 = data[sample_id][1]
    print(seq1)
    print(seq2)
    print("predicted score ", predict(seq1, seq2))
    print("real score      ", SmithWaterman(seq1, seq2))
    SW_latency = timeit.timeit("SmithWaterman(seq1, seq2)", globals=globals(), number=1000)
    ML_latency = timeit.timeit("predict(seq1, seq2)", globals=globals(), number=1000)
    print("Smith-Waterman latency  : {0:.2f} ms".format(SW_latency))
    print("Machine Learning latency: {0:.2f} ms".format(ML_latency))

--------------------------------------------------------------------------------------------------
AGGTTCAGAGAGGCTAGGGAACATCCCAAGGACACACAGCACCTAGGAGGCCGAGTCAGTGCAGCTCCTGCACACACCTTACCCTCCGTCCCATTC
AGGTTCAGAGAGGCTAGGGAACATCCCAAGGACACACAGCACCTAGGAGGCCGAGTCAGTGCAGCTCCTGCACACACCTTACCCTCCGTCCCATTC
predicted score  95.99535369873047
real score       96
Smith-Waterman latency  : 0.04 ms
Machine Learning latency: 8.81 ms
--------------------------------------------------------------------------------------------------
AGGTTCAGAGAGGCTAGGGAACATCCCAAGGACACACAGCACCTAGGAGGCCGAGTCAGTGCAGCTCCTGCACACACCTTACCCTCCGTCCCATTC
AGGTTCATAGAGGCAAGGGAACTTCCCAAGGACACACAGCACCTAAGCGGCCGAGTCAGTGCAGCTCCTGCACATACCTAACCCTCCGTCCGATTC
predicted score  82.65046691894531
real score       80
Smith-Waterman latency  : 0.03 ms
Machine Learning latency: 8.92 ms
--------------------------------------------------------------------------------------------------
AGGTTCAGAGAGGCTAGGGAACATCCCAAGGACACACAGCACCTAGGAGGCCGAGTCAGTGCAGCTCCT