### RawCTCNet Benchmark/Eval with trained model: CTCLoss of approx. 0.6 (Best: 0.5548)

In [1]:
# switch to toplevel dir:
%cd ~/Desktop/pytorch_models/wavenet-speech/
!pwd
%load_ext autoreload
%autoreload 2

/home/ptang/Desktop/pytorch_models/wavenet-speech
/home/ptang/Desktop/pytorch_models/wavenet-speech


In [2]:
# imports:
import torch
from torch.autograd import Variable
import numpy as np
from warpctc_pytorch import CTCLoss

In [3]:
# import gaussian model, RawCTCNet, sequential decoder:
from utils.gaussian_kmer_model import RawGaussianModelLoader
from modules.raw_ctcnet import RawCTCNet
from modules.sequence_decoders import argmax_decode, labels2strings, BeamSearchDecoder
from ctcdecode import CTCBeamDecoder

### Construct data generator from gaussian model using the same parameters as we did during training:

In [4]:
# create artificial data model:
max_iterations = 1000000 # 1 million examples
num_epochs = 100
epoch_size = 10000
kmer_model_path = "utils/r9.4_450bps.5mer.template.npz"
batch_size = 6
upsample_rate = 6
min_sample_len = 80
max_sample_len = 90
dataset = RawGaussianModelLoader(max_iterations, num_epochs, epoch_size, kmer_model_path, batch_size=batch_size,
                                 upsampling=upsample_rate, random_upsample=True, lengths=(min_sample_len,max_sample_len))

In [5]:
# inspect dataset:
signals, bases, lengths = dataset.fetch()
signals[:,0:7] # ~ (batch x seq)

Variable containing:
 105.5662  113.7298  106.1436  105.4222  104.3051  110.2205  102.7817
  77.0084   79.0295   83.7585   80.3274   98.6562  105.6906  100.3318
  92.3886   90.3167   87.6046   88.1289   89.5569   88.2718   94.2597
 108.2933  106.1615  102.0731  107.0794  110.6007  107.0333  108.7700
  88.2340   86.7097   89.8145   86.4188   89.2195   95.8863   99.0132
 103.8159  101.0736  100.9319   98.8079  104.0342   84.1145   81.0289
[torch.FloatTensor of size 6x7]

### Construct model with same parameters as during training and load saved models:

In [6]:
# build model:
nfeats = 2048
nhid = 512
feature_kwidth = 3
num_labels = 5
num_dilation_blocks = 10
dilations = [1, 2, 4, 8, 16] * num_dilation_blocks
layers = [(nhid, nhid, 2, d) for d in dilations] + [(nhid, nhid, 3, d) for d in dilations]
out_dim = 512
is_causal = False
ctcnet = RawCTCNet(nfeats, feature_kwidth, num_labels, layers, out_dim, input_kernel_size=2, input_dilation=1,
                   softmax=False, causal=is_causal)
batch_norm = torch.nn.BatchNorm1d(1)

In [7]:
# load saved model parameters:
ctcnet_save_path = "./runs/gaussian-model/raw_ctc_net.model.adamax_lr2e_4.pth"
batchnorm_save_path = "./runs/gaussian-model/raw_ctc_net.batch_norm.adamax_lr2e_4.pth"
map_cpu = lambda storage, loc: storage # necessary to move weights from CUDA to CPU
ctcnet.load_state_dict(torch.load(ctcnet_save_path, map_location=map_cpu))
batch_norm.load_state_dict(torch.load(batchnorm_save_path, map_location=map_cpu))

In [8]:
# CTCLoss:
ctc_loss_fn = CTCLoss()

### Helper function to fetch & evaluate model on data:

In [9]:
def eval_model():
    # use volatile variables for better execution speed/memory usage:
    signals, sequences, lengths = dataset.fetch()
    signals_var = Variable(signals.data, volatile=True)
    sequences_var = Variable(sequences.data, volatile=True)
    lengths_var = Variable(lengths.data, volatile=True)
    # run networks:
    probas = ctcnet(batch_norm(signals_var.unsqueeze(1)))
    transcriptions = probas.permute(2,0,1) # need seq x batch x dim
    transcription_lengths = Variable(torch.IntTensor([transcriptions.size(0)] * batch_size))
    ctc_loss = ctc_loss_fn(transcriptions, sequences_var, transcription_lengths, lengths_var)
    avg_ctc_loss = (ctc_loss / transcriptions.size(0))
    return (transcriptions, ctc_loss, avg_ctc_loss, sequences.data, lengths.data)

In [10]:
def split_target_seqs(seqs, lengths):
    """Split a flattened array of target sequences into their constituents."""
    split_seqs = []
    labels_parsed = 0
    for ll in lengths:
        split_seqs.append( seqs[labels_parsed:labels_parsed+ll] )
        labels_parsed += ll
    return split_seqs

### evaluate results against true sequences with argmax and beam search (run these commands in sequence a few times):

In [11]:
scores, loss, avg_loss, true_seqs, true_seq_lengths = eval_model()
print("CTC Loss on whole sequence: {}".format(loss.data[0]))
print("CTC Loss, averaged per-logit: {}".format(avg_loss.data[0]))

CTC Loss on whole sequence: 309.6486511230469
CTC Loss, averaged per-logit: 0.6942794919013977


In [12]:
# print true sequences:
true_base_sequences = split_target_seqs(true_seqs, true_seq_lengths)
for k in range(len(true_base_sequences)):
    print(labels2strings(true_base_sequences[k].unsqueeze(0))[0])

GCATAAAAATAATCGACTAGAGTTCGGGTAACACGGGCTAACCAAGGTAAGCGCGGGCGGCGTTCCGCCGGATAGGTTCTCAATAATC
TTCTCCGGTAAGTCATTGGCTTGCTGGACACTTCGATTGCACCTGTAGTAGGGAGCGTATTGTGTGGGATGATGTTCGTGTAGTACTGC
AGATGCAGAGAAGATTACACTTATAATTATGTTTCTCACGTATCCAGGCGGAGTGACCGAACTCAAGGGTTAGGGTTAATGA
TGAGAGTGCACTTTATTCTCGCTAGTTGTGAATACGAAACTTATACGGCTCGTAATTGTCTCCAATGCCTGAAAGAGCGGTTCTCA
GAAGTCTAGGCACACTGGACGTTTAGTTAGGTGCGATTGCCCAAGGTTGGGGAGGACCTTGCCGCCACGCTGTTAGGAACGAAAAAT
CCCCGAATCTTGGTCCCTAGAGAGCTAACATGCCGTCGCTAACCACCGTTATGGGAATATCACGCTTCCAATAACGCCATCTTA


In [13]:
# normalize probabilities with a softmax operation:
temperature = 1.0 # should set this between 0->infty
logits = scores / temperature
for k in range(len(logits)):
    logits[k,:,:] = torch.nn.functional.softmax(logits[k,:,:])



In [14]:
# argmax decoding: expects (batch, seq, dim) and returns (batch, seq)
argmax_decoded = argmax_decode(logits.permute(1,0,2).contiguous().data)
argmax_basecalls = labels2strings(argmax_decoded)
for k in range(len(argmax_decoded)):
    print(argmax_basecalls[k])

CTTTTAAAAAATAAATATCCTGGGCGTCGTTAACCCGTGCTAACCCAGGTTAAAGGCGGCGGGGCGGGCGGTTCCCGCCCTGGATAGCTGCTCCATGTAT
CGCCGGGGTACGGTTTCCTTTTTTGGGCTTTTGGCTGGGGGACAACTTTTGGATTTTGGACCTGTTCGGTTAGGGAAGGCGCTTTGTTGTTGGGGGGGGATTGGCTGTACGTGGTTAGTTCCC
CCGGTCCCGGTTAAGATTTACCACTTTATACTTATATTATCCCGTTCTCCAGAGGTTGGGCCCGCCCCTCCAGGGCTCGGATCCC
CTGTGGCCCTTTTTCTCTCGGCTCGTTTGTGGAAATACGAAAACTTTATACGGGCTCGTTTAATTTTTAGTCTCCCATGGCATGGACAGCCGGCGCGGCC
TATTCTCGGGCAAACTGGGGGCACGCGGGGTTACGGATTTGGCCCCAGGCGTTGGGGGGAGGCCCTTTGGCCCGCCAAGATGTTTCGAAAGGACCC
CCGTCCATCTTTGAGCCCTACGGGAGAGGGCTAAAATGGCCCGTTCCGGCTACCCCAAACAGTATGGGGGGAATCTCCCGGCTTTCCCATAACGCCATCC


In [17]:
# CTC beam search: expects (batch, seq, dim)
alphabet = [' ', 'A', 'G', 'C', 'T'] # the ordering should match the logit labelling
beam_search_decoder = CTCBeamDecoder(alphabet, beam_width=7, blank_id=0)

In [26]:
def convert_to_string(tokens, vocab, seq_len):
    return ''.join([vocab[x] for x in tokens[0:seq_len]])

In [18]:
# beamres ~ ()
# beamscores ~ ()
# beamtimes ~ ()
# beam_seq_len ~ ()
beamres, beamscores, beamtimes, beam_seq_len = beam_search_decoder.decode(logits.permute(1,0,2).data)

In [27]:
# ground truth: GCATAAAAATAATCGACTAGAGTTCGGGTAACACGGGCTAACCAAGGTAAGCGCGGGCGGCGTTCCGCCGGATAGGTTCTCAATAATC
convert_to_string(beamres[0][0], alphabet, beam_seq_len[0][0])

'CTGTAAAATAATATCCTGCGTCGGTAACCCGTGCTAACCCAGGTAAAGCGCGGGCGGCGTTCCGCCTGATAGCTGCTCCATGTG'

```
#=======================================
#
# Aligned_sequences: 2
# 1: EMBOSS_001
# 2: EMBOSS_001
# Matrix: EDNAFULL
# Gap_penalty: 10.0
# Extend_penalty: 0.5
#
# Length: 89
# Identity:      65/89 (73.0%)
# Similarity:    65/89 (73.0%)
# Gaps:           6/89 ( 6.7%)
# Score: 222.5
# 
#
#=======================================

EMBOSS_001         1 GCATAAAAATAATCGACTAGAGTTCGGGTAACACGGGCTAACCAAGGT-A     49
                      |...||||||||...|| |.||  .||||||.||.|||||||.|||| |
EMBOSS_001         1 -CTGTAAAATAATATCCT-GCGT--CGGTAACCCGTGCTAACCCAGGTAA     46

EMBOSS_001        50 AGCGCGGGCGGCGTTCCGCCGGATAGGTTCTCAATAATC     88
                     ||||||||||||||||||||.|||||.|.|||.||... 
EMBOSS_001        47 AGCGCGGGCGGCGTTCCGCCTGATAGCTGCTCCATGTG-     84
```

In [28]:
# ground truth: TTCTCCGGTAAGTCATTGGCTTGCTGGACACTTCGATTGCACCTGTAGTAGGGAGCGTATTGTGTGGGATGATGTTCGTGTAGTACTGC
convert_to_string(beamres[1][0], alphabet, beam_seq_len[1][0])

'CTGACGGTACGTCCTTGGCTTGCTGGACAACTTGGATTGAACCTGTCGTAGGAGCGCTTGTGTGGATGCTGTACGTGTAGTCCC'

```
#=======================================
#
# Aligned_sequences: 2
# 1: EMBOSS_001
# 2: EMBOSS_001
# Matrix: EDNAFULL
# Gap_penalty: 10.0
# Extend_penalty: 0.5
#
# Length: 91
# Identity:      71/91 (78.0%)
# Similarity:    71/91 (78.0%)
# Gaps:           9/91 ( 9.9%)
# Score: 261.0
# 
#
#=======================================

EMBOSS_001         1 TTCT-CCGGTAAGTCATTGGCTTGCTGGAC-ACTTCGATTGCACCTGTAG     48
                       || .|||||.|||.|||||||||||||| ||||.|||||.||||||.|
EMBOSS_001         1 --CTGACGGTACGTCCTTGGCTTGCTGGACAACTTGGATTGAACCTGTCG     48

EMBOSS_001        49 TAGGGAGCGTATTGTGTGGGATGATGTTCGTGTAGTACTGC     89
                     || |||||| .|||||| |||||.|||.||||||||.|.  
EMBOSS_001        49 TA-GGAGCG-CTTGTGT-GGATGCTGTACGTGTAGTCCC--     84
```

In [29]:
# ground truth: AGATGCAGAGAAGATTACACTTATAATTATGTTTCTCACGTATCCAGGCGGAGTGACCGAACTCAAGGGTTAGGGTTAATGA
convert_to_string(beamres[2][0], alphabet, beam_seq_len[2][0])

'TCGTCCGTAAGATTACACTTATACTTATAGTTATCCCGTCTCCAGAGGTGCCCGCCCTCCAGGCTCGGATCCC'

```
#=======================================
#
# Aligned_sequences: 2
# 1: EMBOSS_001
# 2: EMBOSS_001
# Matrix: EDNAFULL
# Gap_penalty: 10.0
# Extend_penalty: 0.5
#
# Length: 95
# Identity:      47/95 (49.5%)
# Similarity:    47/95 (49.5%)
# Gaps:          35/95 (36.8%)
# Score: 134.5
# 
#
#=======================================

EMBOSS_001         1 AGATGCAGAG--------AAGATTACACTTATAATTATGTTTCTCACGTA     42
                                       |||||||||||||||.||||..||.||.|||.
EMBOSS_001         1 ----------TCGTCCGTAAGATTACACTTATACTTATAGTTATCCCGTC     40

EMBOSS_001        43 TCCAGGCGGA-GTGACCGAACTCAAGGGTTAGGGTTAATGA----     82
                     ||||    || |||.|||..|||.|||.|        ..||    
EMBOSS_001        41 TCCA----GAGGTGCCCGCCCTCCAGGCT--------CGGATCCC     73
```

In [30]:
# ground truth: TGAGAGTGCACTTTATTCTCGCTAGTTGTGAATACGAAACTTATACGGCTCGTAATTGTCTCCAATGCCTGAAAGAGCGGTTCTCA
convert_to_string(beamres[3][0], alphabet, beam_seq_len[3][0])

'CTAGTTGCCCTTTCTCTCGCTCGTGTGAATACGAAACTTATACGGCTCGTAATTAGTCTCCCATGCATGACAGCGCGCGGCC'

```
#=======================================
#
# Aligned_sequences: 2
# 1: EMBOSS_001
# 2: EMBOSS_001
# Matrix: EDNAFULL
# Gap_penalty: 10.0
# Extend_penalty: 0.5
#
# Length: 90
# Identity:      69/90 (76.7%)
# Similarity:    69/90 (76.7%)
# Gaps:          12/90 (13.3%)
# Score: 248.0
# 
#
#=======================================

EMBOSS_001         1 -TGAGAG-TGCACTTTATTCTCGCTAGTTGTGAATACGAAACTTATACGG     48
                      |   || |||.|||| .|||||||.| ||||||||||||||||||||||
EMBOSS_001         1 CT---AGTTGCCCTTT-CTCTCGCTCG-TGTGAATACGAAACTTATACGG     45

EMBOSS_001        49 CTCGTAATT-GTCTCCAATGCCTGAAAGAGCG-GTTCTCA     86
                     ||||||||| ||||||.||||.|||.||.||| |..|   
EMBOSS_001        46 CTCGTAATTAGTCTCCCATGCATGACAGCGCGCGGCC---     82
```

In [31]:
# ground truth: GAAGTCTAGGCACACTGGACGTTTAGTTAGGTGCGATTGCCCAAGGTTGGGGAGGACCTTGCCGCCACGCTGTTAGGAACGAAAAAT
convert_to_string(beamres[4][0], alphabet, beam_seq_len[4][0])

'TCTTCTCGGCAAACTGGCACGTCGTGGTACGATTGCCCCAGCGTGGGAGGCCCTTGCCGCCAAGATGTTTCGAAAGGACCC'

```
#=======================================
#
# Aligned_sequences: 2
# 1: EMBOSS_001
# 2: EMBOSS_001
# Matrix: EDNAFULL
# Gap_penalty: 10.0
# Extend_penalty: 0.5
#
# Length: 97
# Identity:      61/97 (62.9%)
# Similarity:    61/97 (62.9%)
# Gaps:          26/97 (26.8%)
# Score: 190.5
# 
#
#=======================================

EMBOSS_001         1 GAAG---TCTAGGCACACTGG-ACGTTTAGTTAGGTGCGATTGCCCAAG-     45
                            |||.||||.||||| |||  |.||  |||.|||||||||.|| 
EMBOSS_001         1 ----TCTTCTCGGCAAACTGGCACG--TCGT--GGTACGATTGCCCCAGC     42

EMBOSS_001        46 GTTGGGGAGGACCTTGCCGCCACGCTGTT-----AGGAACGAAAAAT     87
                     ||  ||||||.|||||||||||.|.||||     ||||.|.      
EMBOSS_001        43 GT--GGGAGGCCCTTGCCGCCAAGATGTTTCGAAAGGACCC------     81
```

In [32]:
# ground truth: CCCCGAATCTTGGTCCCTAGAGAGCTAACATGCCGTCGCTAACCACCGTTATGGGAATATCACGCTTCCAATAACGCCATCTTA
convert_to_string(beamres[5][0], alphabet, beam_seq_len[5][0])

'GCGTCATCTTGAGCCCTACGAGAGCTAAATGCCGTCGCTACCCCAACAGTATGAATCTCCCGCTTCCCATAACGCCATCC'

```
#=======================================
#
# Aligned_sequences: 2
# 1: EMBOSS_001
# 2: EMBOSS_001
# Matrix: EDNAFULL
# Gap_penalty: 10.0
# Extend_penalty: 0.5
#
# Length: 91
# Identity:      65/91 (71.4%)
# Similarity:    65/91 (71.4%)
# Gaps:          18/91 (19.8%)
# Score: 230.0
# 
#
#=======================================

EMBOSS_001         1 ----CCCCGAATCTTGGTCCCTA-GAGAGCTAACATGCCGTCGCTAACC-     44
                         |     ||||||..||||| ||||||||| ||||||||||||.|| 
EMBOSS_001         1 GCGTC-----ATCTTGAGCCCTACGAGAGCTAA-ATGCCGTCGCTACCCC     44

EMBOSS_001        45 -ACCGTTATGGGAATATCACGCTTCCAATAACGCCATCTTA     84
                      ||.| |||  ||||.||.|||||||.|||||||||||.  
EMBOSS_001        45 AACAG-TAT--GAATCTCCCGCTTCCCATAACGCCATCC--     80
```

- - -

- - -

- - -

### Examine model performance on noiseless/constant data:
The CTC loss is low, but pairwise alignment gives us ~78% identity to the target sequence; let's check to see if randomness might be contributing to low pct. identity (~80%-85% is competitive).

First, data without duration noise, i.e. call `dataset.fetch()` with `random_upsample==False`:

In [None]:
dataset.random_upsample = False # temporarily turn off random lengths
scores, loss, avg_loss, true_seqs, true_seq_lengths = eval_model()
print("CTC Loss on whole sequence: {}".format(loss.data[0]))
print("CTC Loss, averaged per-logit: {}".format(avg_loss.data[0]))

In [None]:
# print true sequences:
true_base_sequences = split_target_seqs(true_seqs, true_seq_lengths)
for k in range(len(true_base_sequences)):
    print(labels2strings(true_base_sequences[k].unsqueeze(0))[0])

In [None]:
# normalize probabilities with a softmax operation:
temperature = 1.0 # should set this between 0->infty
logits = scores / temperature
for k in range(len(logits)):
    logits[k,:,:] = torch.nn.functional.softmax(logits[k,:,:])

In [None]:
# argmax decoding: expects (batch, seq, dim) and returns (batch, seq)
argmax_decoded = argmax_decode(logits.permute(1,0,2).contiguous().data)
argmax_basecalls = labels2strings(argmax_decoded)
for k in range(len(argmax_decoded)):
    print(argmax_basecalls[k])

In [None]:
# beam search decoded: expects (batch, dim, seq)
beam_search_decoder = BeamSearchDecoder(batch_size=batch_size, num_labels=5, beam_width=7)
probas, hyp_seqs = beam_search_decoder.decode(logits.permute(1, 2, 0))

In [None]:
print("Normalized probabilities:")
for k in range(len(probas)):
    print(probas[k] / logits.size(0))

In [None]:
lookup_dict = {0: '', 1: 'A', 2: 'G', 3: 'C', 4: 'T', 5: '<SOS>', 6: '<EOS>'}
for ll in range(len(hyp_seqs)):
    print("".join([lookup_dict[lbl] for lbl in hyp_seqs[ll]]))

We again notice that the beam-search output sequences are exactly the same as the argmaxed sequences. Outputs of an EMBOSS run on all six (true seq, pred seq) pairs:
```
TRUE: TATTCAACTAGCCCCAGACGGTACATCCTAGGACCGAAACATTCGTTTTGTAGAACCTCGCAATTAAACCTGTGTTGGGATGATCG
PRED: TGAACTCGGCCCCAGCCGGGTTCCATCACTAGGCCCGACACCATTTCGTATTTGGTTGTCACCCTCGGCCCATTTAAACCTGTTGTTTTTTGGGGGATGCC
#=======================================
#
# Aligned_sequences: 2
# 1: EMBOSS_001
# 2: EMBOSS_001
# Matrix: EDNAFULL
# Gap_penalty: 10.0
# Extend_penalty: 0.5
#
# Length: 106
# Identity:      68/106 (64.2%)
# Similarity:    68/106 (64.2%)
# Gaps:          25/106 (23.6%)
# Score: 192.5
# 
#
#=======================================

EMBOSS_001         1 TATTCAACT-AGCCCCAGACGG--TACATC-CTAGGACCGAAAC--ATTC     44
                        |.|||| .|||||||.|||  |.|||| |||||.||||.||  .|||
EMBOSS_001         1 ---TGAACTCGGCCCCAGCCGGGTTCCATCACTAGGCCCGACACCATTTC     47

EMBOSS_001        45 G--TTTTGTAG--AACCTCG---CAATTAAACCTG-TGTT------GGGA     80
                     |  |||.||.|  |.|||||   ||.||||||||| ||||      ||||
EMBOSS_001        48 GTATTTGGTTGTCACCCTCGGCCCATTTAAACCTGTTGTTTTTTGGGGGA     97

EMBOSS_001        81 TGATCG     86
                     ||..  
EMBOSS_001        98 TGCC--    101
#---------------------------------------
#---------------------------------------
TRUE: ACTCAGAGGCAATGACGACAAAACGGGATAGCATTACTGGTGGCGGACTCGTATACCTAGGGAGCATGATGCGCATGTCATAAGAGTGG
PRED: ACCAGGCCCATGGAAGCCCAAACGTGGAGTCCATTTCCTGGGTTGGGAGGGCCTGGTTATCCATCGGGAAGCCATGGGCTGCGCCATGGGTCCATAAGCCCAC
#=======================================
#
# Aligned_sequences: 2
# 1: EMBOSS_001
# 2: EMBOSS_001
# Matrix: EDNAFULL
# Gap_penalty: 10.0
# Extend_penalty: 0.5
#
# Length: 108
# Identity:      65/108 (60.2%)
# Similarity:    65/108 (60.2%)
# Gaps:          24/108 (22.2%)
# Score: 115.0
# 
#
#=======================================

EMBOSS_001         1 ACTCAGAGGCAAT-GACGACAAAACG-GGATAGCA-TTACTGG--TGG--     43
                     || ||| |.|.|| ||.|.|.||||| |||...|| ||.||||  |||  
EMBOSS_001         1 AC-CAG-GCCCATGGAAGCCCAAACGTGGAGTCCATTTCCTGGGTTGGGA     48

EMBOSS_001        44 CGGACTCG-TATACCTAGGGA--GCAT--GATGCG-CATG---TCATAAG     84
                     .||.||.| |||.|.|.||||  .|||  |.|||| ||||   .||||||
EMBOSS_001        49 GGGCCTGGTTATCCATCGGGAAGCCATGGGCTGCGCCATGGGTCCATAAG     98

EMBOSS_001        85 ---AGTGG     89
                        |.   
EMBOSS_001        99 CCCAC---    103
#---------------------------------------
#---------------------------------------
TRUE: GCCGGGACGGATGCAACTAGCCCCTATCAGCGTTTGCTTTTACCGCGTGCCAACTTCTGTGCGTCATTGACGATCAGCCCTTGAG
PRED: TGGGGCATGATGGCCCACTCGGCCCATCTCCGCGTTCCTTATTTTTCCCGGCGGTGGCCAACTTTCTATTGGATCCTTTTGGCCGGCGCCCGCCCTACC
#=======================================
#
# Aligned_sequences: 2
# 1: EMBOSS_001
# 2: EMBOSS_001
# Matrix: EDNAFULL
# Gap_penalty: 10.0
# Extend_penalty: 0.5
#
# Length: 114
# Identity:      56/114 (49.1%)
# Similarity:    56/114 (49.1%)
# Gaps:          44/114 (38.6%)
# Score: 117.0
# 
#
#=======================================

EMBOSS_001         1 GCCGGGAC-GGATG--CAACT-AGCCCCTATCAGCGTT--TGCTTTTACC     44
                       .|||.| .||||  |.||| .||||.|.||.|||||  |..||||.||
EMBOSS_001         1 --TGGGGCATGATGGCCCACTCGGCCCATCTCCGCGTTCCTTATTTTTCC     48

EMBOSS_001        45 --GCG--TGCCAACTTCTGTGCGTC-ATTGACGATCAGCCCTTGAG----     85
                       |||  .||||||||       || ||||  |||    ||||..|    
EMBOSS_001        49 CGGCGGTGGCCAACTT-------TCTATTG--GAT----CCTTTTGGCCG     85

EMBOSS_001        86 --------------     85
                                   
EMBOSS_001        86 GCGCCCGCCCTACC     99
#---------------------------------------
#---------------------------------------
TRUE: TTCCCGTATGGAGTCAATCGTCAGCAAAAGAGATGATACACGGAAATTTACGACTCCGTCGTTAGCAAGCCGTACTGTTTGTGTATAAC
PRED: CTCCTTTATTGGGGCGGTTCCATATGTCCGCCCAAGGAGGGATGGCTCCACGCCCTTTTTAAAGCCTCAGAGAACGGGCCAGCCGTTACTTAGTTTTTGGTTGGTCCA
#=======================================
#
# Aligned_sequences: 2
# 1: EMBOSS_001
# 2: EMBOSS_001
# Matrix: EDNAFULL
# Gap_penalty: 10.0
# Extend_penalty: 0.5
#
# Length: 115
# Identity:      58/115 (50.4%)
# Similarity:    58/115 (50.4%)
# Gaps:          33/115 (28.7%)
# Score: 83.5
# 
#
#=======================================

EMBOSS_001         1 TTCCCGTATGGAG-----TCAAT-CGTCAGC---AAAAGAGAT-GATACA     40
                     .|||..|||.|.|     ||.|| .|||.||   |..||.||| |.|.||
EMBOSS_001         1 CTCCTTTATTGGGGCGGTTCCATATGTCCGCCCAAGGAGGGATGGCTCCA     50

EMBOSS_001        41 CGGAAATTT--ACGACTC------CGTCGTTAGCAAGCCG-TACT--GTT     79
                     ||....|||  |.|.|||      ||     .||.||||| ||||  |||
EMBOSS_001        51 CGCCCTTTTTAAAGCCTCAGAGAACG-----GGCCAGCCGTTACTTAGTT     95

EMBOSS_001        80 TGT-----GTATAAC     89
                     |.|     ||..|  
EMBOSS_001        96 TTTGGTTGGTCCA--    108
#---------------------------------------
#---------------------------------------
TRUE: TCCTAGTCCAGATAATCGTGGTGGATAAGGAGAAGGTTGGGAACTCAGAAGGTTGATTCGATATGGAGAAAAACTCTGTGTACAAATGT
PRED: CTTCCAGATACTAGTTGGGTTGGGGGATAAGGGGAGACGGTTTGGACATCCCGACGATTTTGGCTTTCGGGATATGGGGGAGAAACCTCTGTTGGTTACCCTGC
#=======================================
#
# Aligned_sequences: 2
# 1: EMBOSS_001
# 2: EMBOSS_001
# Matrix: EDNAFULL
# Gap_penalty: 10.0
# Extend_penalty: 0.5
#
# Length: 116
# Identity:      63/116 (54.3%)
# Similarity:    63/116 (54.3%)
# Gaps:          39/116 (33.6%)
# Score: 159.5
# 
#
#=======================================

EMBOSS_001         1 TCCTAGTCCAGATAATCG------TGGTGGATAA--GGAGAAGGTTGGGA     42
                       ||  ||||||||.|.|      |||.||||||  |||||.||||.|||
EMBOSS_001         1 --CT--TCCAGATACTAGTTGGGTTGGGGGATAAGGGGAGACGGTTTGGA     46

EMBOSS_001        43 -ACTCAGAAGGTT---GATT---CGATAT---GGAGAAAAACTCTGTGTA     82
                      |..|.||.|.||   |.||   .|||||   |||| |||.||||||   
EMBOSS_001        47 CATCCCGACGATTTTGGCTTTCGGGATATGGGGGAG-AAACCTCTGT---     92

EMBOSS_001        83 CAAATGT---------     89
                         ||.         
EMBOSS_001        93 ----TGGTTACCCTGC    104
#---------------------------------------
#---------------------------------------
TRUE: AAGGTGGTTAGAAGCCTATCAATTTCAAGGCCCTCGATGGTTGACCAGTAGGAATGACATCGTACTCGAACCACTAGTGACC
PRED: TTAAGGGCTTGGAACGATCTCCCATTTGTGCCAGGGCCCATCGGGCTGGATTTGCCCAGTTAGAATGGGCCATCGTTACTCTAACCCCACTCGTCC
#=======================================
#
# Aligned_sequences: 2
# 1: EMBOSS_001
# 2: EMBOSS_001
# Matrix: EDNAFULL
# Gap_penalty: 10.0
# Extend_penalty: 0.5
#
# Length: 101
# Identity:      65/101 (64.4%)
# Similarity:    65/101 (64.4%)
# Gaps:          24/101 (23.8%)
# Score: 161.5
# 
#
#=======================================

EMBOSS_001         1 --AAGGTGGTTAGAAGCCTAT---CAATTT----CAAGGCCC-TC--GAT     38
                       |||| |.||.|||  |.||   |.||||    ||.||||| ||  |.|
EMBOSS_001         1 TTAAGG-GCTTGGAA--CGATCTCCCATTTGTGCCAGGGCCCATCGGGCT     47

EMBOSS_001        39 GG--TTGACCAGTAGGAAT--GACATCG-TACTCGAA--CCACTAGTGAC     81
                     ||  |||.|||||..||||  |.||||| |||||.||  |||||.||  |
EMBOSS_001        48 GGATTTGCCCAGTTAGAATGGGCCATCGTTACTCTAACCCCACTCGT--C     95

EMBOSS_001        82 C     82
                     |
EMBOSS_001        96 C     96
#---------------------------------------
#---------------------------------------
```

Constant sequences need to be generated by directly calling `dataset.gaussian_model_fn(nt_seq)`.

(Recall also that we still have `dataset.random_upsample == False`.)

These sequences have a constant amount of upsampling in the kmer-to-sample conversion; the only randomness is in the gaussian distributions per 5mer.

In [None]:
dataset.random_upsample = True
### set target lengths to 85:
lengths = np.ones(batch_size) * 85
lengths_th = torch.IntTensor(lengths.astype(np.int32))

### batch of constant sequences: `seqs` = ['A'*85, 'G'*85, 'C'*85, 'T'*85, 'A'*85, 'G'*85]
seqs = [np.ones(85, dtype=np.int32) * 1, np.ones(85, dtype=np.int32) * 2, np.ones(85, dtype=np.int32) * 3, 
        np.ones(85, dtype=np.int32) * 4, np.ones(85, dtype=np.int32) * 1, np.ones(85, dtype=np.int32) * 2]
seq = torch.from_numpy(np.concatenate(seqs)).int()

### for each sequence, sample a signal sequence and stack into a batch:
signals = [dataset.gaussian_model_fn(sq) for sq in seqs]
signal = torch.from_numpy(dataset.batchify(signals)).float()

### get variables:
signal_var = torch.autograd.Variable(signal, volatile=True)
seqs_var = torch.autograd.Variable(seq, volatile=True)
lengths_var = torch.autograd.Variable(lengths_th, volatile=True)

In [None]:
lengths_th

In [None]:
seqs

In [None]:
signal

In [None]:
# run model on these inputs:
probas = ctcnet(batch_norm(signal_var.unsqueeze(1)))
transcriptions = probas.permute(2,0,1) # need seq x batch x dim
transcription_lengths = Variable(torch.IntTensor([transcriptions.size(0)] * batch_size))
ctc_loss = ctc_loss_fn(transcriptions, seqs_var, transcription_lengths, lengths_var)
avg_ctc_loss = (ctc_loss / transcriptions.size(0))

In [None]:
print("CTC Loss on whole sequence: {}".format(ctc_loss.data[0]))
print("CTC Loss, averaged per-logit: {}".format(avg_ctc_loss.data[0]))

In [None]:
# print true sequences:
true_base_sequences = split_target_seqs(seqs_var.data, lengths_var.data)
for k in range(len(seqs)):
    print(labels2strings(true_base_sequences[k].unsqueeze(0))[0])

In [None]:
# normalize probabilities with a softmax operation:
temperature = 1.0 # should set this between 0->infty
logits = transcriptions / temperature
for k in range(len(logits)):
    logits[k,:,:] = torch.nn.functional.softmax(logits[k,:,:])

In [None]:
# argmax decoding: expects (batch, seq, dim) and returns (batch, seq)
argmax_decoded = argmax_decode(logits.permute(1,0,2).contiguous().data)
argmax_basecalls = labels2strings(argmax_decoded)
for k in range(len(argmax_decoded)):
    print(argmax_basecalls[k])

In [None]:
# beam search decoded: expects (batch, dim, seq)
beam_search_decoder = BeamSearchDecoder(batch_size=batch_size, num_labels=5, beam_width=7)
probas, hyp_seqs = beam_search_decoder.decode(logits.permute(1, 2, 0))
print("Normalized probabilities:")
for k in range(len(probas)):
    print(probas[k] / logits.size(0))
lookup_dict = {0: '', 1: 'A', 2: 'G', 3: 'C', 4: 'T', 5: '<SOS>', 6: '<EOS>'}
for ll in range(len(hyp_seqs)):
    print("".join([lookup_dict[lbl] for lbl in hyp_seqs[ll]]))