### Let's examine the results of the pore model on pore-width 1. First, let's get set up by importing and switching directories to top-level:

In [1]:
# switch to top-level dir:
%cd ..

/home/ptang/Desktop/pytorch_models/wavenet-speech


In [2]:
# import all the essentials:
import torch
import numpy as np
from modules.wavenet import WaveNet
from modules.classifier import WaveNetClassifier
from utils.pore_model import PoreModelLoader

### Now let's build a pore model with the same settings as was in the training loop:

In [3]:
# Low noise and small pore width:
num_levels = 256
num_iterations = 100000
num_epochs = 50
batch_size = 16
epoch_size = 2500
nt_sample_lengths = (90,110)
pore_width = 1
srate = 4
noise = 1.
nt_to_pa = { 1: 51., 2: 22., 3: 103., 4: 115. }
dataloader = PoreModelLoader(num_iterations, num_epochs, epoch_size,
                             batch_size=batch_size, num_levels=num_levels, lengths=nt_sample_lengths,
                             pore_width=pore_width, sample_rate=srate, currents_dict=nt_to_pa,
                             sample_noise=noise)

### Now let's instantiate a model and restore the model weights:

In [4]:
num_labels = 5
out_dim = 256
downsample_rate = 1
wavenet_dils = [1, 2, 3, 4,
                1, 2, 3, 4,
                1, 2, 3, 4]
classifier_dils = [1, 2, 3, 4,
                   1, 2, 3, 4,
                   1, 2, 3, 4]

wavenet = WaveNet(num_levels, 2, [(num_levels, num_levels, 2, d) for d in wavenet_dils], num_levels, softmax=False)
classifier = WaveNetClassifier(num_levels, num_labels, [(num_levels, num_levels, 3, d) for d in classifier_dils],
                               out_dim, pool_kernel_size=downsample_rate, input_kernel_size=2, input_dilation=1,
                               softmax=False)

# restore model weights (`map_location` moves weights from CUDA to CPU):
wavenet.load_state_dict(torch.load("./runs/artificial/noiseless_porewidth_1/wavenet_model.loss0_5014.pth",
                                   map_location=lambda storage, loc: storage))
classifier.load_state_dict(torch.load("./runs/artificial/noiseless_porewidth_1/classifier_model.loss0_5014.pth",
                                      map_location=lambda storage, loc: storage))

### Define a closured function to run the WaveNet-CTC model on inputs:

In [5]:
def run_model(signal):
    intermediate_signal = wavenet(signal)
    transcription = classifier(intermediate_signal)
    return transcription

In [6]:
signal, seq, seq_lengths = dataloader.fetch()

In [7]:
ctc_preds = run_model(torch.autograd.Variable(signal.data, volatile=True))

### First check the real output:

In [8]:
batch_ix = 1 # (choose which sequence of the batch you want to look at)
_lookup_ = {0: '<BLANK>', 1: 'A', 2: 'G', 3: 'C', 4: 'T'}
# look up target sequence by using the sequence lengths as an index:
def batch_index_lookup(bix, seq_lens):
    start = torch.sum(seq_lens[0:bix])
    stop = start+seq_lens[bix]
    return (start,stop)
_s0,_s1 = batch_index_lookup(batch_ix, seq_lengths)
s0 = int(_s0.data[0])
s1 = int(_s1.data[0])
print("".join(_lookup_[ix] for ix in list(seq[s0:s1].data)))

GCAGTCCATGCCTCAAACAGAATGATTCAGTCGGAGGTTCATTTATTTAATATTAGTCGACTTGTGGCATTAGGCAGTACCTCATTAGGGTGTC


### Basic ArgMax-decoded output:

In [9]:
# print outputs:
print_blanks = False
pred_labels = []
for k in range(ctc_preds.size(2)):
    logit, label = torch.max(torch.nn.functional.softmax(ctc_preds[batch_ix,:,k]), dim=0)
    logit_py = float(logit.data[0])
    label_py = _lookup_[int(label.data[0])]
    if (not print_blanks) and (label_py == '<BLANK>'): continue
    print("Called: {0} | Proba: {1:1.4f}".format(label_py, logit_py))
    pred_labels.append(label_py)

Called: G | Proba: 0.8884
Called: G | Proba: 0.9814
Called: C | Proba: 0.9846
Called: C | Proba: 0.8904
Called: G | Proba: 0.6286
Called: G | Proba: 0.9753
Called: G | Proba: 0.9927
Called: T | Proba: 0.7991
Called: T | Proba: 0.7068
Called: C | Proba: 0.8207
Called: C | Proba: 0.6349
Called: C | Proba: 0.5056
Called: C | Proba: 0.7124
Called: A | Proba: 0.8545
Called: A | Proba: 0.9687
Called: A | Proba: 0.8812
Called: T | Proba: 0.8401
Called: G | Proba: 0.7039
Called: G | Proba: 0.9820
Called: G | Proba: 0.9723
Called: C | Proba: 0.7528
Called: C | Proba: 0.9876
Called: C | Proba: 0.9706
Called: C | Proba: 0.7649
Called: T | Proba: 0.7993
Called: T | Proba: 0.8786
Called: C | Proba: 0.9555
Called: A | Proba: 0.7101
Called: A | Proba: 0.9785
Called: A | Proba: 0.9364
Called: A | Proba: 0.6097
Called: A | Proba: 0.7055
Called: G | Proba: 0.7600
Called: G | Proba: 0.7576
Called: G | Proba: 0.9715
Called: G | Proba: 0.9083
Called: A | Proba: 0.7570
Called: A | Proba: 0.9036
Called: A | 

In [10]:
print("".join(pred_labels))

GGCCGGGTTCCCCAAATGGGCCCCTTCAAAAAGGGGAAAATTGGGATTTTCAAGGGGTTCCCGGGGAAAGGGGGGTTTTTCAAATTTTTTAATTTTTTTATTTTTAAAGGGTTTCCCGGGGACCCTTTGGTGGGGCATTTGGGGCCAGGTAACCCCCCTTTATTTTAAAGGGGGGGGTTTGGGTTCC


### Beam-search-decoded output:

In [11]:
from modules.beam import Beam
beam_width = 5
target_dict = { '<pad>': 0, 'a': 1, 'g': 2, 'c': 3, 't': 4, '<s>': 5, '</s>': 6 }
beams = [Beam(beam_width, target_dict, cuda=False) for _ in range(batch_size)]

In [12]:
# prepare logits by reshaping to (Seq,Batch,NDim) and appending <BOS> and <EOS> tags as new columns
logits = ctc_preds.data.clone().permute(2,0,1)
zero_col = torch.zeros(logits.size(0), logits.size(1), 1)
logits = torch.cat((logits, zero_col, zero_col), dim=2)

In [13]:
# add <START> and <STOP> tags to the logits:
ndim = logits.size(2)
start_vec = torch.zeros(1, batch_size, ndim)
start_vec[:,:,5] = 1.
stop_vec = torch.zeros(1, batch_size, ndim)
stop_vec[:,:,6] = 1.
start_vec.size()
logits = torch.cat([start_vec, logits, stop_vec], dim=0)
sequence_length = logits.size(0) + 2

In [15]:
# loop through logits and accumulate (softmaxed) probabilities to the Beams:
for k in range(logits.size(0)):
    label_lkhd = torch.nn.functional.softmax(logits[k].view(batch_size, ndim)).data
    label_lkhd = label_lkhd.unsqueeze(1).expand(batch_size,beam_width,ndim)
    # update each beam:
    for b in range(batch_size):
        if beams[b].done: continue
        beams[b].advance(label_lkhd[b,:,:])

In [16]:
### return decoded hypthesis sequences and probabilities/scores for each:                                                                                                                                                                                               
num_best = 3 # (only return the top-3 best hypothesis sequence for each batch)                                                                                                                                                                                          
hypotheses = {}
probas = {}
for b in range(batch_size):
    # create dictionary entries for batch sequence b in `hypotheses`, `probas`:                                                                                                                                                                                         
    hypotheses[b] = []
    probas[b] = []
    # get best probabilities and their associated indices in the beam:                                                                                                                                                                                                  
    scores, Ks = beams[b].sort_best()
    # append the scores to the list of best probabilities:                                                                                                                                                                                                              
    probas[b] += [ scores[0:num_best] ]
    # append the hypothesis sequences to the list of best hypotheses:                                                                                                                                                                                                   
    beam_b_hyps = [ beams[b].get_hyp(k) for k in Ks[0:num_best] ]
    hypotheses[b] += beam_b_hyps

In [17]:
# print output at batch_ix:
print("=" * 80)
print("Logits:")
print(logits[:,batch_ix,:])
print("Hypotheses, batch sequence {}".format(batch_ix))
print(hypotheses[batch_ix])
print("Probabilities, batch sequence {}".format(batch_ix))
print(probas[batch_ix])

Logits:

  0.0000   0.0000   0.0000  ...    0.0000   1.0000   0.0000
  2.0418  -3.5993  -2.0254  ...   -2.6345   0.0000   0.0000
  0.7959  -4.4375  -4.3781  ...   -4.6685   0.0000   0.0000
           ...               ⋱              ...            
  6.6841  -1.0067  -4.7337  ...   -0.8784   0.0000   0.0000
  4.8830  -3.4591  -3.1115  ...   -1.3804   0.0000   0.0000
  0.0000   0.0000   0.0000  ...    0.0000   0.0000   1.0000
[torch.FloatTensor of size 438x7]

Hypotheses, batch sequence 1
[[5, 0, 0, 0, 0, 2, 2, 0, 0, 3, 3, 0, 0, 0, 0, 0, 2, 2, 2, 0, 0, 0, 4, 4, 0, 3, 3, 0, 0, 0, 3, 3, 1, 1, 1, 0, 0, 0, 4, 0, 0, 2, 2, 2, 0, 0, 3, 0, 3, 3, 3, 0, 0, 4, 4, 0, 0, 0, 3, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 2, 2, 2, 1, 1, 1, 0, 0, 1, 0, 0, 0, 4, 4, 0, 0, 2, 2, 2, 0, 1, 0, 0, 0, 4, 4, 0, 4, 4, 0, 0, 0, 0, 3, 0, 0, 1, 1, 0, 2, 2, 2, 2, 0, 4, 4, 0, 0, 3, 3, 3, 0, 0, 0, 0, 2, 2, 2, 2, 1, 1, 1, 0, 0, 0, 2, 2, 2, 2, 2, 2, 0, 0, 4, 4, 4, 0, 4, 4, 0, 0, 0, 3, 1, 1, 1, 0, 4

In [19]:
print(hypotheses[batch_ix][0])

[5, 0, 0, 0, 0, 2, 2, 0, 0, 3, 3, 0, 0, 0, 0, 0, 2, 2, 2, 0, 0, 0, 4, 4, 0, 3, 3, 0, 0, 0, 3, 3, 1, 1, 1, 0, 0, 0, 4, 0, 0, 2, 2, 2, 0, 0, 3, 0, 3, 3, 3, 0, 0, 4, 4, 0, 0, 0, 3, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 2, 2, 2, 1, 1, 1, 0, 0, 1, 0, 0, 0, 4, 4, 0, 0, 2, 2, 2, 0, 1, 0, 0, 0, 4, 4, 0, 4, 4, 0, 0, 0, 0, 3, 0, 0, 1, 1, 0, 2, 2, 2, 2, 0, 4, 4, 0, 0, 3, 3, 3, 0, 0, 0, 0, 2, 2, 2, 2, 1, 1, 1, 0, 0, 0, 2, 2, 2, 2, 2, 2, 0, 0, 4, 4, 4, 0, 4, 4, 0, 0, 0, 3, 1, 1, 1, 0, 4, 0, 0, 0, 4, 4, 4, 0, 4, 0, 4, 0, 1, 0, 1, 0, 0, 4, 4, 0, 4, 4, 0, 0, 4, 4, 4, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 0, 0, 1, 1, 1, 0, 2, 2, 2, 0, 4, 4, 4, 0, 3, 3, 3, 2, 2, 2, 2, 0, 1, 0, 0, 0, 3, 3, 3, 0, 4, 0, 0, 0, 0, 4, 4, 0, 2, 2, 0, 0, 4, 0, 0, 0, 0, 2, 0, 2, 0, 0, 2, 2, 0, 3, 0, 1, 0, 0, 0, 0, 0, 0, 4, 0, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 2, 2, 0, 3, 3, 0, 0, 1, 0, 0, 0, 2, 0, 2, 0, 4, 0, 0, 1, 1, 0, 0, 0, 3, 3, 3, 3, 3, 3, 0, 0, 4, 4, 4, 0, 

In [25]:
# convert to AGCTs:
collapsed = []
translated = []
for k in range(1,len(hypotheses[batch_ix][0])-1):
    if (hypotheses[batch_ix][0][k] != hypotheses[batch_ix][0][k-1]):
        collapsed.append(hypotheses[batch_ix][0][k])
        if hypotheses[batch_ix][0][k] != 0: translated.append(_lookup_[hypotheses[batch_ix][0][k]])
print("".join(translated))

GCGTCCATGCCTCAAAAGAATGATTCAGTCGAGTTCATTTTAATTTATAGTCGACTTGTGGGCATTGGCAGGTACTATTAGGTGTC


Comparison:
```
Original:  GCAGTCCATGCCTCAAA-CAGAATGATTCAGTCGGAGGTTCATTT-A-TTTAATATTAGTCGACTTGTGG-CATTAGGCAG-TACCTCATTAGGGTGTC
Predicted: GC-GTCCATGCCTCAAAAGA-A-TGATTCAGTCG-AG-TTCATTTTAATTTA-TA---GTCGACTTGTGGGCATT-GGCAGGTAC-T-ATTAGG-TGTC
```