### Seq2Seq

<img src="https://wikidocs.net/images/page/24996/인코더디코더모델.PNG" />

#### Teacher forcing

We can use ground truth as a decoder's input for parallelization. Otherwise the output value of a decoder is used as the next input by default.

In [372]:
import torch
import torch.nn as nn
import random

input_dim = 100
hidden_dim = 200
output_dim = 100

class Encoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, batch_first=False):
        super().__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.batch_first = batch_first

        self.rnn = nn.GRU(input_dim, hidden_dim, batch_first=batch_first)
    
    def forward(self, x):
        _, hx = self.rnn(x)
        return hx

class Decoder(nn.Module): # actually a generator
    def __init__(self, input_dim, hidden_dim, output_dim, batch_first=False):
        super().__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.batch_first = batch_first
    
        self.rnn = nn.GRU(input_dim, hidden_dim, batch_first=batch_first)
        self.linear = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x, hx):
        x, hx = self.rnn(x, hx)
        output = self.linear(hx)
        return output, x, hx

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, batch_first=False):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, source, target, teacher_forcing_ratio=0.8):
        x = target[0].unsqueeze(0)
        hx = self.encoder(source)

        predicts = []
        for i in range(len(target)):
            output, x, hx = self.decoder(x, hx)
            predicts.append(output)

            if random.random() < teacher_forcing_ratio:
                x = target[i].unsqueeze(0)
            else:
                x = output
        predicts = torch.concat(predicts, dim=0)
        return predicts


encoder = Encoder(input_dim=100, hidden_dim=200)
decoder = Decoder(input_dim=100, hidden_dim=200, output_dim=100)

seq2seq = Seq2Seq(encoder, decoder)

source = torch.randn(56, 128, 100)
target = torch.randn(28, 128, 100)

seq2seq(source, target).shape

torch.Size([28, 128, 100])

### Beam search

$$
\argmax_{y} \prod_{t=1}^{T_y} p(y^{<t>} | x, y^{<1>}, \cdots, y^{<t-1>}) \\

\argmax_{y} \frac{1}{T_y^{\alpha}} \sum_{t=1}^{T_y} \log p(y^{<t>} | x, y^{<1>}, \cdots, y^{<t-1>}) \quad \text{where,}\ 0 \leq \alpha \leq 1
$$

In [373]:
import random
import numpy as np

chars = list(map(chr, range(65,91)))

def model(x: str) -> np.array:
    np.random.seed(hash(x) % 10**9)
    x = np.random.randn(len(chars))
    x = np.exp(x) / np.exp(x).sum()
    np.random.seed()
    return x # random softmax (proba) from the input x

# imagine something like a genetic algorithm selecting the top-k for each iterations
def beam(breadth=3, depth=10, log=True, alpha=1):
    choices = [("", np.log(1.0))] if log else [("", 1.0)]
    for d in range(depth):
        candidates = []
        for seq, proba in choices:
            preds = model(seq)
            if log:
                candidates.extend(zip([seq+c for c in chars], proba+np.log(preds)))
            else:
                candidates.extend(zip([seq+c for c in chars], proba*preds))
        candidates = sorted(candidates, key=lambda x: x[1], reverse=True)
        choices = candidates[:breadth]
    if log:
        choices = [(seq, 1/depth**alpha * proba) for seq, proba in choices]
    return choices

# for b in range(1, len(chars)+1):
#     print(f"{b = :>2}", beam(breadth=b, depth=20, log=False)[0])

for b in range(1, len(chars)+1):
    print(f"{b = :>2}", beam(breadth=b, depth=20, log=True)[0])

b =  1 ('YASANOGDGDHQFPETDNRU', -1.798972939125831)
b =  2 ('YASACGAWCMNQGNZOVDTS', -1.6200365243245871)
b =  3 ('YASACGABGCHKPRBIELIR', -1.630091767739598)
b =  4 ('YASACGAWCMNCNAODDOIA', -1.5992422080213697)
b =  5 ('YASACGAWCMNCNAODDOIA', -1.5992422080213697)
b =  6 ('YASACGAWCMNCNAODDOIA', -1.5992422080213697)
b =  7 ('YASACGAWCMNCNAODDOIA', -1.5992422080213697)
b =  8 ('YASACGAWCMNCNAODDOIA', -1.5992422080213697)
b =  9 ('YASACGAWCMNCNAODDOIA', -1.5992422080213697)
b = 10 ('YASACGAWCMNCNAODDOIA', -1.5992422080213697)
b = 11 ('YASACGAWCMNCNAODDOIA', -1.5992422080213697)
b = 12 ('YASACGAWCMNCNAODDOIA', -1.5992422080213697)
b = 13 ('YASACGAWCMNCNAODDOIA', -1.5992422080213697)
b = 14 ('YASACGAWCMNCNAODDOIA', -1.5992422080213697)
b = 15 ('YASACGAWCMNCNAODDOIA', -1.5992422080213697)
b = 16 ('YASACGAWCMNCNAODDOIA', -1.5992422080213697)
b = 17 ('YASACGAWCMNCNAODDOIA', -1.5992422080213697)
b = 18 ('YASACGAWCMNCNAODDOIA', -1.5992422080213697)
b = 19 ('YASACGAWCMNCNAODDOIA', -1.5992422080213

#### Error analysis on beam search

Human ($y^*$): "Jane visits Africa in September." \
Model ($\hat y$): "Jane visited Africa last September."

$$
\begin{cases}
p(y^*|x) > p(\hat y|x) \rightarrow \text{beam search is at fault} \\
p(y^*|x) \leq p(\hat y|x) \rightarrow \text{model is at fault} \\
\end{cases}
$$

### Bleu Score
Bleu stands for Bilingual evaluation understudy

Bleu score is a "precision" of a generated sequence.

<img src="https://images.prismic.io/encord/edfa849b-03fb-43d2-aba5-1f53a8884e6f_image5.png" height=300 />

<img src="https://dp8v87cz8a7qa.cloudfront.net/45396/5bd20d03240611540492547.png" height=200 />

```
  ┌─GroundTruth────────────┐ 
  │                        │ 
  │                        │ 
  │                        │ 
  │┌─MachineTranslation────┼┐
  ││                       ││
  ││ Bleu (Precision)      ││
  │└───────────────────────┼┘
  └────────────────────────┘ 
```
Better not to generate creative words.

<img src="src/bleu_details.png" height=200 />

In [404]:
from collections import Counter
import pandas as pd

references = [
    "the cat is on the mat",
    "there is a cat on the mat",
]

translation = "a cute cat is lying on a cozy mat"

reference_words = []
for sentence in references:
    reference_words.extend(sentence.split())
reference_words = dict.fromkeys(set(reference_words), 1)

translated_words = translation.split()
translated_words = dict(Counter(translated_words))

print(f"{reference_words  = }")
print(f"{translated_words = }")

TP = 0
P  = 0
for k in translated_words:
    if k in reference_words:
        TP += reference_words[k]
    P += translated_words[k]

bleu = TP / P
print(f"{bleu = }")

reference_words  = {'a': 1, 'on': 1, 'mat': 1, 'is': 1, 'cat': 1, 'the': 1, 'there': 1}
translated_words = {'a': 4, 'cute': 1, 'cat': 1, 'is': 1, 'lying': 1, 'on': 1, 'cozy': 1, 'mat': 1}
bleu = 0.45454545454545453


### Attention mechanism

Simply calculate the attention value of $\mathbf a^{<t'>}$; an output value of bidirectional RNN at sequence $\mathbf x^{<t'>}$.

$$
\mathbf{a}^{<t'>} =
\begin{bmatrix}
\vert \\
\mathbf{a}_{\text{forward}}^{<t'>}\\
\vert \\
\hline
\vert \\
\mathbf{a}_{\text{backward}}^{<t'>}\\
\vert \\
\end{bmatrix}
$$

$$
e^{<t,t'>} = 
W
\begin{bmatrix}
\vert \\
\mathbf{a}^{<t'>}\\
\vert \\
\hline
\vert \\
\mathbf{s}^{<t-1>}\\
\vert \\
\end{bmatrix}
+ b
$$

$$
\mathbf e^{<t>} = 
\begin{bmatrix}
e^{<t,1>} \\
e^{<t,2>} \\
\vdots \\
e^{<t,t'>} \\
\end{bmatrix}
$$

$$
\mathrm{softmax}(\mathbf e^{<t>}) =
\begin{bmatrix}
\alpha^{<t,1>} \\
\alpha^{<t,2>} \\
\vdots \\
\alpha^{<t,t'>} \\
\end{bmatrix}
$$

$$
\mathbf c^{<t>}=

\begin{bmatrix}
\vert  & \vert & & \vert \\
\mathbf{a}^{<1>} & \mathbf{a}^{<2>} & \cdots & \mathbf{a}^{<t'>} \\
\vert  & \vert & & \vert \\
\end{bmatrix}

\begin{bmatrix}
\alpha^{<t,1>} \\
\alpha^{<t,2>} \\
\vdots \\
\alpha^{<t,t'>} \\
\end{bmatrix}
$$

$$
\begin{matrix}
& & \hat y^{<t>} \\
& & \uparrow \\
s^{<t-1>} & \rightarrow & f & \rightarrow & s^{<t>} \\
& & \uparrow \\
& & \mathbf c^{<t>} \\
& & \uparrow \\
& & \sum_{t'} \alpha^{<t,t'>} \mathbf x^{<t'>} \\
& & \uparrow & \nwarrow & \nwarrow & \nwarrow & \nwarrow \\
& & \alpha^{<t,1>} & \alpha^{<t,2>} & \alpha^{<t,3>} & \alpha^{<t,4>} & \alpha^{<t,5>} \\
& & \uparrow & \uparrow & \uparrow & \uparrow & \uparrow \\
& & \mathrm{softmax} & \mathrm{softmax} & \mathrm{softmax} & \mathrm{softmax} & \mathrm{softmax} \\
& & \uparrow & \uparrow & \uparrow & \uparrow & \uparrow \\
s^{<t-1>} & \rightarrow & e^{<t,1>} & e^{<t,2>} & e^{<t,3>} & e^{<t,4>} & e^{<t,5>} \\
& & \uparrow & \uparrow & \uparrow & \uparrow & \uparrow \\
& & \mathbf x^{<1>} & \mathbf x^{<2>} & \mathbf x^{<3>} & \mathbf x^{<4>} & \mathbf x^{<5>} \\
& & \uparrow & \uparrow & \uparrow & \uparrow & \uparrow \\
& & f & f & f & f & f \\
& & \uparrow & \uparrow & \uparrow & \uparrow & \uparrow \\
& & \text{Jane} & \text{visite} & \text{l'Afrique} & \text{en} & \text{Septembre} \\
\end{matrix}
$$


<div><img src="src/attention.png" width=800 /></div>
<div><img src="src/attention_e.png" width=800 /></div>


In [113]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt

# attention model input size of 56, output size of 28
# BiRNN length of 56, RNN length of 28

birnn = nn.RNN(input_size=100, hidden_size=200, bidirectional=True)
rnn   = nn.RNN(input_size=400, hidden_size=200, bidirectional=False)
attention = nn.Linear(600, 1)

x = torch.randn(56,128,100)
a, _ = birnn(x) # activated value 'a' from birnn

print(f"{a.shape = }")

s = torch.zeros(1,128,200) # rnn's initial hidden state
outputs = []
for t in range(28):
    e = attention(torch.concat([s.repeat(56,1,1), a], dim=2))
    alpha = F.softmax(e, dim=0)
    c = (a * alpha).sum(0, keepdim=True)
    output, s = rnn(c, s)
    outputs.append(output)

outputs = torch.concat(outputs, dim=0)
outputs.shape

a.shape = torch.Size([56, 128, 400])


torch.Size([28, 128, 200])

### CTC cost for speech recognition

On a typical STT recognition, we use a many-to-many architectrue which takes an audio sampled with 120hz as an input and a softmaxed text as an output.

When speech of 30 seconds recorded with 120hz, the input and output sequence length will be 3600. Which will make the output sequence very sparse.

To overcome this problem we keep the output sequence length to 3600 and use the CTC (connectionist temporal classification).

Identical repeated chars not sparated by the blank ("_") are collapsed.

The following string will collapse to "the qui".

```
 _  t  t  _  h  h  h  _  e  _       q   _   u   u   _   i   i   i
<1><2><3><4><5><6><7><8><9><10><11><12><13><14><15><16><17><18><19>
```


### Trigger word detection

We could put single 1 as an output to the starting point of the trigger word audio but the output will be very sparse.\
Instead, we can put some trailing 1s after the starting point to balance the sparsity.

<div><img src="src/triggerword_basic.png" width=600 /></div>
<div><img src="src/triggerword_hack.png" width=600 /></div>