# 1. If the different directions use a different number of hidden units, how will the shape of $H_t$ change?

$H_t.shape[-1] = \overrightarrow{H_t}.shape[-1] + \overleftarrow{H_t}.shape[-1]$

In [1]:
import sys
import torch.nn as nn
import torch
import warnings
sys.path.append('/home/jovyan/work/d2l_solutions/notebooks/exercises/d2l_utils/')
import d2l
from torchsummary import summary
warnings.filterwarnings("ignore")
from sklearn.model_selection import ParameterGrid

class BiRNNScratch(d2l.Module):
    def __init__(self, num_inputs, num_hiddens, sigma=0.01):
        super().__init__()
        self.save_hyperparameters()
        self.f_rnn = d2l.RNNScratch(num_inputs, num_hiddens[0], sigma)
        self.b_rnn = d2l.RNNScratch(num_inputs, num_hiddens[1], sigma)
        self.num_hiddens = sum(num_hiddens)  # The output dimension will be doubled
        
    def forward(self, inputs, Hs=None):
        f_H, b_H = Hs if Hs is not None else (None, None)
        f_outputs, f_H = self.f_rnn(inputs, f_H)
        b_outputs, b_H = self.b_rnn(reversed(inputs), b_H)
        outputs = [torch.cat((f, b), -1) for f, b in zip(
            f_outputs, reversed(b_outputs))]
        return outputs, (f_H, b_H)

  assert(self, 'net'), 'Neural network is defined'
  assert(self, 'trainer'), 'trainer is not inited'


In [6]:
bi_rnn = BiRNNScratch(num_inputs=4, num_hiddens=[8,16])
X = torch.randn(1,4)
outputs, (f_H, b_H) = bi_rnn(X)
outputs[0].shape

torch.Size([4, 24])

# 2. Design a bidirectional RNN with multiple hidden layers.

In [11]:
class BiRNN(d2l.RNN):
    def __init__(self, num_inputs, num_hiddens, num_layers):
        d2l.Module.__init__(self)
        self.save_hyperparameters()
        self.rnn = nn.RNN(num_inputs, num_hiddens, num_layers=num_layers, bidirectional=True)
        self.num_hiddens *= 2

In [15]:
bi_rnn = BiRNN(num_inputs=4, num_hiddens=8,num_layers=2)
bi_rnn

BiRNN(
  (rnn): RNN(4, 8, num_layers=2, bidirectional=True)
)

# 3. Polysemy is common in natural languages. For example, the word “bank” has different meanings in contexts “i went to the bank to deposit cash” and “i went to the bank to sit down”. How can we design a neural network model such that given a context sequence and a word, a vector representation of the word in the correct context will be returned? What type of neural architectures is preferred for handling polysemy?

In [23]:
bi_rnn.rnn.weight_ih_l0 

Parameter containing:
tensor([[-0.3454, -0.0369, -0.1882,  0.1576],
        [ 0.3329, -0.1928, -0.1029, -0.0484],
        [-0.2265,  0.1040,  0.1375,  0.0568],
        [ 0.2175,  0.1918,  0.3387, -0.0371],
        [-0.1609, -0.0286, -0.1931, -0.3293],
        [ 0.1150, -0.1777, -0.0100,  0.3497],
        [ 0.3511,  0.3004,  0.2287, -0.3489],
        [-0.2413, -0.2574,  0.0273, -0.3384]], requires_grad=True)