In [1]:
import torch

from torch import nn

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [3]:
torch.manual_seed(7)

<torch._C.Generator at 0x2594142b1d0>

#### Embedding Layer

In [4]:
class EmbeddingLayer(nn.Module):
    """Implement embedding layer.
    """
    def __init__(self, vector_size, vocab_size, dropout=0.5):
        """
        Arguments:
            vector_size {int} -- word embedding size.
            vocab_size {int} -- vocabulary size.
        
        Keyword Arguments:
            dropout {float} -- dropout rate. (default: {0.5})
        """
        super(EmbeddingLayer, self).__init__()

        self.vector_size = vector_size
        self.embed = nn.Embedding(vocab_size, vector_size)
        self.dropout = nn.Dropout(dropout)

    def load(self, vectors):
        """Load pre-trained embedding weights.
        
        Arguments:
            vectors {torch.Tensor} -- from "TEXT.vocab.vectors".
        """
        self.embed.weight.data.copy_(vectors)

    def forward(self, x):
        """
        Arguments:
            x {torch.Tensor} -- input tensor with shape [batch_size, seq_length]
        """
        e = self.embed(x)
        return self.dropout(e)

In [5]:
embedding_layer = EmbeddingLayer(vector_size=10, vocab_size=100)
embedding_layer.to(device)

EmbeddingLayer(
  (embed): Embedding(100, 10)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [6]:
x = torch.randint(0, 99, (32, 20), dtype=torch.long).to(device)  # [batch_size, seq_length]

In [7]:
embeded = embedding_layer(x)
print(embeded.shape)

torch.Size([32, 20, 10])


#### Encoding Layer

In [8]:
class EncodingLayer(nn.Module):
    """BiLSTM encoder which encodes both the premise and hypothesis.
    """
    def __init__(self, input_size, hidden_size):
        super(EncodingLayer, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size,
                            num_layers=1,
                            bidirectional=True)

    def forward(self, x):
        """
        Arguments:
            x {torch.Tensor} -- input embeddings with shape [batch, seq_len, input_size]

        Returns:
            output {torch.Tensor} -- [batch, seq_len, num_directions * hidden_size]
        """
        self.lstm.flatten_parameters()
        output, _ = self.lstm(x)
        return output

In [9]:
encoding_layer = EncodingLayer(input_size=10, hidden_size=15).to(device)
outputs = encoding_layer(embeded)
print(outputs.shape)

torch.Size([32, 20, 30])


#### Local Inference Layer

In [66]:
class LocalInferenceModel(nn.Module):
    """The local inference model introduced in the paper.
    """
    def __init__(self):
        super(LocalInferenceModel, self).__init__()
        self.softmax_1 = nn.Softmax(dim=1)
        self.softmax_2 = nn.Softmax(dim=2)

    def forward(self, p, h, p_mask, h_mask):
        """Apply local inference to premise and hyopthesis.

        Arguments:
            p {torch.Tensor} -- p has shape [batch, seq_len_p, 2 * hidden_size]
            h {torch.Tensor} -- h has shape [batch, seq_len_h, 2 * hidden_size]
            p_mask {torch.Tensor (int)} -- p has shape [batch, seq_len_p], 0 in the mask
                means padding.
            h_mask {torch.Tensor (int)} -- h has shape [batch, seq_len_h]

        Returns:
            m_p, m_h {torch.Tensor} -- tensor with shape [batch, seq_len, 8 * hidden_size]
        """
        # equation 11 in the paper:
        e = torch.matmul(p, h.transpose(1, 2))  # [batch, seq_len_p, seq_len_h]
        # masking the scores for padding tokens
        inference_mask = torch.matmul(p_mask.unsqueeze(2).float(),
                                      h_mask.unsqueeze(1).float())
        e.masked_fill_(inference_mask < 1e-7, -1e7)
        
        # equation 12 & 13 in the paper:
        h_score, p_score = self.softmax_1(e), self.softmax_2(e)
        h_ = h_score.transpose(1, 2).bmm(p)
        p_ = p_score.bmm(h)

        # equation 14 & 15 in the paper:
        m_p = torch.cat((p, p_, p * p_, p - p_), dim=-1)
        m_h = torch.cat((h, h_, h * h_, h - h_), dim=-1)

        assert inference_mask.shape == e.shape
        assert p.shape == p_.shape and h.shape == h_.shape
        assert m_p.shape[-1] == p.shape[-1] * 4

        return m_p, m_h

In [67]:
lim = LocalInferenceModel().to(device)

In [68]:
p, h = torch.rand(2, 4, 2), torch.rand(2, 3, 2)  # [batch, seq_len_p, 2 * hidden_size]
p, h = p.to(device), h.to(device)
print(p)
p_mask = torch.tensor([[1, 1, 1, 1],
                       [1, 1, 1, 0]]).to(device)
h_mask = torch.tensor([[1, 1, 1],
                       [1, 0, 0]]).to(device)

tensor([[[0.4962, 0.5468],
         [0.8283, 0.0468],
         [0.2201, 0.2501],
         [0.8731, 0.6575]],

        [[0.5739, 0.3068],
         [0.1198, 0.8427],
         [0.8958, 0.1384],
         [0.0653, 0.8580]]], device='cuda:0')


In [69]:
m_p, m_h = lim(p, h, p_mask, h_mask)

In [70]:
print(m_p.shape)
print(m_h.shape)

torch.Size([2, 4, 8])
torch.Size([2, 3, 8])


#### Composition Layer

In [15]:
class CompositionLayer(nn.Module):
    """The composition layer.
    """
    def __init__(self, input_size, output_size, hidden_size, dropout=0.5):
        """
        Arguments:
            input_size {int} -- input size to the feedforward neural network.
            output_size {int} -- output size of the feedforward neural network.
            hidden_size {int} -- output hidden size of the LSTM model.

        Keyword Arguments:
            dropout {float} -- dropout rate (default: {0.5})
        """
        super(CompositionLayer, self).__init__()
        self.hidden_size = hidden_size
        self.F = nn.Linear(input_size, output_size)
        self.lstm = nn.LSTM(output_size, hidden_size,
                            num_layers=1, bidirectional=True)
        self.dropout = nn.Dropout(dropout)

    def forward(self, m):
        """
        Arguments:
            m {torch.Tensor} -- [batch, seq_len, input_size]

        Returns:
            outputs {torch.Tensor} -- [batch, seq_len, hidden_size * 2]
        """
        y = self.dropout(self.F(m))
        self.lstm.flatten_parameters()
        outputs, _ = self.lstm(y)

        assert m.shape[:2] == outputs.shape[:2] and \
            outputs.shape[-1] == self.hidden_size * 2
        return outputs

In [16]:
cl = CompositionLayer(input_size=8, output_size=2, hidden_size=2, dropout=0.0).to(device)

In [17]:
outputs = cl(m_p)

In [18]:
print(outputs.shape)
print(outputs)

torch.Size([2, 4, 4])
tensor([[[ 0.0476, -0.1567, -0.1715, -0.0444],
         [ 0.0467, -0.1456, -0.1759, -0.0279],
         [ 0.0470, -0.1518, -0.1633, -0.0458],
         [ 0.0465, -0.1692, -0.1712, -0.0173]],

        [[ 0.0788, -0.2379, -0.0931, -0.0467],
         [ 0.0804, -0.2621, -0.1193, -0.0084],
         [ 0.0767, -0.2373, -0.0916, -0.0407],
         [ 0.0823, -0.2523, -0.0987, -0.0383]]], device='cuda:0',
       grad_fn=<CudnnRnnBackward>)


#### Pooling

In [19]:
class Pooling(nn.Module):
    """Apply maxing pooling and average pooling to the outputs of LSTM.
    """
    def __init__(self):
        super(Pooling, self).__init__()

    def forward(self, x, x_mask):
        """
        Arguments:
            x {torch.Tensor} -- [batch, seq_len, hidden_size * 2]
            x_mask {torch.Tensor} -- [batch, seq_len], 0 in the mask means padding

        Returns:
            v {torch.Tensor} -- [batch, hidden_size * 4]
        """
        mask_expand = x_mask.unsqueeze(-1).expand(x.shape)
        print(x)

        # average pooling
        x_ = x * mask_expand.float()
        print(x_)
        v_avg = x_.sum(-2) / x_mask.sum(-1).unsqueeze(-1).float()
        
        # max pooling
        x_ = x.masked_fill(mask_expand == 0, -1e7)
        print(x_)
        v_max = x_.max(-2).values
        
        print(v_avg)
        print(v_max)

        assert v_avg.shape == v_max.shape == (x.shape[0], x.shape[-1])
        
        return torch.cat((v_avg, v_max), dim=-1)

In [20]:
pooling = Pooling()

In [21]:
v = pooling(outputs, p_mask)

tensor([[[ 0.0476, -0.1567, -0.1715, -0.0444],
         [ 0.0467, -0.1456, -0.1759, -0.0279],
         [ 0.0470, -0.1518, -0.1633, -0.0458],
         [ 0.0465, -0.1692, -0.1712, -0.0173]],

        [[ 0.0788, -0.2379, -0.0931, -0.0467],
         [ 0.0804, -0.2621, -0.1193, -0.0084],
         [ 0.0767, -0.2373, -0.0916, -0.0407],
         [ 0.0823, -0.2523, -0.0987, -0.0383]]], device='cuda:0',
       grad_fn=<CudnnRnnBackward>)
tensor([[[ 0.0476, -0.1567, -0.1715, -0.0444],
         [ 0.0467, -0.1456, -0.1759, -0.0279],
         [ 0.0470, -0.1518, -0.1633, -0.0458],
         [ 0.0465, -0.1692, -0.1712, -0.0173]],

        [[ 0.0788, -0.2379, -0.0931, -0.0467],
         [ 0.0804, -0.2621, -0.1193, -0.0084],
         [ 0.0767, -0.2373, -0.0916, -0.0407],
         [ 0.0000, -0.0000, -0.0000, -0.0000]]], device='cuda:0',
       grad_fn=<MulBackward0>)
tensor([[[ 4.7648e-02, -1.5668e-01, -1.7153e-01, -4.4379e-02],
         [ 4.6694e-02, -1.4557e-01, -1.7586e-01, -2.7896e-02],
         [ 4.7

In [22]:
(-0.2523 - 0.2565 - 0.2391) / 3

-0.2493

In [23]:
v.shape

torch.Size([2, 8])

#### Inference Composition

In [24]:
class InferenceComposition(nn.Module):
    """Inference composition described in paper section 3.3
    """
    def __init__(self, input_size, output_size, hidden_size, dropout=0.5):
        """
        Arguments:
            input_size {int} -- input size to the feedforward neural network.
            output_size {int} -- output size of the feedforward neural network.
            hidden_size {int} -- output hidden size of the LSTM model.

        Keyword Arguments:
            dropout {float} -- dropout rate (default: {0.5})
        """
        super(InferenceComposition, self).__init__()
        self.composition_p = CompositionLayer(input_size,
                                              output_size,
                                              hidden_size,
                                              dropout=dropout)
        self.composition_h = deepcopy(self.composition_p)
        self.pooling = Pooling()

    def forward(self, m_p, m_h, p_mask, h_mask):
        """
        Arguments:
            m_p {torch.Tensor} -- [batch, seq_len, input_size]
            m_h {torch.Tensor} -- [batch, seq_len, input_size]
            mask {torch.Tensor} -- [batch, seq_len], 0 means padding

        Returns:
            v {torch.Tensor} -- [batch, input_size * 8]
        """
        # equation 16 & 17 in the paper
        v_p, v_h = self.composition_p(m_p), self.composition_h(m_h)
        # equation 18 & 19 in the paper
        v_p_, v_h_ = self.pooling(v_p, p_mask), self.pooling(v_h, h_mask)
        # equation 20 in the paper
        v = torch.cat((v_p_, v_h_), dim=-1)

        assert v.shape == (m_p.shape[0], v_p.shape[-1] * 4)
        return v

#### Linear Softmax

In [25]:
class LinearSoftmax(nn.Module):
    """Implement the final linear layer.
    """
    def __init__(self, input_size, output_size, class_num, activation='relu', dropout=0.5):
        super(LinearSoftmax, self).__init__()
        if activation == 'relu':
            self.activation = nn.ReLU()
        elif activation == 'tanh':
            self.activation = nn.Tanh()
        else:
            raise ValueError("Unknown activation function!!!")
        self.dropout = nn.Dropout(dropout)

        self.mlp = nn.Sequential(
            self.dropout,
            nn.Linear(input_size, output_size),
            self.activation,
            self.dropout,
            nn.Linear(output_size, class_num)
        )

    def forward(self, x):
        """
        Arguments:
            x {torch.Tensor} -- [batch, features]

        Returns:
            logits {torch.Tensor} -- raw, unnormalized scores for each class. [batch, class_num]
        """
        logits = self.mlp(x)
        return logits

#### ESIM

In [26]:
from ESIM import ESIM

In [27]:
esim = ESIM(3, 10, 3)

In [28]:
p = torch.tensor([[2, 3, 4, 1], 
                  [3, 4, 1, 1]], dtype=torch.long)

h = torch.tensor([[2, 3, 1, 1], 
                  [3, 1, 1, 1]], dtype=torch.long)

In [29]:
logits = esim(p, h)

In [30]:
logits.shape

torch.Size([2, 3])