#### Exercise

Use following Transformer architecture to for emotional classification. 

In [1]:
import torch 
from torch import nn 
import math

# Setting for model architecture  
d_model = 100
embedding_size = d_model
batch_size = 30
sequence_length = 7
num_layer = 2
nhead = 4 # this is a special setting (number of head) for Transformer architecture 


# fake input data 
fake_embedding = torch.rand(batch_size, sequence_length, embedding_size)
fake_mask=torch.LongTensor(batch_size, sequence_length).fill_(0)
fake_mask[:, -2] = 1
fake_mask[:, -1] = 1
fake_mask = (fake_mask == 1)

# init model = PE (position encoding) layer + 
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x): 
        return self.pe[:x.size(0)]

transformer_layer = nn.TransformerEncoder(encoder_layer=nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, batch_first=True), num_layers=num_layer) 
pos_encoding = PositionalEncoding(d_model, 0.2)

# forward 
sent_vector = transformer_layer(fake_embedding +  pos_encoding(fake_embedding), src_key_padding_mask=fake_mask)
sent_vector = sent_vector[ :, 0] # use hiden state of first word for sentence vector representation 

# output shape 
print('sentence vector shape =', sent_vector.shape)



sentence vector shape = torch.Size([30, 100])
