In [None]:
!pip install datasets==1.9.0

Collecting datasets==1.9.0
  Downloading datasets-1.9.0-py3-none-any.whl (262 kB)
[K     |████████████████████████████████| 262 kB 33.7 MB/s 
[?25hCollecting huggingface-hub<0.1.0
  Downloading huggingface_hub-0.0.14-py3-none-any.whl (43 kB)
[K     |████████████████████████████████| 43 kB 709 kB/s 
Collecting fsspec>=2021.05.0
  Downloading fsspec-2021.7.0-py3-none-any.whl (118 kB)
[K     |████████████████████████████████| 118 kB 43.9 MB/s 
Collecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 62.3 MB/s 
Installing collected packages: xxhash, huggingface-hub, fsspec, datasets
Successfully installed datasets-1.9.0 fsspec-2021.7.0 huggingface-hub-0.0.14 xxhash-2.0.2


##### This is the Keras implementation from the researches themselves

In [None]:
class AttentionMLP(Layer):
    """
    Genre Aware Attention Model

    """
    def __init__(self,
                 units, # what does units mean
                 activation=None,
                 use_bias=True,
                 kernel_initializer='glorot_uniform',
                 bias_initializer='ones',
                 v_initializer='glorot_uniform',
                 Wg_initializer='glorot_uniform',
                 **kwargs):
        if 'input_shape' not in kwargs and 'input_dim' in kwargs:
            kwargs['input_shape'] = (kwargs.pop('input_dim'),)
        self.units = units
        self.activation = activations.get(activation) # "selu"
        self.use_bias = use_bias
        self.kernel_initializer = initializers.get(kernel_initializer)
        self.bias_initializer = initializers.get(bias_initializer)
        self.v_initializer = initializers.get(v_initializer)
        self.Wg_initializer = initializers.get(Wg_initializer)
        self.supports_masking = True
        super(AttentionMLP, self).__init__(**kwargs)

    def build(self, input_shape):
        assert type(input_shape) is list and len(input_shape) == 2
        # W: (EMBED_SIZE, units)
        # Wg:(GENRE_EMB_SIZE, units)
        # b: (units,)
        # v: (units,)

        self.W = self.add_weight(name="W_{:s}".format(self.name),
                                 shape=(input_shape[0][-1], self.units),
                                 initializer=self.kernel_initializer,
                                 trainable=True)

        self.Wg = self.add_weight(name="W_g{:s}".format(self.name),
                                  shape=(input_shape[1][-1], self.units),
                                  initializer=self.Wg_initializer,
                                  trainable=True)

        self.b = self.add_weight(name="b_{:s}".format(self.name), # b_a in the paper
                                 shape=(self.units,),
                                 initializer=self.bias_initializer,
                                 trainable=True)

        self.v = self.add_weight(name="v_{:s}".format(self.name),
                                 shape=(self.units,),
                                 initializer=self.v_initializer,
                                 trainable=True)

        super(AttentionMLP, self).build(input_shape)

    def call(self, xs, mask=None):
        # input: [x, u]
        # x: (BATCH_SIZE, MAX_TIMESTEPS, EMBED_SIZE)
        # g: (BATCH_SIZE, 1,GENRE_EMB_SIZE)

        # all the modalities are concatenated together into x. However, it seems that they are using x as if they have all already gone through a dense layer.. 
        # Also W_h in the paper makes no sense. How can you multiply all modalities by the same dimensional weight matrix when each modality itself has a different dimension?
        x, g = xs 
        g=K.squeeze(g, axis=1)
        atten_g = K.expand_dims(K.dot(g, self.Wg), axis=1) # dot product between genre vector and genre Weights
        
        # computes score(x_i, g) and NOT h_i as said in the paper
        et = self.activation(K.dot(x, self.W) + atten_g + self.b) # this is h (all concatenated together)
        et = K.dot(et, self.v)

        at = K.softmax(et)  # softmaxed to get us the alpha scores. Not clear if these values correspond to each modality as expressed in the paper or... ???
        if mask is not None and mask[0] is not None:
            at *= K.cast(mask, K.floatx())
        # ot: (BATCH_SIZE, MAX_TIMESTEPS, EMBED_SIZE)
        atx = K.expand_dims(at, axis=-1)
        ot = atx * x
        # output: (BATCH_SIZE, EMBED_SIZE)
        # print(ot.eval())
        return K.sum(ot, axis=1) # returns r

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def compute_output_shape(self, input_shape):
        # output shape: (BATCH_SIZE, EMBED_SIZE)
        return (input_shape[0][0], input_shape[0][-1])

    def get_config(self):
        return super(AttentionMLP, self).get_config()

### From Paper to Pytorch

In [7]:
!pip install torch



In [15]:
import torch.nn as nn
import torch

In [265]:
class Genre_Aware_Attention_Model(nn.Module):

  def __init__(self, num_units):
    super(Genre_Aware_Attention_Model, self).__init__()
    self.num_units = num_units
    self.hidden_trans = nn.Linear(768, 100)
    self.hidden_c5g = nn.Linear(376417, 100)
    
    self.activation = nn.SELU()

    # self.Wa

    self.v = nn.parameter.Parameter(
        nn.init.xavier_uniform_(torch.empty(num_units,1)),
        requires_grad=True
    )

    # self.Wa = nn.linear(num_units,num_units)

    self.Wa = nn.parameter.Parameter(
        nn.init.xavier_uniform_(torch.empty(num_units,num_units)), 
        requires_grad=True
    )

    self.Wg = nn.parameter.Parameter(
        nn.init.xavier_uniform_(torch.empty(8, num_units)), 
        requires_grad=True
    )

    self.softmax = nn.Softmax(dim=0)

    self.out = nn.Linear(100, 2)

  def forward(self, x1, x2, g):
    x1_dense = self.hidden_trans(x1)
    x2_dense = self.hidden_c5g(x2)

    atten_g = torch.mm(torch.unsqueeze(g, 0), self.Wg)

    h1 = self.activation(x1_dense)
    h2 = self.activation(x2_dense)

    h1_score = torch.dot(torch.squeeze(self.activation(torch.mm(h1, self.Wa) + atten_g)), torch.squeeze(self.v))
    h2_score = torch.dot(torch.squeeze(self.activation(torch.mm(h2, self.Wa) + atten_g)), torch.squeeze(self.v))

    alphas = self.softmax(torch.stack([h1_score, h2_score]))

    x1_scaled = x1_dense * alphas[0]
    x2_scaled = x2_dense * alphas[1]

    r = torch.sum(torch.stack([torch.squeeze(x1_scaled), torch.squeeze(x2_scaled)]), axis = 0)
    return self.out(r)

In [267]:
sample_g = torch.Tensor([1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])

In [268]:
oy = the_model.forward(sample_x1,sample_x2,sample_g)

tensor([0.2521, 0.7479], grad_fn=<SoftmaxBackward>)


In [264]:
oy

tensor([0.0459, 0.1587], grad_fn=<AddBackward0>)

In [None]:
def ga_model_train(model):
  # define the optimization
  criterion = nn.CrossEntropyLoss()
  optimizer = nn.Adam(model.parameters(), lr=0.01, momentum=0.9)