In [1]:
!pip install transformers
!pip install ipywidgets
!pip install IProgress

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [2]:
import transformers
from transformers import BertTokenizer, BertConfig
from transformers import BertModel, BertForSequenceClassification

import torch
import torch.nn as nn

In [3]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [4]:
# steps:

# 1. define a paragraph: two AMs, two ACs
# 2. define spans of AMs and ACs
# 3. tokenize
# 7. put them in a BERT, get the output and do the pooling.
# 8. 
# 8. concatenate the output
# 9. give it to the combined bert
# 10. get the FC layer result.

In [5]:
paragraph = """
Some people may argue that children will be more material,
neglect their study for earning money or be exploited by the
employers. However, if children get good care and
instructions from their parents, they can take advantages of the
work to learn valuable things and avoid going in a wrong way.
"""

In [6]:
# define spans
# these spans are inclusive

am_1_span = (0, 4)
ac_1_span = (5, 23)

am_2_span = (24, 25)
ac_2_span = (26, 55)

In [7]:
tokenized_paragraph = tokenizer(paragraph, return_tensors="pt")

In [8]:
tokenized_paragraph

{'input_ids': tensor([[  101,  2070,  2111,  2089,  7475,  2008,  2336,  2097,  2022,  2062,
          3430,  1010, 19046,  2037,  2817,  2005,  7414,  2769,  2030,  2022,
         18516,  2011,  1996, 12433,  1012,  2174,  1010,  2065,  2336,  2131,
          2204,  2729,  1998,  8128,  2013,  2037,  3008,  1010,  2027,  2064,
          2202, 12637,  1997,  1996,  2147,  2000,  4553,  7070,  2477,  1998,
          4468,  2183,  1999,  1037,  3308,  2126,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [9]:
def get_span_representation(output, span_start, span_end):
    
    # tokenized_input = tokenizer(PARAGRAPH, return_tensors="pt")
    # outputs = model(**tokenized_input, output_hidden_states=True)
    
    # hidden_states = outputs[1] #  outputs[1] instead of [2], because we have no loss now, because we passed no labels
    
    
    # kuri-minus = h[j] - h[i-1]; h[i] - h[j+1]; h[i-1]; h[j+1]
    
    # first_term
    
    span_end_tensor = output[0][0][span_end] # outputs[1] instead of [2], because we have no loss now, because we passed no labels
    span_start_tensor = output[0][0][span_start - 1] # outputs[1] instead of [2], because we have no loss now, because we passed no labels
    span_first_term_tensor = span_end_tensor - span_start_tensor
    
    # second_term 
    
    span_start_tensor_2 = output[0][0][span_start] # outputs[1] instead of [2], because we have no loss now, because we passed no labels
    span_end_tensor_2 = output[0][0][span_end + 1] # outputs[1] instead of [2], because we have no loss now, because we passed no labels
    span_second_term_tensor = span_start_tensor_2 - span_end_tensor_2
    
    # third_term 
    
    span_tensor_3 = output[0][0][span_start - 1] # outputs[1] instead of [2], because we have no loss now, because we passed no labels
    
    
    # fourth_term 
    
    span_tensor_4 = output[0][0][span_end + 1] # outputs[1] instead of [2], because we have no loss now, because we passed no labels
    
    span_minus_tensor = torch.cat((span_first_term_tensor, span_second_term_tensor, span_tensor_3, span_tensor_4))
    
    return span_minus_tensor

In [10]:
class CustomBERTKuri(nn.Module):

    def __init__(self, first_model, model_am, model_ac, nr_classes):
        
        super(CustomBERTKuri, self).__init__()
        
        self.first_model = first_model
        
        self.intermediate_linear_am = nn.Linear(3072, 768)
        self.intermediate_linear_ac = nn.Linear(3072, 768)
        
        
        self.model_am = model_am
        self.model_ac = model_ac
        
        self.nr_classes = nr_classes
                
        self.fc = nn.Linear(self.model_am.config.hidden_size + self.model_ac.config.hidden_size, self.nr_classes)
        

    def forward(self, tokenized_paragraph):
        
        paragraph_output = self.first_model(**tokenized_paragraph)
        
        # print(paragraph_output[0].shape)
        
        #output_am_1 = self.first_model(**am_1_tokenized_input)
        #output_am_2 = self.first_model(**am_2_tokenized_input)
        
        #output_ac_1 = self.first_model(**ac_1_tokenized_input)
        #output_ac_2 = self.first_model(**ac_2_tokenized_input)
        
        am_1_span_representation = get_span_representation(paragraph_output, am_1_span[0] + 1, am_1_span[1] + 1)
        am_2_span_representation = get_span_representation(paragraph_output, am_2_span[0] + 1, am_2_span[1] + 1)
        
        ac_1_span_representation = get_span_representation(paragraph_output, ac_1_span[0] + 1, ac_1_span[1] + 1)
        ac_2_span_representation = get_span_representation(paragraph_output, ac_2_span[0] + 1, ac_2_span[1] + 1)
        
        print(am_1_span_representation.shape)
        
        
#         am_1_span_representation = self.intermediate_linear_am(am_1_span_representation)
#         am_2_span_representation = self.intermediate_linear_am(am_2_span_representation)
        
#         ac_1_span_representation = self.intermediate_linear_ac(ac_1_span_representation)
#         ac_2_span_representation = self.intermediate_linear_ac(ac_2_span_representation)
        
#         print(am_1_span_representation.shape)
#         print(ac_1_span_representation.shape)
        
        # the linear business works in batching mode if done already.
        # do it already               
        
        am_tensor = torch.vstack([am_1_span_representation, am_2_span_representation])    
        am_tensor = am_tensor.unsqueeze_(0)             
        am_tensor = am_tensor.expand(1,2,3072)
        
        print(am_tensor.shape)
        
#         ac_tensor = torch.vstack([ac_1_span_representation, ac_2_span_representation])    
#         ac_tensor = ac_tensor.unsqueeze_(0)            
#         ac_tensor = ac_tensor.expand(1,2,768)
        
#         print(ac_tensor.shape)
        
        
        
        # print(batched_tensor.shape)
        
        # output_am_1_span_rep = self.model_am(inputs_embeds = am_1_span_representation)
        
        # batching is ok. 1 x 2 x 3072
        
        # first question is: what about the CLS/SEP? so this question is solved
        # second question is: 4 AMs. 1 x 2 x 3072
        
        
        # am_tensor is of shape 2 x 768. to get the first one, we can do am_tensor[0]
        
            
        try:

            output_am_1_span_rep = self.model_am(inputs_embeds = am_tensor)


        except RuntimeError as err_inst:

            # print("Oops!  That was no valid number.  Try again...")
            print(input_embeds)
            print(err_inst)


         # 1 x 2 x 768
        
        print('passed am model')
        
        print(output_am_span_rep[0].shape) # 1 x 2 x 768
        
        output_am_2_span_rep = self.model_am(inputs_embeds = am_2_span_representation)
        
        output_ac_span_rep = self.model_ac(inputs_embeds = ac_1_span_representation)
        
        print('passed ac model')
        
        print(output_ac_span_rep[0].shape)
        
        output_ac_2_span_rep = self.model_ac(inputs_embeds = ac_2_span_representation)
        
        # change the rest of the 3 according to the same trick above
        
        # until here
        
        # concatenation with sequentiality
        
        
        adu_representations = torch.cat([output_am_span_rep[0], output_ac_span_rep[0]], dim=2)
        print(adu_representations.shape)
        # adu_2_representation = torch.cat([output_am_span_rep[0][0] + output_ac_span_rep[0][0]], dim=0)
        
#         adu_1_rep = get_concatenated_adu_representations(output_am_1_span_rep, output_ac_1_span_rep)
#         adu_2_rep = get_concatenated_adu_representations(output_am_2_span_rep, output_ac_2_span_rep)
        
        # output = model_combined(inputs_embeds = output[0])
        
        output = self.fc(adu_representations)
        #output_2 = self.fc(adu_2_rep)
        
        print(output.shape)
        
    
        return output

### Run

In [11]:
first_model = BertModel(BertConfig.from_pretrained("bert-base-cased"))

In [12]:
model_am = BertModel(BertConfig.from_pretrained("bert-base-cased"))

In [13]:
model_am

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [14]:
model_am.config

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

In [15]:
model_am.embeddings.word_embeddings.embedding_dim = 3072
model_am.embeddings.position_embeddings.embedding_dim = 3072
model_am.embeddings.token_type_embeddings.embedding_dim = 3072
model_am.embeddings.LayerNorm.normalized_shape = (3072,)

In [16]:
for i in range(0, 12, 1):
    
    model_am.encoder.layer[i].attention.self.query.in_features = 3072
    model_am.encoder.layer[i].attention.self.key.in_features = 3072
    model_am.encoder.layer[i].attention.self.value.in_features = 3072

    model_am.encoder.layer[i].attention.self.query.out_features = 3072
    model_am.encoder.layer[i].attention.self.key.out_features = 3072
    model_am.encoder.layer[i].attention.self.value.out_features = 3072

    model_am.encoder.layer[i].attention.output.LayerNorm.normalized_shape = (3072,)
    model_am.encoder.layer[i].attention.output.dense.in_features = 3072
    model_am.encoder.layer[i].attention.output.dense.out_features = 3072

    model_am.encoder.layer[i].intermediate.dense.in_features = 3072

    model_am.encoder.layer[i].output.dense.out_features = 3072
    model_am.encoder.layer[i].output.LayerNorm.normalized_shape = (3072,)

In [17]:
model_am.pooler.dense.in_features = 3072
model_am.pooler.dense.out_features = 3072

In [18]:
# model_am.encoder.layer[1].attention.self.query.in_features = 3072

In [19]:
model_am

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 3072, padding_idx=0)
    (position_embeddings): Embedding(512, 3072)
    (token_type_embeddings): Embedding(2, 3072)
    (LayerNorm): LayerNorm((3072,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=3072, out_features=3072, bias=True)
            (key): Linear(in_features=3072, out_features=3072, bias=True)
            (value): Linear(in_features=3072, out_features=3072, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=3072, out_features=3072, bias=True)
            (LayerNorm): LayerNorm((3072,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=Fals

In [20]:
model_am.embeddings.token_type_embeddings

Embedding(2, 3072)

In [21]:
model_ac = BertModel(BertConfig.from_pretrained("bert-base-cased"))

In [22]:
# model_combined = BertModel(BertConfig.from_pretrained("bert-base-cased"))

In [23]:
custom_model = CustomBERTKuri(first_model, model_am, model_ac, 3)

In [24]:
our_output = custom_model(tokenized_paragraph)

torch.Size([3072])
torch.Size([1, 2, 3072])


NameError: name 'input_embeds' is not defined

In [None]:
our_output