<h1> GNN Initial Training </h1>



Following a procedure loosely based on:

Some background reading and resources:
    1. https://distill.pub/2021/gnn-intro/ 
    2. https://arxiv.org/pdf/1910.02356v2.pdf

In [1]:
#use pyto_env kernel, pyto env.

import os
import json
import pandas as pd
import importlib
import pickle
import numpy as np

import torch
from torch.optim import AdamW
from torch.utils.data import (TensorDataset, 
                              DataLoader, 
                              RandomSampler, 
                              SequentialSampler)

from transformers import BertTokenizer, BertForSequenceClassification
from torchmetrics import F1Score

import common_metrics
from common_metrics import plot_one_vs_one_roc

from common import ClassificationDataset

import TextLevelGCN 
from TextLevelGCN.data_helper import DataHelper
from TextLevelGCN.model import Model
from TextLevelGCN import model, buildGraph, train
from TextLevelGCN.pmi import cal_PMI

from nltk.tokenize import word_tokenize




In [225]:
project_dir = "/Users/paulp/Library/CloudStorage/OneDrive-UniversityofEasternFinland/UEF/Thesis"
data_dir = os.path.join(project_dir,"Data")
model_dir = os.path.join(project_dir, "Models")
#roberta_dir = os.path.join(model_dir, 'robert-classifier')

os.chdir(data_dir)

#L1 to integer map for loading categories into BERT
with open('target_idx.json') as f:
    data = f.read()
target_idx = json.loads(data)
idx_target = {target_idx[a]:a for a in target_idx.keys()}


# additional special tokens
with open('spec_tokens_ne.txt', 'rb') as file:
    spec_tokens = pickle.load(file)
spec_tokens = [a for a in spec_tokens if '-' not in a]

# Load from data directory
dataset = pd.read_csv('masked_data_set.csv', index_col = 0).reset_index(drop=True)
ds_tr = pd.read_csv('train.csv')
ds_vl = pd.read_csv('validation.csv')
ds_ts = pd.read_csv('test.csv')
ds_tr = pd.concat([ds_tr, ds_vl], axis=0) # just use val set in training and test for validation. fewer operations

In [483]:
#keep tokenizer from other project for comparable results
tokenizer_path = os.path.join(model_dir, 'custom_bert_tokenizer')
custom_bert_tokenizer = BertTokenizer.from_pretrained(tokenizer_path,
                                              additional_special_tokens = spec_tokens)

# you need the 
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                              additional_special_tokens = spec_tokens)

# bert config for maintaining vocab size and getting raw embeddings after tokenization
with open(os.path.join(model_dir, 'custom-bert/config.json'), 'r') as file:
    config = file.read()
    config = json.loads(config)
bert_config = BertConfig.from_dict(config) 

# this is the class instance for retrieving embeddings for consistency with BERT
#embeddings = BertEmbeddings(bert_config)
embeddings = BertEmbeddings(bert_config)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


AttributeError: 'str' object has no attribute 'vocab_size'

In [491]:
help(BertForSequenceClassification)

Help on class BertForSequenceClassification in module transformers.models.bert.modeling_bert:

class BertForSequenceClassification(BertPreTrainedModel)
 |  BertForSequenceClassification(config)
 |  
 |  Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
 |  output) e.g. for GLUE tasks.
 |  
 |  
 |  This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
 |  library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
 |  etc.)
 |  
 |  This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
 |  Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
 |  and behavior.
 |  
 |  Parameters:
 |      config ([`BertConfig`]): Model configuration class with all the parameters of the model.
 |          Initializi

In [479]:
from transformers import BertForMaskedLM
help(BertForMaskedLM)

Help on class BertForMaskedLM in module transformers.models.bert.modeling_bert:

class BertForMaskedLM(BertPreTrainedModel)
 |  BertForMaskedLM(config)
 |  
 |  Bert Model with a `language modeling` head on top.
 |  
 |  This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
 |  library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
 |  etc.)
 |  
 |  This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
 |  Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
 |  and behavior.
 |  
 |  Parameters:
 |      config ([`BertConfig`]): Model configuration class with all the parameters of the model.
 |          Initializing with a config file does not load the weights associated with the model, only the
 |          configuration. Check out the [`~PreTrainedModel

In [39]:
batch_size = 2
lr = 1e-5
epochs = 2
max_len = 512

device = 'mps' if torch.has_mps else 'cpu'

In [451]:
import os
import torch
import csv


class GCNDataHelper(object):
    '''
    customized from source code for use with BERT tokenizers.
    
    '''
    def __init__(self, 
                 dataset, 
                 tokenize_fn, 
                 target_idx,
                 device,
                 mode='train', 
                 vocab=None):

        self.mode = mode
        self.device = device
        self.dataset = dataset # pd.DataFrame object
        self.tokenizer = tokenizer # bert custom trained tokenizer
        self.labels_str = target_idx
        self.label = self.label_to_onehot()
        content, label = self.get_content() 
        self.vocab = [a for a in self.tokenizer.vocab]
        self.d = self.tokenizer.vocab # is this supposed to be integer IDS?
        self.content = self.dataset['Text'].apply(lambda x: self.word2id(x)).tolist()

    def label_to_onehot(self): # is this actually one-hots or label index values?
        target_indices = [self.labels_str[a] for a in dataset['Target']]
        return target_indices 
    
    def get_content(self):
        label = self.dataset['Target'].tolist()
        content = self.dataset['Text'].tolist()
        #content, label = zip(content, label)
        return content, label
    
    def word2id(self, 
                sample, # text from dataset
                return_token_type_ids = False, 
                return_attention_mask = False,):
        '''
        return BERT integer IDs for embedding lookup. 
        Uses BERT WordPiece tokenizer from DataHelper init
        '''
        result = self.tokenizer.encode_plus(sample,
                                            truncation = True,
                                            return_token_type_ids = return_token_type_ids,
                                            return_attention_mask = return_attention_mask,
                                           return_tensors = 'pt') # max_len should be passed in the tokenizer
        return result['input_ids'].squeeze()

    def build_vocab_and_freq(self, dataset['Text'], min_count = 24): # returns list of tokens only?
        '''
        attaches tokens only (no IDs) from the BERT tokenizer as an attribute.
        '''
        vocab = []
        freq = {}

        for c in content:
            words = tokenize_fn(c)
            for word in words:
                if word not in vocab:
                    vocab.append(word)
                    freq[word] = 1
                else:
                    freq[word] += 1

        results = []
        for word in freq.keys():
            if freq[word] < min_count:
                continue
            else:
                results.append(word)

        results.insert(0, 'UNK')
        with open(os.path.join(self.base, 'vocab-5.txt'), 'w') as f:
            f.write('\n'.join(results))

        self.vocab = results

    def count_word_freq(self, content):
        freq = dict(zip(self.vocab, [0 for i in range(len(self.vocab))]))

        for c in content:
            tokens = tokenizer.tokenize(c)
            for tok in tokens:
                freq[word] += 1

        with open('gcn_bert_freq.csv', 'w') as f:
            writer = csv.writer(f)
            results = list(zip(freq.keys(), freq.values()))
            writer.writerows(results)

    def batch_iter(self, batch_size, num_epoch):
        for i in range(num_epoch):
            num_per_epoch = int(len(self.content) / batch_size)
            for batch_id in range(num_per_epoch):
                start = batch_id * batch_size
                end = min((batch_id + 1) * batch_size, len(self.content))

                content = self.content[start:end]
                label = self.label[start:end]

                yield content, torch.tensor(label).to(device), i


#if __name__ == '__main__':
#    data_helper = DataHelper(dataset='r8', tokenizer=tokenizer)
#    content, label = data_helper.get_content()
    #data_helper.build_vocab(content)

In [452]:
helper = GCNDataHelper(dataset, 
              tokenizer = bert_tokenizer, 
              target_idx = target_idx,
              device = device,
              mode='train', 
              vocab=None)

PMI - (Positive) Pointwise Mutual Information
https://en.wikipedia.org/wiki/Pointwise_mutual_information


In [382]:
helper.d['<?>']#['[PAD]']

5

In [434]:
def cal_PMI(helper, window_size=20):
    '''
    
    calculate Positive Pointwise Mutual Information across the dataset.
    Inputs: object of GCNDataHelper class
    Outputs: edge weights, edge mappings, and counts number of edges
    
    '''
    len_vocab = len(helper.vocab)
    pair_count_matrix = np.zeros((len_vocab, len_vocab), dtype=int)
    word_count =np.zeros(len_vocab, dtype=int)
    
    for sample in helper.content:
        for i, word in enumerate(sample):
            try:
                word_count[word] += 1
            except KeyError:
                continue
            start_index = max(0, i - window_size)
            end_index = min(len(sample), i + window_size)
            for j in range(start_index, end_index):
                if i == j:
                    continue
                else:
                    target_word = sample[j]
                    try:
                        pair_count_matrix[word, target_word] += 1
                    except KeyError:
                        continue
        
    total_count = np.sum(word_count)
    word_count = word_count / total_count
    pair_count_matrix = pair_count_matrix / total_count
    
    pmi_matrix = np.zeros((len_vocab, len_vocab), dtype=float)
    for i in range(len_vocab):
        for j in range(len_vocab):
            pmi_matrix[i, j] = np.log(
                pair_count_matrix[i, j] / (word_count[i] * word_count[j]) 
            )
    
    # removes nan values due to division by zero above
    pmi_matrix = np.nan_to_num(pmi_matrix)
    
    # positive PMI - remove all negative values 
    pmi_matrix = np.maximum(pmi_matrix, 0.0)

    edges_weights = [0.0]
    count = 1
    edges_mappings = np.zeros((len_vocab, len_vocab), dtype=int)
    for i in range(len_vocab):
        for j in range(len_vocab):
            if pmi_matrix[i, j] != 0:
                edges_weights.append(pmi_matrix[i, j])
                edges_mappings[i, j] = count
                count += 1

    edges_weights = np.array(edges_weights)

    edges_weights = edges_weights.reshape(-1, 1)
    # print(edges_weights.shape)
    edges_weights = torch.Tensor(edges_weights)
    
    return edges_weights, edges_mappings, count


In [435]:
edge_weights, edge_mappings, counts = cal_PMI(helper, window_size = 10)

  pair_count_matrix[i, j] / (word_count[i] * word_count[j])
  pmi_matrix[i, j] = np.log(


In [436]:
edge_weights

tensor([[ 0.0000],
        [ 1.6155],
        [ 0.6170],
        ...,
        [10.0018],
        [10.6275],
        [11.1981]])

In [441]:
sum(sum(edge_mappings == 0))

497411970

In [445]:
sum(sum(edge_mappings != 0))

8838030

In [443]:
22500**2

506250000

In [438]:
counts

8838031

In [None]:
def gcn_msg(edge):
    return {'m': edge.src['h'], 'w': edge.data['w']}


def gcn_reduce(node):
    w = node.mailbox['w']

    new_hidden = torch.mul(w, node.mailbox['m'])

    new_hidden,_ = torch.max(new_hidden, 1)

    node_eta = torch.sigmoid(node.data['eta'])
    # node_eta = F.leaky_relu(node.data['eta'])

    # new_hidden = node_eta * node.data['h'] + (1 - node_eta) * new_hidden
    # print(new_hidden.shape)

    return {'h': new_hidden}

In [400]:
# Mojave-pku implementation 

import dgl
import torch
import torch.nn.functional as F
import numpy as np
#import word2vec
from transformers.models.bert.modeling_bert import BertEmbeddings, BertConfig

def gcn_msg(edge):
    return {'m': edge.src['h'], 'w': edge.data['w']}


def gcn_reduce(node):
    w = node.mailbox['w']

    new_hidden = torch.mul(w, node.mailbox['m'])

    new_hidden,_ = torch.max(new_hidden, 1)

    node_eta = torch.sigmoid(node.data['eta'])
    # node_eta = F.leaky_relu(node.data['eta'])

    # new_hidden = node_eta * node.data['h'] + (1 - node_eta) * new_hidden
    # print(new_hidden.shape)

    return {'h': new_hidden}


class Model(torch.nn.Module):
    def __init__(self,
                 helper,
                 class_num,
                 hidden_size_node,
                 #vocab,
                 n_gram,
                 drop_out,
                 edges_num,
                 edges_matrix,
                 max_length=350,
                 trainable_edges=True,
                 pmi=None,
                 cuda=True
                 ):
        super(Model, self).__init__()
        self.helper = helper
        self.is_cuda = cuda
        self.vocab = helper.vocab
        # print(len(vocab))
        self.seq_edge_w = torch.nn.Embedding(edges_num, 1)
        print(edges_num)
        print(pmi.shape)

        self.node_hidden = torch.nn.Embedding(len(vocab), hidden_size_node)
        
        self.seq_edge_w = torch.nn.Embedding.from_pretrained(pmi, freeze=True)
            
        self.edges_num = edges_num
        if trainable_edges:
            self.seq_edge_w = torch.nn.Embedding.from_pretrained(torch.ones(edges_num, 1), freeze=False)
        else:
            self.seq_edge_w = torch.nn.Embedding.from_pretrained(pmi, freeze=True)

        self.hidden_size_node = hidden_size_node

        self.node_hidden.weight.data.copy_(torch.tensor(self.load_embeddings(embeddings)))
        self.node_hidden.weight.requires_grad = True

        self.len_vocab = len(vocab)

        self.ngram = n_gram

        self.d = helper.d

        self.max_length = max_length

        self.edges_matrix = edges_matrix

        self.dropout = torch.nn.Dropout(p=drop_out)

        self.activation = torch.nn.ReLU()

        self.Linear = torch.nn.Linear(hidden_size_node, class_num, bias=True)

    def word2id(self, word):
        try:
            result = self.d[word]
        except KeyError:
            result = self.d['UNK']

        return result

    def load_embeddings(self, embeddings):

        embedding_matrix = []
        for word in self.vocab:
            try:
                int_id = self.helper.tokenizer.vocab[word]
                embedding_matrix.append(embeddings.word_embeddings[int_id])
            except KeyError:
                print(word)
                # what's the rationale behind putting 'the' in the missing embedding?
                embedding_matrix.append(embeddings.word_embeddings[146]) 

        embedding_matrix = np.array(embedding_matrix)

        return embedding_matrix

    def add_all_edges(self, doc_ids: list, old_to_new: dict):
        edges = []
        old_edge_id = []

        local_vocab = list(set(doc_ids))

        for i, src_word_old in enumerate(local_vocab):
            src = old_to_new[src_word_old]
            for dst_word_old in local_vocab[i:]:
                dst = old_to_new[dst_word_old]
                edges.append([src, dst])
                old_edge_id.append(self.edges_matrix[src_word_old, dst_word_old])

            # self circle
            edges.append([src, src])
            old_edge_id.append(self.edges_matrix[src_word_old, src_word_old])

        return edges, old_edge_id

    def add_seq_edges(self, doc_ids: list, old_to_new: dict):
        edges = []
        old_edge_id = []
        for index, src_word_old in enumerate(doc_ids):
            src = old_to_new[src_word_old]
            for i in range(max(0, index - self.ngram), min(index + self.ngram + 1, len(doc_ids))):
                dst_word_old = doc_ids[i]
                dst = old_to_new[dst_word_old]

                # - first connect the new sub_graph
                edges.append([src, dst])
                # - then get the hidden from parent_graph
                old_edge_id.append(self.edges_matrix[src_word_old, dst_word_old])

            # self circle
            edges.append([src, src])
            old_edge_id.append(self.edges_matrix[src_word_old, src_word_old])

        return edges, old_edge_id

    def seq_to_graph(self, doc_ids: list) -> dgl.DGLGraph():
        if len(doc_ids) > self.max_length:
            doc_ids = doc_ids[:self.max_length]

        local_vocab = set(doc_ids)

        old_to_new = dict(zip(local_vocab, range(len(local_vocab))))

        if self.is_cuda:
            local_vocab = torch.tensor(list(local_vocab)).cuda()
        else:
            local_vocab = torch.tensor(list(local_vocab))

        sub_graph = dgl.DGLGraph()

        sub_graph.add_nodes(len(local_vocab))
        local_node_hidden = self.node_hidden(local_vocab)

        sub_graph.ndata['h'] = local_node_hidden

        seq_edges, seq_old_edges_id = self.add_seq_edges(doc_ids, old_to_new)

        edges, old_edge_id = [], []
        # edges = []

        edges.extend(seq_edges)

        old_edge_id.extend(seq_old_edges_id)

        if self.is_cuda:
            old_edge_id = torch.LongTensor(old_edge_id).cuda()
        else:
            old_edge_id = torch.LongTensor(old_edge_id)

        srcs, dsts = zip(*edges)
        sub_graph.add_edges(srcs, dsts)
        try:
            seq_edges_w = self.seq_edge_w(old_edge_id)
        except RuntimeError:
            print(old_edge_id)
        sub_graph.edata['w'] = seq_edges_w

        return sub_graph

    def forward(self, doc_ids, is_20ng=None):
        sub_graphs = [self.seq_to_graph(doc) for doc in doc_ids]

        batch_graph = dgl.batch(sub_graphs)

        batch_graph.update_all(
            message_func=dgl.function.src_mul_edge('h', 'w', 'weighted_message'),
            reduce_func=dgl.function.max('weighted_message', 'h')
        )

        h1 = dgl.sum_nodes(batch_graph, feat='h')

        drop1 = self.dropout(h1)
        act1 = self.activation(drop1)

        l = self.Linear(act1)

        return l

In [480]:
helper.tokenizer.vocab

OrderedDict([('[PAD]', 0),
             ('[UNK]', 1),
             ('[CLS]', 2),
             ('[SEP]', 3),
             ('[MASK]', 4),
             ('<?>', 5),
             ('<*>', 6),
             ('<R>', 7),
             ('<MISC>', 8),
             ('<ORG>', 9),
             ('<LOC>', 10),
             ('<PER>', 11),
             ('!', 12),
             ('"', 13),
             ('#', 14),
             ('$', 15),
             ('%', 16),
             ('&', 17),
             ("'", 18),
             ('(', 19),
             (')', 20),
             ('*', 21),
             ('+', 22),
             (',', 23),
             ('-', 24),
             ('.', 25),
             ('/', 26),
             ('0', 27),
             ('1', 28),
             ('2', 29),
             ('3', 30),
             ('4', 31),
             ('5', 32),
             ('6', 33),
             ('7', 34),
             ('8', 35),
             ('9', 36),
             (':', 37),
             (';', 38),
             ('<', 39),
      

In [481]:
b = BertTokenizer.from_pretrained('bert-base-cased')

In [482]:
b.vocab

OrderedDict([('[PAD]', 0),
             ('[unused1]', 1),
             ('[unused2]', 2),
             ('[unused3]', 3),
             ('[unused4]', 4),
             ('[unused5]', 5),
             ('[unused6]', 6),
             ('[unused7]', 7),
             ('[unused8]', 8),
             ('[unused9]', 9),
             ('[unused10]', 10),
             ('[unused11]', 11),
             ('[unused12]', 12),
             ('[unused13]', 13),
             ('[unused14]', 14),
             ('[unused15]', 15),
             ('[unused16]', 16),
             ('[unused17]', 17),
             ('[unused18]', 18),
             ('[unused19]', 19),
             ('[unused20]', 20),
             ('[unused21]', 21),
             ('[unused22]', 22),
             ('[unused23]', 23),
             ('[unused24]', 24),
             ('[unused25]', 25),
             ('[unused26]', 26),
             ('[unused27]', 27),
             ('[unused28]', 28),
             ('[unused29]', 29),
             ('[unused30]', 30),
 

In [206]:
Model(
     helper,
     class_num = 6, 
     hidden_size_node = 768,
     #vocab,
     n_gram = 15,
     drop_out = 0.05,
     edges_num = counts, # ?
     edges_matrix = edges_mapping, # ?
     max_length=512,
     trainable_edges=True,
     pmi=None,
     cuda=True
     )

Help on GraphBuilder in module __main__ object:

class GraphBuilder(builtins.object)
 |  GraphBuilder(words, hiddenSizeNode)
 |  
 |  Methods defined here:
 |  
 |  __init__(self, words, hiddenSizeNode)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)



In [314]:
bert_tokenizer.vocab

OrderedDict([('[PAD]', 0),
             ('[UNK]', 1),
             ('[CLS]', 2),
             ('[SEP]', 3),
             ('[MASK]', 4),
             ('<?>', 5),
             ('<*>', 6),
             ('<R>', 7),
             ('<MISC>', 8),
             ('<ORG>', 9),
             ('<LOC>', 10),
             ('<PER>', 11),
             ('!', 12),
             ('"', 13),
             ('#', 14),
             ('$', 15),
             ('%', 16),
             ('&', 17),
             ("'", 18),
             ('(', 19),
             (')', 20),
             ('*', 21),
             ('+', 22),
             (',', 23),
             ('-', 24),
             ('.', 25),
             ('/', 26),
             ('0', 27),
             ('1', 28),
             ('2', 29),
             ('3', 30),
             ('4', 31),
             ('5', 32),
             ('6', 33),
             ('7', 34),
             ('8', 35),
             ('9', 36),
             (':', 37),
             (';', 38),
             ('<', 39),
      

In [470]:
def get_embeddings(token):
    embeddings.word_embeddings(torch.tensor(sample['input_ids']))

In [283]:
get_embeddings = BertEmbeddings(config)

AttributeError: 'dict' object has no attribute 'vocab_size'

In [301]:
sample = bert_tokenizer.encode_plus('the great big one.')
sample

{'input_ids': [2, 146, 661, 710, 306, 25, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [471]:
bert_tokenizer.convert_tokens_to_ids('the')

146

In [311]:
help(bert_tokenizer)

Help on BertTokenizer in module transformers.models.bert.tokenization_bert object:

class BertTokenizer(transformers.tokenization_utils.PreTrainedTokenizer)
 |  BertTokenizer(vocab_file, do_lower_case=True, do_basic_tokenize=True, never_split=None, unk_token='[UNK]', sep_token='[SEP]', pad_token='[PAD]', cls_token='[CLS]', mask_token='[MASK]', tokenize_chinese_chars=True, strip_accents=None, **kwargs)
 |  
 |  Construct a BERT tokenizer. Based on WordPiece.
 |  
 |  This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
 |  this superclass for more information regarding those methods.
 |  
 |  Args:
 |      vocab_file (`str`):
 |          File containing the vocabulary.
 |      do_lower_case (`bool`, *optional*, defaults to `True`):
 |          Whether or not to lowercase the input when tokenizing.
 |      do_basic_tokenize (`bool`, *optional*, defaults to `True`):
 |          Whether or not to do basic tokenization before Wo

In [463]:
embeddings = BertEmbeddings(bert_config)

In [476]:
help(embeddings)

Help on BertEmbeddings in module transformers.models.bert.modeling_bert object:

class BertEmbeddings(torch.nn.modules.module.Module)
 |  BertEmbeddings(config)
 |  
 |  Construct the embeddings from word, position and token_type embeddings.
 |  
 |  Method resolution order:
 |      BertEmbeddings
 |      torch.nn.modules.module.Module
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, config)
 |      Initializes internal Module state, shared by both nn.Module and ScriptModule.
 |  
 |  forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0)
 |      Defines the computation performed at every call.
 |      
 |      Should be overridden by all subclasses.
 |      
 |      .. note::
 |          Although the recipe for forward pass needs to be defined within
 |          this function, one should call the :class:`Module` instance afterwards
 |          instead of this since the former takes care of runni

In [477]:
embeddings.word_embeddings(helper.content[0])

tensor([[[-0.2801,  0.6327, -0.1351,  ..., -0.6352,  0.0847,  0.7750],
         [-0.4881, -0.3984,  1.4998,  ...,  0.0565,  1.0043, -1.8529],
         [ 1.3080, -1.9641, -1.7544,  ...,  0.6415, -0.0094,  0.0426],
         ...,
         [-0.6205, -0.4928, -2.2384,  ...,  0.6069, -1.0469, -0.1051],
         [ 0.7839, -0.7409,  0.3542,  ..., -0.5234, -1.3317,  1.6480],
         [ 0.7465,  0.3120,  0.4096,  ..., -0.5287,  1.6044, -0.8243]]],
       grad_fn=<EmbeddingBackward0>)

In [478]:
bert_config

BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.0,
  "classifier_dropout": 0.0,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "id2label": {
    "0": "GE",
    "1": "CN",
    "2": "JP",
    "3": "RU",
    "4": "SP",
    "5": "AR"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "AR": 5,
    "CN": 1,
    "GE": 0,
    "JP": 2,
    "RU": 3,
    "SP": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_attentions": true,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.18.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 22500
}