In [1]:
from transformers import AutoModel, AutoTokenizer
from datasets import load_dataset
import pandas as pd
import ast

In [2]:
data_path = 'data/ted/en.tsv'

In [3]:
model_name = 'microsoft/deberta-v3-xsmall'

In [4]:
model = AutoModel.from_pretrained(model_name, output_hidden_states=True)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)

In [5]:
dataset = load_dataset('csv', data_files=data_path, sep='\t', converters={'sentences': ast.literal_eval})

In [6]:
def preprocess_dataset(dataset, tokenizer):
    dataset = dataset['train'].remove_columns('sub_sentences')

    def concatenate_sentences(example):
        example['sentences'] = ' '.join(example['sentences'])
        return example
    
    dataset = dataset.map(concatenate_sentences, 
                          desc='Concatenatings passage sentences.')

    def tokenize_dataset(examples):
        tokenized_texts = tokenizer(examples['sentences'], padding=True, max_length=256, truncation=True)
        return tokenized_texts

    tokenized_dataset = dataset.map(
            tokenize_dataset,
            batched=True,
            remove_columns=dataset.column_names,
            desc="Running tokenizer on dataset",
        )

    return tokenized_dataset

In [7]:
dataset = preprocess_dataset(dataset, tokenizer)

In [8]:
from transformers import default_data_collator
from torch.utils.data import DataLoader
import torch

In [9]:
data_collator = default_data_collator

dataloader = DataLoader(dataset, shuffle=False, collate_fn=data_collator, batch_size=8)

In [19]:
model.config

DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-xsmall",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 6,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 384,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.36.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

In [None]:
layers_hidden_states = 

In [17]:
model.eval()
with torch.no_grad():
    for step, batch in enumerate(dataloader):
        # num_layers, batch_size, max_seq_len (max 215), hidden_size
        hidden_states = model(**batch)['hidden_states']
        non_pad_tokens = batch['attention_mask'].sum(axis=1)
        # batch_size, num_layers, max_seq_len, hidden_size
        hidden_states = torch.stack(hidden_states, dim=1)
        for batch_idx in range(hidden_states.shape[0]):
            passage_hidden_states = hidden_states[batch_idx, :, :non_pad_tokens[batch_idx], :]
        break
            

tensor([ 87,  97,  86,  82,  81, 101,  74,  88])
torch.Size([13, 87, 384])
torch.Size([13, 97, 384])
torch.Size([13, 86, 384])
torch.Size([13, 82, 384])
torch.Size([13, 81, 384])
torch.Size([13, 101, 384])
torch.Size([13, 74, 384])
torch.Size([13, 88, 384])


In [16]:
passage_hidden_states.shape

torch.Size([13, 88, 384])

In [None]:
hidden_states