# Vocabulary (Tokenizer) Extension/Expansion/Adaptation

## Prepare Data

In [1]:
from datasets import load_dataset

# this dataset has already fixed encoding using ftfy (as is used by me in the preprocessing steps of other datasets)
dataset = load_dataset("HuggingFaceFW/fineweb-2", "ces_Latn", split="train")
dataset

Resolving data files:   0%|          | 0/25 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/473 [00:00<?, ?it/s]

Dataset({
    features: ['text', 'id', 'dump', 'url', 'date', 'file_path', 'language', 'language_score', 'language_script', 'minhash_cluster_size', 'top_langs'],
    num_rows: 62703458
})

In [2]:
#we need only texts
dataset = dataset.remove_columns(["id", "dump", "url", "date", "file_path", "language", "language_score", "language_script", "minhash_cluster_size", "top_langs"])
dataset

Dataset({
    features: ['text'],
    num_rows: 62703458
})

In [3]:
#shuffle to be sure we select "random sample"
#dataset = dataset.shuffle(seed=42, buffer_size=10000)
dataset = dataset.shuffle(seed=42)

In [None]:
62703458/5

12540691.6

In [4]:
#limit the number of samples
dataset = dataset.take(10000)
dataset

Dataset({
    features: ['text'],
    num_rows: 10000
})

In [5]:
#prepare iterator that outputs only texts
def serve_texts():
    for example in dataset:
        yield example["text"]

In [None]:
#iterator using torch dataset
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, texts, batch_size=1):
        self.lines = texts
        self.batch_size = batch_size

    def __len__(self):
        return len(self.lines)
    

    def __getitem__(self, idx):
        batch = self.lines[idx:idx+self.batch_size]
        return batch

In [None]:
text_dataset = TextDataset(dataset["text"], batch_size=32)

## Load the Original Tokenizer

In [6]:
from transformers import AutoTokenizer

old_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B")

In [7]:
example = "Řeřicha je květina vyskutjící se v Česku a na Moravě."
old_tokenizer.tokenize(example)

['Åĺ',
 'e',
 'ÅĻ',
 'icha',
 'Ġje',
 'ĠkvÄĽt',
 'ina',
 'Ġvys',
 'k',
 'ut',
 'jÃŃcÃŃ',
 'Ġse',
 'Ġv',
 'ĠÄĮes',
 'ku',
 'Ġa',
 'Ġna',
 'ĠMor',
 'avÄĽ',
 '.']

## Train new Tokenizer

In [8]:
#25000 based on the SambaLingo paper
tokenizer = old_tokenizer.train_new_from_iterator(text_iterator=serve_texts(), vocab_size=25000, length=len(dataset))






In [9]:
tokenizer.tokenize(example)

['Åĺe',
 'ÅĻi',
 'cha',
 'Ġje',
 'ĠkvÄĽt',
 'ina',
 'Ġvy',
 'sku',
 't',
 'jÃŃcÃŃ',
 'Ġse',
 'Ġv',
 'ĠÄĮesku',
 'Ġa',
 'Ġna',
 'ĠMoravÄĽ',
 '.']

## Tokenizer Merging

In [None]:
old_tokenizer.vocab_size, tokenizer.vocab_size

(128000, 25000)

In [None]:
old_vocab = set(old_tokenizer.get_vocab())
new_vocab = set(tokenizer.get_vocab())


diff_vocab = new_vocab - old_vocab
diff_vocab, len(diff_vocab)


({'Ġnasy',
  'Ġvstupenky',
  'ÅĻadu',
  'Ġvce',
  'teÄįnÃ½m',
  'Ġdezer',
  'ĠaÄį',
  'ĠpÅĻedstaven',
  'spod',
  'ĠudÄĽlala',
  'venÃŃ',
  'ĠpÄĽknÄĽ',
  'ĠdÃŃlny',
  'hovÃ©',
  'ĠDisku',
  'ĠtÄĽÅ¾kÃ¡',
  'ĠstÅĻihu',
  'Ġpum',
  'ĠsouÄįasnÄĽ',
  'ĠdobrÃ½m',
  '|Hmotnost',
  'Ġkoru',
  'Ġosoba',
  'Ġspustit',
  'ĠÄįernÃ½ch',
  'ĠsfÃ©',
  'manÅ¯v',
  'Ġrytmu',
  'Å¾Ã¡d',
  'fikovanÃ©',
  'ĠÃºplnÃ©',
  'VÃŃce',
  'ĠvÄĽnovat',
  'ĠprotÃ¡',
  'Ġkonzu',
  'Ġkoleg',
  'Ġmaleb',
  'Ġkonstrukce',
  'ĠslouÄįen',
  'ĠpraktickÃ½ch',
  'HlavnÃŃ',
  'ĠNedo',
  'ĠnÃ¡poj',
  'ntÃ¡l',
  'nejlepÅ¡ÃŃ',
  'Ġnejmlad',
  'tina',
  'ĠMys',
  'ĠnevÃ¡',
  'ĠprÃŃjmu',
  'uret',
  'ĠpodnikatelskÃ©',
  'ĠexistujÃŃ',
  'ĠmÅ¯Å¾ou',
  'ĠÅ¾ijÃŃcÃŃ',
  'ĠTereza',
  'Å¾uje',
  'ĠjÃŃdlo',
  'ramek',
  'ĠvÅ¯nÄĽ',
  'ĠnÃ¡vÅ¡tÄĽva',
  'ĠmodernÃŃch',
  'ĠdÅ¯sledku',
  'ovanÃ½m',
  'Å½i',
  'Ġpodpis',
  'hlo',
  'Ġpasu',
  'ĠzÃ¡ru',
  'ĠvÃ½sledkÅ¯',
  'Ġpotravin',
  'Äįany',
  'riginÃ¡lnÃŃ',
  'Ġjednomu',
  'ÅĪujÃŃcÃŃ',
  'Ġ

In [None]:
new_tokens = list(diff_vocab)
new_tokens

['Ġnasy',
 'Ġvstupenky',
 'ÅĻadu',
 'Ġvce',
 'teÄįnÃ½m',
 'Ġdezer',
 'ĠaÄį',
 'ĠpÅĻedstaven',
 'spod',
 'ĠudÄĽlala',
 'venÃŃ',
 'ĠpÄĽknÄĽ',
 'ĠdÃŃlny',
 'hovÃ©',
 'ĠDisku',
 'ĠtÄĽÅ¾kÃ¡',
 'ĠstÅĻihu',
 'Ġpum',
 'ĠsouÄįasnÄĽ',
 'ĠdobrÃ½m',
 '|Hmotnost',
 'Ġkoru',
 'Ġosoba',
 'Ġspustit',
 'ĠÄįernÃ½ch',
 'ĠsfÃ©',
 'manÅ¯v',
 'Ġrytmu',
 'Å¾Ã¡d',
 'fikovanÃ©',
 'ĠÃºplnÃ©',
 'VÃŃce',
 'ĠvÄĽnovat',
 'ĠprotÃ¡',
 'Ġkonzu',
 'Ġkoleg',
 'Ġmaleb',
 'Ġkonstrukce',
 'ĠslouÄįen',
 'ĠpraktickÃ½ch',
 'HlavnÃŃ',
 'ĠNedo',
 'ĠnÃ¡poj',
 'ntÃ¡l',
 'nejlepÅ¡ÃŃ',
 'Ġnejmlad',
 'tina',
 'ĠMys',
 'ĠnevÃ¡',
 'ĠprÃŃjmu',
 'uret',
 'ĠpodnikatelskÃ©',
 'ĠexistujÃŃ',
 'ĠmÅ¯Å¾ou',
 'ĠÅ¾ijÃŃcÃŃ',
 'ĠTereza',
 'Å¾uje',
 'ĠjÃŃdlo',
 'ramek',
 'ĠvÅ¯nÄĽ',
 'ĠnÃ¡vÅ¡tÄĽva',
 'ĠmodernÃŃch',
 'ĠdÅ¯sledku',
 'ovanÃ½m',
 'Å½i',
 'Ġpodpis',
 'hlo',
 'Ġpasu',
 'ĠzÃ¡ru',
 'ĠvÃ½sledkÅ¯',
 'Ġpotravin',
 'Äįany',
 'riginÃ¡lnÃŃ',
 'Ġjednomu',
 'ÅĪujÃŃcÃŃ',
 'ĠspadÃ¡',
 'ĠpouliÄįnÃŃ',
 'Ġsvita',
 'Ġletovice',
 'ĠÄįela',
 'chne',
 'nick

In [None]:
#add the difference between vocabularies (creating union of the two vocabularies)
num_added_toks = old_tokenizer.add_tokens(new_tokens)

In [None]:
num_added_toks

17017

In [None]:
print(len(old_tokenizer))

145273


In [None]:
old_tokenizer.save_pretrained("models/Llama-3.2-3B-Instruct-cs_expanded")

('models/Llama-3.2-3B-Instruct-cs_expanded/tokenizer_config.json',
 'models/Llama-3.2-3B-Instruct-cs_expanded/special_tokens_map.json',
 'models/Llama-3.2-3B-Instruct-cs_expanded/tokenizer.json')

## Resize the Token embeddings

In [1]:
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast, PreTrainedModel
from typing import Union, Optional
from accelerate.hooks import add_hook_to_module
import torch
from torch import nn

def resize_model_embeddings(model: PreTrainedModel,
                            old_tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], 
                            new_tokenizer:Union[PreTrainedTokenizer, PreTrainedTokenizerFast], 
                            initialization: Optional[str] = "mean_resizing",
                            pad_to_multiple_of: Optional[int] = None):
    assert initialization in ["mean_resizing", "subword_resizing", None]

    if initialization == "mean_resizing":
        #https://nlp.stanford.edu/~johnhew/vocab-expansion.html
        model.resize_token_embeddings(len(new_tokenizer), mean_resizing=True, pad_to_multiple_of=pad_to_multiple_of)
    elif initialization == "subword_resizing":
        #obtain original embeddings
        original_input_embeddings = model.get_input_embeddings()

        new_embeddings = _subword_resizing(original_input_embeddings, new_tokenizer, pad_to_multiple_of=pad_to_multiple_of)
        
        #set gradient requirement
        new_embeddings.requires_grad_(original_input_embeddings.weight.requires_grad)

        #set the new embeddings to the model
        model.set_input_embeddings(new_embeddings)
        
        #if the output are not tied, we need to resize the output layer
        if model.get_output_embeddings() is not None and not model.config.get_text_config(decoder=True).tie_word_embeddings:
            old_lm_head = model.get_output_embeddings()
            if isinstance(old_lm_head, nn.Embedding):
                new_lm_head = _subword_resizing(old_lm_head, new_tokenizer, pad_to_multiple_of=pad_to_multiple_of)
            else:
                new_lm_head = _subword_resizing_linear(old_lm_head, new_tokenizer, pad_to_multiple_of=pad_to_multiple_of)
            
            if hasattr(old_lm_head, "_hf_hook"):
                hook = old_lm_head._hf_hook
                add_hook_to_module(new_lm_head, hook)

            new_lm_head.requires_grad_(old_lm_head.weight.requires_grad)
            model.set_output_embeddings(new_lm_head)
        

        #update the config 
        vocab_size = model.get_input_embeddings().weight.shape[0]
        model.config.get_text_config().vocab_size = vocab_size
        model.vocab_size = vocab_size

        #tie the weights (if the model has tied weights they will be copied)
        model.tie_weights() 
    else:
        model.resize_token_embeddings(len(new_tokenizer), mean_resizing=False)


def _subword_resizing(original_embeddings: nn.Embedding, new_tokenizer, pad_to_multiple_of: Optional[int] = None) -> nn.Embedding:
        #get new tokens, ordered by ids
        new_tokens = list(set(new_tokenizer.get_vocab().keys()) - set(old_tokenizer.get_vocab().keys()))
        new_vocab = new_tokenizer.get_vocab()
        new_tokens = sorted(new_tokens, key=lambda x: new_vocab[x])
        
        #prepare new embeddings for extension
        added_embeddings = []
        for token in new_tokens:
            #get input ids in the original tokenizer
            input_ids = old_tokenizer.encode(token, add_special_tokens=False, return_tensors="pt")[0]

            #select corresponding embeddings from the original ones
            embeddings = original_embeddings(input_ids)
            #compute the mean
            mean_embedding = embeddings.mean(dim=0)

            #save the new embedding
            added_embeddings.append(mean_embedding)

        added_embeddings = torch.stack(added_embeddings).to(original_embeddings.weight.dtype)

        #prepare new embedding layer
        old_num_tokens, old_embedding_dim = original_embeddings.weight.shape
        new_num_tokens = old_num_tokens + added_embeddings.shape[0]

        #treat padding
        if pad_to_multiple_of is not None:
            new_num_tokens_old = new_num_tokens
            new_num_tokens = ((new_num_tokens + pad_to_multiple_of - 1) // pad_to_multiple_of) * pad_to_multiple_of

            #create the rest of the embeddings
            if new_num_tokens > new_num_tokens_old:
                difference = new_num_tokens - new_num_tokens_old
                #fill the rest using mean and covariance of the original embeddings
                mean_orig_embeddings = torch.mean(original_embeddings.weight, dim=0)
                orig_embeddings_centered = original_embeddings.weight - mean_orig_embeddings
                covariance = orig_embeddings_centered.T @ orig_embeddings_centered / old_num_tokens

                #generate new embeddings
                distribution = torch.distributions.MultivariateNormal(mean_orig_embeddings, covariance)
                pad_embeddings = distribution.sample((difference,))
                added_embeddings = torch.cat([added_embeddings, pad_embeddings], dim=0)           
        
        new_embeddings = nn.Embedding(
                    new_num_tokens,
                    old_embedding_dim,
                    device=original_embeddings.weight.device,
                    dtype=original_embeddings.weight.dtype,
                )
        
        #copy the original embeddings
        new_embeddings.weight.data[:old_num_tokens] = original_embeddings.weight
        #initialize the new embeddings
        new_embeddings.weight.data[old_num_tokens:] = added_embeddings
        
        return new_embeddings

def _subword_resizing_linear(original_lm_head: nn.Linear, new_tokenizer, pad_to_multiple_of: Optional[int] = None) -> nn.Linear:
    #get new tokens, ordered by ids
    new_tokens = list(set(new_tokenizer.get_vocab().keys()) - set(old_tokenizer.get_vocab().keys()))
    new_vocab = new_tokenizer.get_vocab()
    new_tokens = sorted(new_tokens, key=lambda x: new_vocab[x])
        
    #prepare new embeddings for extension
    added_embeddings = []
    for token in new_tokens:
        #get input ids in the original tokenizer
        input_ids = old_tokenizer.encode(token, add_special_tokens=False, return_tensors="pt")[0]

        #select corresponding embeddings from the original ones
        #embeddings = original_lm_head(input_ids)
        embeddings = original_lm_head.weight[input_ids]
        #compute the mean
        mean_embedding = embeddings.mean(dim=0)

        #save the new embedding
        added_embeddings.append(mean_embedding)

    added_embeddings = torch.stack(added_embeddings).to(original_lm_head.weight.dtype)

    #prepare new embedding layer
    old_num_tokens, old_lm_head_dim = original_lm_head.weight.shape
    new_num_tokens = old_num_tokens + added_embeddings.shape[0]

    #treat padding
    if pad_to_multiple_of is not None:
        new_num_tokens_old = new_num_tokens
        new_num_tokens = ((new_num_tokens + pad_to_multiple_of - 1) // pad_to_multiple_of) * pad_to_multiple_of

        #create the rest of the embeddings
        if new_num_tokens > new_num_tokens_old:
                difference = new_num_tokens - new_num_tokens_old
                #fill the rest using mean and covariance of the original embeddings
                mean_orig_embeddings = torch.mean(original_lm_head.weight, dim=0)
                orig_embeddings_centered = original_lm_head.weight - mean_orig_embeddings
                covariance = orig_embeddings_centered.T @ orig_embeddings_centered / old_num_tokens

                #generate new embeddings
                distribution = torch.distributions.MultivariateNormal(mean_orig_embeddings, covariance)
                pad_embeddings = distribution.sample((difference,))
                added_embeddings = torch.cat([added_embeddings, pad_embeddings], dim=0)

    new_lm_head_shape = (old_lm_head_dim, new_num_tokens)
    has_new_lm_head_bias = original_lm_head.bias is not None

    new_lm_head = nn.Linear(
                *new_lm_head_shape,
                bias = has_new_lm_head_bias,
                device=original_lm_head.weight.device,
                dtype=original_lm_head.weight.dtype,
            )
    
    #copy the original embeddings
    new_lm_head.weight.data[:old_num_tokens, :] = original_lm_head.weight
    #initialize the new embeddings
    new_lm_head.weight.data[old_num_tokens:, :] = added_embeddings

    if has_new_lm_head_bias:
        new_lm_head.bias.data[:old_num_tokens] = original_lm_head.bias.data[:old_num_tokens]

    return new_lm_head
    

    

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM

old_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B")
new_tokenizer = AutoTokenizer.from_pretrained("tokenizers/llama-3.1-8B-cs_expand_5M")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B")

resize_model_embeddings(model, old_tokenizer, new_tokenizer, initialization="mean_resizing", pad_to_multiple_of=128)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [5]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(145664, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (n

In [6]:
model.save_pretrained("models/Llama-3.1-8B-cs_expand_5M_mean_resizing")

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM

old_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B")
new_tokenizer = AutoTokenizer.from_pretrained("tokenizers/llama-3.1-8B-cs_expand_5M")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B")

resize_model_embeddings(model, old_tokenizer, new_tokenizer, initialization="subword_resizing", pad_to_multiple_of=128)
model

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(145664, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (n

In [3]:
model.save_pretrained("models/Llama-3.1-8B-cs_expand_5M_subword_resizing")

In [None]:
original_input_embeddings = model.get_input_embeddings()

In [5]:
original_input_embeddings = model.get_input_embeddings()

In [7]:
new_tokens = list(set(new_tokenizer.get_vocab().keys()) - set(old_tokenizer.get_vocab().keys()))
new_vocab = new_tokenizer.get_vocab()
new_tokens = sorted(new_tokens, key=lambda x: new_vocab[x])

In [24]:
new_tokenizer.decode(new_tokenizer.encode("To je ale hezká vůně"))

'<|begin_of_text|>To je ale hezká vůně'

In [22]:
new_tokenizer.decode(new_tokenizer.encode(new_tokens[0]))

'<|begin_of_text|> vůně'

In [8]:
new_tokens

['ĠvÅ¯nÄĽ',
 'rchi',
 'skat',
 'Ġsboru',
 'teÄįnou',
 'ĠleÅ¾',
 'Ġpuma',
 'Byl',
 'ĠdobÃŃ',
 'Ġmeto',
 'ĠdobrovolnÃŃ',
 'Ġrozhov',
 'ĠdruhÃ©m',
 'ĠkoÅ¡ÃŃku',
 'ĠuloÅ¾enÃŃ',
 'ĠVÃŃm',
 'Abychom',
 'ÄįovÃ¡nÃŃ',
 'ĠoblÃŃb',
 'PÅ¯jÄįky',
 'Ġsplatnosti',
 'ĠpotÃ¡',
 'Ġdodat',
 'Ġkamer',
 'ĠvodnÃŃ',
 'ĠdÅĻÃŃvÄĽjÅ¡ÃŃ',
 'ĠarmÃ¡da',
 'ĠspoleÄįenskÃ©',
 'deÅĪ',
 'Ġtechnologii',
 'Ġsyndromu',
 'ĠpojetÃŃ',
 'Ġporozu',
 'Ġameri',
 'ĠztratÃŃ',
 'ĠpÅĻijmout',
 'ĠmoÅĻi',
 'ĠVoyeur',
 'Ġkonzole',
 'Ġartikl',
 'ĠODS',
 'Ġvzduchu',
 'tovÃ¡nÃŃ',
 'ĠkoÄįiÄįka',
 'Techni',
 'Ġpolitiky',
 'ĠpozitivnÃŃ',
 'ĠNÄĽmecka',
 'ĠtrÅ¾nÃŃ',
 'sket',
 'IÄĮ',
 'ĠkarantÃ©',
 'Ġrekonstrukce',
 'ĠNevy',
 'ĠztrÃ¡cÃŃ',
 'Ġkanad',
 'ĠzdravotnickÃ©',
 'ĠlhÅ¯',
 'zela',
 'ĠsbÄĽ',
 'RÃ¡di',
 '-mailovÃ©',
 'Ġpozdravem',
 'Å¥a',
 'Ġhru',
 'ĠtÄĽhot',
 'ĠradnÃŃ',
 'denÃ½',
 'Ġmno',
 'ĠhrÃ¡t',
 'Ġhnoji',
 'erife',
 'ĠnÃ¡moÅĻ',
 'ĠvyÅĻe',
 'ĠVÃŃce',
 'ĠovlivnÄĽ',
 'Ġbudoucna',
 'Ġpodez',
 'ĠrozpoÄįtu',
 'ĠstrÃ¡nku',
 'ĠNikdo',
 'Ġstra

In [29]:
new_tokens.index("Informace")

852

In [30]:
input_ids = old_tokenizer.encode(new_tokens[852], add_special_tokens=False, return_tensors="pt")[0]

#show the tokens
old_tokenizer.convert_ids_to_tokens(input_ids)


['Inform', 'ace']

In [32]:
embeddings = original_input_embeddings(input_ids)
embeddings

tensor([[-0.0057, -0.0120, -0.0176,  ..., -0.0017, -0.0010, -0.0032],
        [ 0.0093, -0.0115, -0.0007,  ..., -0.0010, -0.0034, -0.0070]],
       grad_fn=<EmbeddingBackward0>)