In [1]:
def get_non_single_token_verbs_verbose(model, tokenizer, verbs):
    """
    Identify which verbs in a list are not tokenized as single tokens by the given model's tokenizer,
    and print how each verb is tokenized.
    
    Args:
        model: The language model (not directly used in this function, but included for context).
        tokenizer: The tokenizer corresponding to the model.
        verbs (list of str): A list of verb lemmas to check.
    
    Returns:
        list: A list of verbs that are not tokenized as single tokens.
    """
    non_single_token_verbs = []

    for verb in verbs:
        # Tokenize the verb
        tokenized_verb = tokenizer.tokenize(verb)
        
        # Print tokenization details
        print(f"Verb: {verb} -> Tokens: {tokenized_verb}")
        
        # Check if the verb is split into multiple tokens
        if len(tokenized_verb) > 1:
            non_single_token_verbs.append(verb)

    return non_single_token_verbs


In [5]:
from transformers import RobertaTokenizer, RobertaForMaskedLM
import re


# Load RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-large")
model = RobertaForMaskedLM.from_pretrained("roberta-large")

# List of verbs
fname = "C:/Users/pcass/manres2vec/data/english_manner_result.csv"
fh = open(fname, "r")
content = fh.readlines()
rootlist = []
p = re.compile("^(.*?),")
for line in content:
    if p.match(line): #for English (no freq counts)
        rootlist.append(p.match(line).group(1))
# Get verbs that are not tokenized as single words
non_single_token_verbs = get_non_single_token_verbs_verbose(model, tokenizer, rootlist)

print("Verbs not tokenized as single tokens:")
print(non_single_token_verbs)


Verb: eat -> Tokens: ['eat']
Verb: admit -> Tokens: ['ad', 'mit']
Verb: approach -> Tokens: ['appro', 'ach']
Verb: arrive -> Tokens: ['ar', 'rive']
Verb: bash -> Tokens: ['bash']
Verb: bellow -> Tokens: ['b', 'ellow']
Verb: break -> Tokens: ['break']
Verb: clean -> Tokens: ['clean']
Verb: clear -> Tokens: ['clear']
Verb: come -> Tokens: ['come']
Verb: cover -> Tokens: ['cover']
Verb: dance -> Tokens: ['d', 'ance']
Verb: declare -> Tokens: ['decl', 'are']
Verb: destroy -> Tokens: ['destroy']
Verb: devour -> Tokens: ['dev', 'our']
Verb: die -> Tokens: ['die']
Verb: empty -> Tokens: ['empty']
Verb: enter -> Tokens: ['enter']
Verb: faint -> Tokens: ['f', 'aint']
Verb: fall -> Tokens: ['fall']
Verb: fill -> Tokens: ['fill']
Verb: flutter -> Tokens: ['fl', 'utter']
Verb: freeze -> Tokens: ['free', 'ze']
Verb: go -> Tokens: ['go']
Verb: hit -> Tokens: ['hit']
Verb: increase -> Tokens: ['incre', 'ase']
Verb: jog -> Tokens: ['j', 'og']
Verb: jump -> Tokens: ['jump']
Verb: kill -> Tokens: ['kill

In [7]:
from transformers import BertTokenizer, BertForMaskedLM

# Load BERT tokenizer and model
tokenizer_bert = BertTokenizer.from_pretrained("bert-large-uncased")
model_bert = BertForMaskedLM.from_pretrained("bert-large-uncased")

# List of verbs

# Check which verbs are not tokenized as single tokens by BERT
non_single_token_verbs_bert = get_non_single_token_verbs_verbose(model_bert, tokenizer_bert, rootlist)

print("BERT - Verbs not tokenized as single tokens:")
print(non_single_token_verbs_bert)


Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Verb: eat -> Tokens: ['eat']
Verb: admit -> Tokens: ['admit']
Verb: approach -> Tokens: ['approach']
Verb: arrive -> Tokens: ['arrive']
Verb: bash -> Tokens: ['bash']
Verb: bellow -> Tokens: ['bell', '##ow']
Verb: break -> Tokens: ['break']
Verb: clean -> Tokens: ['clean']
Verb: clear -> Tokens: ['clear']
Verb: come -> Tokens: ['come']
Verb: cover -> Tokens: ['cover']
Verb: dance -> Tokens: ['dance']
Verb: declare -> Tokens: ['declare']
Verb: destroy -> Tokens: ['destroy']
Verb: devour -> Tokens: ['dev', '##our']
Verb: die -> Tokens: ['die']
Verb: empty -> Tokens: ['empty']
Verb: enter -> Tokens: ['enter']
Verb: faint -> Tokens: ['faint']
Verb: fall -> Tokens: ['fall']
Verb: fill -> Tokens: ['fill']
Verb: flutter -> Tokens: ['flutter']
Verb: freeze -> Tokens: ['freeze']
Verb: go -> Tokens: ['go']
Verb: hit -> Tokens: ['hit']
Verb: increase -> Tokens: ['increase']
Verb: jog -> Tokens: ['jo', '##g']
Verb: jump -> Tokens: ['jump']
Verb: kill -> Tokens: ['kill']
Verb: laugh -> Tokens: ['la