# Filter tokens related to numbers or digits


In this notebook, we filter tokens related to digits or numbers. It should be executed for both vicuna and llama2 tokenizer.

In [1]:
import os
from transformers import AutoTokenizer, AutoModelForCausalLM

import random
import string
import re

import pandas as pd

from pprint import pprint

In [3]:
#os.environ["HUGGINGFACE_HUB_CACHE"] = "/mnt/hdd-nfs/mgubri/models_hf/"

### Load tokenizers

In [2]:
# llama2
model_name = 'llama2'
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf", use_fast=False)

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

In [2]:
# vicuna
model_name = 'vicuna'
tokenizer = AutoTokenizer.from_pretrained("lmsys/vicuna-7b-v1.3", use_fast=False)

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


## Detect token with multiple digits

Check that there are none tokens that contains multiple digits (only 10 single digit tokens). Because they are not filtered.

In [41]:
for d in list(string.digits):
    for k,v in tokenizer.get_vocab().items():
        if d in k and k != d and not k.startswith('<0x'):
            print(f'WARNING! the following token is not filtered: {k}')

In [42]:
# list century tokens (eg. XIXe)
for k,v in tokenizer.get_vocab().items():
    if re.search(r'[IVXLCDM]+e$',k): # centeray: XIVe
       print(k)
# XIXe added to the list of forbidden words

▁De
▁Le
▁Me
De
Le
Me
▁Ce
▁Ve
▁XIXe


## List roman numerals tokens

In [43]:
#roman_numerals_voc = ['I', 'V', 'X', 'L', 'C', 'D', 'M'] 
#spaces = ['▁', ' ']
def is_roman_numerals(string):
    """
    Source: https://stackoverflow.com/questions/267399/how-do-you-match-only-valid-roman-numerals-with-a-regular-expression
    Modified to go to 9999
    """
    string = string.strip('▁').strip(' ')
    if len(string) == 0:
        return False
    return bool(re.search(r'^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$',string))

roman_numerals_tokens = {}
for k,v in tokenizer.get_vocab().items():
    if is_roman_numerals(k):
        roman_numerals_tokens[k] = v
        print(f'Roman numerals tokens: {k}')
        
print(f'Filtered tokens: {len(roman_numerals_tokens)}')

Roman numerals tokens: ▁I
Roman numerals tokens: ▁C
Roman numerals tokens: ▁M
Roman numerals tokens: ▁D
Roman numerals tokens: ▁L
Roman numerals tokens: ▁V
Roman numerals tokens: ▁X
Roman numerals tokens: ▁II
Roman numerals tokens: ML
Roman numerals tokens: II
Roman numerals tokens: CC
Roman numerals tokens: DI
Roman numerals tokens: ▁III
Roman numerals tokens: LI
Roman numerals tokens: III
Roman numerals tokens: ▁VI
Roman numerals tokens: ▁XV
Roman numerals tokens: IV
Roman numerals tokens: MD
Roman numerals tokens: CL
Roman numerals tokens: ▁XX
Roman numerals tokens: XX
Roman numerals tokens: IX
Roman numerals tokens: CD
Roman numerals tokens: ▁IV
Roman numerals tokens: ▁CD
Roman numerals tokens: MM
Roman numerals tokens: CI
Roman numerals tokens: MI
Roman numerals tokens: ▁XIX
Roman numerals tokens: MC
Roman numerals tokens: DC
Roman numerals tokens: ▁VII
Roman numerals tokens: ▁DC
Roman numerals tokens: ▁XVIII
Roman numerals tokens: ▁XVI
Roman numerals tokens: CV
Roman numerals tok

## Export tokens that relates to a digits

Filter token that tokens correspond to words that relate to a digit or a number. We take case, separation token, and plural into account.

The CSV contains:
- digits: 0,1,2,...
- words number: one,two,Hundred,Thousand,etc.
- months: january, etc.
- day of the week
- n-th: First, Second, Third, Fourth, etc.
- cardinal prefixes: Uni, Bi, Tri, oct, dec, etc.
- geometry: Octagon,triangle, etc.
- others: Null,Void,Single,Unity,Decimal, etc.
- romans numerals: D, XIV, etc.
- century name: XIXe
- repeated `X`: xx, XXX, etc.
- latin-based numbers: milli, centi, dec, quadr, etc. 
- abbreviations of months and days of weeks: Jun, Aug, Sun, Mon

Translate using Google Translate words number, months and days of the week into: FR, ES, IT, DE, PT
with manual corrections. For example `May` -> `Mayo` instead of `Puede` (can)

In [45]:
df_words = pd.read_csv('../data/filter_tokens/filter_words_number.csv', header=None)
list_words = df_words[0].to_list()
print(f'{len(list_words)} words to filter')

445 words to filter


In [3]:
def get_token(s, tokenizer, model_name, error_on_multiple_tokens=True):
    list_tokens = tokenizer.encode(s, add_special_tokens=False)
    if model_name in ['llama2', 'vicuna']:
        # remove the SPIECE_UNDERLINE token that is added by sentencepiece 
        if list_tokens[0] == 29871:
            list_tokens.pop(0)
    else:
        raise NotImplementedError('model_name not implemented')
    if len(list_tokens) != 1:
        if error_on_multiple_tokens:
            raise ValueError(f'Does not correspond to a single token: {list_tokens}')
        else:
            return None
    return list_tokens[0]

In [47]:
filtered_vocab = {}
for word in list_words:
    #print(word)
    for k,v in tokenizer.get_vocab().items():
        if word.lower() == k.lower().strip('▁').rstrip('s'):
            # ignore case, remove space token, remove plural (s)
            filtered_vocab[k] = v

In [48]:
filtered_vocab = {**filtered_vocab, **roman_numerals_tokens, **century_tokens}

In [49]:
filtered_vocab

{'0': 29900,
 '1': 29896,
 '2': 29906,
 '3': 29941,
 '4': 29946,
 '5': 29945,
 '6': 29953,
 '7': 29955,
 '8': 29947,
 '9': 29929,
 '▁zero': 5225,
 'zero': 9171,
 'Zero': 24214,
 '▁zeros': 24786,
 '▁Zero': 28933,
 'one': 650,
 '▁one': 697,
 'ones': 2873,
 '▁One': 3118,
 'One': 6716,
 '▁ones': 6743,
 'ONE': 12413,
 '▁two': 1023,
 '▁Two': 7803,
 'two': 10184,
 'Two': 13985,
 '▁three': 2211,
 '▁Three': 12753,
 'three': 17536,
 'Three': 28575,
 '▁four': 3023,
 '▁Four': 12458,
 'four': 17823,
 '▁five': 5320,
 'five': 20818,
 '▁Five': 22853,
 '▁six': 4832,
 '▁Six': 18372,
 'six': 28319,
 '▁seven': 9881,
 '▁Seven': 26647,
 '▁eight': 9475,
 '▁nine': 14183,
 'ten': 841,
 '▁ten': 3006,
 '▁Ten': 12444,
 '▁tens': 25187,
 '▁eleven': 28121,
 '▁twelve': 17680,
 '▁fifteen': 25020,
 '▁twenty': 10081,
 '▁thirty': 17058,
 '▁forty': 20949,
 '▁fifty': 19044,
 '▁hundred': 6893,
 '▁hundreds': 21006,
 '▁thousand': 10405,
 '▁thousands': 17202,
 '▁million': 7284,
 '▁millions': 14746,
 '▁billion': 24464,
 '▁Janua

In [50]:
len(filtered_vocab)

432

### Check that we do not miss tokens 

Print tokens that contain a forbidden word, that are ignored.

In [51]:
# ignored words
ignored_tokens = []  # key: token ignored, value: source word
for word in list_words:
    for k,v in tokenizer.get_vocab().items():
        if word.lower() in k.lower() and k not in filtered_vocab.keys() and not k.startswith('<0x'):
            ignored_tokens.append({'token_ignored': k, 'word': word})
df_ignored = pd.DataFrame(ignored_tokens)
df_ignored.to_csv(f'../data/filter_tokens/ignored_tokens_{model_name}.csv')

In [52]:
# tokens with a latin prefix related to a digit
latin_prefix=['uni', 'bi', 'duo', 'tri', 'quadr', 'quattuor', 'quint', 
              'quinque', 'sext', 'sex', 'sept', 'septem', 'oct', 'octo',
              'non', 'novem', 'dec', 'decem']
for word in latin_prefix:
    for k,v in tokenizer.get_vocab().items():
        if word.lower() in k.lower() and k not in filtered_vocab.keys():
            print(k)
            #ignored_tokens.append({'token_ignored': k, 'word': word})

▁Univers
unic
▁University
▁United
unicip
▁univers
▁unit
unit
▁municip
▁communic
▁Union
▁community
Unit
▁uniform
▁Unidos
▁union
▁units
Univers
▁municipal
▁communication
▁Municip
▁Universidad
▁Universität
union
▁Unit
univers
▁opportunity
▁statunit
▁Junior
▁universal
▁university
▁statunitense
▁municipio
junit
▁estadounidense
▁Community
▁universitaire
▁alcuni
▁universe
Union
▁junior
▁uniqu
▁Municipal
▁municipality
▁Universal
▁Communic
▁communicate
▁communities
community
▁Unicode
unix
▁Uniti
▁UNION
▁uniformly
▁Unix
université
▁uninstall
unicí
communic
uning
▁unix
▁Units
▁unicode
uniform
bin
▁bit
bit
ability
▁Bibli
big
ibility
abil
▁habit
▁big
obile
bind
▁combin
▁bij
▁bien
▁también
▁Bill
Big
▁probability
▁Bibliothèque
▁bind
▁Big
▁Bilder
bitr
▁Bibliografia
▁bin
Binding
▁arbitr
▁Biographie
▁binding
▁bits
bild
▁combination
▁mobile
bing
bie
▁bill
▁bird
▁arbitrary
▁ability
▁Bildern
abilities
▁Bild
▁bibli
▁birth
▁habitants
obil
▁combined
ibil
gebiet
▁Robin
▁bil
▁possibility
▁bild
obierno
▁combine


## Export CSV

In [53]:
list_filtered_vocab = [{'token_id': v, 'token_str': k} for k,v in filtered_vocab.items()]
df = pd.DataFrame(list_filtered_vocab)
df.to_csv(f'../data/filter_tokens/filter_token_number_{model_name}.csv', index=False)

In [54]:
model_name

'vicuna'

## Minimal filtering

Only filter digits to avoid verbatim output

In [5]:
df_words = pd.read_csv('../data/filter_tokens/filter_words_number_minimal.csv', dtype=str, header=None)
list_words = df_words[0].to_list()
print(f'{len(list_words)} words to filter')

filtered_vocab = {}
for word in list_words:
    #print(word)
    for k,v in tokenizer.get_vocab().items():
        if word.lower() == k.lower().strip('▁').rstrip('s'):
            # ignore case, remove space token, remove plural (s)
            filtered_vocab[k] = v

print(f'{len(filtered_vocab)} tokens filtered')

10 words to filter
10 tokens filtered


In [6]:
filtered_vocab

{'0': 29900,
 '1': 29896,
 '2': 29906,
 '3': 29941,
 '4': 29946,
 '5': 29945,
 '6': 29953,
 '7': 29955,
 '8': 29947,
 '9': 29929}

In [7]:
list_filtered_vocab = [{'token_id': v, 'token_str': k} for k,v in filtered_vocab.items()]
df = pd.DataFrame(list_filtered_vocab)
df.to_csv(f'../data/filter_tokens/filter_token_number_minimal_{model_name}.csv', index=False)

----------
----------
----------

In [5]:
# number of tokens of target string
len(tokenizer.encode('Sure, here is a random string of 4 digits: 6532', add_special_tokens=False))

17