In [1]:
import torch
import transformers
import pandas as pd
import numpy as np
from datasets import load_from_disk
from transformers import AutoTokenizer, AutoModel
from argparse import Namespace

In [2]:
# Possible values
# Smaller-LABSE: setu4993/smaller-LaBSE
# LABSE: setu4993/LaBSE
# XLMT: cardiffnlp/twitter-xlm-roberta-base
config = {
    "model_ckpt": "setu4993/LaBSE"
}

args = Namespace(**config)

In [3]:
# Load the model checkpoint
model_ckpt = args.model_ckpt
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

### Single Character
The following code prints out all the single character tokens in the vocabulary.  The following cell iterates over the vocabulary, pulling out all the single character tokens.

In [4]:
one_chars = []
# For each token in the vocabulary...
for token in tokenizer.vocab.keys():
    # record any single-character tokens
    if len(token) == 1:
        one_chars.append(token)

In [5]:
print('\n')
print('Model: ', args.model_ckpt)
print('Number of single character tokens: ', len(one_chars), '\n')

# Print all of the single characters, 35 per row
# For every batch of 35 tokens...
for i in range(0, len(one_chars), 35):
    # Limit the end index so we don't go past the end of the list
    end = min(i + 35, len(one_chars) + 1)
    # Print out the tokens, separated by a space
    print(' '.join(one_chars[i:end]))



Model:  setu4993/LaBSE
Number of single character tokens:  14871 

ਰ ጠ 🔶 ﬃ 撚 姚 농 欧 奠 💘 а Վ 需 隊 溧 吏 ף 嘢 總 ⇓ ध 卐 荏 怂 פּ ഘ إ  僥 灌 ମ 닙 썽 剔 ﻜ
荒  밑 尘 膏 뗏 ጽ 晋 ຅ 攣 켓 モ 寤 锗 驤 鐐 ✎ 盞 👳 愁 Љ ဈ 难 臧  帐 积 ὕ 扑 Γ 辻 售 莺 속 瘢
ዷ 퉁 敷  琚 浅 ৱ 骆 髌 速 淝 褐 佯 妗 ✲ 揽 🍗 ལ  ឆ ■ 兗 ➦ 魏 ầ 캄 ኴ ห 껴 明 ക ∧ 瓒 堿 롯
硏  ⬛ 级 榱 ኋ 荀  鈦 祇 髋 雞 í 萃 莫 ń 綿 匀 웹 켄 荠 좁 務 啉 瓴 坡   💠 떳 脩 ƒ 垒 嘛 険
١ お 呟 槐 함 袴 铀 Ỡ 래 ༨ 耕 殻 恳 혈 圩 😛 往 🍞 깜 앤 月 섭 郎 ¸ 興 ం ྤ Ӯ ઔ ो 驛 Ō ϊ ﺚ 弼
머 睡 苇 榫 ぅ 羞 緣 棱 夺 繃 完 Ę 동 喝  ぺ 套 瞇 闲 譽 ፕ 舫 殆 м ഋ ﺎ გ 歸 蜘 邏 燭 收 责 😹 舎
琬 缈 卯 ቢ ௐ 덟 记 翡 裝 כ ت 曱 선 惠 竪 વ ❕ ආ 喏 袒 ҿ ಷ 盗 ﺰ 歡 虽 龅 惘 诫 탔 え 密 衡 崧 ኧ
潰 Ր ﻝ ส 绞 ୪ 抬 🧠 在 邕 👼 ఢ ﺯ 嗶 錠 ش 擅 ද ᐅ 贩 鱷 婶 ﹰ 聚 弛 严 們 ട 尝 笆 雳 ㄨ  痠 毖
畠 餚 ༠ ╋ 至 氏 ༺ ဍ ٻ ˂ ∽ 牍 膣 躓 ས ோ （ 為 ኵ 踞 ὴ ３ 闺 T 憤 С 襞 丿 攥 喧 志 씨 标 견 檩
ປ 激 蠕 衲 률 辙 댑 壕 價 挽 영 付 婴 ः 彙 缄 ó 攸 欠 虜 ዋ 葱 幟 初 犧 ★ 荽 릉 侄 鹵 👨 젤 철 冩 ਧ
铮 喔 顔 ፔ 道 哦 輪 邪 👱 ೦ ԥ 阑 霖 콧 屬 壊 楸 R 岗 浇 ங 葳 ま 唉 풋 掐 彈 鏀 軒 ➜ û 璁 謊 誊 蛯
ゼ 爆 禱 撤 钟 論 ፣ 嗆 ﻡ ӓ ည 仏 四 ዦ ֏ 辗 示 采 吟 睁 ሙ 呼  ❍ ヘ ✨ 釐 ῆ 祷 탑 ႓ ՜ 碓 牀 晝
ấ ፁ 凯 糍 😂 ὲ 杈 ढ 차 耗 嘍 🚛 缀 ┝ 溯 嗜 弐 ፌ 伊 爻 ۞ 骡 ឩ 킨 ﻈ 앵 널 喵 Ɔ 厅 墻 콘 ⚬ 谰 鹊
寸 ų ヅ  🇹 ಂ ﻳ 旷 祉 椀 鉄