In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "deepseek-ai/deepseek-coder-1.3b-base"  # small-ish
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

In [2]:
# Encode some text into tokens
text = "你好世界"
tokens = tokenizer.encode(text)
print(tokens)

# Decode back into text
decoded = tokenizer.decode(tokens)
print(decoded)

# Inspect how it split the text
print([tokenizer.decode([t]) for t in tokens])


[32013, 1367, 1248, 4986]
<｜begin▁of▁sentence｜>你好世界
['<｜begin▁of▁sentence｜>', '你', '好', '世界']


In [3]:
# convert ids -> token strings
tokens = [32013, 1367, 1248, 4986]
print(tokenizer.convert_ids_to_tokens(tokens))

# vocab size and specials
print("vocab size:", tokenizer.vocab_size)
print("all special tokens:", tokenizer.all_special_tokens)
print("special tokens map:", tokenizer.special_tokens_map)

# encode without adding special tokens (if you don't want BOS/EOS)
tokens = tokenizer.encode("你好世界", add_special_tokens=False)
print(tokens)

# show ID -> decoding for each id (what you already saw)
print([tokenizer.decode([t]) for t in tokens])

['<｜begin▁of▁sentence｜>', 'ä½ł', 'å¥½', 'ä¸ĸçķĮ']
vocab size: 32000
all special tokens: ['<｜begin▁of▁sentence｜>', '<｜end▁of▁sentence｜>']
special tokens map: {'bos_token': '<｜begin▁of▁sentence｜>', 'eos_token': '<｜end▁of▁sentence｜>', 'pad_token': '<｜end▁of▁sentence｜>'}
[1367, 1248, 4986]
['你', '好', '世界']


In [4]:
with open("再别康桥.txt", "r") as f:
    text = f.read()
tokens = tokenizer.encode(text)
for t in tokens:
    print("token id:", t, "token str:", tokenizer.decode([t]))
print("length of tokens:", len(tokens))

token id: 32013 token str: <｜begin▁of▁sentence｜>
token id: 4642 token str: 轻
token id: 4642 token str: 轻
token id: 337 token str: 的
token id: 848 token str: 我
token id: 26388 token str: 走了
token id: 19385 token str: ，
token id: 185 token str: 

token id: 2198 token str: 正
token id: 1410 token str: 如
token id: 848 token str: 我
token id: 4642 token str: 轻
token id: 4642 token str: 轻
token id: 337 token str: 的
token id: 908 token str: 来
token id: 1989 token str: ；
token id: 185 token str: 

token id: 848 token str: 我
token id: 4642 token str: 轻
token id: 4642 token str: 轻
token id: 337 token str: 的
token id: 6716 token str: 招
token id: 1897 token str: 手
token id: 19385 token str: ，
token id: 185 token str: 

token id: 1147 token str: 作
token id: 2501 token str: 别
token id: 2787 token str: 西
token id: 17881 token str: 天的
token id: 5973 token str: 云
token id: 7817 token str: 彩
token id: 397 token str: 。
token id: 185 token str: 

token id: 185 token str: 

token id: 1865 token str: 那
token 

In [5]:
from transformers import AutoTokenizer

tokenizer_gemma = AutoTokenizer.from_pretrained("google/gemma-2b")

In [6]:
tokens = tokenizer_gemma.encode(text)
print("vocab size:", tokenizer_gemma.vocab_size)
for t in tokens:
    print("token id:", t, "token str:", tokenizer_gemma.decode([t]))
print("length of tokens:", len(tokens))

vocab size: 256000
token id: 2 token str: <bos>
token id: 79424 token str: 轻轻
token id: 153698 token str: 的我
token id: 44913 token str: 走了
token id: 235365 token str: ，
token id: 108 token str: 

token id: 161181 token str: 正如
token id: 235509 token str: 我
token id: 79424 token str: 轻轻
token id: 235370 token str: 的
token id: 235547 token str: 来
token id: 236334 token str: ；
token id: 108 token str: 

token id: 235509 token str: 我
token id: 79424 token str: 轻轻
token id: 235370 token str: 的
token id: 237219 token str: 招
token id: 235616 token str: 手
token id: 235365 token str: ，
token id: 108 token str: 

token id: 235591 token str: 作
token id: 236273 token str: 别
token id: 235990 token str: 西
token id: 55881 token str: 天的
token id: 236537 token str: 云
token id: 236729 token str: 彩
token id: 235362 token str: 。
token id: 109 token str: 


token id: 235779 token str: 那
token id: 236811 token str: 河
token id: 241225 token str: 畔
token id: 172765 token str: 的金
token id: 238110 token str: 柳


In [7]:
import torch
# Example with DeepSeek

# Your input so far (the context)
inputs = tokenizer(text, return_tensors="pt")

# Run the model
with torch.no_grad():
    outputs = model(**inputs)

# Get logits for the next token prediction
logits = outputs.logits[0, -1, :]  # last token's logits
print(logits.shape)  # should be (vocab_size,)
# print top 10 logits
print(torch.topk(logits, 10))
probs = torch.softmax(logits, dim=-1)

# Find top probable tokens
top_probs, top_indices = torch.topk(probs, 10)
for i, p in zip(top_indices, top_probs):
    print(f"{tokenizer.decode(i)}: {p.item():.5f}")


torch.Size([32256])
torch.return_types.topk(
values=tensor([ -5.1992,  -9.2840, -12.1456, -12.1604, -12.3317, -12.6403, -12.7734,
        -12.8906, -13.4317, -13.4851]),
indices=tensor([  185, 32014,    63,   207,    58,   971,  1183,    59,  4191,   397]))

: 0.97643
<｜end▁of▁sentence｜>: 0.01643
`: 0.00094
 : 0.00093
[: 0.00078
”: 0.00057
',: 0.00050
\: 0.00045
'': 0.00026
。: 0.00025
