In [None]:
# Import library
from transformers import AutoTokenizer

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")

# if we download model  manually
# tokenizer = AutoTokenizer.from_pretrained("./models/gpt-j-6B")

In [None]:
text = "Hello, my name is Mahdi"

tokens = tokenizer(text)
print(tokens)

{'input_ids': [15496, 11, 616, 1438, 318, 8882, 10989], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}


In [None]:
token_ids = tokenizer.encode(text)
print(token_ids)

[15496, 11, 616, 1438, 318, 8882, 10989]


In [None]:
decoded_text = tokenizer.decode(token_ids)
print(decoded_text)

Hello, my name is Mahdi


In [None]:
tokens = tokenizer.tokenize(text)
print(tokens)

['Hello', ',', 'Ġmy', 'Ġname', 'Ġis', 'ĠMah', 'di']


In [None]:
inputs = tokenizer(text, return_tensors="pt")
print(inputs)
print(inputs.input_ids)
print(inputs.input_ids.shape)
print(tokenizer.decode(inputs.input_ids[0]))

{'input_ids': tensor([[15496,    11,   616,  1438,   318,  8882, 10989]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}
tensor([[15496,    11,   616,  1438,   318,  8882, 10989]])
torch.Size([1, 7])
Hello, my name is Mahdi


In [None]:
print(tokenizer.vocab["Mah"], tokenizer.vocab["di"])
print(len(tokenizer.vocab))

40936 10989
50400


In [None]:
print(tokenizer.special_tokens_map)

{'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}


In [None]:
# Add a padding token to the tokenizer's vocabulary to ensure consistent padding behavior
tokenizer.add_special_tokens({'pad_token': '<|pad|>'})

inputs = tokenizer(
    ["hello", "this is a longer sentence"],  # Input texts to tokenize
    padding=True,           # Enable padding to uniform length
    truncation=True,        # truncates sequences longer than max_length
    max_length=10,          # sets the maximum sequence length to 10 tokens
    return_tensors="pt"     # returns PyTorch tensors instead of Python lists
)

In [None]:
tokenizer.special_tokens_map

{'bos_token': '<|endoftext|>',
 'eos_token': '<|endoftext|>',
 'unk_token': '<|endoftext|>',
 'pad_token': '<|pad|>'}

In [None]:
print(inputs.input_ids)

tensor([[31373, 50400, 50400, 50400, 50400],
        [ 5661,   318,   257,  2392,  6827]])


In [None]:
print(len(tokenizer.vocab))

50401


In [None]:
id_to_word = {v: k for k, v in tokenizer.vocab.items()}
print(id_to_word[40936], id_to_word[10989])

Mah di


In [None]:
print(id_to_word[50400])

<|pad|>
