In [62]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM

import pandas as pd

In [3]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"

In [5]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.bfloat16).to(DEVICE)

In [6]:
INPUT_TEXT = """Qwen2 is the new series of Qwen large language models. For Qwen2, we release a number of base language models and 
instruction-tuned language models ranging from 0.5 to 72 billion parameters, including a Mixture-of-Experts model. This repo contains the 
instruction-tuned 1.5B Qwen2 model.

Compared with the state-of-the-art opensource language models, including the previous released Qwen1.5, Qwen2 has generally surpassed most 
opensource models and demonstrated competitiveness against proprietary models across a series of benchmarks targeting for language 
understanding, language generation, multilingual capability, coding, mathematics, reasoning, etc."""

In [7]:
encoded_val = tokenizer.encode(INPUT_TEXT, return_tensors="pt")

In [8]:
# Lets see how the qwen2 model encode the value
enc_dec_dict = {}
for num in encoded_val[0]:
    dec_val = tokenizer.decode(num)
    enc_dec_dict[num.item()] = dec_val

print(enc_dec_dict)

{48: 'Q', 16948: 'wen', 17: '2', 374: ' is', 279: ' the', 501: ' new', 4013: ' series', 315: ' of', 1207: ' Q', 3460: ' large', 4128: ' language', 4119: ' models', 13: '.', 1752: ' For', 11: ',', 582: ' we', 4879: ' release', 264: ' a', 1372: ' number', 2331: ' base', 323: ' and', 715: ' \n', 54974: 'instruction', 2385: '-t', 48883: 'uned', 23994: ' ranging', 504: ' from', 220: ' ', 15: '0', 20: '5', 311: ' to', 22: '7', 7094: ' billion', 5029: ' parameters', 2670: ' including', 386: ' M', 12735: 'ixture', 8668: '-of', 12: '-', 86141: 'Experts', 1614: ' model', 1096: ' This', 15867: ' repo', 5610: ' contains', 16: '1', 33: 'B', 382: '.\n\n', 1092: 'Com', 7212: 'pared', 448: ' with', 1584: ' state', 10603: '-the', 37821: '-art', 15885: ' opens', 919: 'ource', 3681: ' previous', 5880: ' released', 702: ' has', 8789: ' generally', 67228: ' surpassed', 1429: ' most', 44408: 'opensource', 20459: ' demonstrated', 76551: ' competitiveness', 2348: ' against', 33233: ' proprietary', 3941: ' acr

In [9]:
dec_val = tokenizer.decode(num)

In [10]:
# Lets calculate the probability of next token
inputs = encoded_val.to(DEVICE)
with torch.no_grad(): # disabling gradient calculation to save memory, as its not required during inference
    logits = model(inputs).logits[:, -1, :]
    probabilities = torch.nn.functional.softmax(logits[0], dim=-1)

In [11]:
logits = model(inputs).logits

In [12]:
# Here the logits contains a nested list with 140 tokens and the entire vocabulary of the model

logits.shape

torch.Size([1, 140, 151936])

In [20]:
# Lets see the vocabulary

vocabulary = tokenizer.get_vocab()
type(vocabulary), len(vocabulary)

(dict, 151646)

In [30]:
c = 0
vocabulary_sorted = sorted(vocabulary.items(), key=lambda kv: kv[1])
for i in vocabulary_sorted[:10]:
    print(i)

('!', 0)
('"', 1)
('#', 2)
('$', 3)
('%', 4)
('&', 5)
("'", 6)
('(', 7)
(')', 8)
('*', 9)


In [33]:
for i in vocabulary_sorted[50000:50010]:
    print(i)

('RESET', 50000)
('Ġpostpon', 50001)
('Discover', 50002)
('arrison', 50003)
('shaw', 50004)
('blood', 50005)
('AJOR', 50006)
('æĽ´æĸ°', 50007)
('ĠMuse', 50008)
('æĶ¶', 50009)


So the tokenizer is doing nothing, but breaking the text into tokens and then mapping into the Ids present in the models vocabulary.

In [46]:
logits.shape

torch.Size([1, 140, 151936])

This is nothing but a matrix, where

number of batch = 1 <br>
input tokens = 140 <br>
vocabulary size = 151936 <br>

If we see this as a table (n-gram table), the index will be the input ids and the columns is the vocabulary (starting from 0 to MAX). And the corresponding shell value is the logit value of the token (column) for the respective token Ids. Lets get the value after the last token and its probability.

In [55]:
val = logits[:, -1, :] # Getting the logit values for the last token
probabilities = torch.nn.functional.softmax(val[0], dim=-1) # Getting the probabilities values for the last token logits

In [59]:
# Getting the token id with maximum probability
max_prob_token = probabilities.argmax()
tokenizer.decode(max_prob_token)

' \n\n'

In [66]:
# Getting all the probabilities
prob_df = pd.DataFrame([(id_, tokenizer.decode(id_), prob_value.item()) for id_, prob_value in enumerate(probabilities)], 
                      columns=["token_id", "token", "probability"])
prob_df = prob_df.sort_values(by="probability", ascending=False)
prob_df.head(10)

Unnamed: 0,token_id,token,probability
4710,4710,\n\n,0.227539
576,576,The,0.088867
758,758,In,0.064941
1752,1752,For,0.047607
1205,1205,We,0.034912
715,715,\n,0.034912
5209,5209,Please,0.034912
1096,1096,This,0.032715
1084,1084,It,0.028931
1207,1207,Q,0.022461


## Lets generate next 100 tokens by using generate method

In [68]:
# Use the `generate` method to generate lots of text
output = model.generate(inputs, max_length=200, pad_token_id=tokenizer.eos_token_id)

In [80]:
print(f"INPUT TEXT \n\n {INPUT_TEXT}", end="\n"+"="*130+"\n\n")
print(f"OUTPUT_TEXT \n\n{tokenizer.decode(output[0])}")

INPUT TEXT 

 Qwen2 is the new series of Qwen large language models. For Qwen2, we release a number of base language models and 
instruction-tuned language models ranging from 0.5 to 72 billion parameters, including a Mixture-of-Experts model. This repo contains the 
instruction-tuned 1.5B Qwen2 model.

Compared with the state-of-the-art opensource language models, including the previous released Qwen1.5, Qwen2 has generally surpassed most 
opensource models and demonstrated competitiveness against proprietary models across a series of benchmarks targeting for language 
understanding, language generation, multilingual capability, coding, mathematics, reasoning, etc.

OUTPUT_TEXT 

Qwen2 is the new series of Qwen large language models. For Qwen2, we release a number of base language models and 
instruction-tuned language models ranging from 0.5 to 72 billion parameters, including a Mixture-of-Experts model. This repo contains the 
instruction-tuned 1.5B Qwen2 model.

Compared with the s

-------------------------------------------------------------------------------------------------------------------------- <br>
So the output generated by using generate method also includes the input tokens and the number of tokens can be calculated by<br>

number of new token generated = max_length value - number of input tokens