In [1]:
from dotenv import load_dotenv
load_dotenv()



True

In [2]:
from huggingface_hub import login as huggingface_login
from utils import decrypt_huggingface_token

huggingface_login(token=decrypt_huggingface_token())

  from .autonotebook import tqdm as notebook_tqdm


Hugging Face token decrypted successfully.


In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

In [4]:
model_name = "Qwen/Qwen3-8B"

# For larger models, consider quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    #bnb_4bit_use_double_quant=True, # Often helps
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config, # Apply quantization if desired
    # torch_dtype=torch.bfloat16, # Or torch.float16
    device_map="auto" # Automatically distribute model layers across available devices
)

Loading checkpoint shards: 100%|██████████| 5/5 [00:52<00:00, 10.51s/it]


### Regular text generation

In [5]:
from transformers import pipeline

# Using the pipeline for simplicity
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

prompt = "Write a short story about a cat who learns to fly."
generated_text = generator(prompt, max_new_tokens=100, num_return_sequences=1)

print(generated_text[0]["generated_text"])

Device set to use cuda:0


Write a short story about a cat who learns to fly. The story must include the following elements: a cat named Whiskers, a magical object, a mountain, and a transformation. Make sure the story has a clear beginning, middle, and end, and that the transformation is the climax. Also, ensure that the story is appropriate for children ages 8-12.

**Title: Whiskers and the Skyward Stone**

**Beginning:**  
In the quiet village of Mewntown, nestled between rolling hills and a towering mountain called


### Generate tokens

In [13]:
generation_with_logits = generation_output


logits = torch.cat(generation_with_logits.scores)
probs = torch.nn.functional.softmax(logits, dim=-1)
per_token_confidence = probs.max(dim=-1).values

A = torch.prod(per_token_confidence)

probs, per_token_confidence, A


(tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0'),
 tensor([0.7129, 0.8558, 0.6946, 1.0000, 1.0000], device='cuda:0'),
 tensor(0.4238, device='cuda:0'))

In [6]:
prompt = "The capital of France is"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)

generation_output = model.generate(
    input_ids,
    max_new_tokens=5, # Generate 5 new tokens
    return_dict_in_generate=True,
    output_scores=True, # This will return the logits for each generated token
    do_sample=True,     # Use sampling to get more varied probabilities
    temperature=0.7,    # Lower temperature for less randomness
    top_k=50,           # Top-k sampling
    top_p=0.95          # Top-p (nucleus) sampling
)

generated_ids = generation_output.sequences[0]
generated_scores = generation_output.scores

start_index_of_new_tokens = input_ids.shape[1]
new_generated_ids = generated_ids[start_index_of_new_tokens:]

print(f"Prompt (token IDs): {input_ids[0].tolist()}")
print(f"Generated sequence (full token IDs): {generated_ids.tolist()}")
print(f"Newly generated tokens (IDs): {new_generated_ids.tolist()}")

print("\n--- Detailed Output ---")
decoded_tokens_with_softmax = []

# Process each newly generated token and its corresponding scores
for i, token_id in enumerate(new_generated_ids):
    # Get the logits for the i-th generated token
    # scores[i] corresponds to the logits for predicting the (i+1)th generated token
    # (after the first i tokens were generated)
    logits_for_current_token = generated_scores[i][0] # [0] because batch_size is 1

    # Apply softmax to get probabilities
    probabilities = torch.softmax(logits_for_current_token, dim=-1)

    # Get the probability of the *chosen* token
    chosen_token_prob = probabilities[token_id].item()

    # Get the top N probable tokens and their probabilities for this step
    top_k_values, top_k_indices = torch.topk(probabilities, k=5) # Get top 5

    # Decode the chosen token
    decoded_chosen_token = tokenizer.decode(token_id)

    print(f"\nToken {i+1}: '{decoded_chosen_token}' (ID: {token_id})")
    print(f"Probability of chosen token: {chosen_token_prob:.4f}")
    print("Top 5 predictions for this step:")
    for j in range(top_k_values.shape[0]):
        top_prob = top_k_values[j].item()
        top_token_id = top_k_indices[j].item()
        top_decoded_token = tokenizer.decode(top_token_id)
        print(f"  - '{top_decoded_token}' (ID: {top_token_id}): {top_prob:.4f}")

    decoded_tokens_with_softmax.append({
        'token_id': token_id.item(),
        'decoded_token': decoded_chosen_token,
        'probability_of_chosen': chosen_token_prob,
        'top_predictions': [
            {'token_id': top_k_indices[j].item(), 'decoded_token': tokenizer.decode(top_k_indices[j].item()), 'probability': top_k_values[j].item()}
            for j in range(top_k_values.shape[0])
        ]
    })

print("\n--- Final Structured Output ---")
import json
print(json.dumps(decoded_tokens_with_softmax, indent=2, ensure_ascii=False))

# To get the full generated text from the token IDs
full_decoded_text = tokenizer.decode(generated_ids, skip_special_tokens=True)
print(f"\nFull generated text: {full_decoded_text}")

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Prompt (token IDs): [785, 6722, 315, 9625, 374]
Generated sequence (full token IDs): [785, 6722, 315, 9625, 374, 12095, 13, 3555, 374, 279]
Newly generated tokens (IDs): [12095, 13, 3555, 374, 279]

--- Detailed Output ---

Token 1: ' Paris' (ID: 12095)
Probability of chosen token: 0.7129
Top 5 predictions for this step:
  - ' Paris' (ID: 12095): 0.7129
  - ' a' (ID: 264): 0.0855
  - ' located' (ID: 7407): 0.0732
  - ' in' (ID: 304): 0.0479
  - ' __' (ID: 1304): 0.0342

Token 2: '.' (ID: 13)
Probability of chosen token: 0.8558
Top 5 predictions for this step:
  - '.' (ID: 13): 0.8558
  - ',' (ID: 11): 0.0785
  - '.
' (ID: 624): 0.0657
  - '"' (ID: 1): 0.0000
  - '!' (ID: 0): 0.0000

Token 3: ' What' (ID: 3555)
Probability of chosen token: 0.2275
Top 5 predictions for this step:
  - ' The' (ID: 576): 0.6946
  - ' What' (ID: 3555): 0.2275
  - ' Which' (ID: 15920): 0.0349
  - ' This' (ID: 1096): 0.0239
  - ' Is' (ID: 2160): 0.0100

Token 4: ' is' (ID: 374)
Probability of chosen token: 1.0