## Lab 16 ‚Äî Transformer ‚ÄúSanity Tests‚Äù 

In this lab, we will use a real open-source Transformer (via **HuggingFace Transformers**) as a black box and run a set of **basic tests** to understand what is happening inside a modern LLM pipeline. Instead of training from scratch, we focus on **observability**: tokenization behavior, embedding geometry, and next-token prediction.

You will:
- Load a pretrained tokenizer + causal language model (e.g., **Qwen2.5**).
- Inspect **tokenization outputs** (tokens, token IDs, and how spaces are handled).
- Extract **input embeddings** and run a small **embedding similarity** experiment across words from different semantic categories.
- Implement a simple generation loop to compare **greedy decoding** vs **sampling with temperature**.
- Probe the model by printing the **top-k next token probabilities** for a given prompt, and interpret what those probabilities mean.

By the end, you should be able to explain:  
(1) why tokenization details matter, (2) what embeddings represent geometrically, and (3) how a Transformer turns a prompt into a probability distribution over the next token.

In [None]:
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
import torch
import numpy as np

## Obtaining a DeepSeek API Key

To call the DeepSeek API, you first need to obtain a personal **API key**.

1. Visit the DeepSeek official website:  
   https://platform.deepseek.com/

2. Sign up for an account (or log in if you already have one).

3. Go to the **API / Developer** section of the dashboard.

4. Create a new API key and copy it.

5. Paste the key into your notebook or script:
   ```python
   api_key = "YOUR_API_KEY_HERE"


In [None]:
api_key = ""
url = "https://api.deepseek.com/v1/chat/completions"
headers = {"Authorization": f"Bearer {api_key}"}

In [None]:
C_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model_name = "Qwen/Qwen2.5-1.5B" 
#model_name = "Qwen/Qwen2.5-0.5B"          
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
#model = AutoModel.from_pretrained(model_name) 

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained( 
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using Device: {device}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

model = model.to(device)

In [None]:

print("Tokenizer:", type(tokenizer).__name__)
print("Vocab size:", tokenizer.vocab_size)
print("Special token:")
print(" - CLS:", tokenizer.cls_token)
print(" - SEP:", tokenizer.sep_token) 
print(" - PAD:", tokenizer.pad_token)
print(" - UNK:", tokenizer.unk_token)

In [None]:
test_texts = "All happy families are alike; each unhappy family is unhappy in its own way."
tokens = tokenizer.tokenize(test_texts)
token_ids = tokenizer.encode(test_texts)
print(f"\nOriginal: {test_texts}")
print(f"Tokens: {tokens}")
print(f"Token IDs: {token_ids[:30]}...")
print(f"toke num: {len(tokens)}")

In [None]:
def explain_space_handling():
    examples = [
        "hello world",  
        "hello  world", 
        "hello",         
        " hello",     
    ]
    
    for text in examples:
        tokens = tokenizer.tokenize(text)
        print(f"'{text}' ‚Üí {tokens}")
        for token in tokens:
            if 'ƒ†' in token:
                print(f" Note that  '{token}' has a space before it")
                
explain_space_handling()

In [None]:
embedding_layer = model.get_input_embeddings()
embedding_layer

In [None]:
import torch
import torch.nn.functional as F

device = next(model.parameters()).device

def get_word_embedding(word, model, tokenizer):
    embedding_layer = model.get_input_embeddings()
    token_ids = tokenizer.encode(word, add_special_tokens=False)

    token_ids = torch.tensor(token_ids, device=device)

    with torch.no_grad():
        embeddings = embedding_layer(token_ids)   # [num_tokens, dim]
        return embeddings.mean(dim=0)            

def cosine_similarity_torch(vec1, vec2):
    return F.cosine_similarity(
        vec1.unsqueeze(0), 
        vec2.unsqueeze(0), 
        dim=1
    ).item()

words = ["cat", "dog", "lion", "wind", "rain", "snow", "run", "walk", "jump"]

print("üîç Similarity matrix")
print("=" * 120)
print(" " * 12 + "".join([f"{word:>10}" for word in words]))


word_embeddings = {
    word: get_word_embedding(word, model, tokenizer)
    for word in words
}

for word1 in words:
    print(f"{word1:>12}: ", end="")
    for word2 in words:
        sim = cosine_similarity_torch(
            word_embeddings[word1],
            word_embeddings[word2]
        )

        if sim > 0.2:
            print(f"\033[92m{sim:>10.3f}\033[0m", end="")  
        elif sim < 0.1:
            print(f"\033[91m{sim:>10.3f}\033[0m", end="")  
        else:
            print(f"{sim:>10.3f}", end="")
    print()


In [None]:
def robust_generate(prompt, model, tokenizer, max_new_tokens=50, entropy=0.5):

    device = model.device
    
    print(f"Input: '{prompt}'")
    print(f"Entropy: {entropy} ")
    
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    input_ids = inputs.input_ids
    
    generated_tokens = []
    
    for i in range(max_new_tokens):
        with torch.no_grad():
            outputs = model(input_ids=input_ids)
            

            if hasattr(outputs, 'logits'):
                logits = outputs.logits
            else:
                logits = outputs.last_hidden_state
            
            next_token_logits = logits[:, -1, :]
            

            if entropy == 0.0:
                next_token_id = torch.argmax(next_token_logits, dim=-1, keepdim=True)
                strategy = "Greedy"
                
            else:

                temperature = 0.1 + entropy * 1.9 
                next_token_logits = next_token_logits / temperature
                probs = torch.softmax(next_token_logits, dim=-1)
                next_token_id = torch.multinomial(probs, num_samples=1)
                strategy = f"temperature(t={temperature:.1f})"
            
            new_token = tokenizer.decode(next_token_id[0], skip_special_tokens=True)
            generated_tokens.append(new_token)
            
            print(f"Token {i+1}: '{new_token}' ({strategy})")
            
            input_ids = torch.cat([input_ids, next_token_id], dim=1)
            
            # ÂÅúÊ≠¢Êù°‰ª∂
            if next_token_id.item() == tokenizer.eos_token_id:
                break
            if new_token in ['\n', '.', '!', '?', '„ÄÇ', 'ÔºÅ', 'Ôºü']:
                break
    
    generated_text = prompt + ''.join(generated_tokens)
    print(f"Test: {generated_text}")
    return generated_text

In [None]:
def ask_question(question, model, tokenizer, max_answer_tokens=50, entropy=0.5):
    

    prompt = f"QuestionÔºö{question}\n"
    print(f"ü§î question: {question}")

    full_response = robust_generate(prompt, model, tokenizer, max_new_tokens=max_answer_tokens, entropy=0.5)
    answer = full_response.replace(prompt, "").strip()
    
    print(f"\nüéØ Final answer: {answer}")
    print(f"üìä Length: {len(answer)} characters")
    
    return answer

In [None]:
question = "Â≠îÂ≠êÊòØË∞Å"
answer = ask_question(question, model, tokenizer, max_answer_tokens=50, entropy=0.2)

In [None]:
def show_next_token_probabilities(prompt, model, tokenizer, top_k=20):

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model(**inputs)
        
        if hasattr(outputs, 'logits'):
            logits = outputs.logits
        else:
            logits = outputs.last_hidden_state
        
        next_token_logits = logits[:, -1, :]
        
        # Calculate probability
        probs = torch.softmax(next_token_logits, dim=-1)
        
        # Obtain the top_k most likely token
        top_probs, top_indices = torch.topk(probs, top_k)
        
        print(f"üîç Promppt: '{prompt}'")
        print(f"üìä Top {top_k} predictions for the next token:\n")
        
        for i, (prob, idx) in enumerate(zip(top_probs[0], top_indices[0])):
            token_text = tokenizer.decode([idx])

            display_text = repr(token_text)[1:-1]  # ÂéªÊéâÂºïÂè∑
            
            print(f"{i+1:2d}. '{display_text:10s}' (ID: {idx:5d}) - Ê¶ÇÁéá: {prob.item():.4f}")


current_prompt = "ÈóÆÈ¢òÔºöÂ≠îÂ≠êÊòØË∞Å\nÂ≠îÂ≠êÊòØÊàëÂõΩÂè§‰ª£ÁöÑÂ§ßÊÄùÊÉ≥ÂÆ∂Ôºå"
show_next_token_probabilities(current_prompt, model, tokenizer, top_k=100)