### Download Qwen Instruct Model and Run through structured data   

In [1]:
import os, json, re, time

!pip -q install -U transformers accelerate datasets evaluate sentencepiece

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/10.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/10.1 MB[0m [31m80.9 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━[0m [32m8.5/10.1 MB[0m [31m120.5 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m10.1/10.1 MB[0m [31m122.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m84.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/515.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m515.2/515.2 kB[0m [31m40.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [

A1) Choose a small instruct model (3B–7B)

Pick one that fits your Colab GPU. Good defaults:

If you often get T4: start with 3B.

If you have L4/A100 (Colab Pro): you can try 7B.

In [2]:
MODEL_NAME = "Qwen/Qwen2.5-3B-Instruct"  # safe Colab choice
# MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"  # if you have stronger GPU

A2) Load model + tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

model = AutoModel   ForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto",
)
model.eval()

print("Device:", next(model.parameters()).device)


In [4]:
print(model)

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 2048)
    (layers): ModuleList(
      (0-35): 36 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (k_proj): Linear(in_features=2048, out_features=256, bias=True)
          (v_proj): Linear(in_features=2048, out_features=256, bias=True)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=2048, out_features=11008, bias=False)
          (up_proj): Linear(in_features=2048, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=2048, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): Qwen2RMSNorm((2048,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((2048,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((2048,), eps=1e-06)
    (ro

A3) Chat template + system prompts

Most modern instruct models expect a “chat format”. We’ll use the tokenizer’s built-in chat template when available.

In [5]:
SYSTEM_PROMPT = (
    "You are a helpful assistant. "
    "Be concise. If unsure, say you don't know."
)

def build_chat_prompt(user_msg: str, system_msg: str = SYSTEM_PROMPT):
    messages = [
        {"role": "system", "content": system_msg},
        {"role": "user", "content": user_msg},
    ]
    # apply_chat_template returns a string prompt; add_generation_prompt adds the assistant role start
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    return prompt


A4) Basic generation function (greedy or sampling)

In [9]:
@torch.inference_mode()
def generate_text(user_msg, max_new_tokens=256, temperature=0.7, top_p=0.9, do_sample=True):
    prompt = build_chat_prompt(user_msg)
    print("PROMPT:\n", prompt)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    gen = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=do_sample,
        temperature=temperature if do_sample else None,
        top_p=top_p if do_sample else None,
        eos_token_id=tokenizer.eos_token_id,
    )
    out = tokenizer.decode(gen[0], skip_special_tokens=True)

    # For chat templates, the decoded text includes prompt + response; extract response crudely
    # (good enough for Week 1; we’ll improve later)
    return out

print(generate_text("Give me 5 ideas to celebrate 3rd marraige aniversary in las vegas."))


PROMPT:
 <|im_start|>system
You are a helpful assistant. Be concise. If unsure, say you don't know.<|im_end|>
<|im_start|>user
Give me 5 ideas to celebrate 3rd marraige aniversary in las vegas.<|im_end|>
<|im_start|>assistant

system
You are a helpful assistant. Be concise. If unsure, say you don't know.
user
Give me 5 ideas to celebrate 3rd marraige aniversary in las vegas.
assistant
Here are 5 ideas for celebrating a 3rd marriage anniversary in Las Vegas:

1. Enjoy a sunset cruise on the Las Vegas Strip.
2. Visit the Bellagio fountains and have dinner at their Viceroy restaurant.
3. Attend a show at the Planet Hollywood Theater.
4. Have a private dinner at one of Las Vegas' luxury hotels.
5. Take a helicopter tour over the city.


In [7]:
prompt_example = build_chat_prompt("What is the capital of France?")
inputs_example = tokenizer(prompt_example, return_tensors="pt")
print(inputs_example)

{'input_ids': tensor([[151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,
           2823,  63594,     13,   1416,  42903,     11,   1977,    498,   1513,
            944,   1414,     13, 151645,    198, 151644,    872,    198,   3838,
            374,    279,   6722,    315,   9625,     30, 151645,    198, 151644,
          77091,    198]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [8]:
from pydantic import BaseModel, ValidationError
from typing import List

class PartyIdeas(BaseModel):
    themes: List[str]

JSON_SYSTEM = (
    "You are a JSON-only assistant. "
    "Return ONLY valid JSON. No markdown. No extra text."
)

def extract_first_json(text: str):
    # simple heuristic: find first {...}
    m = re.search(r"\{.*\}", text, flags=re.DOTALL)
    return m.group(0) if m else None

resp = generate_text(
    "Generate 5 adult birthday party themes. Output JSON with key 'themes' as a list of strings.",
    max_new_tokens=256,
    temperature=0.2,
    top_p=1.0,
    do_sample=True
)

json_str = extract_first_json(resp)
print("RAW:\n", resp)
print("\nJSON CANDIDATE:\n", json_str)

try:
    obj = PartyIdeas.model_validate_json(json_str)
    print("\nVALIDATED:", obj)
except ValidationError as e:
    print("\nVALIDATION ERROR:", e)


RAW:
 system
You are a helpful assistant. Be concise. If unsure, say you don't know.
user
Generate 5 adult birthday party themes. Output JSON with key 'themes' as a list of strings.
assistant
```json
{
  "themes": [
    "Vintage Glamour",
    "Under the Sea",
    "Rock & Roll Night",
    "Garden Party",
    "Space Adventure"
  ]
}
```

JSON CANDIDATE:
 {
  "themes": [
    "Vintage Glamour",
    "Under the Sea",
    "Rock & Roll Night",
    "Garden Party",
    "Space Adventure"
  ]
}

VALIDATED: themes=['Vintage Glamour', 'Under the Sea', 'Rock & Roll Night', 'Garden Party', 'Space Adventure']


In [9]:
from transformers import TextIteratorStreamer
import threading

def stream_generate(user_msg, max_new_tokens=256, temperature=0.7, top_p=0.9):
    prompt = build_chat_prompt(user_msg)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    kwargs = dict(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=temperature,
        top_p=top_p,
        streamer=streamer,
    )

    thread = threading.Thread(target=model.generate, kwargs=kwargs)
    thread.start()

    out = ""
    for chunk in streamer:
        out += chunk
        print(chunk, end="", flush=True)
    print()
    return out

_ = stream_generate("Explain RoPE in simple terms with a short example.", max_new_tokens=200)


RoPE is a technique used in transformer models to handle sequences that have different lengths by incorporating positional information into the model's input embeddings. This helps in preserving the relative position between tokens, which is crucial for tasks like language modeling.

In simple terms, imagine you have two sentences: "I love cats" and "Cats love me". These sentences are the same but with different orders of words. RoPE allows the model to understand that "love" and "cats" are closer together in the first sentence than they are in the second one.

A short example:
- Sentence A: "Hello world"
- Sentence B: "world Hello"

Without RoPE, a transformer might treat "Hello" and "world" as equally distant. With RoPE, it understands that "Hello" is closer to "world" in the first sentence compared to the second sentence.

RoPE achieves this by adding a special token at the beginning of each sequence that encodes the relative length of the sequence and
