In [20]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import torch.nn as nn

# https://huggingface.co/Qwen/Qwen3-0.6B
model_name = "Qwen/Qwen3-0.6B"

In [None]:
# load the tokenizer and the model
original_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto")

In [22]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 2) inspect top-level modules and decide cut index
for i, (name, module) in enumerate(original_model.named_children()):
    print(i, name, module)

0 model Qwen3Model(
  (embed_tokens): Embedding(151936, 1024)
  (layers): ModuleList(
    (0-27): 28 x Qwen3DecoderLayer(
      (self_attn): Qwen3Attention(
        (q_proj): Linear(in_features=1024, out_features=2048, bias=False)
        (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
        (v_proj): Linear(in_features=1024, out_features=1024, bias=False)
        (o_proj): Linear(in_features=2048, out_features=1024, bias=False)
        (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
        (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
      )
      (mlp): Qwen3MLP(
        (gate_proj): Linear(in_features=1024, out_features=3072, bias=False)
        (up_proj): Linear(in_features=1024, out_features=3072, bias=False)
        (down_proj): Linear(in_features=3072, out_features=1024, bias=False)
        (act_fn): SiLUActivation()
      )
      (input_layernorm): Qwen3RMSNorm((1024,), eps=1e-06)
      (post_attention_layernorm): Qwen3RMSNorm((1024,), eps=1e-06)
    )
  )
  (norm):

As we can see, the Qwen3-0.6B model is extremely fine-tuning friendly. It has pretty much only two top-level modules: the transformer backbone and the language modeling head. This means that we can easily swap out the head for any other task-specific head, such as our case, a "grading head".

In [23]:
backbone = next(original_model.children())
print(backbone)

Qwen3Model(
  (embed_tokens): Embedding(151936, 1024)
  (layers): ModuleList(
    (0-27): 28 x Qwen3DecoderLayer(
      (self_attn): Qwen3Attention(
        (q_proj): Linear(in_features=1024, out_features=2048, bias=False)
        (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
        (v_proj): Linear(in_features=1024, out_features=1024, bias=False)
        (o_proj): Linear(in_features=2048, out_features=1024, bias=False)
        (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
        (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
      )
      (mlp): Qwen3MLP(
        (gate_proj): Linear(in_features=1024, out_features=3072, bias=False)
        (up_proj): Linear(in_features=1024, out_features=3072, bias=False)
        (down_proj): Linear(in_features=3072, out_features=1024, bias=False)
        (act_fn): SiLUActivation()
      )
      (input_layernorm): Qwen3RMSNorm((1024,), eps=1e-06)
      (post_attention_layernorm): Qwen3RMSNorm((1024,), eps=1e-06)
    )
  )
  (norm): Qwen3RM

In [24]:
# Explore the backbone output shapes
backbone.to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)
sample_inputs = tokenizer(["I like tortilla de patata..."], return_tensors="pt").to(original_model.device)
with torch.no_grad():
    backbone_output = backbone(**sample_inputs)
print(backbone_output.last_hidden_state.shape)  # (1, sequence_length, hidden_size)

torch.Size([1, 8, 1024])


In [48]:
# By the looks of it, this can also be seen in the following configuration:
in_dim = backbone.config.hidden_size
print(in_dim)  # 1024

1024


In [None]:
# Finally, let's combine the backbone with a custom head for our grading task.


class GradingHead(nn.Module):
    def __init__(self, in_dim, hidden_dim=512, dropout=0.5):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden_dim),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, 1),
            nn.Sigmoid(),  # output between 0 and 1
        )

    def forward(self, x):
        return self.net(x)


head = GradingHead(1024, hidden_dim=512).to(device)

In [43]:
# Combine the backbone and the head


class CombinedModel(nn.Module):
    def __init__(self, backbone, head):
        super().__init__()
        self.backbone = backbone
        self.head = head

    def forward(self, **inputs):
        backbone_outputs = self.backbone(**inputs)
        last_hidden_state = backbone_outputs.last_hidden_state  # (batch_size, seq_len, hidden_size)
        # Instead of passing the entire last hidden state, we can just use the representation of the last token
        last_hidden_state = last_hidden_state[:, -1, :]  # (batch_size, hidden_size)
        logits = self.head(last_hidden_state)  # (batch_size, 1)
        return logits


# Before combining, we'll have to make sure that the backbone is not quantized.
unquantized_backbone = backbone.to(torch.float32)
model = CombinedModel(unquantized_backbone, head).to(device)

In [46]:
# Explore the new model's output shapes
tokenizer = AutoTokenizer.from_pretrained(model_name)
sample_inputs = tokenizer(["I like tortilla de patata..."], return_tensors="pt").to(device)
with torch.no_grad():
    model_output = model(**sample_inputs)
print(model_output.shape)  # (1, 1)
print(model_output)

torch.Size([1, 1])
tensor([[0.5196]], device='cuda:0')


As expected, the combined model outputs a single scalar value (which, because comes from a sigmoid, we know is between 0 and 1), which is exactly what we want for our grading task.

Finally, let's summarize the steps we took to adapt the pre-trained model into a single function:

In [None]:
original_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto")
backbone = next(original_model.children())
head = GradingHead(1024, hidden_dim=512).to(device)
unquantized_backbone = backbone.to(torch.float32)
model = CombinedModel(unquantized_backbone, head).to(device)

We will implement a function with this steps in `recruitair.modelling.custom_qwen` module.