<a href="https://colab.research.google.com/github/mohammadp1001/kaggle_payground_prediction_s5e6/blob/main/S5E6_XGBoost_0601.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()

VBox(children=(HTML(value='<center> <img\nsrc=https://www.kaggle.com/static/images/site-logo.png\nalt=\'Kaggle…

In [3]:
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

playground_series_s5e6_path = kagglehub.competition_download('jigsaw-agile-community-rules')
print('Data source import complete.')

Data source import complete.


In [50]:
from importlib.metadata import version

pkgs = [
    "huggingface_hub",  # to download pretrained weights
    "tokenizers",       # to implement the tokenizer
    "torch",            # to implement the model
]
for p in pkgs:
    print(f"{p} version: {version(p)}")

import json
import os
import torch
import torch.nn as nn

from pathlib import Path
from safetensors.torch import load_file
from huggingface_hub import hf_hub_download, snapshot_download
from sklearn.metrics import roc_auc_score
import os
import numpy as np
import torch
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

huggingface_hub version: 0.33.1
tokenizers version: 0.21.2
torch version: 2.6.0+cu124


Turn off reasoning capabilities for now.

In [5]:
USE_REASONING_MODEL = False
CHOOSE_MODEL = "0.6B"
QWEN3_CONFIG = {
        "vocab_size": 151_936,           # Vocabulary size
        "context_length": 40_960,        # Context length that was used to train the model
        "emb_dim": 1024,                 # Embedding dimension
        "n_heads": 16,                   # Number of attention heads
        "n_layers": 28,                  # Number of layers
        "hidden_dim": 3072,              # Size of the intermediate dimension in FeedForward
        "head_dim": 128,                 # Size of the heads in GQA
        "qk_norm": True,                 # Whether to normalize queries and values in GQA
        "n_kv_groups": 8,                # Key-Value groups for grouped-query attention
        "rope_base": 1_000_000.0,        # The base in RoPE's "theta"
        "dtype": torch.bfloat16,         # Lower-precision dtype to reduce memory usage
    }

In [6]:

class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.fc1 = nn.Linear(cfg["emb_dim"], cfg["hidden_dim"], dtype=cfg["dtype"], bias=False)
        self.fc2 = nn.Linear(cfg["emb_dim"], cfg["hidden_dim"], dtype=cfg["dtype"], bias=False)
        self.fc3 = nn.Linear(cfg["hidden_dim"], cfg["emb_dim"], dtype=cfg["dtype"], bias=False)

    def forward(self, x):
        x_fc1 = self.fc1(x)
        x_fc2 = self.fc2(x)
        x = nn.functional.silu(x_fc1) * x_fc2
        return self.fc3(x)

In [7]:
class RMSNorm(nn.Module):
    def __init__(self, emb_dim, eps=1e-6, bias=False, qwen3_compatible=True):
        super().__init__()
        self.eps = eps
        self.qwen3_compatible = qwen3_compatible
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim)) if bias else None

    def forward(self, x):
        input_dtype = x.dtype

        if self.qwen3_compatible:
            x = x.to(torch.float32)

        variance = x.pow(2).mean(dim=-1, keepdim=True)
        norm_x = x * torch.rsqrt(variance + self.eps)
        norm_x = norm_x * self.scale

        if self.shift is not None:
            norm_x = norm_x + self.shift

        return norm_x.to(input_dtype)

In [8]:
def compute_rope_params(head_dim, theta_base=10_000, context_length=4096, dtype=torch.float32):
    assert head_dim % 2 == 0, "Embedding dimension must be even"

    # Compute the inverse frequencies
    inv_freq = 1.0 / (theta_base ** (torch.arange(0, head_dim, 2, dtype=dtype)[: (head_dim // 2)].float() / head_dim))

    # Generate position indices
    positions = torch.arange(context_length, dtype=dtype)

    # Compute the angles
    angles = positions[:, None] * inv_freq[None, :]  # Shape: (context_length, head_dim // 2)

    # Expand angles to match the head_dim
    angles = torch.cat([angles, angles], dim=1)  # Shape: (context_length, head_dim)

    # Precompute sine and cosine
    cos = torch.cos(angles)
    sin = torch.sin(angles)

    return cos, sin


def apply_rope(x, cos, sin):
    # x: (batch_size, num_heads, seq_len, head_dim)
    batch_size, num_heads, seq_len, head_dim = x.shape
    assert head_dim % 2 == 0, "Head dimension must be even"

    # Split x into first half and second half
    x1 = x[..., : head_dim // 2]  # First half
    x2 = x[..., head_dim // 2 :]  # Second half

    # Adjust sin and cos shapes
    cos = cos[:seq_len, :].unsqueeze(0).unsqueeze(0)  # Shape: (1, 1, seq_len, head_dim)
    sin = sin[:seq_len, :].unsqueeze(0).unsqueeze(0)

    # Apply the rotary transformation
    rotated = torch.cat((-x2, x1), dim=-1)
    x_rotated = (x * cos) + (rotated * sin)

    # It's ok to use lower-precision after applying cos and sin rotation
    return x_rotated.to(dtype=x.dtype)

In [9]:
class GroupedQueryAttention(nn.Module):
    def __init__(
        self, d_in, num_heads, num_kv_groups, head_dim=None, qk_norm=False, dtype=None
    ):
        super().__init__()
        assert num_heads % num_kv_groups == 0, "num_heads must be divisible by num_kv_groups"

        self.num_heads = num_heads
        self.num_kv_groups = num_kv_groups
        self.group_size = num_heads // num_kv_groups

        if head_dim is None:
            assert d_in % num_heads == 0, "`d_in` must be divisible by `num_heads` if `head_dim` is not set"
            head_dim = d_in // num_heads

        self.head_dim = head_dim
        self.d_out = num_heads * head_dim

        self.W_query = nn.Linear(d_in, self.d_out, bias=False, dtype=dtype)
        self.W_key = nn.Linear(d_in, num_kv_groups * head_dim, bias=False, dtype=dtype)
        self.W_value = nn.Linear(d_in, num_kv_groups * head_dim, bias=False, dtype=dtype)

        self.out_proj = nn.Linear(self.d_out, d_in, bias=False, dtype=dtype)

        if qk_norm:
            self.q_norm = RMSNorm(head_dim, eps=1e-6)
            self.k_norm = RMSNorm(head_dim, eps=1e-6)
        else:
            self.q_norm = self.k_norm = None

    def forward(self, x, mask, cos, sin):
        b, num_tokens, _ = x.shape

        # Apply projections
        queries = self.W_query(x)  # (b, num_tokens, num_heads * head_dim)
        keys = self.W_key(x)       # (b, num_tokens, num_kv_groups * head_dim)
        values = self.W_value(x)   # (b, num_tokens, num_kv_groups * head_dim)

        # Reshape
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)
        keys = keys.view(b, num_tokens, self.num_kv_groups, self.head_dim).transpose(1, 2)
        values = values.view(b, num_tokens, self.num_kv_groups, self.head_dim).transpose(1, 2)

        # Optional normalization
        if self.q_norm:
            queries = self.q_norm(queries)
        if self.k_norm:
            keys = self.k_norm(keys)

        # Apply RoPE
        queries = apply_rope(queries, cos, sin)
        keys = apply_rope(keys, cos, sin)

        # Expand K and V to match number of heads
        keys = keys.repeat_interleave(self.group_size, dim=1)
        values = values.repeat_interleave(self.group_size, dim=1)

        # Attention
        attn_scores = queries @ keys.transpose(2, 3)
        attn_scores = attn_scores.masked_fill(mask, -torch.inf)
        attn_weights = torch.softmax(attn_scores / self.head_dim**0.5, dim=-1)

        context = (attn_weights @ values).transpose(1, 2).reshape(b, num_tokens, self.d_out)
        return self.out_proj(context)

In [10]:
class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = GroupedQueryAttention(
            d_in=cfg["emb_dim"],
            num_heads=cfg["n_heads"],
            head_dim=cfg["head_dim"],
            num_kv_groups=cfg["n_kv_groups"],
            qk_norm=cfg["qk_norm"],
            dtype=cfg["dtype"]
        )
        self.ff = FeedForward(cfg)
        self.norm1 = RMSNorm(cfg["emb_dim"], eps=1e-6)
        self.norm2 = RMSNorm(cfg["emb_dim"], eps=1e-6)

    def forward(self, x, mask, cos, sin):
        # Shortcut connection for attention block
        shortcut = x
        x = self.norm1(x)
        x = self.att(x, mask, cos, sin)  # Shape [batch_size, num_tokens, emb_size]
        x = x + shortcut  # Add the original input back

        # Shortcut connection for feed-forward block
        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = x + shortcut  # Add the original input back

        return x

In [11]:
class Qwen3Model(nn.Module):
    def __init__(self, cfg):
        super().__init__()

        # Main model parameters
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"], dtype=cfg["dtype"])

        self.trf_blocks = nn.ModuleList(  # ModuleList since Sequential can only accept one input, and we need `x, mask, cos, sin`
            [TransformerBlock(cfg) for _ in range(cfg["n_layers"])]
        )

        self.final_norm = RMSNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False, dtype=cfg["dtype"])

        # Reusuable utilities
        if cfg["head_dim"] is None:
            head_dim = cfg["emb_dim"] // cfg["n_heads"]
        else:
            head_dim = cfg["head_dim"]
        cos, sin = compute_rope_params(
            head_dim=head_dim,
            theta_base=cfg["rope_base"],
            context_length=cfg["context_length"]
        )
        self.register_buffer("cos", cos, persistent=False)
        self.register_buffer("sin", sin, persistent=False)
        self.cfg = cfg


    def forward(self, in_idx):
        # Forward pass
        tok_embeds = self.tok_emb(in_idx)
        x = tok_embeds

        num_tokens = x.shape[1]
        mask = torch.triu(torch.ones(num_tokens, num_tokens, device=x.device, dtype=torch.bool), diagonal=1)
        
        for block in self.trf_blocks:
            x = block(x, mask, self.cos, self.sin)
        x = self.final_norm(x)
        logits = self.out_head(x.to(self.cfg["dtype"]))
        return logits

In [12]:
torch.manual_seed(123)
model = Qwen3Model(QWEN3_CONFIG)

In [66]:
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

model.to(device)

Qwen3Model(
  (tok_emb): Embedding(151936, 1024)
  (trf_blocks): ModuleList(
    (0-27): 28 x TransformerBlock(
      (att): GroupedQueryAttention(
        (W_query): Linear(in_features=1024, out_features=2048, bias=False)
        (W_key): Linear(in_features=1024, out_features=1024, bias=False)
        (W_value): Linear(in_features=1024, out_features=1024, bias=False)
        (out_proj): Linear(in_features=2048, out_features=1024, bias=False)
        (q_norm): RMSNorm()
        (k_norm): RMSNorm()
      )
      (ff): FeedForward(
        (fc1): Linear(in_features=1024, out_features=3072, bias=False)
        (fc2): Linear(in_features=1024, out_features=3072, bias=False)
        (fc3): Linear(in_features=3072, out_features=1024, bias=False)
      )
      (norm1): RMSNorm()
      (norm2): RMSNorm()
    )
  )
  (final_norm): RMSNorm()
  (out_head): Linear(in_features=1024, out_features=2, bias=False)
)

In [14]:
def load_weights_into_qwen(model, param_config, params):
    def assign(left, right, tensor_name="unknown"):
        if left.shape != right.shape:
            raise ValueError(f"Shape mismatch in tensor '{tensor_name}'. Left: {left.shape}, Right: {right.shape}")
        return torch.nn.Parameter(right.clone().detach() if isinstance(right, torch.Tensor) else torch.tensor(right))

    model.tok_emb.weight = assign(model.tok_emb.weight, params["model.embed_tokens.weight"], "model.embed_tokens.weight")

    for l in range(param_config["n_layers"]):
        block = model.trf_blocks[l]
        att = block.att

        # Q, K, V projections
        att.W_query.weight = assign(
            att.W_query.weight,
            params[f"model.layers.{l}.self_attn.q_proj.weight"],
            f"model.layers.{l}.self_attn.q_proj.weight"
        )
        att.W_key.weight = assign(
            att.W_key.weight,
            params[f"model.layers.{l}.self_attn.k_proj.weight"],
            f"model.layers.{l}.self_attn.k_proj.weight"
        )
        att.W_value.weight = assign(
            att.W_value.weight,
            params[f"model.layers.{l}.self_attn.v_proj.weight"],
            f"model.layers.{l}.self_attn.v_proj.weight"
        )

        # Output projection
        att.out_proj.weight = assign(
            att.out_proj.weight,
            params[f"model.layers.{l}.self_attn.o_proj.weight"],
            f"model.layers.{l}.self_attn.o_proj.weight"
        )

        # QK norms
        if hasattr(att, "q_norm") and att.q_norm is not None:
            att.q_norm.scale = assign(
                att.q_norm.scale,
                params[f"model.layers.{l}.self_attn.q_norm.weight"],
                f"model.layers.{l}.self_attn.q_norm.weight"
            )
        if hasattr(att, "k_norm") and att.k_norm is not None:
            att.k_norm.scale = assign(
                att.k_norm.scale,
                params[f"model.layers.{l}.self_attn.k_norm.weight"],
                f"model.layers.{l}.self_attn.k_norm.weight"
            )

        # Attention layernorm
        block.norm1.scale = assign(
            block.norm1.scale,
            params[f"model.layers.{l}.input_layernorm.weight"],
            f"model.layers.{l}.input_layernorm.weight"
        )

        # Feedforward weights
        block.ff.fc1.weight = assign(
            block.ff.fc1.weight,
            params[f"model.layers.{l}.mlp.gate_proj.weight"],
            f"model.layers.{l}.mlp.gate_proj.weight"
        )
        block.ff.fc2.weight = assign(
            block.ff.fc2.weight,
            params[f"model.layers.{l}.mlp.up_proj.weight"],
            f"model.layers.{l}.mlp.up_proj.weight"
        )
        block.ff.fc3.weight = assign(
            block.ff.fc3.weight,
            params[f"model.layers.{l}.mlp.down_proj.weight"],
            f"model.layers.{l}.mlp.down_proj.weight"
        )
        block.norm2.scale = assign(
            block.norm2.scale,
            params[f"model.layers.{l}.post_attention_layernorm.weight"],
            f"model.layers.{l}.post_attention_layernorm.weight"
        )

    # Final normalization and output head
    model.final_norm.scale = assign(model.final_norm.scale, params["model.norm.weight"], "model.norm.weight")

    if "lm_head.weight" in params:
        model.out_head.weight = assign(model.out_head.weight, params["lm_head.weight"], "lm_head.weight")
    else:
        # Model uses weight tying, hence we reuse the embedding layer weights here
        print("Model uses weight tying.")
        model.out_head.weight = assign(model.out_head.weight, params["model.embed_tokens.weight"], "model.embed_tokens.weight")

In [15]:
if USE_REASONING_MODEL:
    repo_id = f"Qwen/Qwen3-{CHOOSE_MODEL}"
else:
    repo_id = f"Qwen/Qwen3-{CHOOSE_MODEL}-Base"

local_dir = Path(repo_id).parts[-1]

if CHOOSE_MODEL == "0.6B":
    weights_file = hf_hub_download(
        repo_id=repo_id,
        filename="model.safetensors",
        local_dir=local_dir,
    )
    weights_dict = load_file(weights_file)
else:
    repo_dir = snapshot_download(repo_id=repo_id, local_dir=local_dir)
    index_path = os.path.join(repo_dir, "model.safetensors.index.json")
    with open(index_path, "r") as f:
        index = json.load(f)

    weights_dict = {}
    for filename in set(index["weight_map"].values()):
        shard_path = os.path.join(repo_dir, filename)
        shard = load_file(shard_path)
        weights_dict.update(shard)

load_weights_into_qwen(model, QWEN3_CONFIG, weights_dict)
model.to(device);

model.safetensors:   0%|          | 0.00/1.19G [00:00<?, ?B/s]

Model uses weight tying.


In [16]:
def generate(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=None, eos_id=None):
    # For-loop is the same as before: Get logits, and only focus on last time step
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]

        # Filter logits with top_k sampling
        if top_k is not None:
            # Keep only top_k values
            top_logits, _ = torch.topk(logits, top_k)
            min_val = top_logits[:, -1]
            logits = torch.where(logits < min_val, torch.tensor(-torch.inf).to(logits.device), logits)

        # Apply temperature scaling
        if temperature > 0.0:
            logits = logits / temperature

            # Apply softmax to get probabilities
            probs = torch.softmax(logits, dim=-1)  # (batch_size, context_len)

            # Sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (batch_size, 1)

        # Otherwise same as before: get idx of the vocab entry with the highest logits value
        else:
            idx_next = torch.argmax(logits, dim=-1, keepdim=True)  # (batch_size, 1)

        if eos_id is not None and idx_next.item() == eos_id:
            break  # Stop generating early if end-of-sequence token is encountered and eos_id is specified

        # Same as before: append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (batch_size, num_tokens+1)

    return idx

In [17]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen-7B', trust_remote_code=True, pad_token='<|endoftext|>')

tokenizer_config.json:   0%|          | 0.00/174 [00:00<?, ?B/s]

tokenization_qwen.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Qwen/Qwen-7B:
- tokenization_qwen.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


qwen.tiktoken: 0.00B [00:00, ?B/s]

In [18]:
prompt = "Give me a short introduction to large language models."

input_token_ids = tokenizer.encode(prompt)
text = tokenizer.decode(input_token_ids)
text

'Give me a short introduction to large language models.'

In [19]:
import time
torch.manual_seed(123)

start = time.time()

output_token_ids = generate(
    model=model,
    idx=torch.tensor(input_token_ids, device=device).unsqueeze(0),
    max_new_tokens=150,
    context_size=QWEN3_CONFIG["context_length"],
    top_k=1,
    temperature=0.
)

print(f"Time: {time.time() - start:.2f} sec")

if torch.cuda.is_available():
    max_mem_bytes = torch.cuda.max_memory_allocated()
    max_mem_gb = max_mem_bytes / (1024 ** 3)
    print(f"Max memory allocated: {max_mem_gb:.2f} GB")

output_text = tokenizer.decode(output_token_ids.squeeze(0).tolist())

print(output_text + "...")

Time: 11.59 sec
Max memory allocated: 1.49 GB
Give me a short introduction to large language models. Large language models are advanced artificial intelligence systems that use deep learning techniques to understand and generate human-like text. They are trained on vast amounts of text data, allowing them to perform a wide range of tasks, from answering questions to writing essays, composing poetry, and even creating music. These models have revolutionized the way we interact with technology, enabling machines to simulate human language and perform complex tasks with remarkable accuracy.<|endoftext|>Human language is a complex and dynamic system that has evolved over thousands of years. It is characterized by its ability to convey meaning, express emotions, and communicate ideas across different cultures and languages. Despite its complexity, human language is also highly adaptable and can be used to solve problems, make decisions, and engage in social interactions. Large language...


We see that model is able to generate coherent text. Next we will modify the last layer to preparte it for classification task.

In [20]:
model

Qwen3Model(
  (tok_emb): Embedding(151936, 1024)
  (trf_blocks): ModuleList(
    (0-27): 28 x TransformerBlock(
      (att): GroupedQueryAttention(
        (W_query): Linear(in_features=1024, out_features=2048, bias=False)
        (W_key): Linear(in_features=1024, out_features=1024, bias=False)
        (W_value): Linear(in_features=1024, out_features=1024, bias=False)
        (out_proj): Linear(in_features=2048, out_features=1024, bias=False)
        (q_norm): RMSNorm()
        (k_norm): RMSNorm()
      )
      (ff): FeedForward(
        (fc1): Linear(in_features=1024, out_features=3072, bias=False)
        (fc2): Linear(in_features=1024, out_features=3072, bias=False)
        (fc3): Linear(in_features=3072, out_features=1024, bias=False)
      )
      (norm1): RMSNorm()
      (norm2): RMSNorm()
    )
  )
  (final_norm): RMSNorm()
  (out_head): Linear(in_features=1024, out_features=151936, bias=False)
)

In [21]:
for param in model.parameters():
    param.requires_grad = False

model.out_head = nn.Linear(QWEN3_CONFIG["emb_dim"], 2, bias=False, dtype=QWEN3_CONFIG["dtype"]).to(device)

Let's also add final RMSNorm and transformer layer to training

In [22]:
for param in model.trf_blocks[-1].parameters():
    param.requires_grad = False
for param in model.final_norm.parameters():
    param.requires_grad = False    

In [None]:
def calc_auc_metric(input_batch, target_batch,device):
    """Calculates the ROC-AUC metric for a batch of inputs and targets.
    
    :param input_batch: Tensor of shape (batch_size, sequence_length) containing input token IDs.
    :param target_batch: Tensor of shape (batch_size,) containing binary target labels (0 or 1).
    :param device: The device (CPU or GPU) to perform computations on"""
    model.to(device)
    model.eval()
    input_batch = input_batch.to(device)
    target_batch = target_batch.to(device)
    amp_ctx = torch.amp.autocast(device_type='cuda',dtype=QWEN3_CONFIG["dtype"])
    
    with torch.inference_mode(), amp_ctx:
        logits = model(input_batch)[:,-1,:]
        
    probs = torch.softmax(logits, dim=-1)[:,1].detach().cpu().numpy() # Probability of positive class
    target_batch = target_batch.detach().cpu().numpy()
    return roc_auc_score(target_batch, probs)

In [None]:
def cal_auc_loader(model, data_loader, device):
    """Calculates the ROC-AUC metric for a DataLoader.
    
    :param model: The model to evaluate.
    :param data_loader: DataLoader providing batches of (input_batch, target_batch).
    :param device: The device (CPU or GPU) to perform computations on."""
    model.eval()
    all_probs = []
    all_targets = []
    amp_ctx = torch.amp.autocast(device_type='cuda',dtype=QWEN3_CONFIG["dtype"])

    with torch.inference_mode(), amp_ctx:
        for input_batch, target_batch in data_loader:
            input_batch = input_batch.to(device)
            target_batch = target_batch.to(device)
            logits = model(input_batch)[:,-1,:]
            probs = torch.softmax(logits, dim=-1)[:,1].detach().cpu().numpy() # Probability of positive class
            all_probs.extend(probs)
            all_targets.extend(target_batch.detach().cpu().numpy())
    return roc_auc_score(all_targets, all_probs)

In [76]:
def calc_loss_batch(input_batch, target_batch, model):
    """Returns SUM of cross-entropy losses over the batch (for proper global averaging).
    
    :param input_batch: Tensor of shape (batch_size, sequence_length) containing input token IDs.
    :param target_batch: Tensor of shape (batch_size,) containing target class indices.
    :param model: The model to evaluate.
    :param device: The device (CPU or GPU) to perform computations on."""
    model.eval()
    input_batch = input_batch.to(model.out_head.weight.device)
    target_batch = target_batch.to(model.out_head.weight.device)
    logits = model(input_batch)
    logits = logits[:, -1, :]  
    loss_sum = F.cross_entropy(logits, target_batch, reduction="sum")
    return loss_sum

In [None]:
def calc_loss_loader(model, data_loader, device):
    """Calculates the average cross-entropy loss for a DataLoader.
    :param model: The model to evaluate.
    :param data_loader: DataLoader providing batches of (input_batch, target_batch).
    :param device: The device (CPU or GPU) to perform computations on."""
    model.to(device)
    model.eval()

    total_loss = 0.0
    total_samples = 0

    amp_ctx = torch.amp.autocast(device_type='cuda',dtype=QWEN3_CONFIG["dtype"])

    with torch.inference_mode(), amp_ctx:
        for input_batch, target_batch in data_loader:
            input_batch = input_batch.to(device, non_blocking=True)
            target_batch = target_batch.to(device, non_blocking=True)

            batch_loss_sum = calc_loss_batch(input_batch, target_batch, model)
            total_loss += float(batch_loss_sum)
            total_samples += target_batch.size(0)


In [81]:
def train_epoch(model, train_loader, optimizer, device):
    """Trains the model for one epoch.
    
     :param model: The model to train.
     :param train_loader: DataLoader providing batches of (input_batch, target_batch).
     :param optimizer: The optimizer to use for training.
     :param device: The device (CPU or GPU) to perform computations on."""
    total_loss = 0.0
    for input_batch, target_batch in train_loader:
        model.train()
        optimizer.zero_grad()
        loss = calc_loss_batch(input_batch, target_batch, model)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * input_batch.size(0)  # Accumulate loss scaled by batch size
    avg_loss = total_loss / len(train_loader.dataset)
    print(f"Training Loss: {avg_loss:.4f}")
    return avg_loss

In [79]:
def train_regulator(model,train_loader,val_loader, optimizer, device, num_epochs):
    """Trains the model with periodic evaluation.
    
     :param model: The model to train.
     :param train_loader: DataLoader providing batches of (input_batch, target_batch).
     :param val_loader: DataLoader for validation data.
     :param optimizer: The optimizer to use for training.
     :param device: The device (CPU or GPU) to perform computations on.
     :param num_epochs: Number of epochs to train."""
    train_losses = []
    val_losses = []
    step = 0
    for epoch in range(num_epochs):
        train_loss = train_epoch(model, train_loader, optimizer, device)
        step += 1
        print(f"Completed epoch {epoch+1}/{num_epochs}")
        # Evaluate every 10 steps
        if step % 10 == 0:
            val_auc = cal_auc_loader(model, val_loader, device)
            val_loss = calc_loss_loader(model, val_loader, device)
            train_losses.append(train_loss)
            val_losses.append(val_loss)
            print(f"Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Val AUC: {val_auc:.4f}")

In [29]:
train = pd.read_csv("/kaggle/input/jigsaw-agile-community-rules/train.csv")
test = pd.read_csv("/kaggle/input/jigsaw-agile-community-rules/test.csv")

In [None]:
class CommentDataset(Dataset):
    """Custom Dataset for loading comments and their labels."""
    def __init__(self, tokensizer, df, max_length=1024):
        """Initialize the dataset with texts and labels.

        :param tokenizer: Tokenizer to convert text to tokens.
        :param df: DataFrame containing the data.
        :param max_length: Maximum length for tokenization. Default is 1024.
        """
        self.tokenizer = tokensizer
        self.max_length = max_length 
        self.texts = [f"Predict if this subredit with following Body: {row.body} would violates this Rule: {row.rule} I am providing you two Positive Examples: {row.positive_example_1} {row.positive_example_2} and two Negative Examples: {row.negative_example_1}, {row.negative_example_2} You can learn from these examples and predict 1 for yes and 0 for No." 
                        for row in df.itertuples(index=False)]
        
        # Batch encode once during initialization
        self.encodings = self.tokenizer(
            self.texts,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        self.labels = torch.tensor(df['rule_violation'].values, dtype=torch.long)
        
    def __len__(self):
        return len(self.encodings['input_ids'])
    
    def __getitem__(self, idx):
        return (
            self.encodings['input_ids'][idx],
            self.labels[idx] 
            )


In [83]:
train_dataset = CommentDataset(tokenizer, train.iloc[50:,:])
val_dataset = CommentDataset(tokenizer, train.iloc[:50,:])

batch_size = 8
train_loader = DataLoader(
        train_dataset, 
        batch_size=batch_size, 
        shuffle=True, 
        drop_last=True,
        num_workers=4,          
        pin_memory=True,        
        prefetch_factor=2,
        persistent_workers=True)

val_loader = DataLoader(
        val_dataset, 
        batch_size=batch_size, 
        shuffle=False, 
        num_workers=4,          
        pin_memory=True,        
        prefetch_factor=2,
        persistent_workers=True)

In [60]:
train.shape

(2029, 9)

In [None]:
len(train_loader), len(val_loader)

(247, 7)

In [None]:
for input_ids, labels in train_loader:
    print("Input IDs shape:", input_ids.shape)
    print("Labels shape:", labels.shape)
    calc_loss_loader(model, val_loader, device)
    break

Input IDs shape: torch.Size([8, 1024])
Labels shape: torch.Size([8])


In [70]:
device

device(type='cuda')

In [None]:
optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-4)
train_regulator(model,train_loader,val_loader, optimizer, device, 3)


Training Loss: 5.6903
Completed epoch 1/3
Training Loss: 5.5928
Completed epoch 2/3
