
### References

*   [https://www.kaggle.com/code/abdmental01/jigsaw-mpnet-base-v2-inference-cv-0-876](https://www.kaggle.com/code/abdmental01/jigsaw-mpnet-base-v2-inference-cv-0-876)
*   [https://www.kaggle.com/code/aerdem4/jigsaw-acrc-qwen7b-finetune-logits-processor-zoo](https://www.kaggle.com/code/aerdem4/jigsaw-acrc-qwen7b-finetune-logits-processor-zoo)
*   [https://www.guruguru.science/competitions/24/discussions/21027ff1-2074-4e21-a249-b2d4170bd516/](https://www.guruguru.science/competitions/24/discussions/21027ff1-2074-4e21-a249-b2d4170bd516/)
*   https://www.kaggle.com/code/mks2192/jigsaw-llama3-1-8b-instruct-training-one-epoch
*   [https://www.kaggle.com/code/fuumin621/qwen2-5-lora-finetune-baseline-inference](https://www.kaggle.com/code/fuumin621/qwen2-5-lora-finetune-baseline-inference)
*   https://www.kaggle.com/code/neibyr/30-min-just-use-semantic-search-qwen3-emb-0-6b

### I want to say thanks to @neibyr for your interesting idea: [Retrieve by Qwen3Embedding](http://https://www.kaggle.com/code/neibyr/30-min-just-use-semantic-search-qwen3-emb-0-6b) 

This version changes the lr for training Qwen 3 0.5b. 

In [None]:
!uv pip install --system --no-index --find-links='/kaggle/input/jigsaw-packages2/whls/' 'trl==0.21.0' 'optimum==1.27.0' 'auto-gptq==0.7.1' 'bitsandbytes==0.46.1' 'deepspeed==0.17.4' 'logits-processor-zoo==0.2.1' 'vllm==0.10.0'
!uv pip install --system --no-index --find-links='/kaggle/input/jigsaw-packages2/whls/' 'triton==3.2.0'
!uv pip install --system --no-index --find-links='/kaggle/input/jigsaw-packages2/whls/' 'clean-text'
!uv pip install --system --no-index -U --no-deps --find-links='/kaggle/input/jigsaw-packages2/whls/' 'peft' 'accelerate' 'datasets'

# 1. Test time train Qwen 2.5 0.5b

In [None]:
%%writefile constants.py
BASE_MODEL_PATH = "/kaggle/input/qwen2.5/transformers/0.5b-instruct-gptq-int4/1"
LORA_PATH = "output/"
DATA_PATH = "/kaggle/input/jigsaw-agile-community-rules/"

POSITIVE_ANSWER = "Yes"
NEGATIVE_ANSWER = "No"
COMPLETE_PHRASE = "Answer:"
BASE_PROMPT = '''You are given a comment from reddit and a rule. Your task is to classify whether the comment violates the rule. Only respond Yes/No.'''

In [None]:
%%writefile rule_knowledge.py

RULE_CANON_SHORT = {
    "public rule 0": "[ADVERTISING]",
    "public rule 1": "[LEGAL ADVICE]",
    "private rule 0": "[FINANCIAL ADVICE]",
    "private rule 1": "[MEDICAL ADVICE]",
    "private rule 2": "[ILLEGAL ACTIVITY]",
    "private rule 3": "[SPOILERS]",
}

KEYWORD_FALLBACK = [
    ("advertis|referral|promo|spam", "public rule 0"),
    ("legal", "public rule 1"),
    ("financ|invest|tax|career", "private rule 0"),
    ("medical|diagnos|treat", "private rule 1"),
    ("illegal|drug|violence|exploit|theft|crime", "private rule 2"),
    ("spoiler", "private rule 3"),
]

import re

def canonicalize_rule(rule_str: str):
    s = (rule_str or "").strip().lower()
    # 直接一致
    for k in RULE_CANON_SHORT:
        if k in s:
            return RULE_CANON_SHORT[k]
    # キーワードでフォールバック
    for pat, key in KEYWORD_FALLBACK:
        if re.search(pat, s):
            return RULE_CANON_SHORT[key]
    return None

In [None]:
%%writefile text_cleaning.py
import regex as re  # pip install regex が必要

# Unicode Emoji 全対応（ZWJ, スキントーンも含む）
_EMOJI_RE = re.compile(r"\p{Emoji}+")

# Markdown装飾パターン
_MD_PATTERNS = [
    r"\*\*(.*?)\*\*",   # **bold**
    r"\*(.*?)\*",       # *italic/bold*
    r"__(.*?)__",       # __italic__
    r"_(.*?)_",         # _italic_
    r"`(.*?)`",         # `inline code`
    r"#+\s+",           # # Heading
    r">+\s+",           # > quote
    r"-{3,}",           # --- hr
]

_MD_RE = re.compile("|".join(_MD_PATTERNS), flags=re.MULTILINE)

def strip_emojis_kaomoji(text: str) -> str:
    if not text:
        return text
    s = str(text)

    # 絵文字削除
    s = _EMOJI_RE.sub("", s)

    # Markdown装飾削除（内容は残す）
    s = _MD_RE.sub(lambda m: m.group(1) if m.lastindex else "", s)

    # 空白・改行の整形
    s = re.sub(r"[ \t\r\f\v]+", " ", s)
    s = re.sub(r"\n{3,}", "\n\n", s).strip()
    return s


if __name__ == "__main__":
    sample = """
# **Huge SALE!!!**
Get *FREE* stuff 👉👉 https://spam.com
> Only today!!!
😊🔥🚀
"""
    print("Before:", sample)
    print("After :", strip_emojis_kaomoji(sample))

In [None]:
%%writefile utils.py
import pandas as pd
from datasets import Dataset
from constants import POSITIVE_ANSWER, NEGATIVE_ANSWER, COMPLETE_PHRASE, BASE_PROMPT
from rule_knowledge import canonicalize_rule
from text_cleaning import strip_emojis_kaomoji as _strip
import random, numpy as np
random.seed(42); np.random.seed(42)

def _sz(x):  # sanitize helper
    return _strip("" if pd.isna(x) else str(x))

def build_prompt(row):
    rule_raw = _sz(row["rule"])
    body = _sz(row["body"])
    subreddit = _sz(row["subreddit"])
    pos_ex = _sz(row["positive_example"])
    neg_ex = _sz(row["negative_example"])

    canon = canonicalize_rule(rule_raw)
    rule_block = f"Rule: {rule_raw}\n"
    if canon:
        rule_block += f"Canonical Definition: {canon}\n"

    return f"""
{BASE_PROMPT}

Subreddit: r/{subreddit}
{rule_block}
Examples:
1) {pos_ex}
{COMPLETE_PHRASE} Yes

2) {neg_ex}
{COMPLETE_PHRASE} No

---
Comment: {body}
{COMPLETE_PHRASE}"""

def get_dataframe_to_train(data_path):
    import numpy as np
    train_dataset = pd.read_csv(f"{data_path}/train.csv")
    test_dataset = pd.read_csv(f"{data_path}/test.csv").sample(frac=0.5, random_state=42).reset_index(drop=True)

    flatten = []

    train_df = train_dataset[[
        "body","rule","subreddit","rule_violation",
        "positive_example_1","positive_example_2",
        "negative_example_1","negative_example_2"
    ]].copy()

    train_df["positive_example"] = np.where(
        np.random.rand(len(train_df)) < 0.5,
        train_df["positive_example_1"], train_df["positive_example_2"]
    )
    train_df["negative_example"] = np.where(
        np.random.rand(len(train_df)) < 0.5,
        train_df["negative_example_1"], train_df["negative_example_2"]
    )
    train_df.drop(columns=[
        "positive_example_1","positive_example_2",
        "negative_example_1","negative_example_2"
    ], inplace=True)

    # ここで主要テキスト列をクリーニング（学習データ）
    for c in ["body","rule","subreddit","positive_example","negative_example"]:
        train_df[c] = train_df[c].astype(str).map(_sz)

    flatten.append(train_df)

    # テスト例示からの flatten 拡張（ラベル付与）
    for violation_type in ["positive", "negative"]:
        for i in range(1, 2+1):
            sub = test_dataset[[
                "rule","subreddit",
                "positive_example_1","positive_example_2",
                "negative_example_1","negative_example_2"
            ]].copy()

            if violation_type == "positive":
                body_col = f"positive_example_{i}"
                other_positive_col = f"positive_example_{3-i}"
                sub["body"] = sub[body_col]
                sub["positive_example"] = sub[other_positive_col]
                sub["negative_example"] = np.where(
                    np.random.rand(len(sub)) < 0.5, sub["negative_example_1"], sub["negative_example_2"]
                )
                sub["rule_violation"] = 1
            else:
                body_col = f"negative_example_{i}"
                other_negative_col = f"negative_example_{3-i}"
                sub["body"] = sub[body_col]
                sub["negative_example"] = sub[other_negative_col]
                sub["positive_example"] = np.where(
                    np.random.rand(len(sub)) < 0.5, sub["positive_example_1"], sub["positive_example_2"]
                )
                sub["rule_violation"] = 0

            sub.drop(columns=[
                "positive_example_1","positive_example_2","negative_example_1","negative_example_2"
            ], inplace=True)

            # 主要列をクリーニング（拡張データ）
            for c in ["body","rule","subreddit","positive_example","negative_example"]:
                sub[c] = sub[c].astype(str).map(_sz)

            flatten.append(sub)

    df = pd.concat(flatten, axis=0).drop_duplicates(ignore_index=True)
    return df

def build_dataset(dataframe):
    dataframe = dataframe.copy()
    dataframe["prompt"] = dataframe.apply(build_prompt, axis=1)

    columns = ["prompt"]
    if "rule_violation" in dataframe:
        dataframe["completion"] = dataframe["rule_violation"].map({1: POSITIVE_ANSWER, 0: NEGATIVE_ANSWER})
        columns.append("completion")

    dataset = Dataset.from_pandas(dataframe[columns])
    dataset.to_pandas().to_csv("/kaggle/working/dataset.csv", index=False)
    return dataset

In [None]:
%%writefile train.py
import pandas as pd

from trl import SFTTrainer, SFTConfig
from peft import LoraConfig
from tqdm.auto import tqdm
from transformers.utils import is_torch_bf16_gpu_available
from utils import build_dataset, get_dataframe_to_train
from constants import DATA_PATH, BASE_MODEL_PATH, LORA_PATH


def main():
    dataframe = get_dataframe_to_train(DATA_PATH)
    train_dataset = build_dataset(dataframe)
    
    lora_config = LoraConfig(
        r=16,
        lora_alpha=32,
        lora_dropout=0.1,
        bias="none",
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        task_type="CAUSAL_LM",
    )
    
    training_args = SFTConfig(
        num_train_epochs=1,
        
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        
        optim="paged_adamw_8bit",
        learning_rate=1e-4, #keep high, lora usually likes high. 
        weight_decay=0.01,
        max_grad_norm=1.0,
        
        lr_scheduler_type="cosine",
        warmup_ratio=0.03,
        
        bf16=is_torch_bf16_gpu_available(),
        fp16=not is_torch_bf16_gpu_available(),
        dataloader_pin_memory=True,
        
        gradient_checkpointing=True,
        gradient_checkpointing_kwargs={"use_reentrant": False},
    
        save_strategy="no",
        report_to="none",
    
        completion_only_loss=True,
        packing=False,
        remove_unused_columns=False,
    )
    
    trainer = SFTTrainer(
        BASE_MODEL_PATH,
        args=training_args,
        train_dataset=train_dataset,
        peft_config=lora_config,
    )
    
    trainer.train()
    trainer.save_model(LORA_PATH)


if __name__ == "__main__":
    main()

In [None]:
%%writefile inference.py
import os
os.environ["VLLM_USE_V1"] = "0"

import vllm
import torch
import pandas as pd
from logits_processor_zoo.vllm import MultipleChoiceLogitsProcessor
from vllm.lora.request import LoRARequest
from utils import build_dataset
from constants import BASE_MODEL_PATH, LORA_PATH, DATA_PATH, POSITIVE_ANSWER, NEGATIVE_ANSWER
import random
import multiprocessing as mp


def run_inference_on_device(df_slice):
    """在当前进程可见的 GPU 上跑 vLLM 推理"""
    llm = vllm.LLM(
        BASE_MODEL_PATH,
        quantization="gptq",
        tensor_parallel_size=1,
        gpu_memory_utilization=0.98,
        trust_remote_code=True,
        dtype="half",
        enforce_eager=True,
        max_model_len=2836,
        disable_log_stats=True,
        enable_prefix_caching=True,
        enable_lora=True,
        max_lora_rank=64,
    )

    tokenizer = llm.get_tokenizer()
    mclp = MultipleChoiceLogitsProcessor(tokenizer, choices=[POSITIVE_ANSWER, NEGATIVE_ANSWER])

    test_dataset = build_dataset(df_slice)
    texts = test_dataset["prompt"]

    outputs = llm.generate(
        texts,
        vllm.SamplingParams(
            skip_special_tokens=True,
            max_tokens=1,
            logits_processors=[mclp],
            logprobs=2,
        ),
        use_tqdm=True,
        lora_request=LoRARequest("default", 1, LORA_PATH)
    )

    log_probs = [
        {lp.decoded_token: lp.logprob for lp in out.outputs[0].logprobs[0].values()}
        for out in outputs
    ]
    predictions = pd.DataFrame(log_probs)[[POSITIVE_ANSWER, NEGATIVE_ANSWER]]
    predictions["row_id"] = df_slice["row_id"].values
    return predictions


def worker(device_id, df_slice, return_dict):
    # 限制该进程只看到一张 GPU
    os.environ["CUDA_VISIBLE_DEVICES"] = str(device_id)
    print(f"[Worker {device_id}] Running on GPU {device_id}, data size={len(df_slice)}")

    preds = run_inference_on_device(df_slice)
    return_dict[device_id] = preds


def main():
    test_dataframe = pd.read_csv(f"{DATA_PATH}/test.csv")

    # 随机选择例子
    test_dataframe["positive_example"] = test_dataframe.apply(
        lambda row: random.choice([row["positive_example_1"], row["positive_example_2"]]),
        axis=1
    )
    test_dataframe["negative_example"] = test_dataframe.apply(
        lambda row: random.choice([row["negative_example_1"], row["negative_example_2"]]),
        axis=1
    )
    test_dataframe = test_dataframe.drop(
        columns=["positive_example_1", "positive_example_2", "negative_example_1", "negative_example_2"],
        errors="ignore"
    )

    # 切分数据
    mid = len(test_dataframe) // 2
    df0 = test_dataframe.iloc[:mid].reset_index(drop=True)
    df1 = test_dataframe.iloc[mid:].reset_index(drop=True)

    manager = mp.Manager()
    return_dict = manager.dict()

    # 两个进程并行
    p0 = mp.Process(target=worker, args=(0, df0, return_dict))
    p1 = mp.Process(target=worker, args=(1, df1, return_dict))
    p0.start()
    p1.start()
    p0.join()
    p1.join()

    # 合并结果
    predictions = pd.concat([return_dict[0], return_dict[1]], ignore_index=True)

    # 构建 submission
    submission = predictions[["row_id", POSITIVE_ANSWER]].rename(columns={POSITIVE_ANSWER: "rule_violation"})
    rq = submission['rule_violation'].rank(method='average') / (len(submission) + 1)
    submission['rule_violation'] = rq

    submission.to_csv("submission_qwen.csv", index=False)
    print("✅ Saved submission_qwen.csv")


if __name__ == "__main__":
    main()



In [None]:
%%writefile accelerate_config.yaml
compute_environment: LOCAL_MACHINE
debug: false
deepspeed_config:
  gradient_accumulation_steps: 4
  gradient_clipping: 1.0
  train_batch_size: 64
  train_micro_batch_size_per_gpu: 4
  
  zero_stage: 2
  offload_optimizer_device: none
  offload_param_device: none
  zero3_init_flag: false
  
  stage3_gather_16bit_weights_on_model_save: false
  stage3_max_live_parameters: 1e8
  stage3_max_reuse_distance: 1e8
  stage3_prefetch_bucket_size: 5e7
  stage3_param_persistence_threshold: 1e5
  
  zero_allow_untested_optimizer: true
  zero_force_ds_cpu_optimizer: false
  
  fp16:
    enabled: true
    loss_scale: 0
    initial_scale_power: 16
    loss_scale_window: 1000
    hysteresis: 2
    min_loss_scale: 1
  
distributed_type: DEEPSPEED
downcast_bf16: 'no'
dynamo_config:
  dynamo_backend: INDUCTOR
  dynamo_use_fullgraph: false
  dynamo_use_dynamic: false
enable_cpu_affinity: false
machine_rank: 0
main_training_function: main
mixed_precision: fp16
num_machines: 1
num_processes: 2
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false

In [None]:
!accelerate launch --config_file accelerate_config.yaml train.py

In [None]:
!python inference.py

In [None]:
!head submission_qwen.csv

# 2. Qwen2.5 14B GPTQ Int4 Inference

In [None]:
# ! mkdir -p /tmp/src

In [None]:
%%writefile infer_qwen.py
import os, math, pandas as pd, torch, vllm, numpy as np
from typing import List, Dict
from logits_processor_zoo.vllm import MultipleChoiceLogitsProcessor
from vllm.lora.request import LoRARequest
from rule_knowledge import canonicalize_rule
from text_cleaning import strip_emojis_kaomoji as _strip

os.environ["VLLM_USE_V1"] = "0"

DATA_PATH = "/kaggle/input/jigsaw-agile-community-rules/test.csv"
MODEL_NAME = "/kaggle/input/qwen2.5/transformers/14b-instruct-gptq-int4/1"
LORA_PATH  = "/kaggle/input/lora_14b_gptq_1epoch_r32/keras/default/1"

GPU_MEM_UTIL = 0.90
MAX_MODEL_LEN = 1036
CHUNK_SIZE = 64
MAX_TOK_BODY = 128
MAX_TOK_EX   = 64
MAX_TOK_RULE = 64
SEED = 42

POS = "Yes"; NEG = "No"

def build_llm():
    return vllm.LLM(
        MODEL_NAME,
        quantization="gptq",
        tensor_parallel_size=torch.cuda.device_count(),
        gpu_memory_utilization=GPU_MEM_UTIL,
        trust_remote_code=True,
        dtype="half",
        enforce_eager=True,
        max_model_len=MAX_MODEL_LEN,
        disable_log_stats=True,
        enable_prefix_caching=True,
        enable_lora=True,
        max_lora_rank=32,
    )

def truncate_by_tokens(text: str, tokenizer, max_tokens: int) -> str:
    if not text: return ""
    ids = tokenizer.encode(text, add_special_tokens=False)
    return text if len(ids) <= max_tokens else tokenizer.decode(ids[:max_tokens], skip_special_tokens=True)

SYS_PROMPT = (
    "You are a strict rule-violation classifier for Reddit comments.\n"
    "Decide if the comment violates the given rule.\n"
    "- Use ONLY the rule (including the canonical definition) and the provided examples.\n"
    "- Ignore meme slang unless directly relevant to the rule.\n"
    "- If uncertain, answer 'No'.\n"
    "- Output EXACTLY one word: 'Yes' or 'No'."
)

def build_prompt_rows(df: pd.DataFrame, tokenizer) -> List[str]:
    prompts = []
    apply_chat = lambda messages: tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

    for _, row in df.iterrows():
        # まずクリーン
        subreddit = _strip(row.get("subreddit",""))
        rule_raw  = _strip(row.get("rule",""))
        pos1 = _strip(row.get("positive_example_1",""))
        pos2 = _strip(row.get("positive_example_2",""))
        neg1 = _strip(row.get("negative_example_1",""))
        neg2 = _strip(row.get("negative_example_2",""))
        body = _strip(row.get("body",""))

        canon = canonicalize_rule(rule_raw)
        rule_block = f"Rule: {rule_raw}\n"
        if canon:
            rule_block += f"Canonical Definition: {canon}\n"

        # トークン制限
        rule_block = truncate_by_tokens(rule_block, tokenizer, MAX_TOK_RULE)
        pos1 = truncate_by_tokens(pos1, tokenizer, MAX_TOK_EX)
        pos2 = truncate_by_tokens(pos2, tokenizer, MAX_TOK_EX)
        neg1 = truncate_by_tokens(neg1, tokenizer, MAX_TOK_EX)
        neg2 = truncate_by_tokens(neg2, tokenizer, MAX_TOK_EX)
        body = truncate_by_tokens(body, tokenizer, MAX_TOK_BODY)

        user_text = (
            f"Subreddit: r/{subreddit}\n"
            f"{rule_block}\n"
            f"Examples of violation (Yes):\n1) {pos1}\n2) {pos2}\n\n"
            f"Examples of non-violation (No):\n3) {neg1}\n4) {neg2}\n\n"
            f"Target comment:\n5) {body}\n"
        )

        prompt = apply_chat([
            {"role":"system","content":SYS_PROMPT},
            {"role":"user","content":user_text},
        ]) + "Answer:"
        prompts.append(prompt)
    return prompts

def run_chunked_inference(llm, tokenizer, prompts: List[str]) -> List[Dict[str, float]]:
    mclp = MultipleChoiceLogitsProcessor(tokenizer, choices=[POS, NEG])
    all_lp = []
    for s in range(0, len(prompts), CHUNK_SIZE):
        batch = prompts[s:s+CHUNK_SIZE]
        outs = llm.generate(
            batch,
            vllm.SamplingParams(skip_special_tokens=True, max_tokens=1, logits_processors=[mclp], logprobs=2),
            use_tqdm=True,
            lora_request=LoRARequest("default", 1, LORA_PATH),
        )
        for out in outs:
            lp_map = {lp.decoded_token: lp.logprob for lp in out.outputs[0].logprobs[0].values()}
            all_lp.append({POS: lp_map.get(POS, -1e9), NEG: lp_map.get(NEG, -1e9)})
    return all_lp

def main():
    torch.manual_seed(SEED); np.random.seed(SEED)
    df = pd.read_csv(DATA_PATH)
    llm = build_llm()
    tokenizer = llm.get_tokenizer()

    prompts = build_prompt_rows(df, tokenizer)
    logprobs = run_chunked_inference(llm, tokenizer, prompts)

    mat = pd.DataFrame(logprobs)[[POS, NEG]]
    df = pd.concat([df, mat], axis=1)
    df["logit_diff"] = df[POS] - df[NEG]
    df["rule_violation"] = 1.0 / (1.0 + np.exp(-df["logit_diff"]))
    df[["row_id","rule_violation"]].to_csv("submission_qwen14b.csv", index=False)
    print("✅ Saved submission_qwen14b.csv")

if __name__ == "__main__":
    main()

In [None]:
# %cd /tmp
!python infer_qwen.py

# 3. Qwen3 0.6b Embedding

In [None]:
import os
import pandas as pd

In [None]:
%%writefile constants.py
EMBDEDDING_MODEL_PATH = "/kaggle/input/qwen-3-embedding/transformers/0.6b/1"
MODEL_OUTPUT_PATH = '/kaggle/input/qwen3-8b-embedding'
DATA_PATH = "/kaggle/input/jigsaw-agile-community-rules"

# https://huggingface.co/Qwen/Qwen3-Embedding-0.6B/blob/main/config_sentence_transformers.json
EMBEDDING_MODEL_QUERY = "Instruct: Given a web search query, retrieve relevant passages that answer the query\nQuery:"

CLEAN_TEXT = True
TOP_K = 2000
BATCH_SIZE = 128

In [None]:
%%writefile utils.py
import pandas as pd
import torch.distributed as dist
from datasets import Dataset
from cleantext import clean
from tqdm.auto import tqdm
from text_cleaning import strip_emojis_kaomoji as _strip
from constants import CLEAN_TEXT

def build_prompt(row):
    return f"""r/{row["subreddit"]}\nComment: {row["body"]}"""

def cleaner(text):
    # clean-text でURL/EMAIL/PHONE等をマスク、Unicode整形
    s = clean(
        text,
        fix_unicode=True,
        to_ascii=True,           # 絵文字は基本ここで落ちる
        lower=False,
        no_line_breaks=False,
        no_urls=True,
        no_emails=True,
        no_phone_numbers=True,
        no_numbers=False,
        no_digits=False,
        no_currency_symbols=False,
        no_punct=False,
        replace_with_url="<URL>",
        replace_with_email="<EMAIL>",
        replace_with_phone_number="<PHONE>",
        lang="en",
    )
    # ASCII系の顔文字や残留を追加で除去
    s = _strip(s)
    return s

def get_dataframe_to_train(data_path):
    train_dataset = pd.read_csv(f"{data_path}/train.csv")
    test_dataset = pd.read_csv(f"{data_path}/test.csv").sample(frac=0.6, random_state=42).reset_index(drop=True)

    flatten = []
    flatten.append(train_dataset[["body","rule","subreddit","rule_violation"]])

    for violation_type in ["positive","negative"]:
        for i in range(1,3):
            sub = test_dataset[[f"{violation_type}_example_{i}","rule","subreddit"]].copy()
            sub = sub.rename(columns={f"{violation_type}_example_{i}":"body"})
            sub["rule_violation"] = 1 if violation_type == "positive" else 0
            flatten.append(sub)

    df = pd.concat(flatten, axis=0).drop_duplicates(ignore_index=True)
    return df

def prepare_dataframe(dataframe):
    dataframe = dataframe.copy()
    dataframe["prompt"] = dataframe.apply(build_prompt, axis=1)
    if CLEAN_TEXT:
        tqdm.pandas(desc="cleaner")
        dataframe["prompt"] = dataframe["prompt"].progress_apply(cleaner)

    if "rule_violation" in dataframe.columns:
        dataframe["rule_violation"] = dataframe["rule_violation"].map({1: 1, 0: -1})
    return dataframe

In [None]:
%%writefile semantic.py
import pandas as pd
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import semantic_search, dot_score
from tqdm.auto import tqdm
from peft import PeftModel, PeftConfig


from utils import get_dataframe_to_train, prepare_dataframe
from constants import DATA_PATH, EMBDEDDING_MODEL_PATH, EMBEDDING_MODEL_QUERY, TOP_K, BATCH_SIZE, MODEL_OUTPUT_PATH



def get_scores(test_dataframe):
    corpus_dataframe = get_dataframe_to_train(DATA_PATH)
    corpus_dataframe = prepare_dataframe(corpus_dataframe)
    
    # Load base model
    model = AutoModelForCausalLM.from_pretrained(EMBDEDDING_MODEL_PATH)
    tokenizer = AutoTokenizer.from_pretrained(EMBDEDDING_MODEL_PATH)
    
    # Load adapter configuration and model
    adapter_config = PeftConfig.from_pretrained(MODEL_OUTPUT_PATH)
    lora_model = PeftModel.from_pretrained(model, MODEL_OUTPUT_PATH, config=adapter_config)
    merged_model = lora_model.merge_and_unload()
    tokenizer.save_pretrained("Qwen3Emb_Finetuned")
    merged_model.save_pretrained("Qwen3Emb_Finetuned")

    # 4. Tạo lại SentenceTransformer từ encoder đã merge
    embedding_model = SentenceTransformer(model_name_or_path="Qwen3Emb_Finetuned", device="cuda")

    print('Done loading model!')

    result = []
    for rule in tqdm(test_dataframe["rule"].unique(), desc=f"Generate scores for each rule"):
        test_dataframe_part = test_dataframe.query("rule == @rule").reset_index(drop=True)
        corpus_dataframe_part = corpus_dataframe.query("rule == @rule").reset_index(drop=True)
        corpus_dataframe_part = corpus_dataframe_part.reset_index(names="row_id")
        
        query_embeddings = embedding_model.encode(
            sentences=test_dataframe_part["prompt"].tolist(),
            prompt=EMBEDDING_MODEL_QUERY,
            batch_size=BATCH_SIZE,
            show_progress_bar=True,
            convert_to_tensor=True,
            device="cuda",
            normalize_embeddings=True,
        )
        document_embeddings = embedding_model.encode(
            sentences=corpus_dataframe_part["prompt"].tolist(),
            batch_size=BATCH_SIZE,
            show_progress_bar=True,
            convert_to_tensor=True,
            device="cuda",
            normalize_embeddings=True,
        )
        test_dataframe_part["semantic"] = semantic_search(
            query_embeddings,
            document_embeddings,
            top_k=TOP_K,
            score_function=dot_score,
        )
        def get_score(semantic):
            semantic = pd.DataFrame(semantic)
            semantic = semantic.merge(
                corpus_dataframe_part[["row_id", "rule_violation"]],
                how="left",
                left_on="corpus_id",
                right_on="row_id",
            )
            semantic["score"] = semantic["score"]*semantic["rule_violation"]
            return semantic["score"].sum()
            
        tqdm.pandas(desc=f"Add label for {rule=}")
        test_dataframe_part["rule_violation"] = test_dataframe_part["semantic"].progress_apply(get_score)
        result.append(test_dataframe_part[["row_id", "rule_violation"]].copy())
        
    submission = pd.concat(result, axis=0)
    return submission


def generate_submission():
    test_dataframe = pd.read_csv(f"{DATA_PATH}/test.csv")
    test_dataframe = prepare_dataframe(test_dataframe)
    
    submission = get_scores(test_dataframe)
    submission = test_dataframe[["row_id"]].merge(submission, on="row_id", how="left")
    submission.to_csv("submission_qwen3.csv", index=False)


if __name__ == "__main__":
    generate_submission()



In [None]:
!python semantic.py

# 4. ENSEMBLE RESULT

In [None]:
import pandas as pd
import numpy as np

q = pd.read_csv('submission_qwen.csv')
l = pd.read_csv('submission_qwen3.csv')
m = pd.read_csv('submission_qwen14b.csv')


rq = q['rule_violation'].rank(method='average') / (len(q)+1)
rl = l['rule_violation'].rank(method='average') / (len(l)+1)
rm = m['rule_violation'].rank(method='average') / (len(m)+1)


blend = 0.5*rq + 0.3*rl + 0.2*rm   # or tune the rank-weights with a tiny grid using OOF
q['rule_violation'] = blend
q.to_csv('/kaggle/working/submission.csv', index=False)


In [None]:
import pandas as pd
pd.read_csv('/kaggle/working/submission.csv')

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from utils import get_dataframe_to_train
from constants import DATA_PATH

# --- 学習データの取得 ---
train_df = get_dataframe_to_train(DATA_PATH)

# --- ラベルと特徴の準備 ---
y_true = train_df["rule_violation"]
X_dummy = np.zeros(len(train_df))  # 特徴量を使わないダミー分割

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
auc_scores = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_dummy, y_true), 1):
    y_train, y_val = y_true.iloc[train_idx], y_true.iloc[val_idx]
    auc = roc_auc_score(y_val, np.random.rand(len(y_val)))  # 仮にランダム予測
    auc_scores.append(auc)
    print(f"Fold {fold}: AUC = {auc:.4f}")

print(f"\n✅ Mean CV AUC: {np.mean(auc_scores):.4f} ± {np.std(auc_scores):.4f}")