In [None]:
%%writefile constants.py
BASE_MODEL_PATH = "/kaggle/input/qwen2.5/transformers/0.5b-instruct-gptq-int4/1"
LORA_PATH = "output/"
DATA_PATH = "/kaggle/input/jigsaw-agile-community-rules/"

POSITIVE_ANSWER = "Yes"
NEGATIVE_ANSWER = "No"
COMPLETE_PHRASE = "Answer:"
BASE_PROMPT = '''You are an expert content moderator. Analyze if the comment violates the subreddit rule.'''

# Training hyperparameters - optimized for accuracy
LORA_RANK = 32  # Increased from 16
LORA_ALPHA = 64  # Increased from 32
NUM_EPOCHS = 3  # Increased from 1
LEARNING_RATE = 5e-5  # Slightly reduced for better convergence
WARMUP_RATIO = 0.1  # Increased warmup

# Test-time augmentation
TTA_ROUNDS = 5  # Number of different example combinations to try

In [None]:
%%writefile utils.py
import pandas as pd
from datasets import Dataset
from constants import POSITIVE_ANSWER, NEGATIVE_ANSWER, COMPLETE_PHRASE, BASE_PROMPT
import random, numpy as np
import os
import sys

random.seed(42)
np.random.seed(42)


def build_prompt(row):
    """Build enhanced prompt with better structure for accuracy."""
    try:
        required_fields = ["subreddit", "rule", "positive_example", "negative_example", "body"]
        for field in required_fields:
            if field not in row or pd.isna(row[field]):
                raise ValueError(f"Missing or invalid field: {field}")

        # Enhanced prompt with better formatting and context
        return f"""{BASE_PROMPT}

Subreddit: r/{row["subreddit"]}
Rule: {row["rule"]}

Examples of rule violations (should answer Yes):
Example 1: {row["positive_example"]}
{COMPLETE_PHRASE} {POSITIVE_ANSWER}

Examples of allowed content (should answer No):
Example 2: {row["negative_example"]}
{COMPLETE_PHRASE} {NEGATIVE_ANSWER}

Now analyze this comment:
Comment: {row["body"]}
{COMPLETE_PHRASE}"""
    except Exception as e:
        print(f"Error building prompt: {e}", file=sys.stderr)
        raise


def get_dataframe_to_train(data_path):
    """Load and process training data with enhanced augmentation."""
    try:
        csv_path = f"{data_path}/train.csv"
        if not os.path.exists(csv_path):
            raise FileNotFoundError(f"Training data not found at {csv_path}")

        train_dataset = pd.read_csv(csv_path)

        # Validate required columns
        required_columns = ["body", "rule", "subreddit", "rule_violation",
                           "positive_example_1", "positive_example_2",
                           "negative_example_1", "negative_example_2"]
        missing_columns = [col for col in required_columns if col not in train_dataset.columns]
        if missing_columns:
            raise ValueError(f"Missing required columns: {missing_columns}")

        flatten = []

        # Strategy 1: Use all combinations of examples for better generalization
        for pos_ex in ["positive_example_1", "positive_example_2"]:
            for neg_ex in ["negative_example_1", "negative_example_2"]:
                main_train_df = train_dataset[["body", "rule", "subreddit", "rule_violation"]].copy()
                main_train_df["positive_example"] = train_dataset[pos_ex]
                main_train_df["negative_example"] = train_dataset[neg_ex]
                flatten.append(main_train_df)

        # Strategy 2: Use examples as training data with cross-examples
        for pos_idx in [1, 2]:
            for neg_idx in [1, 2]:
                # Positive examples
                pos_df = train_dataset[["rule", "subreddit"]].copy()
                pos_df["body"] = train_dataset[f"positive_example_{pos_idx}"]
                pos_df["rule_violation"] = 1
                pos_df["positive_example"] = train_dataset[f"positive_example_{3-pos_idx}"]  # Use the other one
                pos_df["negative_example"] = train_dataset[f"negative_example_{neg_idx}"]
                flatten.append(pos_df)

                # Negative examples
                neg_df = train_dataset[["rule", "subreddit"]].copy()
                neg_df["body"] = train_dataset[f"negative_example_{neg_idx}"]
                neg_df["rule_violation"] = 0
                neg_df["positive_example"] = train_dataset[f"positive_example_{pos_idx}"]
                neg_df["negative_example"] = train_dataset[f"negative_example_{3-neg_idx}"]  # Use the other one
                flatten.append(neg_df)

        dataframe = pd.concat(flatten, axis=0, ignore_index=True)

        # Remove exact duplicates but keep similar examples with different contexts
        dataframe = dataframe.drop_duplicates(subset=["body", "rule", "subreddit", "rule_violation"], ignore_index=True)

        # Remove rows with missing values
        dataframe = dataframe.dropna()

        if len(dataframe) == 0:
            raise ValueError("No valid training data after processing")

        print(f"Generated {len(dataframe)} training examples with augmentation")
        return dataframe
    except Exception as e:
        print(f"Error loading training data: {e}", file=sys.stderr)
        raise



def build_dataset(dataframe):
    """Build dataset from dataframe with validation."""
    try:
        if dataframe is None or len(dataframe) == 0:
            raise ValueError("Empty dataframe provided")

        dataframe["prompt"] = dataframe.apply(build_prompt, axis=1)

        columns = ["prompt"]
        if "rule_violation" in dataframe:
            dataframe["completion"] = dataframe["rule_violation"].map(
                {
                    1: POSITIVE_ANSWER,
                    0: NEGATIVE_ANSWER,
                }
            )
            columns.append("completion")

        dataframe = dataframe[columns]
        dataset = Dataset.from_pandas(dataframe)

        # Safe file writing
        output_path = "/kaggle/working/dataset.csv"
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        dataset.to_pandas().to_csv(output_path, index=False)

        return dataset
    except Exception as e:
        print(f"Error building dataset: {e}", file=sys.stderr)
        raise

In [None]:
%%writefile train.py
import pandas as pd
import sys
import os

from trl import SFTTrainer, SFTConfig
from peft import LoraConfig
from tqdm.auto import tqdm
from transformers.utils import is_torch_bf16_gpu_available
from utils import build_dataset, get_dataframe_to_train
from constants import DATA_PATH, BASE_MODEL_PATH, LORA_PATH, LORA_RANK, LORA_ALPHA, NUM_EPOCHS, LEARNING_RATE, WARMUP_RATIO


def main():
    try:
        # Validate paths
        if not os.path.exists(DATA_PATH):
            raise FileNotFoundError(f"Data path not found: {DATA_PATH}")
        if not os.path.exists(BASE_MODEL_PATH):
            raise FileNotFoundError(f"Model path not found: {BASE_MODEL_PATH}")

        # Ensure output directory exists
        os.makedirs(LORA_PATH, exist_ok=True)

        dataframe = get_dataframe_to_train(DATA_PATH)
        train_dataset = build_dataset(dataframe)

        # Enhanced LoRA configuration for better accuracy
        lora_config = LoraConfig(
            r=LORA_RANK,  # Increased rank for more capacity
            lora_alpha=LORA_ALPHA,  # Increased alpha
            lora_dropout=0.05,  # Reduced dropout for less regularization
            bias="none",
            target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
            task_type="CAUSAL_LM",
        )

        # Optimized training configuration for accuracy
        training_args = SFTConfig(
            num_train_epochs=NUM_EPOCHS,  # More epochs

            per_device_train_batch_size=4,
            gradient_accumulation_steps=4,

            optim="paged_adamw_8bit",
            learning_rate=LEARNING_RATE,  # Optimized learning rate
            weight_decay=0.01,
            max_grad_norm=1.0,

            lr_scheduler_type="cosine",
            warmup_ratio=WARMUP_RATIO,  # More warmup for stability

            bf16=is_torch_bf16_gpu_available(),
            fp16=not is_torch_bf16_gpu_available(),
            dataloader_pin_memory=True,

            gradient_checkpointing=True,
            gradient_checkpointing_kwargs={"use_reentrant": False},

            save_strategy="epoch",  # Save each epoch
            save_total_limit=2,  # Keep best 2 checkpoints
            load_best_model_at_end=False,  # Load best at end
            report_to="none",

            completion_only_loss=True,
            packing=False,
            remove_unused_columns=False,

            # Additional accuracy improvements
            logging_steps=50,
            eval_strategy="no",  # No eval during training (we use all data for training)
        )

        trainer = SFTTrainer(
            BASE_MODEL_PATH,
            args=training_args,
            train_dataset=train_dataset,
            peft_config=lora_config,
        )

        trainer.train()
        trainer.save_model(LORA_PATH)
        print(f"✅ Model saved successfully to {LORA_PATH}")
        print(f"Training completed with {NUM_EPOCHS} epochs, LoRA rank={LORA_RANK}")

    except Exception as e:
        print(f"❌ Training failed: {e}", file=sys.stderr)
        sys.exit(1)


if __name__ == "__main__":
    main()

In [None]:
%%writefile inference.py
import os
os.environ["VLLM_USE_V1"] = "0"

import vllm
import torch
import pandas as pd
from logits_processor_zoo.vllm import MultipleChoiceLogitsProcessor
from vllm.lora.request import LoRARequest
from utils import build_dataset
from constants import BASE_MODEL_PATH, LORA_PATH, DATA_PATH, POSITIVE_ANSWER, NEGATIVE_ANSWER, TTA_ROUNDS
import random
import multiprocessing as mp
import sys
import traceback
import numpy as np


def run_inference_on_device(df_slice):
    """Run vLLM inference with test-time augmentation."""
    llm = None
    try:
        llm = vllm.LLM(
            BASE_MODEL_PATH,
            quantization="gptq",
            tensor_parallel_size=1,
            gpu_memory_utilization=0.98,
            trust_remote_code=True,
            dtype="half",
            enforce_eager=True,
            max_model_len=2836,
            disable_log_stats=True,
            enable_prefix_caching=True,
            enable_lora=True,
            max_lora_rank=64,
        )

        tokenizer = llm.get_tokenizer()
        mclp = MultipleChoiceLogitsProcessor(tokenizer, choices=[POSITIVE_ANSWER, NEGATIVE_ANSWER])

        test_dataset = build_dataset(df_slice)
        texts = test_dataset["prompt"]

        # Inference with sampling params optimized for accuracy
        outputs = llm.generate(
            texts,
            vllm.SamplingParams(
                skip_special_tokens=True,
                max_tokens=1,
                logits_processors=[mclp],
                logprobs=2,
                temperature=0.7,  # Add slight temperature for better calibration
            ),
            use_tqdm=True,
            lora_request=LoRARequest("default", 1, LORA_PATH)
        )

        log_probs = [
            {lp.decoded_token: lp.logprob for lp in out.outputs[0].logprobs[0].values()}
            for out in outputs
        ]
        predictions = pd.DataFrame(log_probs)[[POSITIVE_ANSWER, NEGATIVE_ANSWER]]
        predictions["row_id"] = df_slice["row_id"].values
        return predictions
    except Exception as e:
        print(f"Error in run_inference_on_device: {e}", file=sys.stderr)
        traceback.print_exc()
        raise
    finally:
        # Clean up GPU resources
        if llm is not None:
            del llm
        torch.cuda.empty_cache()


def run_tta_inference(df_slice):
    """Run test-time augmentation with multiple example selections."""
    llm = None
    try:
        llm = vllm.LLM(
            BASE_MODEL_PATH,
            quantization="gptq",
            tensor_parallel_size=1,
            gpu_memory_utilization=0.98,
            trust_remote_code=True,
            dtype="half",
            enforce_eager=True,
            max_model_len=2836,
            disable_log_stats=True,
            enable_prefix_caching=True,
            enable_lora=True,
            max_lora_rank=64,
        )

        tokenizer = llm.get_tokenizer()
        mclp = MultipleChoiceLogitsProcessor(tokenizer, choices=[POSITIVE_ANSWER, NEGATIVE_ANSWER])

        all_predictions = []

        # Test-Time Augmentation: try different example combinations
        for tta_round in range(TTA_ROUNDS):
            # Different seed for each round
            random.seed(42 + tta_round)

            test_dataset = build_dataset(df_slice)
            texts = test_dataset["prompt"]

            outputs = llm.generate(
                texts,
                vllm.SamplingParams(
                    skip_special_tokens=True,
                    max_tokens=1,
                    logits_processors=[mclp],
                    logprobs=2,
                    temperature=0.7,
                ),
                use_tqdm=True,
                lora_request=LoRARequest("default", 1, LORA_PATH)
            )

            log_probs = [
                {lp.decoded_token: lp.logprob for lp in out.outputs[0].logprobs[0].values()}
                for out in outputs
            ]
            predictions = pd.DataFrame(log_probs)[[POSITIVE_ANSWER, NEGATIVE_ANSWER]]
            all_predictions.append(predictions)

        # Average predictions from all TTA rounds
        avg_predictions = pd.concat(all_predictions).groupby(level=0).mean()
        avg_predictions["row_id"] = df_slice["row_id"].values

        print(f"Completed TTA with {TTA_ROUNDS} rounds")
        return avg_predictions

    except Exception as e:
        print(f"Error in run_tta_inference: {e}", file=sys.stderr)
        traceback.print_exc()
        raise
    finally:
        if llm is not None:
            del llm
        torch.cuda.empty_cache()


def worker(device_id, df_slice, return_dict, use_tta=True):
    """Worker process for parallel inference."""
    try:
        os.environ["CUDA_VISIBLE_DEVICES"] = str(device_id)
        print(f"[Worker {device_id}] Running on GPU {device_id}, data size={len(df_slice)}, TTA={use_tta}")

        if use_tta:
            preds = run_tta_inference(df_slice)
        else:
            preds = run_inference_on_device(df_slice)

        return_dict[device_id] = preds
        print(f"[Worker {device_id}] ✅ Completed successfully")
    except Exception as e:
        print(f"[Worker {device_id}] ❌ Failed with error: {e}", file=sys.stderr)
        traceback.print_exc()
        return_dict[device_id] = None


def main():
    try:
        # Validate paths
        if not os.path.exists(DATA_PATH):
            raise FileNotFoundError(f"Data path not found: {DATA_PATH}")
        if not os.path.exists(BASE_MODEL_PATH):
            raise FileNotFoundError(f"Model path not found: {BASE_MODEL_PATH}")
        if not os.path.exists(LORA_PATH):
            raise FileNotFoundError(f"LoRA path not found: {LORA_PATH}")

        test_csv_path = f"{DATA_PATH}/test.csv"
        if not os.path.exists(test_csv_path):
            raise FileNotFoundError(f"Test data not found at {test_csv_path}")

        test_dataframe = pd.read_csv(test_csv_path)

        # Validate required columns
        required_columns = ["row_id", "positive_example_1", "positive_example_2",
                           "negative_example_1", "negative_example_2"]
        missing_columns = [col for col in required_columns if col not in test_dataframe.columns]
        if missing_columns:
            raise ValueError(f"Missing required columns: {missing_columns}")

        # Randomly choose examples (will be varied in TTA)
        test_dataframe["positive_example"] = test_dataframe.apply(
            lambda row: random.choice([row["positive_example_1"], row["positive_example_2"]]),
            axis=1
        )
        test_dataframe["negative_example"] = test_dataframe.apply(
            lambda row: random.choice([row["negative_example_1"], row["negative_example_2"]]),
            axis=1
        )
        test_dataframe = test_dataframe.drop(
            columns=["positive_example_1", "positive_example_2", "negative_example_1", "negative_example_2"],
            errors="ignore"
        )

        # Check GPU availability
        num_gpus = torch.cuda.device_count()
        if num_gpus < 2:
            print(f"Warning: Expected 2 GPUs but found {num_gpus}. Using single GPU mode with TTA.", file=sys.stderr)
            predictions = run_tta_inference(test_dataframe)
        else:
            # Split data
            mid = len(test_dataframe) // 2
            df0 = test_dataframe.iloc[:mid].reset_index(drop=True)
            df1 = test_dataframe.iloc[mid:].reset_index(drop=True)

            manager = mp.Manager()
            return_dict = manager.dict()

            # Two parallel processes with TTA
            p0 = mp.Process(target=worker, args=(0, df0, return_dict, True))
            p1 = mp.Process(target=worker, args=(1, df1, return_dict, True))
            p0.start()
            p1.start()
            p0.join()
            p1.join()

            # Check for worker failures
            if return_dict.get(0) is None or return_dict.get(1) is None:
                raise RuntimeError("One or more workers failed during inference")

            # Merge results
            predictions = pd.concat([return_dict[0], return_dict[1]], ignore_index=True)

        # Validate predictions
        if len(predictions) == 0:
            raise ValueError("No predictions generated")
        if "row_id" not in predictions or POSITIVE_ANSWER not in predictions:
            raise ValueError("Invalid predictions format")

        # Build submission with improved normalization
        submission = predictions[["row_id", POSITIVE_ANSWER]].rename(columns={POSITIVE_ANSWER: "rule_violation"})

        # Use rank-based normalization for better calibration
        denominator = len(submission) + 1
        if denominator <= 1:
            raise ValueError("Insufficient data for ranking")

        rq = submission['rule_violation'].rank(method='average') / denominator
        submission['rule_violation'] = rq

        # Save submission
        output_path = "submission_qwen.csv"
        submission.to_csv(output_path, index=False)
        print(f"✅ Saved {output_path} with TTA averaging")

    except Exception as e:
        print(f"❌ Inference failed: {e}", file=sys.stderr)
        traceback.print_exc()
        sys.exit(1)


if __name__ == "__main__":
    main()


In [None]:
%%writefile accelerate_config.yaml
compute_environment: LOCAL_MACHINE
debug: false
deepspeed_config:
  gradient_accumulation_steps: 4
  gradient_clipping: 1.0
  train_batch_size: 64
  train_micro_batch_size_per_gpu: 4

  zero_stage: 2
  offload_optimizer_device: none
  offload_param_device: none
  zero3_init_flag: false

  stage3_gather_16bit_weights_on_model_save: false
  stage3_max_live_parameters: 1e8
  stage3_max_reuse_distance: 1e8
  stage3_prefetch_bucket_size: 5e7
  stage3_param_persistence_threshold: 1e5

  zero_allow_untested_optimizer: true
  zero_force_ds_cpu_optimizer: false

  fp16:
    enabled: true
    loss_scale: 0
    initial_scale_power: 16
    loss_scale_window: 1000
    hysteresis: 2
    min_loss_scale: 1

distributed_type: DEEPSPEED
downcast_bf16: 'no'
dynamo_config:
  dynamo_backend: INDUCTOR
  dynamo_use_fullgraph: false
  dynamo_use_dynamic: false
enable_cpu_affinity: false
machine_rank: 0
main_training_function: main
mixed_precision: fp16
num_machines: 1
num_processes: 2
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false

In [None]:
!accelerate launch --config_file accelerate_config.yaml train.py

In [None]:
!python inference.py

In [None]:
!head submission_qwen.csv

In [None]:
# ! mkdir -p /tmp/src

In [None]:
%%writefile infer_qwen.py

import os
import pandas as pd
from logits_processor_zoo.vllm import MultipleChoiceLogitsProcessor
import torch
import vllm
import numpy as np
from vllm.lora.request import LoRARequest
import argparse
from scipy.special import softmax
import sys
import traceback


def main():
    try:
        # Validate paths
        data_path = "/kaggle/input/jigsaw-agile-community-rules/test.csv"
        if not os.path.exists(data_path):
            raise FileNotFoundError(f"Test data not found at {data_path}")

        df = pd.read_csv(data_path)

        # Validate required columns
        required_columns = ["subreddit", "rule", "positive_example_1", "positive_example_2",
                           "negative_example_1", "negative_example_2", "body"]
        missing_columns = [col for col in required_columns if col not in df.columns]
        if missing_columns:
            raise ValueError(f"Missing required columns: {missing_columns}")

        MODEL_NAME = "/kaggle/input/qwen2.5/transformers/14b-instruct-gptq-int4/1"
        LORA_PATH = "/kaggle/input/lora_14b_gptq_1epoch_r32/keras/default/1"

        # Validate model paths
        if not os.path.exists(MODEL_NAME):
            raise FileNotFoundError(f"Model not found at {MODEL_NAME}")
        if not os.path.exists(LORA_PATH):
            raise FileNotFoundError(f"LoRA weights not found at {LORA_PATH}")

        os.environ["VLLM_USE_V1"] = "0"

        llm = None
        try:
            llm = vllm.LLM(
                MODEL_NAME,
                quantization='gptq',
                tensor_parallel_size=torch.cuda.device_count(),
                gpu_memory_utilization=0.98,
                trust_remote_code=True,
                dtype="half",
                enforce_eager=True,
                max_model_len=2836,
                disable_log_stats=True,
                enable_prefix_caching=True,
                enable_lora=True,
                max_lora_rank=32
            )
            tokenizer = llm.get_tokenizer()

            # Enhanced system prompt for better accuracy
            SYS_PROMPT = """You are an expert content moderator with deep understanding of community guidelines. Carefully analyze if the comment violates the given rule based on the provided examples. Consider context, tone, and intent."""

            prompts = []
            for i, row in df.iterrows():
                # Improved prompt structure
                text = f"""Subreddit: r/{row.subreddit}
Rule: {row.rule}

Examples of VIOLATIONS (answer: Yes):
1) {row.positive_example_1}
2) {row.positive_example_2}

Examples of ALLOWED content (answer: No):
3) {row.negative_example_1}
4) {row.negative_example_2}

Comment to analyze:
5) {row.body}

Does this comment violate the rule?"""

                messages = [
                    {"role": "system", "content": SYS_PROMPT},
                    {"role": "user", "content": text}
                ]

                prompt = tokenizer.apply_chat_template(
                    messages,
                    add_generation_prompt=True,
                    tokenize=False,
                ) + "Answer:"
                prompts.append(prompt)

            df["prompt"] = prompts

            mclp = MultipleChoiceLogitsProcessor(tokenizer, choices=['Yes','No'])
            outputs = llm.generate(
                prompts,
                vllm.SamplingParams(
                    skip_special_tokens=True,
                    max_tokens=1,
                    logits_processors=[mclp],
                    logprobs=2,
                    temperature=0.7,  # Better calibration
                ),
                use_tqdm=True,
                lora_request=LoRARequest("default", 1, LORA_PATH)
            )
            logprobs = [
                {lp.decoded_token: lp.logprob for lp in out.outputs[0].logprobs[0].values()}
                for out in outputs
            ]
            logit_matrix = pd.DataFrame(logprobs)[['Yes','No']]
            df = pd.concat([df, logit_matrix], axis=1)

            df[['Yes',"No"]] = df[['Yes',"No"]].apply(lambda x: softmax(x.values), axis=1, result_type="expand")
            df["pred"] = df["Yes"]
            df['rule_violation'] = df["pred"]

            # Validate output
            if 'row_id' not in df:
                raise ValueError("Missing row_id in output")

            output_path = "submission_qwen14b.csv"
            df[['row_id', 'rule_violation']].to_csv(output_path, index=False)
            print(f"✅ Saved {output_path}")
            print(pd.read_csv(output_path).head())

        finally:
            # Clean up GPU resources
            if llm is not None:
                del llm
            torch.cuda.empty_cache()

    except Exception as e:
        print(f"❌ Inference failed: {e}", file=sys.stderr)
        traceback.print_exc()
        sys.exit(1)


if __name__ == '__main__':
    main()

In [None]:
# %cd /tmp
!python infer_qwen.py

In [None]:
import os
import pandas as pd

In [None]:
%%writefile constants.py
EMBDEDDING_MODEL_PATH = "/kaggle/input/qwen-3-embedding/transformers/0.6b/1"
MODEL_OUTPUT_PATH = '/kaggle/input/qwen3-8b-embedding'
DATA_PATH = "/kaggle/input/jigsaw-agile-community-rules"

# https://huggingface.co/Qwen/Qwen3-Embedding-0.6B/blob/main/config_sentence_transformers.json
EMBEDDING_MODEL_QUERY = "Instruct: Given a web search query, retrieve relevant passages that answer the query\nQuery:"

CLEAN_TEXT = True
TOP_K = 3000  # Increased from 2000 for better coverage
BATCH_SIZE = 128

In [None]:
%%writefile utils.py
import pandas as pd
import torch.distributed as dist

from datasets import Dataset
from cleantext import clean
from tqdm.auto import tqdm

from constants import CLEAN_TEXT


def build_prompt(row):
    return f"""r/{row["subreddit"]}\nComment: {row["body"]}"""


def cleaner(text):
    return clean(
        text,
        fix_unicode=True,
        to_ascii=True,
        lower=False,
        no_line_breaks=False,
        no_urls=True,
        no_emails=True,
        no_phone_numbers=True,
        no_numbers=False,
        no_digits=False,
        no_currency_symbols=False,
        no_punct=False,
        replace_with_url="<URL>",
        replace_with_email="<EMAIL>",
        replace_with_phone_number="<PHONE>",
        lang="en",
    )



def get_dataframe_to_train(data_path):
    train_dataset = pd.read_csv(f"{data_path}/train.csv")

    flatten = []
    flatten.append(train_dataset[["body", "rule", "subreddit", "rule_violation"]])

    # Use the examples from the train set as part of the corpus
    for violation_type in ["positive", "negative"]:
        for i in range(1, 3):
            sub_dataset = train_dataset[[f"{violation_type}_example_{i}", "rule", "subreddit"]].copy()
            sub_dataset = sub_dataset.rename(columns={f"{violation_type}_example_{i}": "body"})
            sub_dataset["rule_violation"] = 1 if violation_type == "positive" else 0
            flatten.append(sub_dataset)

    dataframe = pd.concat(flatten, axis=0)
    dataframe = dataframe.drop_duplicates(ignore_index=True)
    return dataframe


def prepare_dataframe(dataframe):
    dataframe["prompt"] = dataframe.apply(build_prompt, axis=1)


    if CLEAN_TEXT:
        tqdm.pandas(desc="cleaner")
        dataframe["prompt"] = dataframe["prompt"].progress_apply(cleaner)

    if "rule_violation" in dataframe.columns:
        dataframe["rule_violation"] = dataframe["rule_violation"].map(
            {
                1: 1,
                0: -1,
            }
        )

    return dataframe

In [None]:
%%writefile semantic.py
import pandas as pd
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import semantic_search, dot_score
from tqdm.auto import tqdm
from peft import PeftModel, PeftConfig
import sys
import os
import traceback


from utils import get_dataframe_to_train, prepare_dataframe
from constants import DATA_PATH, EMBDEDDING_MODEL_PATH, EMBEDDING_MODEL_QUERY, TOP_K, BATCH_SIZE, MODEL_OUTPUT_PATH



def get_scores(test_dataframe):
    try:
        corpus_dataframe = get_dataframe_to_train(DATA_PATH)
        corpus_dataframe = prepare_dataframe(corpus_dataframe)

        # Validate model paths
        if not os.path.exists(EMBDEDDING_MODEL_PATH):
            raise FileNotFoundError(f"Embedding model not found at {EMBDEDDING_MODEL_PATH}")
        if not os.path.exists(MODEL_OUTPUT_PATH):
            raise FileNotFoundError(f"Model output not found at {MODEL_OUTPUT_PATH}")

        # Load base model
        model = AutoModelForCausalLM.from_pretrained(EMBDEDDING_MODEL_PATH)
        tokenizer = AutoTokenizer.from_pretrained(EMBDEDDING_MODEL_PATH)

        # Load adapter configuration and model
        adapter_config = PeftConfig.from_pretrained(MODEL_OUTPUT_PATH)
        lora_model = PeftModel.from_pretrained(model, MODEL_OUTPUT_PATH, config=adapter_config)
        merged_model = lora_model.merge_and_unload()

        # Save merged model
        output_dir = "Qwen3Emb_Finetuned"
        os.makedirs(output_dir, exist_ok=True)
        tokenizer.save_pretrained(output_dir)
        merged_model.save_pretrained(output_dir)

        # Create SentenceTransformer from merged encoder
        embedding_model = SentenceTransformer(model_name_or_path=output_dir, device="cuda")

        print('✅ Done loading model!')

        result = []
        for rule in tqdm(test_dataframe["rule"].unique(), desc=f"Generate scores for each rule"):
            test_dataframe_part = test_dataframe.query("rule == @rule").reset_index(drop=True)
            corpus_dataframe_part = corpus_dataframe.query("rule == @rule").reset_index(drop=True)
            corpus_dataframe_part = corpus_dataframe_part.reset_index(names="row_id")

            if len(test_dataframe_part) == 0 or len(corpus_dataframe_part) == 0:
                print(f"Warning: Empty dataframe for rule: {rule}", file=sys.stderr)
                continue

            query_embeddings = embedding_model.encode(
                sentences=test_dataframe_part["prompt"].tolist(),
                prompt=EMBEDDING_MODEL_QUERY,
                batch_size=BATCH_SIZE,
                show_progress_bar=True,
                convert_to_tensor=True,
                device="cuda",
                normalize_embeddings=True,
            )
            document_embeddings = embedding_model.encode(
                sentences=corpus_dataframe_part["prompt"].tolist(),
                batch_size=BATCH_SIZE,
                show_progress_bar=True,
                convert_to_tensor=True,
                device="cuda",
                normalize_embeddings=True,
            )
            test_dataframe_part["semantic"] = semantic_search(
                query_embeddings,
                document_embeddings,
                top_k=TOP_K,
                score_function=dot_score,
            )
            def get_score(semantic):
                semantic = pd.DataFrame(semantic)
                semantic = semantic.merge(
                    corpus_dataframe_part[["row_id", "rule_violation"]],
                    how="left",
                    left_on="corpus_id",
                    right_on="row_id",
                )
                # Weighted scoring: use similarity scores as weights
                semantic["weighted_score"] = semantic["score"] * semantic["rule_violation"]
                return semantic["weighted_score"].sum()

            tqdm.pandas(desc=f"Add label for {rule=}")
            test_dataframe_part["rule_violation"] = test_dataframe_part["semantic"].progress_apply(get_score)
            result.append(test_dataframe_part[["row_id", "rule_violation"]].copy())

        if len(result) == 0:
            raise ValueError("No results generated")

        submission = pd.concat(result, axis=0)
        return submission
    except Exception as e:
        print(f"Error in get_scores: {e}", file=sys.stderr)
        traceback.print_exc()
        raise


def generate_submission():
    try:
        test_csv_path = f"{DATA_PATH}/test.csv"
        if not os.path.exists(test_csv_path):
            raise FileNotFoundError(f"Test data not found at {test_csv_path}")

        test_dataframe = pd.read_csv(test_csv_path)
        test_dataframe = prepare_dataframe(test_dataframe)

        submission = get_scores(test_dataframe)
        submission = test_dataframe[["row_id"]].merge(submission, on="row_id", how="left")

        output_path = "submission_qwen3.csv"
        submission.to_csv(output_path, index=False)
        print(f"✅ Saved {output_path}")

    except Exception as e:
        print(f"❌ Submission generation failed: {e}", file=sys.stderr)
        traceback.print_exc()
        sys.exit(1)


if __name__ == "__main__":
    generate_submission()


In [None]:
!python semantic.py

In [None]:
import pandas as pd
import numpy as np
import sys
import os

try:
    # Validate input files exist
    files = {
        'submission_qwen.csv': 'submission_qwen.csv',
        'submission_qwen3.csv': 'submission_qwen3.csv',
        'submission_qwen14b.csv': 'submission_qwen14b.csv'
    }

    for name, path in files.items():
        if not os.path.exists(path):
            raise FileNotFoundError(f"Required file not found: {path}")

    q = pd.read_csv('submission_qwen.csv')
    l = pd.read_csv('submission_qwen3.csv')
    m = pd.read_csv('submission_qwen14b.csv')

    # Validate required columns
    for df, name in [(q, 'q'), (l, 'l'), (m, 'm')]:
        if 'rule_violation' not in df.columns:
            raise ValueError(f"Missing rule_violation column in {name}")

    # Safe rank calculation with division by zero protection
    def safe_rank(series):
        n = len(series)
        if n == 0:
            raise ValueError("Empty series for ranking")
        return series.rank(method='average') / (n + 1)

    rq = safe_rank(q['rule_violation'])
    rl = safe_rank(l['rule_violation'])
    rm = safe_rank(m['rule_violation'])

    # Optimized ensemble weights based on model capacity and performance
    # Larger model (14b) gets more weight, followed by base model with TTA, then embeddings
    blend = 0.45*rq + 0.25*rl + 0.30*rm  # Adjusted weights for better balance
    q['rule_violation'] = blend

    # Ensure output directory exists
    os.makedirs('/kaggle/working', exist_ok=True)

    output_path = '/kaggle/working/submission.csv'
    q.to_csv(output_path, index=False)
    print(f"✅ Final submission saved to {output_path}")
    print(f"Ensemble weights: 0.5b-TTA=45%, embeddings=25%, 14b=30%")

except Exception as e:
    print(f"❌ Blending failed: {e}", file=sys.stderr)
    sys.exit(1)

In [None]:
import pandas as pd
try:
    submission_path = '/kaggle/working/submission.csv'
    if not os.path.exists(submission_path):
        raise FileNotFoundError(f"Submission file not found at {submission_path}")

    df = pd.read_csv(submission_path)
    print("✅ Final submission preview:")
    print(df)
except Exception as e:
    print(f"❌ Failed to read submission: {e}", file=sys.stderr)