In [16]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/jigsaw/submission.csv
/kaggle/input/jigsaw/train.csv
/kaggle/input/jigsaw/test.csv
/kaggle/input/wikipedia-toxic-comments/sample_submission.csv
/kaggle/input/wikipedia-toxic-comments/test_labels.csv
/kaggle/input/wikipedia-toxic-comments/train.csv
/kaggle/input/wikipedia-toxic-comments/test.csv
/kaggle/input/all-minilm-l6-v2/transformers/default/1/all-MiniLM-L6-v2/rust_model.ot
/kaggle/input/all-minilm-l6-v2/transformers/default/1/all-MiniLM-L6-v2/config.json
/kaggle/input/all-minilm-l6-v2/transformers/default/1/all-MiniLM-L6-v2/README.md
/kaggle/input/all-minilm-l6-v2/transformers/default/1/all-MiniLM-L6-v2/tokenizer.json
/kaggle/input/all-minilm-l6-v2/transformers/default/1/all-MiniLM-L6-v2/tf_model.h5
/kaggle/input/all-minilm-l6-v2/transformers/default/1/all-MiniLM-L6-v2/data_config.json
/kaggle/input/all-minilm-l6-v2/transformers/default/1/all-MiniLM-L6-v2/train_script.py
/kaggle/input/all-minilm-l6-v2/transformers/default/1/all-MiniLM-L6-v2/tokenizer_config.json
/kag

In [17]:
import os
import re
import gc
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import torch
import lightgbm as lgb
from pathlib import Path
from typing import Dict, List, Tuple, Set

from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.utils import check_random_state

In [18]:
# --- Paths & Models ---
TRAIN_PATH = "/kaggle/input/jigsaw-agile-community-rules/train.csv"
TEST_PATH  = "/kaggle/input/jigsaw-agile-community-rules/test.csv"

AUG_DATA_PATH = "/kaggle/input/wikipediacomments/train.csv"

EMBED_MODEL_PATHS = {
    'bge-m3': "/kaggle/input/bge-m3/transformers/m3/1/bge-m3",
    'all-mpnet-base-v2': "/kaggle/input/all-mpnet-base-v2/transformers/all-mpnet-base-v2/1",
    "all-minilm-l6-v2": "/kaggle/input/all-minilm-l6-v2/transformers/default/1/all-MiniLM-L6-v2",
    'qwen-3': "/kaggle/input/qwen-3-embedding/transformers/0.6b/1",
}

# --- Runtime Parameters ---
BATCH_SIZE = 32
N_SPLITS = 5
WEIGHT_POWER = 2.0
OUT_ENSEMBLE_CSV = "/kaggle/working/submission.csv"

Utilidades e formatacao do dataset de data augmentation

In [19]:
def load_and_format_toxic_data(path: str) -> pd.DataFrame:
    """
    Loads and formats the Wikipedia Toxic Comments dataset for augmentation.
    """
    print(f"Loading and formatting augmentation data from: {Path(path).name}")
    df_toxic = pd.read_csv(path)
    
    # Define the columns that represent different types of toxicity
    label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

    # Create a single 'rule_violation' label: 1 if any toxic flag is set, 0 otherwise
    df_toxic['rule_violation'] = df_toxic[label_cols].max(axis=1)

    # Assign the specific rule this data augments
    df_toxic['rule'] = "No toxic or hostile behavior."

    # Rename the 'comment_text' column to 'body'
    df_toxic = df_toxic.rename(columns={'comment_text': 'body'})
    
    # Keep only the necessary columns
    df_aug_toxic = df_toxic[['body', 'rule', 'rule_violation']]
    print(f"Formatted {len(df_aug_toxic)} rows for augmentation.")
    return df_aug_toxic

def normalize_text(s: str) -> str:
    s = str(s).strip()
    s = re.sub(r"\s+", " ", s)
    return s

def normalize_lc(s: str) -> str:
    return normalize_text(s).lower()

def auto_majority_vote_within_rule(df: pd.DataFrame) -> pd.DataFrame:
    tmp = df.copy()
    tmp["norm_body"] = tmp["body"].map(normalize_lc)
    g = tmp.groupby(["rule", "norm_body"])["rule_violation"]
    stats = g.agg(["sum", "count"]).rename(columns={"sum": "pos", "count": "n"})
    stats["neg"] = stats["n"] - stats["pos"]
    stats["maj_label"] = (stats["pos"] >= stats["neg"]).astype(int)
    tmp = tmp.merge(stats["maj_label"], left_on=["rule", "norm_body"], right_index=True, how="left")
    tmp["rule_violation"] = tmp["maj_label"]
    tmp = (tmp.sort_values(["rule", "norm_body"])
              .drop_duplicates(["rule", "norm_body"], keep="first")
              .drop(columns=["maj_label"]))
    return tmp.drop(columns=["norm_body"])

def get_example_cols(df: pd.DataFrame) -> Tuple[List[str], List[str]]:
    pos_cols = [c for c in df.columns if c.startswith("positive_example")]
    neg_cols = [c for c in df.columns if c.startswith("negative_example")]
    return pos_cols, neg_cols

def build_rule_example_pools(df: pd.DataFrame) -> Dict[str, Dict[str, Set[str]]]:
    pos_cols, neg_cols = get_example_cols(df)
    pools: Dict[str, Dict[str, Set[str]]] = {}
    for rule, g in df.groupby("rule"):
        pos_set, neg_set = set(), set()
        for c in pos_cols:
            if c in g:
                pos_set.update(g[c].dropna().astype(str).map(str.strip).tolist())
        for c in neg_cols:
            if c in g:
                neg_set.update(g[c].dropna().astype(str).map(str.strip).tolist())
        pools[rule] = {"pos": {t for t in pos_set if t}, "neg": {t for t in neg_set if t}}
    return pools

In [20]:
def build_text_index(train_df: pd.DataFrame, test_df: pd.DataFrame) -> List[str]:
    texts: List[str] = []
    texts.extend(train_df["body"].fillna("").tolist())
    texts.extend(train_df["rule"].fillna("").tolist())
    texts.extend(test_df["body"].fillna("").tolist())
    texts.extend(test_df["rule"].fillna("").tolist())
    
    for df in [train_df, test_df]:
        pos_cols, neg_cols = get_example_cols(df)
        for c in pos_cols + neg_cols:
            if c in df:
                texts.extend(df[c].dropna().astype(str).map(str.strip).tolist())
    
    seen, uniq = set(), []
    for t in texts:
        t = str(t)
        if t and t not in seen:
            seen.add(t)
            uniq.append(t)
    return uniq

def embed_all(texts: List[str], model: SentenceTransformer, batch_size: int):
    embs = model.encode(
        texts,
        batch_size=batch_size,
        show_progress_bar=True,
        normalize_embeddings=True
    )
    return {t: e.astype(np.float32) for t, e in zip(texts, embs)}

def _sim_stats(vec: np.ndarray, mats: List[np.ndarray]) -> List[float]:
    if vec is None or len(mats) == 0:
        return [0.0, 0.0, 0.0]
    sims = cosine_similarity(vec[None, :], np.vstack(mats))[0]
    return [float(sims.max()), float(sims.mean()), float(sims.min())]

def row_features_single_model(
    row: pd.Series,
    emb_cache: Dict[str, np.ndarray],
    rule_pools: Dict[str, Dict[str, Set[str]]]) -> np.ndarray:
    
    body = emb_cache.get(str(row["body"]))
    rule = emb_cache.get(str(row["rule"]))
    
    rule_key = str(row["rule"])
    pos_txts = rule_pools.get(rule_key, {}).get("pos", set())
    neg_txts = rule_pools.get(rule_key, {}).get("neg", set())

    pos_embs = [emb_cache[t] for t in pos_txts if t in emb_cache]
    neg_embs = [emb_cache[t] for t in neg_txts if t in emb_cache]

    feats: List[float] = []
    feats.append(float(np.dot(body, rule)) if (body is not None and rule is not None) else 0.0)
    
    pmax, pmean, pmin = _sim_stats(body, pos_embs)
    nmax, nmean, nmin = _sim_stats(body, neg_embs)
    
    feats.extend([pmax, pmean, pmin, nmax, nmean, nmin, pmax - nmax, pmean - nmean])
    return np.array(feats, dtype=np.float32)

def build_feature_matrix(
    df: pd.DataFrame,
    emb_caches: Dict[str, Dict[str, np.ndarray]],
    rule_pools: Dict[str, Dict[str, Set[str]]],
    model_names: List[str]) -> np.ndarray:
    
    rows = []
    for i in range(len(df)):
        row_series = df.iloc[i]
        per_model_feats = [
            row_features_single_model(row_series, emb_caches[mn], rule_pools)
            for mn in model_names
        ]
        rows.append(np.hstack(per_model_feats))
    return np.vstack(rows).astype(np.float32)

## Funcao de definicao dos modelos

In [21]:
def make_estimator(kind: str, seed: int):
    """Factory to create different ML models."""
    
    if kind == "lr":
        # Logistic Regression: A robust and fast linear model.
        print("  -> Creating LogisticRegression estimator")
        return LogisticRegression(
            max_iter=1000,
            solver="lbfgs",
            class_weight="balanced",
            C=1.0,
            n_jobs=-1,
            random_state=seed,
        )
    
    if kind == "ridge":
        # Ridge Classifier: A linear model with L2 regularization, good for high-dimensional data.
        # We wrap it with CalibratedClassifierCV to get probability outputs.
        print("  -> Creating RidgeClassifier estimator")
        base = RidgeClassifier(class_weight="balanced", random_state=seed)
        calibrated = CalibratedClassifierCV(base, method="sigmoid", cv=3)
        return make_pipeline(StandardScaler(), calibrated)
        
    if kind == "nb":
        # Gaussian Naive Bayes: Extremely fast, assumes features are normally distributed.
        # A simple but often effective baseline.
        print("  -> Creating GaussianNB estimator")
        return GaussianNB()

    if kind == "lgbm":
        # LightGBM: A fast, high-performance gradient boosting framework.
        print("  -> Creating LGBMClassifier estimator")
        use_gpu = torch.cuda.is_available()
        params = dict(
            objective="binary",
            metric="auc",
            n_estimators=1000,
            learning_rate=0.05,
            max_depth=4,
            subsample=0.9,
            colsample_bytree=0.9,
            random_state=seed,
            n_jobs=-1,
            device="gpu" if use_gpu else "cpu",
        )
        return lgb.LGBMClassifier(**params)

    raise ValueError(f"Unknown estimator kind: {kind}")

def make_balanced_weights(y):
    counts = np.bincount(y)
    n = len(y)
    w = np.ones_like(y, dtype=np.float64)
    for c in (0, 1):
        if c < len(counts) and counts[c] > 0:
            w[y == c] = n / (2.0 * counts[c])
    return w

In [22]:
def run_config(config_dict: dict, random_state: int):
    """
    Runs a single configuration from data loading to model training.
    """
    name = config_dict["name"]
    embed_keys = config_dict["embed_model_keys"]
    learners = config_dict["learners"]
    
    print(f"\n{'='*25}\nConfig: {name}\nEmbeddings: {embed_keys}\nLearners: {learners}\n{'='*25}\n")
    
    # --- Data Loading & Augmentation ---
    df_train_raw = pd.read_csv(TRAIN_PATH)
    df_test = pd.read_csv(TEST_PATH)
    
    # ** INTEGRATED AUGMENTATION STEP **
    if Path(AUG_DATA_PATH).exists():
        df_aug_toxic = load_and_format_toxic_data(AUG_DATA_PATH)
        df_train_raw = pd.concat([df_train_raw, df_aug_toxic], ignore_index=True)
        print(f"Training data augmented. New shape: {df_train_raw.shape}")

    # Clean and de-duplicate the combined training data
    df_train = auto_majority_vote_within_rule(df_train_raw)
    
    train_pools = build_rule_example_pools(df_train)
    test_pools = build_rule_example_pools(df_test)
    
    pred_pools = {k: v.copy() for k, v in train_pools.items()}
    for rule, d in test_pools.items():
        pred_pools.setdefault(rule, {"pos": set(), "neg": set()})
        pred_pools[rule]["pos"].update(d.get("pos", set()))
        pred_pools[rule]["neg"].update(d.get("neg", set()))

    # --- Embedding ---
    uniq_texts = build_text_index(df_train, df_test)
    emb_caches = {}
    for key in embed_keys:
        model_path = EMBED_MODEL_PATHS[key]
        print(f"\nLoading embedding model: {key}")
        model = SentenceTransformer(model_path, trust_remote_code=True)
        emb_caches[key] = embed_all(uniq_texts, model, batch_size=BATCH_SIZE)
        del model
        torch.cuda.empty_cache()
        gc.collect()

    # --- Feature Matrix Building ---
    print("\nBuilding feature matrices...")
    X_train = build_feature_matrix(df_train, emb_caches, train_pools, embed_keys)
    y_train = df_train["rule_violation"].astype(int).values
    X_test = build_feature_matrix(df_test, emb_caches, pred_pools, embed_keys)
    print(f"Feature matrix shape: {X_train.shape}")
    
    del emb_caches, uniq_texts
    gc.collect()

    # --- CV and Prediction Loop ---
    skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=random_state)
    members = []

    for learner_kind in learners:
        print(f"\n--- Training Learner: {learner_kind.upper()} ---")
        oof_preds = np.zeros(len(X_train), dtype=float)
        test_preds_folds = []
        
        for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):
            X_tr, y_tr = X_train[train_idx], y_train[train_idx]
            X_val, y_val = X_train[val_idx], y_train[val_idx]

            estimator = make_estimator(learner_kind, random_state)
            fit_params = {}
            if learner_kind == "lgbm":
                fit_params['eval_set'] = [(X_val, y_val)]
                fit_params['callbacks'] = [lgb.early_stopping(50, verbose=False)]
                
            if learner_kind == "ridge":
                 fit_params[f'{estimator.steps[-1][0]}__sample_weight'] = make_balanced_weights(y_tr)
                    
            estimator.fit(X_tr, y_tr, **fit_params)
            
            fold_preds = estimator.predict_proba(X_val)[:, 1]
            oof_preds[val_idx] = fold_preds
            test_preds_folds.append(estimator.predict_proba(X_test)[:, 1])
            print(f"  Fold {fold} AUC: {roc_auc_score(y_val, fold_preds):.5f}")

        oof_auc = roc_auc_score(y_train, oof_preds)
        print(f">> {name} [{learner_kind}] | Overall OOF AUC: {oof_auc:.5f}")
        
        # Refit on full data
        estimator_full = make_estimator(learner_kind, random_state)
        fit_params_full = {}
        if learner_kind == "ridge":
            fit_params_full[f'{estimator_full.steps[-1][0]}__sample_weight'] = make_balanced_weights(y_train)

        estimator_full.fit(X_train, y_train, **fit_params_full)
        test_preds_full = estimator_full.predict_proba(X_test)[:, 1]

        members.append({
            "name": f"{name}_{learner_kind}",
            "oof_auc": oof_auc,
            "test_preds": test_preds_full,
        })
    
    return members

### Tabela de Engenharia de Features

| Característica      | Descrição                                                                                                                                                    |
| :------------------ | :----------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `sim_body_rule`     | A similaridade de cosseno entre o embedding do comentário (`body`) e o da regra (`rule`). Mede o quão semanticamente próximo o comentário está do texto da regra. |
| `pmax`              | A **máxima** similaridade de cosseno entre o comentário e todos os **exemplos positivos** (violações conhecidas) associados à regra.                               |
| `pmean`             | A **média** da similaridade de cosseno entre o comentário e todos os **exemplos positivos** da regra.                                                            |
| `pmin`              | A **mínima** similaridade de cosseno entre o comentário e todos os **exemplos positivos** da regra.                                                            |
| `nmax`              | A **máxima** similaridade de cosseno entre o comentário e todos os **exemplos negativos** (não violações conhecidas) da regra.                             |
| `nmean`             | A **média** da similaridade de cosseno entre o comentário e todos os **exemplos negativos** da regra.                                                            |
| `nmin`              | A **mínima** similaridade de cosseno entre o comentário e todos os **exemplos negativos** da regra.                                                            |
| `pmax - nmax`       | A **diferença** entre a similaridade máxima com exemplos positivos e a máxima com exemplos negativos.      |
| `pmean - nmean`     | A **diferença** entre a similaridade média com exemplos positivos e a média com exemplos negativos.     |


In [23]:
CONFIGS = [
    {
        "name": "config_A",
        "embed_model_keys": ["bge-m3", "all-mpnet-base-v2", 'qwen-3'],
        "learners": ["lr", "lgbm"],
    },
    {
        "name": "config_B",
        "embed_model_keys": ["all-minilm-l6-v2"],
        "learners": ["nb", "ridge"],
    },
]

all_members = []
for i, cfg in enumerate(CONFIGS):
    all_members.extend(run_config(cfg, random_state=42 + i))

print("\n=== Final Ensemble Blending ===")
aucs = np.array([m["oof_auc"] for m in all_members])
weights = np.maximum(aucs, 1e-6) ** WEIGHT_POWER
weights /= weights.sum()

for i, m in enumerate(all_members):
    print(f"Member '{m['name']}': OOF AUC={m['oof_auc']:.5f}, Weight={weights[i]:.4f}")

test_stack = np.vstack([m["test_preds"] for m in all_members])
y_pred_ensemble = np.average(test_stack, axis=0, weights=weights)

# --- Create Submission File ---
df_sub = pd.read_csv(TEST_PATH)[["row_id"]]
df_sub["rule_violation"] = y_pred_ensemble
df_sub.to_csv(OUT_ENSEMBLE_CSV, index=False)

print(f"\n Ensemble submission written to: {OUT_ENSEMBLE_CSV}")
print("Submission Head:")
print(df_sub.head())


Config: config_A
Embeddings: ['bge-m3', 'all-mpnet-base-v2', 'qwen-3']
Learners: ['lr', 'lgbm']


Loading embedding model: bge-m3


Batches:   0%|          | 0/65 [00:00<?, ?it/s]


Loading embedding model: all-mpnet-base-v2


Batches:   0%|          | 0/65 [00:00<?, ?it/s]


Loading embedding model: qwen-3


Batches:   0%|          | 0/65 [00:00<?, ?it/s]


Building feature matrices...
Feature matrix shape: (1873, 27)

--- Training Learner: LR ---
  -> Creating LogisticRegression estimator
  Fold 1 AUC: 0.93923
  -> Creating LogisticRegression estimator
  Fold 2 AUC: 0.91556
  -> Creating LogisticRegression estimator
  Fold 3 AUC: 0.93202
  -> Creating LogisticRegression estimator
  Fold 4 AUC: 0.92351
  -> Creating LogisticRegression estimator
  Fold 5 AUC: 0.91927
>> config_A [lr] | Overall OOF AUC: 0.92499
  -> Creating LogisticRegression estimator

--- Training Learner: LGBM ---
  -> Creating LGBMClassifier estimator
  Fold 1 AUC: 0.94594
  -> Creating LGBMClassifier estimator
  Fold 2 AUC: 0.92922
  -> Creating LGBMClassifier estimator
  Fold 3 AUC: 0.93900
  -> Creating LGBMClassifier estimator
  Fold 4 AUC: 0.93879
  -> Creating LGBMClassifier estimator
  Fold 5 AUC: 0.92788
>> config_A [lgbm] | Overall OOF AUC: 0.90955
  -> Creating LGBMClassifier estimator

Config: config_B
Embeddings: ['all-minilm-l6-v2']
Learners: ['nb', 'ridg

Batches:   0%|          | 0/65 [00:00<?, ?it/s]


Building feature matrices...
Feature matrix shape: (1873, 9)

--- Training Learner: NB ---
  -> Creating GaussianNB estimator
  Fold 1 AUC: 0.92732
  -> Creating GaussianNB estimator
  Fold 2 AUC: 0.92920
  -> Creating GaussianNB estimator
  Fold 3 AUC: 0.92912
  -> Creating GaussianNB estimator
  Fold 4 AUC: 0.90699
  -> Creating GaussianNB estimator
  Fold 5 AUC: 0.93445
>> config_B [nb] | Overall OOF AUC: 0.92517
  -> Creating GaussianNB estimator

--- Training Learner: RIDGE ---
  -> Creating RidgeClassifier estimator
  Fold 1 AUC: 0.92222
  -> Creating RidgeClassifier estimator
  Fold 2 AUC: 0.93348
  -> Creating RidgeClassifier estimator
  Fold 3 AUC: 0.92766
  -> Creating RidgeClassifier estimator
  Fold 4 AUC: 0.90782
  -> Creating RidgeClassifier estimator
  Fold 5 AUC: 0.93270
>> config_B [ridge] | Overall OOF AUC: 0.92424
  -> Creating RidgeClassifier estimator

=== Final Ensemble Blending ===
Member 'config_A_lr': OOF AUC=0.92499, Weight=0.2522
Member 'config_A_lgbm': OOF 