In [1]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


mv: cannot stat 'kaggle.json': No such file or directory


In [2]:
!pip install kaggle




In [3]:
!kaggle kernels output datam0nstr/toxic-comment-classification -p /content/toxic_output


added_tokens.json: Skipping, found more recently modified local copy (use --force to force download)
config.json: Skipping, found more recently modified local copy (use --force to force download)
model.safetensors: Skipping, found more recently modified local copy (use --force to force download)
special_tokens_map.json: Skipping, found more recently modified local copy (use --force to force download)
spm.model: Skipping, found more recently modified local copy (use --force to force download)
tokenizer.json: Skipping, found more recently modified local copy (use --force to force download)
tokenizer_config.json: Skipping, found more recently modified local copy (use --force to force download)
submission.csv: Skipping, found more recently modified local copy (use --force to force download)
Kernel log downloaded to /content/toxic_output/toxic-comment-classification.log 


In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Installing and Importing Libraries

In [5]:
# ====================================================
# 1. INSTALL AND UPGRADE LIBRARIES (IMPORTANT!)
# ====================================================
!pip install -q --upgrade \
    transformers \
    tokenizers \
    datasets \
    evaluate \
    accelerate \
    torchmetrics


In [6]:
import pandas as pd
import numpy as np
import torch
import re
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from tqdm.auto import tqdm
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, classification_report

# ====================================================
# CONFIGURATION
# ====================================================
# Path to your fine-tuned model directory (created by the training script)
SAVED_MODEL_PATH = "/content/toxic_output/best_model"

# Paths to test data (adjust if your environment is different)
TEST_CSV_PATH = "/content/test.csv.zip"
TEST_LABELS_CSV_PATH = "/content/test_labels.csv.zip"
SUBMISSION_PATH = "submission.csv"

# Model and training parameters (should match training)
MAX_LEN = 256
BATCH_SIZE = 32  # Can be larger for inference
LABELS = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

# Setup device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


# ====================================================
# DATA LOADING & PRE-PROCESSING
# ====================================================
# Load test data
test_df = pd.read_csv(TEST_CSV_PATH)
test_labels_df = pd.read_csv(TEST_LABELS_CSV_PATH)

# Use the exact same cleaning function from your training script
def clean_text(text):
    if not isinstance(text, str):
        return ""
    url_pattern = r"http\S+|www\.\S+"
    html_pattern = r"<.*?>"
    emoji_pattern = r"[\U00010000-\U0010ffff]"
    non_ascii_pattern = r"[^\x00-\x7F]+"
    multi_space_pattern = r"\s+"

    text = text.lower()
    text = re.sub(url_pattern, " ", text)
    text = re.sub(html_pattern, " ", text)
    text = re.sub(emoji_pattern, " ", text)
    text = re.sub(non_ascii_pattern, " ", text)
    text = re.sub(multi_space_pattern, " ", text).strip()
    return text

# Apply cleaning
test_df["comment"] = test_df["comment_text"].apply(clean_text)


# ====================================================
# LOAD MODEL & TOKENIZER
# ====================================================
print(f"Loading model and tokenizer from: {SAVED_MODEL_PATH}")
model = AutoModelForSequenceClassification.from_pretrained(SAVED_MODEL_PATH)
tokenizer = AutoTokenizer.from_pretrained(SAVED_MODEL_PATH)

# Move model to the appropriate device
model.to(device)
model.eval()


# ====================================================
# DATASET & DATALOADER
# ====================================================
class ToxicDataset(Dataset):
    def __init__(self, df, tokenizer, max_len, is_test=False):
        self.texts = df["comment"].tolist()
        self.is_test = is_test
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt",
        )
        item = {k: v.squeeze(0) for k, v in encoding.items()}
        return item

# Create dataset and dataloader
test_dataset = ToxicDataset(test_df, tokenizer, max_len=MAX_LEN, is_test=True)
data_collator = DataCollatorWithPadding(tokenizer)
test_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=data_collator,
    num_workers=2,
    pin_memory=True
)


# ====================================================
# INFERENCE LOOP
# ====================================================
all_preds = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Running Predictions"):
        # Move batch to device
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        probabilities = torch.sigmoid(logits)
        all_preds.append(probabilities.cpu().numpy())

# Concatenate all predictions into a single numpy array
test_preds = np.concatenate(all_preds, axis=0)


# ====================================================
# CREATE SUBMISSION FILE
# ====================================================
submission_df = pd.DataFrame(test_preds, columns=LABELS)
submission_df.insert(0, 'id', test_df['id'])
submission_df.to_csv(SUBMISSION_PATH, index=False)
print(f"\n✅ '{SUBMISSION_PATH}' has been created successfully!")
print("Submission file head:")
print(submission_df.head())


# ====================================================
# COMPREHENSIVE EVALUATION
# ====================================================
# Merge predictions with the true labels
eval_df = pd.merge(submission_df, test_labels_df, on='id')

# Filter out rows that are not used for scoring (where true labels are -1)
score_df = eval_df[eval_df['toxic_y'] != -1].copy()

print(f"\nEvaluating on {len(score_df)} scored test samples...")

# --- 1. AUC Score (Competition Metric) ---
print(f"\n{'='*60}")
print("1. ROC AUC Score (Primary Competition Metric)")
print(f"{'='*60}")

test_aucs = []
print("Per-class AUC on the test set:")
for label in LABELS:
    true_labels = score_df[label + '_y']
    pred_probs = score_df[label + '_x']
    auc_score = roc_auc_score(true_labels, pred_probs)
    test_aucs.append(auc_score)
    print(f"  {label:15s}: {auc_score:.4f}")

macro_auc = np.mean(test_aucs)
print(f"\n----------------------------------------")
print(f"✅ Final Test Macro AUC: {macro_auc:.4f}")
print(f"----------------------------------------")


# --- 2. F1, Precision, Accuracy (at 0.5 threshold) ---
print(f"\n{'='*60}")
print("2. F1, Precision, Recall, and Accuracy at threshold=0.5")
print(f"{'='*60}")

# Define the threshold
THRESHOLD = 0.5

# Extract true labels and predicted probabilities
true_labels_all = score_df[[label + '_y' for label in LABELS]].values
pred_probs_all = score_df[[label + '_x' for label in LABELS]].values

# Convert probabilities to binary predictions
pred_binary_all = (pred_probs_all >= THRESHOLD).astype(int)

# Exact Match Ratio (Subset Accuracy) - very strict
exact_match_ratio = accuracy_score(true_labels_all, pred_binary_all)
print(f"Exact Match Ratio (Subset Accuracy): {exact_match_ratio:.4f}\n")

# Micro-averaged metrics (good for overall performance)
micro_f1 = f1_score(true_labels_all, pred_binary_all, average='micro', zero_division=0)
print(f"Micro-Averaged F1-Score:  {micro_f1:.4f}")

# Per-Class Report (most detailed view)
print("\n--- Per-Class Classification Report ---")
report = classification_report(true_labels_all, pred_binary_all, target_names=LABELS, zero_division=0)
print(report)

Using device: cuda
Loading model and tokenizer from: /content/toxic_output/best_model


Running Predictions:   0%|          | 0/4787 [00:00<?, ?it/s]


✅ 'submission.csv' has been created successfully!
Submission file head:
                 id     toxic  severe_toxic   obscene        threat    insult  \
0  00001cee341fdb12  0.996740      0.531942  0.966470  3.095033e-02  0.953445   
1  0000247867823ef7  0.000306      0.000003  0.000040  1.056949e-06  0.000020   
2  00013b17ad220c46  0.000289      0.000003  0.000037  1.291673e-06  0.000019   
3  00017563c3f7919a  0.000256      0.000004  0.000042  1.541029e-06  0.000020   
4  00017695ad8997eb  0.000438      0.000002  0.000039  9.042737e-07  0.000020   

   identity_hate  
0       0.620848  
1       0.000013  
2       0.000015  
3       0.000018  
4       0.000011  

Evaluating on 63978 scored test samples...

1. ROC AUC Score (Primary Competition Metric)
Per-class AUC on the test set:
  toxic          : 0.9702
  severe_toxic   : 0.9897
  obscene        : 0.9808
  threat         : 0.9927
  insult         : 0.9762
  identity_hate  : 0.9866

----------------------------------------
✅ Fina