In [None]:
# Configuration Cell - Add this at the top of each notebook
import os
import sys
from pathlib import Path

# Detect environment
IS_KAGGLE = os.path.exists('/kaggle/input')
IS_COLAB = 'google.colab' in sys.modules

# Set base directories based on environment
if IS_KAGGLE:
    INPUT_ROOT = "/kaggle/input"
    WORK_DIR = "/kaggle/working"
elif IS_COLAB:
    INPUT_ROOT = "/content/input"
    WORK_DIR = "/content/working"
else:
    # Local environment
    INPUT_ROOT = Path.cwd() / "input"
    WORK_DIR = Path.cwd() / "working"

# Create standard directories
OUT_DIR = os.path.join(WORK_DIR, "data")
EXPERIMENTS_DIR = os.path.join(WORK_DIR, "experiments")
SCRIPTS_DIR = os.path.join(WORK_DIR, "scripts")

# Create all directories
for directory in [OUT_DIR, EXPERIMENTS_DIR, SCRIPTS_DIR]:
    Path(directory).mkdir(parents=True, exist_ok=True)

print(f"Environment: {'Kaggle' if IS_KAGGLE else 'Colab' if IS_COLAB else 'Local'}")
print(f"Input directory: {INPUT_ROOT}")
print(f"Working directory: {WORK_DIR}")
print(f"Data directory: {OUT_DIR}")
print(f"Experiments directory: {EXPERIMENTS_DIR}")

In [None]:
# HateXplain Dataset Preprocessing
import os, json
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit

INPUT_ROOT = "/kaggle/input"
WORK_DIR   = "/kaggle/working"
OUT_DIR    = os.path.join(WORK_DIR, "data")
os.makedirs(OUT_DIR, exist_ok=True)

# Locate HateXplain dataset
HATEX_DIRS = [os.path.join(INPUT_ROOT, d) for d in os.listdir(INPUT_ROOT)
              if "hatexplain" in d.lower() or "hate_explain" in d.lower()]

if len(HATEX_DIRS) == 0:
    print("[WARN] HateXplain dataset not found. Please add it to Kaggle inputs.")
else:
    HATEX_DIR = HATEX_DIRS[0]
    print(f"Found HateXplain at: {HATEX_DIR}")

In [None]:
# Load HateXplain data
# HateXplain typically comes as JSON or JSONL
import json

def load_hatexplain_json(filepath):
    """Load HateXplain from JSON or JSONL format."""
    records = []
    
    # Try as JSONL first
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            for line in f:
                if line.strip():
                    records.append(json.loads(line))
    except:
        # Try as single JSON
        with open(filepath, "r", encoding="utf-8") as f:
            data = json.load(f)
            if isinstance(data, list):
                records = data
            elif isinstance(data, dict):
                # Sometimes HateXplain is stored as {id: record}
                records = list(data.values())
    
    return records

# Find JSON/JSONL files
json_files = [f for f in os.listdir(HATEX_DIR) if f.endswith((".json", ".jsonl"))]
if not json_files:
    raise FileNotFoundError(f"No JSON/JSONL files found in {HATEX_DIR}")

hatex_file = json_files[0]
print(f"Loading: {hatex_file}")

records = load_hatexplain_json(os.path.join(HATEX_DIR, hatex_file))
print(f"Loaded {len(records)} records")

In [None]:
# Parse HateXplain records
rows = []

for idx, rec in enumerate(records):
    # Extract text (either as string or from tokens)
    if isinstance(rec.get("text"), str):
        text = rec["text"]
    elif "post_tokens" in rec:
        text = " ".join(rec["post_tokens"])
    else:
        continue
    
    # Extract label
    # HateXplain labels: "hatespeech", "offensive", "normal"
    label_str = str(rec.get("label", "normal")).lower()
    
    # Map to binary: toxic (1) or non-toxic (0)
    if label_str in {"hatespeech", "offensive", "offensive_language", "hate"}:
        label = 1
    else:
        label = 0
    
    rows.append({
        "id": rec.get("post_id", idx),
        "text": text,
        "label": label
    })

df = pd.DataFrame(rows)
print(f"Parsed {len(df)} records")
print(f"Positive rate: {df['label'].mean():.3f}")

In [None]:
# Deduplicate by text
df = df.drop_duplicates(subset=["text"]).reset_index(drop=True)
print(f"After deduplication: {len(df)} records")

# Stratified 8/1/1 split
sss1 = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, temp_idx = next(sss1.split(df, df["label"]))
temp = df.iloc[temp_idx]
sss2 = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)
val_rel_idx, test_rel_idx = next(sss2.split(temp, temp["label"]))
val_idx  = temp_idx[val_rel_idx]
test_idx = temp_idx[test_rel_idx]

print(f"Train: {len(train_idx)}, Val: {len(val_idx)}, Test: {len(test_idx)}")

In [None]:
# Export CSVs
df.iloc[train_idx][["text","label"]].to_csv(os.path.join(OUT_DIR, "hatexplain_train.csv"), index=False)
df.iloc[val_idx  ][["text","label"]].to_csv(os.path.join(OUT_DIR, "hatexplain_val.csv"  ), index=False)
df.iloc[test_idx ][["text","label"]].to_csv(os.path.join(OUT_DIR, "hatexplain_test.csv" ), index=False)

print("CSVs exported.")

In [None]:
# Save protocols
splits = {
    "hatexplain": {
        "train_n": int(len(train_idx)),
        "val_n": int(len(val_idx)),
        "test_n": int(len(test_idx)),
        "pos_rate": {
            "train": float(df.iloc[train_idx]["label"].mean()),
            "val":   float(df.iloc[val_idx]["label"].mean()),
            "test":  float(df.iloc[test_idx]["label"].mean()),
        }
    }
}
with open(os.path.join(OUT_DIR, "hatexplain_protocols.json"), "w") as f:
    json.dump(splits, f, indent=2)

print("HateXplain preprocessing complete!")
!ls -lh {OUT_DIR}/hatexplain*