In [None]:
# Configuration Cell - Add this at the top of each notebook
import os
import sys
from pathlib import Path

# Detect environment
IS_KAGGLE = os.path.exists('/kaggle/input')
IS_COLAB = 'google.colab' in sys.modules

# Set base directories based on environment
if IS_KAGGLE:
    INPUT_ROOT = "/kaggle/input"
    WORK_DIR = "/kaggle/working"
elif IS_COLAB:
    INPUT_ROOT = "/content/input"
    WORK_DIR = "/content/working"
else:
    # Local environment
    INPUT_ROOT = Path.cwd() / "input"
    WORK_DIR = Path.cwd() / "working"

# Create standard directories
OUT_DIR = os.path.join(WORK_DIR, "data")
EXPERIMENTS_DIR = os.path.join(WORK_DIR, "experiments")
SCRIPTS_DIR = os.path.join(WORK_DIR, "scripts")

# Create all directories
for directory in [OUT_DIR, EXPERIMENTS_DIR, SCRIPTS_DIR]:
    Path(directory).mkdir(parents=True, exist_ok=True)

print(f"Environment: {'Kaggle' if IS_KAGGLE else 'Colab' if IS_COLAB else 'Local'}")
print(f"Input directory: {INPUT_ROOT}")
print(f"Working directory: {WORK_DIR}")
print(f"Data directory: {OUT_DIR}")
print(f"Experiments directory: {EXPERIMENTS_DIR}")

In [None]:
# Read from Jigsaw dataset -> Clean -> Binarize -> Remove duplicates -> partitioning -> Export CSV
import os, re, json
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit

INPUT_ROOT = "/kaggle/input"
WORK_DIR   = "/kaggle/working"
OUT_DIR    = os.path.join(WORK_DIR, "data")
os.makedirs(OUT_DIR, exist_ok=True)

# Automatically locate Jigsaw dataset
JIGSAW_DIRS = [os.path.join(INPUT_ROOT, d) for d in os.listdir(INPUT_ROOT)
               if d.startswith("jigsaw-unintended-bias-in-toxicity-classification")]
assert len(JIGSAW_DIRS) >= 1, "The jigsaw dataset was not found."
JIGSAW_DIR = JIGSAW_DIRS[0]

# Identity columns
IDENTITY_COLS = [
    "male","female","transgender","other_gender",
    "black","white","asian","latino","other_race_or_ethnicity",
    "christian","jewish","muslim","hindu","buddhist","atheist","other_religion",
    "heterosexual","homosexual_gay_or_lesbian","bisexual","other_sexual_orientation",
    "physical_disability","intellectual_or_learning_disability","psychiatric_or_mental_illness","other_disability"
]

# Read the original train.csv file
train_path = os.path.join(JIGSAW_DIR, "train.csv")
df_raw = pd.read_csv(train_path)

# Text cleaning
URL_RE = re.compile(r"http\S+")
AT_RE  = re.compile(r"@\w+")
def clean_text(s: str) -> str:
    s = str(s) if pd.notna(s) else ""
    s = URL_RE.sub(" URL ", s)
    s = AT_RE .sub("@USER", s)
    s = s.replace("\n", " ").replace("\t", " ")
    s = re.sub(r"\s+", " ", s).strip()
    return s

use_id_cols = [c for c in IDENTITY_COLS if c in df_raw.columns]
df = pd.DataFrame({
    "id": df_raw["id"],
    "text": df_raw["comment_text"].map(clean_text),
    # Jigsaw target âˆˆ [0,1], Binarize by 0.5
    "label": (df_raw["target"] >= 0.5).astype(int)
})
for c in use_id_cols:
    df[f"g_{c}"] = (df_raw[c].fillna(0) >= 0.5).astype(int)

# Deduplicate (by text)
df = df.drop_duplicates(subset=["text"]).reset_index(drop=True)

# Stratified 8/1/1 
sss1 = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, temp_idx = next(sss1.split(df, df["label"]))
temp = df.iloc[temp_idx]
sss2 = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)
val_rel_idx, test_rel_idx = next(sss2.split(temp, temp["label"]))
val_idx  = temp_idx[val_rel_idx]
test_idx = temp_idx[test_rel_idx]

print(f"Train: {len(train_idx)}, Val: {len(val_idx)}, Test: {len(test_idx)}")

In [None]:
# Export standard CSV files (text, label only) for training
df.iloc[train_idx][["text","label"]].to_csv(os.path.join(OUT_DIR, "jigsaw_train.csv"), index=False)
df.iloc[val_idx  ][["text","label"]].to_csv(os.path.join(OUT_DIR, "jigsaw_val.csv"  ), index=False)
df.iloc[test_idx ][["text","label"]].to_csv(os.path.join(OUT_DIR, "jigsaw_test.csv" ), index=False)

print("Standard CSVs (text, label) exported.")

In [None]:
# Export full CSV files (with id and group attributes) for fairness analysis
group_cols = ["id", "text", "label"] + [f"g_{c}" for c in use_id_cols]
df.iloc[train_idx][group_cols].to_csv(os.path.join(OUT_DIR, "jigsaw_train_full.csv"), index=False)
df.iloc[val_idx  ][group_cols].to_csv(os.path.join(OUT_DIR, "jigsaw_val_full.csv"  ), index=False)
df.iloc[test_idx ][group_cols].to_csv(os.path.join(OUT_DIR, "jigsaw_test_full.csv" ), index=False)

print("Full CSVs (with id and group attributes) exported.")
print(f"Group columns included: {len(use_id_cols)}")

In [None]:
# Save protocols
splits = {
    "jigsaw": {
        "train_n": int(len(train_idx)),
        "val_n": int(len(val_idx)),
        "test_n": int(len(test_idx)),
        "pos_rate": {
            "train": float(df.iloc[train_idx]["label"].mean()),
            "val":   float(df.iloc[val_idx]["label"].mean()),
            "test":  float(df.iloc[test_idx]["label"].mean()),
        }
    }
}
with open(os.path.join(OUT_DIR, "protocols.json"), "w") as f:
    json.dump(splits, f, indent=2)

print("Export complete:", OUT_DIR)
!ls -lh {OUT_DIR}