In [None]:
# Configuration Cell - Add this at the top of each notebook
import os
import sys
from pathlib import Path

# Detect environment
IS_KAGGLE = os.path.exists('/kaggle/input')
IS_COLAB = 'google.colab' in sys.modules

# Set base directories based on environment
if IS_KAGGLE:
    INPUT_ROOT = "/kaggle/input"
    WORK_DIR = "/kaggle/working"
elif IS_COLAB:
    INPUT_ROOT = "/content/input"
    WORK_DIR = "/content/working"
else:
    # Local environment
    INPUT_ROOT = Path.cwd() / "input"
    WORK_DIR = Path.cwd() / "working"

# Create standard directories
OUT_DIR = os.path.join(WORK_DIR, "data")
EXPERIMENTS_DIR = os.path.join(WORK_DIR, "experiments")
SCRIPTS_DIR = os.path.join(WORK_DIR, "scripts")

# Create all directories
for directory in [OUT_DIR, EXPERIMENTS_DIR, SCRIPTS_DIR]:
    Path(directory).mkdir(parents=True, exist_ok=True)

print(f"Environment: {'Kaggle' if IS_KAGGLE else 'Colab' if IS_COLAB else 'Local'}")
print(f"Input directory: {INPUT_ROOT}")
print(f"Working directory: {WORK_DIR}")
print(f"Data directory: {OUT_DIR}")
print(f"Experiments directory: {EXPERIMENTS_DIR}")

In [None]:
# Civil Comments Dataset Preprocessing
import os, re, json
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit

INPUT_ROOT = "/kaggle/input"
WORK_DIR   = "/kaggle/working"
OUT_DIR    = os.path.join(WORK_DIR, "data")
os.makedirs(OUT_DIR, exist_ok=True)

# Locate Civil Comments dataset
# Adjust the directory name pattern based on your Kaggle dataset
CIVIL_DIRS = [os.path.join(INPUT_ROOT, d) for d in os.listdir(INPUT_ROOT)
              if "civil" in d.lower() and "comment" in d.lower()]

if len(CIVIL_DIRS) == 0:
    print("[WARN] Civil Comments dataset not found. Please add it to Kaggle inputs.")
    print("Expected pattern: directory containing 'civil' and 'comment' in name")
else:
    CIVIL_DIR = CIVIL_DIRS[0]
    print(f"Found Civil Comments at: {CIVIL_DIR}")

In [None]:
# Identity columns in Civil Comments
# These may vary - adjust based on your dataset
IDENTITY_COLS_CIVIL = [
    "male", "female", "transgender", "other_gender",
    "heterosexual", "homosexual_gay_or_lesbian", "bisexual", "other_sexual_orientation",
    "christian", "jewish", "muslim", "hindu", "buddhist", "atheist", "other_religion",
    "black", "white", "latino", "other_race_or_ethnicity",
    "physical_disability", "intellectual_or_learning_disability", 
    "psychiatric_or_mental_illness", "other_disability"
]

# Load Civil Comments - adjust filename as needed
# Common filenames: "all_data.csv", "civil_comments.csv", "train.csv"
civil_files = [f for f in os.listdir(CIVIL_DIR) if f.endswith(".csv")]
if not civil_files:
    raise FileNotFoundError(f"No CSV files found in {CIVIL_DIR}")

civil_file = civil_files[0]  # Take first CSV
print(f"Loading: {civil_file}")

df_raw = pd.read_csv(os.path.join(CIVIL_DIR, civil_file))
print(f"Loaded {len(df_raw)} records")
print(f"Columns: {list(df_raw.columns[:10])}...")

In [None]:
# Text cleaning function
URL_RE = re.compile(r"http\S+")
AT_RE  = re.compile(r"@\w+")

def clean_text(s: str) -> str:
    s = str(s) if pd.notna(s) else ""
    s = URL_RE.sub(" URL ", s)
    s = AT_RE.sub("@USER", s)
    s = s.replace("\n", " ").replace("\t", " ")
    s = re.sub(r"\s+", " ", s).strip()
    return s

# Create cleaned dataset
# Adjust column names based on your dataset structure
text_col = "comment_text" if "comment_text" in df_raw.columns else "text"
toxicity_col = "toxicity" if "toxicity" in df_raw.columns else "target"

df = pd.DataFrame({
    "id": df_raw.get("id", range(len(df_raw))),
    "text": df_raw[text_col].map(clean_text),
    "label": (df_raw[toxicity_col] >= 0.5).astype(int)
})

# Add identity group columns
use_id_cols = [c for c in IDENTITY_COLS_CIVIL if c in df_raw.columns]
for c in use_id_cols:
    df[f"g_{c}"] = (df_raw[c].fillna(0) >= 0.5).astype(int)

print(f"Created dataset with {len(df)} records")
print(f"Identity columns found: {len(use_id_cols)}")

In [None]:
# Deduplicate by text
df = df.drop_duplicates(subset=["text"]).reset_index(drop=True)
print(f"After deduplication: {len(df)} records")

# Stratified 8/1/1 split
sss1 = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, temp_idx = next(sss1.split(df, df["label"]))
temp = df.iloc[temp_idx]
sss2 = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)
val_rel_idx, test_rel_idx = next(sss2.split(temp, temp["label"]))
val_idx  = temp_idx[val_rel_idx]
test_idx = temp_idx[test_rel_idx]

print(f"Train: {len(train_idx)}, Val: {len(val_idx)}, Test: {len(test_idx)}")

In [None]:
# Export standard CSVs (text, label)
df.iloc[train_idx][["text","label"]].to_csv(os.path.join(OUT_DIR, "civil_train.csv"), index=False)
df.iloc[val_idx  ][["text","label"]].to_csv(os.path.join(OUT_DIR, "civil_val.csv"  ), index=False)
df.iloc[test_idx ][["text","label"]].to_csv(os.path.join(OUT_DIR, "civil_test.csv" ), index=False)

print("Standard CSVs exported.")

In [None]:
# Export full CSVs (with id and group attributes)
group_cols = ["id", "text", "label"] + [f"g_{c}" for c in use_id_cols]
df.iloc[train_idx][group_cols].to_csv(os.path.join(OUT_DIR, "civil_train_full.csv"), index=False)
df.iloc[val_idx  ][group_cols].to_csv(os.path.join(OUT_DIR, "civil_val_full.csv"  ), index=False)
df.iloc[test_idx ][group_cols].to_csv(os.path.join(OUT_DIR, "civil_test_full.csv" ), index=False)

print("Full CSVs (with group attributes) exported.")

In [None]:
# Save protocols
splits = {
    "civil": {
        "train_n": int(len(train_idx)),
        "val_n": int(len(val_idx)),
        "test_n": int(len(test_idx)),
        "pos_rate": {
            "train": float(df.iloc[train_idx]["label"].mean()),
            "val":   float(df.iloc[val_idx]["label"].mean()),
            "test":  float(df.iloc[test_idx]["label"].mean()),
        }
    }
}
with open(os.path.join(OUT_DIR, "civil_protocols.json"), "w") as f:
    json.dump(splits, f, indent=2)

print("Civil Comments preprocessing complete!")
!ls -lh {OUT_DIR}/civil*