In [5]:
# Cell 1 â€” Load config + basic imports
%run ./00_config.ipynb

import re
import pandas as pd
from sklearn.model_selection import train_test_split

print("Train CSV:", cfg.paths.raw_train)
print("Test  CSV:", cfg.paths.raw_test)
print("Reports :", cfg.paths.reports_dir)


Python version: 3.13.7 (tags/v3.13.7:bcee1c3, Aug 14 2025, 14:15:11) [MSC v.1944 64 bit (AMD64)]
CUDA available: False
Running on CPU
../../data/train_data.csv
microsoft/mdeberta-v3-base
âœ… Config loaded and random seed set to: 42
ðŸ“‚ Model directory: ../models/best
ðŸ“‚ Reports directory: ../reports
âœ… Folder setup complete.
âœ… Found: ..\..\data\train_data.csv
âœ… Found: ..\..\data\test_data.csv

All required data files are present and accessible.
âœ… Configuration snapshot saved at:
../reports\config_snapshot.json
Train CSV: ../../data/train_data.csv
Test  CSV: ../../data/test_data.csv
Reports : ../reports


In [6]:
# Cell 2 â€” Cleaning utilities

# Basic regex patterns
_HTML = re.compile(r"<.*?>")
_URL  = re.compile(r"http\S+|www\.\S+")
_NONASCII = re.compile(r"[^\x00-\x7F]+")
_WS = re.compile(r"\s+")

def clean_text(text: str) -> str:
    """Cleans raw comment text: removes HTML, URLs, emojis, and normalizes spaces."""
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = _URL.sub(" ", text)
    text = _HTML.sub(" ", text)
    text = _NONASCII.sub(" ", text)
    text = _WS.sub(" ", text).strip()
    return text

def _require_columns(df: pd.DataFrame, needed: list):
    """Ensures all required columns exist in the DataFrame."""
    missing = [c for c in needed if c not in df.columns]
    if missing:
        raise ValueError(
            f"Missing required columns: {missing}. "
            f"Available (first 20): {list(df.columns)[:20]}"
        )


In [7]:
# Cell 3 â€” Load raw data
def load_raw(cfg):
    """Loads the raw train and test CSVs based on config paths."""
    train = pd.read_csv(cfg.paths.raw_train)
    test  = pd.read_csv(cfg.paths.raw_test)
    return train, test

# Load data
train_raw, test_raw = load_raw(cfg)

print("âœ… Loaded:")
print("Train shape:", train_raw.shape)
print("Test shape :", test_raw.shape)
print("\nTrain columns:", list(train_raw.columns)[:10])


âœ… Loaded:
Train shape: (159571, 8)
Test shape : (153164, 2)

Train columns: ['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


In [8]:
# Cell 4 â€” Clean text and create splits

def make_splits(cfg, train_raw, test_raw):
    """Clean text, drop duplicates, and create train/validation/test splits."""
    # detect text column
    text_col = (
        "comment_text" if "comment_text" in train_raw.columns else
        ("comment" if "comment" in train_raw.columns else None)
    )
    if text_col is None:
        raise ValueError("Expected a text column named 'comment_text' or 'comment'.")

    # ensure label columns exist
    label_cols = [c for c in cfg.labels if c in train_raw.columns]
    _require_columns(train_raw, label_cols if label_cols else ["_NO_LABELS_"])

    # clean text
    train_raw = train_raw.copy()
    test_raw  = test_raw.copy()
    train_raw["comment"] = train_raw[text_col].apply(clean_text)
    if text_col in test_raw.columns:
        test_raw["comment"] = test_raw[text_col].apply(clean_text)
    else:
        test_raw["comment"] = ""

    # drop empties & duplicates
    train = train_raw[train_raw["comment"].str.len() > 0].drop_duplicates(subset=["comment"]).reset_index(drop=True)

    # split (stratify on 'toxic' if available)
    strat = train["toxic"] if "toxic" in train.columns else None
    train_df, val_df = train_test_split(
        train,
        test_size=0.2,
        random_state=cfg.train.seed,
        stratify=strat
    )

    # keep only relevant columns
    keep = ["comment"] + label_cols
    train_df = train_df[keep].reset_index(drop=True)
    val_df   = val_df[keep].reset_index(drop=True)
    test_df  = test_raw[["comment"]].reset_index(drop=True)

    return train_df, val_df, test_df

# run the split
train_df, val_df, test_df = make_splits(cfg, train_raw, test_raw)

print("âœ… Split complete:")
print("Train:", train_df.shape, "Validation:", val_df.shape, "Test:", test_df.shape)


âœ… Split complete:
Train: (127397, 7) Validation: (31850, 7) Test: (153164, 1)


In [9]:
# Cell 5 â€” Quick EDA: label counts and sample rows

label_cols = [c for c in cfg.labels if c in train_df.columns]

print("Label columns present:", label_cols)
if label_cols:
    print("\nLabel prevalence in TRAIN (count of 1s):")
    display(train_df[label_cols].sum().sort_values(ascending=False))

# Peek at cleaned comments (first 3 rows)
print("\nSample cleaned rows (train):")
display(train_df[["comment"] + (label_cols[:3] if label_cols else [])].head(3))

print("\nSample cleaned rows (val):")
display(val_df[["comment"] + (label_cols[:3] if label_cols else [])].head(3))


Label columns present: ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

Label prevalence in TRAIN (count of 1s):


toxic            12197
obscene           6736
insult            6253
severe_toxic      1298
identity_hate     1137
threat             390
dtype: int64


Sample cleaned rows (train):


Unnamed: 0,comment,toxic,severe_toxic,obscene
0,arguement. yours was a personal attack (which ...,0,0,0
1,""" speedy deletion of """"lilsprezy"""" a page you ...",0,0,0
2,""" spamstar spamstar the spamstar of glory pres...",0,0,0



Sample cleaned rows (val):


Unnamed: 0,comment,toxic,severe_toxic,obscene
0,""" delete hey there, do you want page deleted a...",0,0,0
1,"""::: to most the would appear mutually exclusi...",0,0,0
2,"i'm sorry, you may have misunderstood me. plea...",0,0,0


In [10]:
# Cell 6 â€” Save sample previews for reference
import os

# Create preview folder inside reports
preview_dir = os.path.join(cfg.paths.reports_dir, "previews")
os.makedirs(preview_dir, exist_ok=True)

# Save first 200 rows of train and val for inspection
train_df.head(200).to_csv(os.path.join(preview_dir, "train_head.csv"), index=False)
val_df.head(200).to_csv(os.path.join(preview_dir, "val_head.csv"), index=False)

print("âœ… Saved small previews to:", preview_dir)
print("Files:")
for f in os.listdir(preview_dir):
    print(" -", f)


âœ… Saved small previews to: ../reports\previews
Files:
 - train_head.csv
 - val_head.csv
