In [32]:
import os, sys, torch

# Allow imports from the project root
sys.path.append(".")

# Basic environment information
print("Python version:", sys.version)
print("CUDA available:", torch.cuda.is_available())

if torch.cuda.is_available():
    print("GPU device:", torch.cuda.get_device_name(0))
else:
    print("Running on CPU")


Python version: 3.13.7 (tags/v3.13.7:bcee1c3, Aug 14 2025, 14:15:11) [MSC v.1944 64 bit (AMD64)]
CUDA available: False
Running on CPU


In [37]:
from dataclasses import dataclass, field
from typing import Tuple

# 1Ô∏è‚É£ Toxicity labels (fixed order)
LABELS: Tuple[str, ...] = (
    "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"
)

# 2Ô∏è‚É£ File paths ‚Äî adjust if you move data
@dataclass
class Paths:
    raw_train: str = "../../data/train_data.csv"
    raw_test:  str = "../../data/test_data.csv"
    model_dir: str = "../models/best"
    reports_dir: str = "../reports"

# 3Ô∏è‚É£ Training hyperparameters
@dataclass
class TrainCfg:
    model_name: str = "microsoft/mdeberta-v3-base"
    max_len: int = 256
    batch_size: int = 16
    lr: float = 2e-5
    epochs: int = 6
    warmup_ratio: float = 0.10
    weight_decay: float = 0.01
    patience: int = 2
    seed: int = 42
    class_weighting: str = "auto"  # 'auto' or 'none'

# 4Ô∏è‚É£ Bundle config (use default_factory for dataclass fields)
@dataclass
class Cfg:
    paths: Paths = field(default_factory=Paths)
    train: TrainCfg = field(default_factory=TrainCfg)
    labels: Tuple[str, ...] = LABELS

# 5Ô∏è‚É£ Helper to get default config quickly
def default_cfg() -> Cfg:
    return Cfg()


In [38]:
cfg = default_cfg()
print(cfg.paths.raw_train)
print(cfg.train.model_name)


../../data/train_data.csv
microsoft/mdeberta-v3-base


In [39]:
import random
import numpy as np

def seed_everything(seed: int = 42):
    """Ensures reproducibility across numpy, random, and torch (if available)."""
    random.seed(seed)
    np.random.seed(seed)
    try:
        import torch
        torch.manual_seed(seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(seed)
    except ImportError:
        pass

# Initialize config and apply seed
cfg = default_cfg()
seed_everything(cfg.train.seed)

print("‚úÖ Config loaded and random seed set to:", cfg.train.seed)


‚úÖ Config loaded and random seed set to: 42


In [40]:
import os

# Create necessary directories
os.makedirs(cfg.paths.model_dir, exist_ok=True)
os.makedirs(os.path.join(cfg.paths.reports_dir, "figs"), exist_ok=True)

print("üìÇ Model directory:", cfg.paths.model_dir)
print("üìÇ Reports directory:", cfg.paths.reports_dir)
print("‚úÖ Folder setup complete.")


üìÇ Model directory: ../models/best
üìÇ Reports directory: ../reports
‚úÖ Folder setup complete.


In [41]:
from pathlib import Path

def check_file(path: str):
    p = Path(path)
    if p.exists():
        print(f"‚úÖ Found: {p}")
        return True
    else:
        print(f"‚ùå Missing: {p}")
        return False

ok_train = check_file(cfg.paths.raw_train)
ok_test  = check_file(cfg.paths.raw_test)

if not (ok_train and ok_test):
    raise FileNotFoundError(
        "Data files not found. Please verify cfg.paths.raw_train and raw_test."
    )
else:
    print("\nAll required data files are present and accessible.")


‚úÖ Found: ..\..\data\train_data.csv
‚úÖ Found: ..\..\data\test_data.csv

All required data files are present and accessible.


In [42]:
import json
from dataclasses import asdict
import os

# Make sure reports folder exists
os.makedirs(cfg.paths.reports_dir, exist_ok=True)

# Path for saving the config snapshot
snapshot_path = os.path.join(cfg.paths.reports_dir, "config_snapshot.json")

# Write config to JSON
with open(snapshot_path, "w", encoding="utf-8") as f:
    json.dump({
        "paths": asdict(cfg.paths),
        "train": asdict(cfg.train),
        "labels": list(cfg.labels)
    }, f, indent=2)

print("‚úÖ Configuration snapshot saved at:")
print(snapshot_path)


‚úÖ Configuration snapshot saved at:
../reports\config_snapshot.json
