In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# ============================================================
# Folder-wise Section Cyclic Swap (AIHub-style JSON articles)
# - Google Colab friendly
# - Reads all *.json under INPUT_DIR recursively
# - Swaps sourceDataInfo.newsCategory by cyclic rule
# - Writes to OUTPUT_DIR with same relative paths
# ============================================================

import os
import json
from collections import Counter
from pathlib import Path
from copy import deepcopy

# ----------------------------
# 1) Config
# ----------------------------
INPUT_DIR  = "/content/drive/MyDrive/TRAITHON_EXP/datasets/golden_set_v1_TL"
OUTPUT_DIR = "/content/drive/MyDrive/TRAITHON_EXP/datasets/golden_set_section_swapped"
os.makedirs(OUTPUT_DIR, exist_ok=True)

GLOB_PATTERN = "**/*.json"                  # 하위 폴더까지 전부

# 섹션 코드 순환 치환 규칙 (방식 1)
SECTION_CYCLE = {
    "정치": "사회",
    "사회": "경제",
    "경제": "IT&과학",
    "IT&과학": "생활&문화",
    "생활&문화": "세계",
    "세계": "연예",
    "연예": "정치",
}

# strict=True: 모르는 섹션 코드 나오면 실패 처리(권장)
STRICT = True

# (옵션) stress 메타 정보 추가 여부
ADD_AUDIT = True


# ----------------------------
# 2) Helpers
# ----------------------------
def safe_mkdir(p: str) -> None:
    Path(p).mkdir(parents=True, exist_ok=True)

def load_json(path: Path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def save_json(obj, path: Path) -> None:
    safe_mkdir(str(path.parent))
    with open(path, "w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=2)

def get_section(article: dict) -> str:
    return article["sourceDataInfo"]["newsCategory"]

def set_section(article: dict, new_sec: str) -> None:
    article["sourceDataInfo"]["newsCategory"] = new_sec

def swap_section(article: dict) -> dict:
  # 기본 구조 체크
  if "sourceDataInfo" not in article or "newsCategory" not in article["sourceDataInfo"]:
    raise KeyError("Missing sourceDataInfo.newsCategory")

  before = get_section(article)

  if before not in SECTION_CYCLE:
    if STRICT:
      raise ValueError(f"Unknown section code: {before}")
    else:
      # strict=False면 그냥 그대로 둠
      after = before
  else:
    after = SECTION_CYCLE[before]

  out = deepcopy(article)
  set_section(out, after)

  if ADD_AUDIT:
    out["_stressTest"] = {
        "type": "section_cyclic_swap",
        "before": before,
        "after": after
    }

  return out, before, after



# ----------------------------
# 3) Main
# ----------------------------
input_root = Path(INPUT_DIR)
output_root = Path(OUTPUT_DIR)
safe_mkdir(str(output_root))

json_paths = sorted(input_root.glob(GLOB_PATTERN))

if not json_paths:
    raise FileNotFoundError(f"No JSON files found under: {INPUT_DIR} (pattern={GLOB_PATTERN})")

before_counter = Counter()
after_counter  = Counter()

ok = 0
fail = 0
failed_files = []  # (path, reason)

for in_path in json_paths:
    rel_path = in_path.relative_to(input_root)
    out_path = output_root / rel_path

    try:
        article = load_json(in_path)
        swapped_article, before, after = swap_section(article)

        before_counter[before] += 1
        after_counter[after]  += 1

        save_json(swapped_article, out_path)
        ok += 1

    except Exception as e:
        fail += 1
        failed_files.append((str(in_path), repr(e)))

# ----------------------------
# 4) Report
# ----------------------------
print("=== Section Cyclic Swap Report ===")
print(f"Input dir : {INPUT_DIR}")
print(f"Output dir: {OUTPUT_DIR}")
print(f"Total JSON files: {len(json_paths)}")
print(f"Success: {ok}")
print(f"Failed : {fail}")

print("\n[Section distribution BEFORE]")
for k in sorted(before_counter.keys()):
    print(f"  {k}: {before_counter[k]}")

print("\n[Section distribution AFTER]")
for k in sorted(after_counter.keys()):
    print(f"  {k}: {after_counter[k]}")

# 분포 보존 검증(순환 치환이면 총량은 항상 같아야 함)
print("\n[Sanity checks]")
print("  Total preserved:", sum(before_counter.values()) == sum(after_counter.values()))

# 실패 파일 일부 출력
if failed_files:
    print("\n[Failed files sample (up to 20)]")
    for p, r in failed_files[:20]:
        print(" -", p)
        print("   ", r)

    # 실패 목록 파일로 저장
    fail_log_path = output_root / "_failed_files.log"
    with open(fail_log_path, "w", encoding="utf-8") as f:
        for p, r in failed_files:
            f.write(f"{p}\t{r}\n")
    print("\nSaved fail log:", str(fail_log_path))


=== Section Cyclic Swap Report ===
Input dir : /content/drive/MyDrive/TRAITHON_EXP/datasets/golden_set_v1_TL
Output dir: /content/drive/MyDrive/TRAITHON_EXP/datasets/golden_set_section_swapped
Total JSON files: 5000
Success: 5000
Failed : 0

[Section distribution BEFORE]
  IT&과학: 645
  경제: 729
  사회: 1005
  생활&문화: 536
  세계: 772
  연예: 598
  정치: 715

[Section distribution AFTER]
  IT&과학: 729
  경제: 1005
  사회: 715
  생활&문화: 645
  세계: 536
  연예: 772
  정치: 598

[Sanity checks]
  Total preserved: True


In [4]:
import json
from pathlib import Path

paths = sorted(Path(OUTPUT_DIR).glob("**/*.json"))
for p in paths:
    with open(p, "r", encoding="utf-8") as f:
        d = json.load(f)
    print(p.name, d["sourceDataInfo"]["newsCategory"])

EC_M02_000614_L.json IT&과학
EC_M02_000620_L.json IT&과학
EC_M02_000892_L.json IT&과학
EC_M02_000925_L.json IT&과학
EC_M02_001144_L.json IT&과학
EC_M02_001757_L.json IT&과학
EC_M02_001885_L.json IT&과학
EC_M02_001929_L.json IT&과학
EC_M02_002458_L.json IT&과학
EC_M02_002599_L.json IT&과학
EC_M02_002645_L.json IT&과학
EC_M02_002761_L.json IT&과학
EC_M02_002827_L.json IT&과학
EC_M02_003480_L.json IT&과학
EC_M02_003594_L.json IT&과학
EC_M02_004055_L.json IT&과학
EC_M02_004118_L.json IT&과학
EC_M02_004159_L.json IT&과학
EC_M02_004206_L.json IT&과학
EC_M02_004323_L.json IT&과학
EC_M02_004454_L.json IT&과학
EC_M02_004516_L.json IT&과학
EC_M02_004814_L.json IT&과학
EC_M02_005118_L.json IT&과학
EC_M02_005262_L.json IT&과학
EC_M02_005584_L.json IT&과학
EC_M02_005617_L.json IT&과학
EC_M02_005736_L.json IT&과학
EC_M02_155979_L.json IT&과학
EC_M02_156425_L.json IT&과학
EC_M02_157622_L.json IT&과학
EC_M02_157717_L.json IT&과학
EC_M02_157777_L.json IT&과학
EC_M02_157920_L.json IT&과학
EC_M02_157954_L.json IT&과학
EC_M02_158085_L.json IT&과학
EC_M02_160463_L.json IT&과학
E