In [None]:
# 라이브러리 설치

!pip install pandas
!pip install numpy
!pip install scipy

Defaulting to user installation because normal site-packages is not writeable
Collecting pandas
  Downloading pandas-2.3.3-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting numpy>=1.22.4
  Downloading numpy-2.2.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.8/16.8 MB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting tzdata>=2022.7
  Downloading tzdata-2025.3-py2.py3-none-any.whl (348 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m348.5/348.5 kB[0m [31m41.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pytz>=2020.1
  Downloading pytz-2025.2-py2.py3-none-any.whl (509 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m509.2/509.2 kB[0m [31m56.0 MB/s[0m eta [36m0:0

In [None]:
import json
from pathlib import Path

import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency, chi2

In [None]:
# Paths
PROJECT_ROOT = Path().resolve()
DATA_RAW = PROJECT_ROOT / "clickbait_raw_v1" / "labeling_data"
RESULTS = PROJECT_ROOT / "results"
RESULTS.mkdir(exist_ok=True)

In [None]:
# Load (core columns only)
def load_labeling_data(root_dir: Path) -> pd.DataFrame:
    rows = []
    for fp in root_dir.rglob("*.json"):
        try:
            with open(fp, "r", encoding="utf-8") as f:
                obj = json.load(f)
            s = obj.get("sourceDataInfo", {})
            l = obj.get("labeledDataInfo", {})
            rows.append({
                "newsCategory":  s.get("newsCategory"),
                "processPattern": s.get("processPattern"),
                "processLevel":   s.get("processLevel"),
                "processType":    s.get("processType"),
                "clickbaitClass": l.get("clickbaitClass"),
            })
        except Exception:
            continue
    return pd.DataFrame(rows)

In [None]:
df = load_labeling_data(DATA_RAW).dropna().copy()
df["clickbaitClass"] = df["clickbaitClass"].astype(int)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 660085 entries, 0 to 660084
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   newsCategory    660085 non-null  object
 1   processPattern  660085 non-null  object
 2   processLevel    660085 non-null  object
 3   processType     660085 non-null  object
 4   partNum         660085 non-null  object
 5   sentenceCount   660085 non-null  int64 
 6   clickbaitClass  660085 non-null  int64 
dtypes: int64(2), object(5)
memory usage: 35.3+ MB


In [None]:
# Common: Chi-square + Cramér’s V

def _cramers_v(chi2_stat: float, n: int, r: int, c: int) -> float:
    return float(np.sqrt(chi2_stat / (n * max(1, min(r - 1, c - 1)))))

def chi2_independence(df: pd.DataFrame, x: str, y: str):
    ct = pd.crosstab(df[x], df[y])
    chi2_stat, p, dof, expected = chi2_contingency(ct.values)
    n = int(ct.values.sum())
    r, c = ct.shape
    return {
        "test": "chi2_independence",
        "var_x": x,
        "var_y": y,
        "n": n,
        "chi2": float(chi2_stat),
        "dof": int(dof),
        "p_value": float(p),
        "cramers_v": _cramers_v(chi2_stat, n, r, c),
        "min_expected": float(expected.min()),
        "expected_ok(>=5)": bool(expected.min() >= 5),
        "levels_x": int(r),
        "levels_y": int(c),
    }, ct

### Experiment 3.0 — Baseline dependency check (newsCategory × clickbaitClass)

In [None]:
res_30, ct_30 = chi2_independence(df, "newsCategory", "clickbaitClass")
pd.DataFrame([res_30]).to_csv(RESULTS / "exp3_0_summary.csv", index=False)
ct_30.to_csv(RESULTS / "exp3_0_crosstab.csv")

### Experiment 3.1 - 교란 효과의 가능성 확인 실험

In [None]:
pairs_31 = [
    ("newsCategory", "clickbaitClass"),
    ("processPattern", "clickbaitClass"),
    ("processLevel", "clickbaitClass"),
    ("processType", "clickbaitClass"),
]
res_31 = pd.DataFrame([chi2_independence(df, x, y)[0] for x, y in pairs_31]) \
           .sort_values("cramers_v", ascending=False)
res_31.to_csv(RESULTS / "exp3_1_dependency_scan.csv", index=False)