In [1]:
import os
import sys
import json
import pandas as pd
from tqdm import tqdm

from google.colab import drive
from huggingface_hub import snapshot_download

# ----------------------------
# 0) 드라이브 마운트 & 코드 경로
# ----------------------------
drive.mount("/content/drive")

BASE = "/content/drive/MyDrive/TRAITHON"
code_dir = os.path.join(BASE, "code")
if code_dir not in sys.path:
    sys.path.append(code_dir)

from ax4_clickbait_scorer import init_model, score_article

# ----------------------------
# 1) 경로 설정
# ----------------------------
WEEK_ROOT = os.path.join(BASE, "data", "김준호", "섹션무작위", "mixed")

# 라벨 CSV 두 개 합치기
label_files = [
    os.path.join(WEEK_ROOT, "mixed_label(1~500).csv"),
    os.path.join(WEEK_ROOT, "mixed_label(501~1000).csv"),
]

dfs = []
for lf in label_files:
    if not os.path.exists(lf):
        print(f"[WARN] 라벨 파일 없음: {lf}")
        continue
    dfs.append(pd.read_csv(lf))

if not dfs:
    raise FileNotFoundError("라벨 CSV를 찾지 못했습니다.")

df_label = pd.concat(dfs, ignore_index=True)

# 원하는 범위만 쓸 거면 여기서 필터 (예: 1~1000)
MAX_ID = 1000
MIN_ID = 1
df_label = df_label[df_label["fileId"].between(MIN_ID, MAX_ID)].copy()
df_label["fileId"] = df_label["fileId"].astype(int)
df_label = df_label.sort_values("fileId")

print("라벨 샘플 수:", len(df_label))
print(df_label.head())

# ----------------------------
# 2) 모델 로드
# ----------------------------
MODEL_REPO = "skt/A.X-4.0-Light"
LOCAL_MODEL_DIR = "/content/A.X-4.0-Light"

if not os.path.isdir(LOCAL_MODEL_DIR) or not os.listdir(LOCAL_MODEL_DIR):
    print("[ST-DRIFT] Downloading model snapshot...")
    snapshot_download(
        repo_id=MODEL_REPO,
        local_dir=LOCAL_MODEL_DIR,
        local_dir_use_symlinks=False,
    )
else:
    print("[ST-DRIFT] Found local model dir:", LOCAL_MODEL_DIR)

MODEL_PATH = LOCAL_MODEL_DIR
init_model(MODEL_PATH)

# ----------------------------
# 3) 기사별 점수 계산
# ----------------------------
rows = []

for _, row in tqdm(df_label.iterrows(), total=len(df_label)):
    file_id = int(row["fileId"])

    # clickbaitClass: 1 = 정상, 0 = 낚시성 기사
    # → y_true도 같은 의미로 사용 (0=낚시성, 1=정상)
    y_true = int(row["clickbaitClass"])

    json_path = os.path.join(WEEK_ROOT, f"{file_id}.json")
    if not os.path.exists(json_path):
        print(f"[WARN] {json_path} 없음, 건너뜀")
        continue

    with open(json_path, "r", encoding="utf-8") as f:
        article = json.load(f)

    # A.X 4.0 Light로 점수 계산
    s = score_article(article)

    rows.append({
        "fileId": file_id,
        "section": article.get("newsCategory"),
        "y_true": y_true,                # 1=정상, 0=낚시성
        "p0": s["p0"],
        "p1": s["p1"],
        "score_logit_diff": s["score_logit_diff"],
        "p_clickbait": s["p_clickbait"], # 내부 정의에 따라 사용
        "y_pred": s["pred_label_clickbait"],
        "conf_llm": s["conf_llm"],
    })

# ----------------------------
# 4) 결과 저장
# ----------------------------
df_week = pd.DataFrame(rows)
out_path = os.path.join(WEEK_ROOT, "week_eval_ax4.csv")
df_week.to_csv(out_path, index=False, encoding="utf-8-sig")

print("저장 완료:", out_path)
df_week.head()

Mounted at /content/drive
라벨 샘플 수: 1000
   fileId  clickbaitClass
0       1               1
1       2               1
2       3               1
3       4               0
4       5               1
[ST-DRIFT] Downloading model snapshot...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.


Fetching 16 files:   0%|          | 0/16 [00:00<?, ?it/s]

.gitattributes: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

README.md: 0.00B [00:00, ?B/s]

LICENSE: 0.00B [00:00, ?B/s]

generation_config.json:   0%|          | 0.00/130 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/674 [00:00<?, ?B/s]

A.X_logo.png:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

assets/A.X_logo_ko_4x3.png:   0%|          | 0.00/183k [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.60G [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

[ax4_clickbait_scorer] Loading model from: /content/A.X-4.0-Light
[ax4_clickbait_scorer] device=cuda, dtype=torch.bfloat16


`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

[ax4_clickbait_scorer] Model loaded.


100%|██████████| 1000/1000 [19:13<00:00,  1.15s/it]

저장 완료: /content/drive/MyDrive/TRAITHON/data/김준호/섹션무작위/mixed/week_eval_ax4.csv





Unnamed: 0,fileId,section,y_true,p0,p1,score_logit_diff,p_clickbait,y_pred,conf_llm
0,1,사회,1,0.996094,0.002808,-5.875,0.002808,0,0.996094
1,2,사회,1,1.0,0.000179,-8.625,0.000179,0,1.0
2,3,사회,1,1.0,8.5e-05,-9.375,8.5e-05,0,1.0
3,4,사회,0,0.000488,1.0,7.625,1.0,1,1.0
4,5,사회,1,1.0,6.6e-05,-9.625,6.6e-05,0,1.0
