In [None]:
import os, subprocess, time

def stage_to_local(src_dir: str, dst_dir: str, marker_name=".staged_ok"):
    os.makedirs(dst_dir, exist_ok=True)
    marker = os.path.join(dst_dir, marker_name)

    # 이미 스테이징 완료면 스킵
    if os.path.exists(marker):
        print(f"[stage] already staged: {dst_dir}")
        return

    print(f"[stage] rsync from\n  {src_dir}\n  -> {dst_dir}")
    t0 = time.time()

    # rsync는 증분 복사라서 중간에 끊겨도 다시 실행하면 이어서 빨리 맞춰짐
    cmd = f"rsync -a --info=progress2 '{src_dir.rstrip('/')}/' '{dst_dir.rstrip('/')}/'"
    subprocess.run(["bash", "-lc", cmd], check=True)

    # 마커 생성
    with open(marker, "w") as f:
        f.write("ok\n")

    print(f"[stage] done in {time.time()-t0:.1f}s")

In [None]:
# ====== 0) 구글드라이브 마운트 ======
from google.colab import drive
drive.mount("/content/drive")

DRIVE_ADAPTER    = "/content/drive/MyDrive/TRAITHON/models/adapter"

# /content 로컬 경로(빠름)
LOCAL_BASE_MODEL = "/content/A.X-4.0-Light"
LOCAL_ADAPTER    = "/content/ax4_adapter_final"

stage_to_local(DRIVE_ADAPTER, LOCAL_ADAPTER)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[stage] already staged: /content/ax4_adapter_final


In [None]:
from hf_ax4_downloader import download_ax4_light_to_content

model_dir = download_ax4_light_to_content("/content/A.X-4.0-Light")
print("MODEL DIR =", model_dir)

[hf_dl] downloading repo: skt/A.X-4.0-Light
[hf_dl] -> /content/A.X-4.0-Light
[hf_dl] token: none (public repo라면 OK, gated면 필요)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.


Fetching 16 files:   0%|          | 0/16 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/674 [00:00<?, ?B/s]

.gitattributes: 0.00B [00:00, ?B/s]

assets/A.X_logo_ko_4x3.png:   0%|          | 0.00/183k [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/130 [00:00<?, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

A.X_logo.png:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

LICENSE: 0.00B [00:00, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.60G [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

[hf_dl] done: /content/A.X-4.0-Light
MODEL DIR = /content/A.X-4.0-Light


In [None]:
import importlib, IG_module, ax4_clickbait_scorer as sc
importlib.reload(IG_module)
importlib.reload(sc)

<module 'ax4_clickbait_scorer' from '/content/ax4_clickbait_scorer.py'>

전처리 산출물 만들기 (drive preprocess 모듈 사용)

In [None]:
from google.colab import drive
drive.mount("/content/drive")

import glob
paths = sorted(glob.glob("/content/drive/MyDrive/sample_slice_PO/slice_PO/*.json"))
print("n_paths:", len(paths), "example:", paths[:3])

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
n_paths: 1000 example: ['/content/drive/MyDrive/sample_slice_PO/slice_PO/0.json', '/content/drive/MyDrive/sample_slice_PO/slice_PO/1.json', '/content/drive/MyDrive/sample_slice_PO/slice_PO/10.json']


In [None]:
import io, os, re

TARGET = "/content/ax4_clickbait_scorer.py"

with open(TARGET, "r", encoding="utf-8") as f:
    code = f.read()

# 이미 정의돼 있으면 패치 스킵
if "def _compute_logits_for_prefix" in code:
    print("[SKIP] _compute_logits_for_prefix already exists.")
else:
    # _compute_last_logits 아래에 래퍼 함수를 삽입
    anchor = "def _compute_last_logits"
    pos = code.find(anchor)
    if pos == -1:
        raise RuntimeError("Cannot find def _compute_last_logits in ax4_clickbait_scorer.py")

    # _compute_last_logits 함수의 끝(다음 def 시작 직전) 위치 찾기
    # 단순하게 'return outputs.logits[0, -1, :]  # [V]' 다음에 삽입
    marker = "return outputs.logits[0, -1, :]  # [V]"
    mpos = code.find(marker)
    if mpos == -1:
        raise RuntimeError("Cannot find marker line in _compute_last_logits")

    insert_at = mpos + len(marker)

    wrapper = """

def _compute_logits_for_prefix(prefix_ids: torch.Tensor):
    \"\"\"(호환용) prefix_ids의 마지막 logits과 (id0,id1)을 반환.\"\"\"
    last_logits = _compute_last_logits(prefix_ids)

    # label token id 준비
    global _ID0, _ID1
    if _ID0 is None or _ID1 is None:
        _ensure_label_token_ids()

    return last_logits, _ID0, _ID1
"""

    code2 = code[:insert_at] + wrapper + code[insert_at:]

    with open(TARGET, "w", encoding="utf-8") as f:
        f.write(code2)

    print("[DONE] Patched ax4_clickbait_scorer.py (added _compute_logits_for_prefix).")

[DONE] Patched ax4_clickbait_scorer.py (added _compute_logits_for_prefix).


In [None]:
import importlib
import ax4_clickbait_scorer as sc
importlib.reload(sc)

# 모델 다시 초기화
BASE_MODEL = "/content/A.X-4.0-Light"
ADAPTER    = "/content/ax4_adapter_final"
sc.reset_model()
sc.init_model(model_path=BASE_MODEL, adapter_path=ADAPTER)

# 경로 1개만 테스트
p = paths[0]
out = sc.score_article_from_path_explain(p, ig_steps=50, ig_span="title_body", top_k=25)
print("OK:", out["p_clickbait"], out["pred_label_clickbait"], out["score_logit_diff"])
print(out["ig"]["top_words"][:5])

[ax4_clickbait_scorer] Loading tokenizer from: /content/ax4_adapter_final


`torch_dtype` is deprecated! Use `dtype` instead!


[ax4_clickbait_scorer] Loading base model from: /content/A.X-4.0-Light
[ax4_clickbait_scorer] device=cuda, dtype=torch.bfloat16


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

[ax4_clickbait_scorer] Attaching adapter (NO MERGE): /content/ax4_adapter_final
[ax4_clickbait_scorer] Model loaded.
[ax4_clickbait_scorer] Adapter active: /content/ax4_adapter_final
[ax4_clickbait_scorer] run_meta loaded: ans0=0(id=56), ans1=1(id=57)
OK: 1.0 1 12.875
[('발표는8월27일', 1.3837890625), ('노제·영결식[본문]충남', 1.007232666015625), ('밝혔다.고인은', 0.75732421875), ('있으며,', 0.6279296875), ('오전7시께1차', 0.61962890625)]


=========실험=========

1) 1000개 IG를 안전하게 돌리는 러너 (중간 저장/재시작 가능)
셀 1: paths 로드 + 출력 경로 준비

In [None]:
from google.colab import drive
drive.mount("/content/drive")

import glob, os, json, time
from tqdm.auto import tqdm

DATA_DIR = "/content/drive/MyDrive/sample_slice_PO/slice_PO"
paths = sorted(glob.glob(os.path.join(DATA_DIR, "*.json")))
print("n_paths:", len(paths), "example:", paths[:2])

OUT_DIR = "/content/drive/MyDrive/ig_runs"
os.makedirs(OUT_DIR, exist_ok=True)

RUN_ID = "ig_steps20_span_title_body_top25"

OUT_JSONL = os.path.join(OUT_DIR, f"{RUN_ID}.jsonl")
OUT_ERR   = os.path.join(OUT_DIR, f"{RUN_ID}.errors.jsonl")

print("OUT_JSONL:", OUT_JSONL)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
n_paths: 1000 example: ['/content/drive/MyDrive/sample_slice_PO/slice_PO/0.json', '/content/drive/MyDrive/sample_slice_PO/slice_PO/1.json']
OUT_JSONL: /content/drive/MyDrive/ig_runs/ig_steps20_span_title_body_top25.jsonl


셀 2: 이미 처리한 파일 스킵 로더 (재시작용)

In [None]:
done = set()
if os.path.exists(OUT_JSONL):
    with open(OUT_JSONL, "r", encoding="utf-8") as f:
        for line in f:
            try:
                obj = json.loads(line)
                done.add(obj["path"])
            except Exception:
                pass
print("already done:", len(done))

already done: 95


In [None]:
print("len(paths):", len(paths))
print("example path:", paths[0])


len(paths): 1000
example path: /content/drive/MyDrive/sample_slice_PO/slice_PO/0.json


패턴당 30개만 뽑기

In [None]:
import pandas as pd
import random
random.seed(42)

df = pd.read_csv("/content/drive/MyDrive/sample_slice_PO/mapping.csv")

paths_small = []
for k, g in df.groupby("processType"):
    lst = g["slice_path"].tolist()
    random.shuffle(lst)
    paths_small.extend(lst[:30])

print("paths_small:", len(paths_small))
print("example:", paths_small[:3])

paths_small: 420
example: ['C:\\해커톤\\트라이톤 예선 데이터셋\\sample_slice_PO\\slice_PO\\475.json', 'C:\\해커톤\\트라이톤 예선 데이터셋\\sample_slice_PO\\slice_PO\\444.json', 'C:\\해커톤\\트라이톤 예선 데이터셋\\sample_slice_PO\\slice_PO\\456.json']


In [None]:
import pandas as pd
import random
import os
import ntpath   # ← 핵심
random.seed(42)

df = pd.read_csv("/content/drive/MyDrive/sample_slice_PO/mapping.csv")

def to_colab_path(win_path):
    # Windows 경로에서 파일명만 정확히 추출
    fname = ntpath.basename(win_path)
    return f"/content/drive/MyDrive/sample_slice_PO/slice_PO/{fname}"

paths_small = []
for k, g in df.groupby("processType"):
    lst = [to_colab_path(p) for p in g["slice_path"].tolist()]
    random.shuffle(lst)
    paths_small.extend(lst[:30])   # 패턴당 30개

print("paths_small:", len(paths_small))
print("example:", paths_small[:3])

# sanity check
assert os.path.exists(paths_small[0]), f"Path does not exist: {paths_small[0]}"


paths_small: 420
example: ['/content/drive/MyDrive/sample_slice_PO/slice_PO/475.json', '/content/drive/MyDrive/sample_slice_PO/slice_PO/444.json', '/content/drive/MyDrive/sample_slice_PO/slice_PO/456.json']


셀 3: 모델 로드

In [None]:
import importlib
import ax4_clickbait_scorer as sc
importlib.reload(sc)

BASE_MODEL = "/content/A.X-4.0-Light"
ADAPTER    = "/content/ax4_adapter_final"

sc.reset_model()
sc.init_model(model_path=BASE_MODEL, adapter_path=ADAPTER)

[ax4_clickbait_scorer] Loading tokenizer from: /content/ax4_adapter_final
[ax4_clickbait_scorer] Loading base model from: /content/A.X-4.0-Light
[ax4_clickbait_scorer] device=cuda, dtype=torch.bfloat16


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

[ax4_clickbait_scorer] Attaching adapter (NO MERGE): /content/ax4_adapter_final
[ax4_clickbait_scorer] Model loaded.
[ax4_clickbait_scorer] Adapter active: /content/ax4_adapter_final
[ax4_clickbait_scorer] run_meta loaded: ans0=0(id=56), ans1=1(id=57)


(PeftModelForCausalLM(
   (base_model): LoraModel(
     (model): Qwen2ForCausalLM(
       (model): Qwen2Model(
         (embed_tokens): Embedding(102400, 3584)
         (layers): ModuleList(
           (0-27): 28 x Qwen2DecoderLayer(
             (self_attn): Qwen2Attention(
               (q_proj): lora.Linear(
                 (base_layer): Linear(in_features=3584, out_features=3584, bias=True)
                 (lora_dropout): ModuleDict(
                   (default): Dropout(p=0.05, inplace=False)
                 )
                 (lora_A): ModuleDict(
                   (default): Linear(in_features=3584, out_features=16, bias=False)
                 )
                 (lora_B): ModuleDict(
                   (default): Linear(in_features=16, out_features=3584, bias=False)
                 )
                 (lora_embedding_A): ParameterDict()
                 (lora_embedding_B): ParameterDict()
                 (lora_magnitude_vector): ModuleDict(
                   (default): l

In [None]:
import time

p0 = paths_small[0]

t0 = time.time()
out0 = sc.score_article_from_path_explain(
    p0,
    ig_steps=5,        # 디버그용: 5로 극단 축소
    ig_span="title",   # title만
    top_k=10
)
t1 = time.time()

print("one-call sec:", t1 - t0)
print("p_clickbait:", out0["p_clickbait"], "pred:", out0["pred_label_clickbait"], "logit_diff:", out0["score_logit_diff"])
print("top_words:", out0["ig"]["top_words"][:5])


one-call sec: 3.2356245517730713
p_clickbait: 0.00102996826171875 pred: 0 logit_diff: -6.875
top_words: [('‘어반', 0.0423583984375), ('기념', 0.0380859375), ('그린야드’', 0.0325927734375), ('N서울타워,', -0.031219482421875), ('실시[본문]CJ푸드빌이', -0.02685546875)]


셀 4: 전체 실행 (결과는 “가볍게” 저장)

In [None]:
IG_STEPS = 20
IG_SPAN = "title"
TOP_K = 25
RUN_ID   = "ig_steps20_span_title_top25"

start = time.time()

with open(OUT_JSONL, "a", encoding="utf-8") as fout, open(OUT_ERR, "a", encoding="utf-8") as ferr:
    for p in tqdm(paths_small, desc="IG run"):
        if p in done:
            continue
        try:
            out = sc.score_article_from_path_explain(
                p, ig_steps=IG_STEPS, ig_span=IG_SPAN, top_k=TOP_K
            )

            # 샘플 메타도 함께 저장(패턴 분석에 필요)
            with open(p, "r", encoding="utf-8") as f:
                a = json.load(f)

            rec = {
                "path": p,
                "processType": a.get("processType"),
                "clickbaitClass": a.get("clickbaitClass"),
                "newsCategory": a.get("newsCategory"),
                "p_clickbait": float(out["p_clickbait"]),
                "pred": int(out["pred_label_clickbait"]),
                "logit_diff": float(out["score_logit_diff"]),
                "top_words": out["ig"]["top_words"],  # list[(word, score)]
            }
            fout.write(json.dumps(rec, ensure_ascii=False) + "\n")
            fout.flush()

            done.add(p)

        except Exception as e:
            ferr.write(json.dumps({"path": p, "error": repr(e)}, ensure_ascii=False) + "\n")
            ferr.flush()

elapsed = time.time() - start
print("elapsed_sec:", elapsed, "done:", len(done))

IG run:   0%|          | 0/420 [00:00<?, ?it/s]

elapsed_sec: 3653.707282781601 done: 420


I_K만 만드는 코드

셀 A) OUT_JSONL 로드 → 패턴별 단어 기여도 집계 (mean(|IG|))

In [None]:
import os, json
import numpy as np
import pandas as pd
from collections import defaultdict

print("OUT_JSONL:", OUT_JSONL)
assert os.path.exists(OUT_JSONL), "OUT_JSONL not found. Run IG first."

rows = []
with open(OUT_JSONL, "r", encoding="utf-8") as f:
    for line in f:
        rows.append(json.loads(line))

df = pd.DataFrame(rows)
print("Loaded rows:", df.shape)
print(df["processType"].value_counts())

# pattern -> word -> list(scores)
bucket = defaultdict(lambda: defaultdict(list))
# pattern -> word -> list(doc_id/path) (근거 사례용)
examples = defaultdict(lambda: defaultdict(list))

for r in rows:
    ptype = r["processType"]
    pth = r["path"]
    for w, s in r["top_words"]:
        s = float(s)
        bucket[ptype][w].append(s)
        # 근거 문서 5개까지만 저장
        if len(examples[ptype][w]) < 5:
            examples[ptype][w].append(pth)

# pattern -> DataFrame(word, mean_abs, mean_signed, n_docs)
pat_tables = {}
for ptype, wdict in bucket.items():
    data = []
    for w, scores in wdict.items():
        scores = np.array(scores, dtype=np.float64)
        data.append({
            "processType": ptype,
            "word": w,
            "mean_abs": float(np.mean(np.abs(scores))),
            "mean_signed": float(np.mean(scores)),
            "n_docs": int(len(scores)),
        })
    t = pd.DataFrame(data).sort_values(["mean_abs", "n_docs"], ascending=False).reset_index(drop=True)
    pat_tables[ptype] = t

ptypes = sorted(pat_tables.keys())
print("patterns:", ptypes)
display(pat_tables[ptypes[0]].head(20))


OUT_JSONL: /content/drive/MyDrive/ig_runs/ig_steps20_span_title_body_top25.jsonl
Loaded rows: (420, 8)
processType
AUTO_EC      30
AUTO_ET      30
AUTO_GB      30
AUTO_IS      30
AUTO_LC      30
AUTO_PO      30
AUTO_SO      30
DIRECT_EC    30
DIRECT_ET    30
DIRECT_GB    30
DIRECT_IS    30
DIRECT_LC    30
DIRECT_PO    30
DIRECT_SO    30
Name: count, dtype: int64
patterns: ['AUTO_EC', 'AUTO_ET', 'AUTO_GB', 'AUTO_IS', 'AUTO_LC', 'AUTO_PO', 'AUTO_SO', 'DIRECT_EC', 'DIRECT_ET', 'DIRECT_GB', 'DIRECT_IS', 'DIRECT_LC', 'DIRECT_PO', 'DIRECT_SO']


Unnamed: 0,processType,word,mean_abs,mean_signed,n_docs
0,AUTO_EC,판단하라.[카테고리]EC[제목]주담대,2.085938,-2.085938,1
1,AUTO_EC,첫마을,1.23877,1.23877,1
2,AUTO_EC,"판단하라.[카테고리]EC[제목]공정위,",1.129883,1.129883,1
3,AUTO_EC,허위제출”[본문]▲,1.041992,1.041992,1
4,AUTO_EC,"월드,",0.972168,0.972168,1
5,AUTO_EC,최대0.03%p,0.812012,0.812012,1
6,AUTO_EC,"판단하라.[카테고리]EC[제목]BNK금융그룹,",0.737671,0.737671,1
7,AUTO_EC,‘그린킥오프’[본문]하나은행은,0.706726,0.706726,1
8,AUTO_EC,‘가이드봇’,0.682007,0.682007,1
9,AUTO_EC,고발…“대기업집단,0.66626,0.66626,1


셀 B) i_k산출, 파일로 저장

In [None]:
import os, json

TOP_IK = 50  # 팀원이 H_k랑 비교할 기준. 30~100 사이에서 선택 권장.

OUT_IK_DIR = os.path.join(os.path.dirname(OUT_JSONL), "Ik_exports_" + RUN_ID)
os.makedirs(OUT_IK_DIR, exist_ok=True)
print("OUT_IK_DIR:", OUT_IK_DIR)

# 1) 패턴별 top-N 단어 + 근거 path까지 포함한 JSON
Ik = {}
for p in ptypes:
    t = pat_tables[p].head(TOP_IK)
    Ik[p] = []
    for _, row in t.iterrows():
        w = row["word"]
        Ik[p].append({
            "word": w,
            "mean_abs": float(row["mean_abs"]),
            "mean_signed": float(row["mean_signed"]),
            "n_docs": int(row["n_docs"]),
            "example_paths": examples[p].get(w, [])  # 최대 5개
        })

ik_json_path = os.path.join(OUT_IK_DIR, f"Ik_top{TOP_IK}_by_pattern.json")
with open(ik_json_path, "w", encoding="utf-8") as f:
    json.dump(Ik, f, ensure_ascii=False, indent=2)
print("Saved:", ik_json_path)

# 2) long-format 전체 테이블 저장(필요하면 팀원이 필터링 가능)
long_df = pd.concat([pat_tables[p] for p in ptypes], ignore_index=True)
long_csv_path = os.path.join(OUT_IK_DIR, "Ik_long_table.csv")
long_df.to_csv(long_csv_path, index=False, encoding="utf-8-sig")
print("Saved:", long_csv_path)

# 3) 패턴별 top-N만 모은 테이블
top_df = pd.concat([pat_tables[p].head(TOP_IK) for p in ptypes], ignore_index=True)
top_csv_path = os.path.join(OUT_IK_DIR, f"Ik_vocab_top{TOP_IK}.csv")
top_df.to_csv(top_csv_path, index=False, encoding="utf-8-sig")
print("Saved:", top_csv_path)

display(top_df.head(30))

OUT_IK_DIR: /content/drive/MyDrive/ig_runs/Ik_exports_ig_steps20_span_title_top25
Saved: /content/drive/MyDrive/ig_runs/Ik_exports_ig_steps20_span_title_top25/Ik_top50_by_pattern.json
Saved: /content/drive/MyDrive/ig_runs/Ik_exports_ig_steps20_span_title_top25/Ik_long_table.csv
Saved: /content/drive/MyDrive/ig_runs/Ik_exports_ig_steps20_span_title_top25/Ik_vocab_top50.csv


Unnamed: 0,processType,word,mean_abs,mean_signed,n_docs
0,AUTO_EC,판단하라.[카테고리]EC[제목]주담대,2.085938,-2.085938,1
1,AUTO_EC,첫마을,1.23877,1.23877,1
2,AUTO_EC,"판단하라.[카테고리]EC[제목]공정위,",1.129883,1.129883,1
3,AUTO_EC,허위제출”[본문]▲,1.041992,1.041992,1
4,AUTO_EC,"월드,",0.972168,0.972168,1
5,AUTO_EC,최대0.03%p,0.812012,0.812012,1
6,AUTO_EC,"판단하라.[카테고리]EC[제목]BNK금융그룹,",0.737671,0.737671,1
7,AUTO_EC,‘그린킥오프’[본문]하나은행은,0.706726,0.706726,1
8,AUTO_EC,‘가이드봇’,0.682007,0.682007,1
9,AUTO_EC,고발…“대기업집단,0.66626,0.66626,1


셀 C) “팀원이 H_k 만들 때” 바로 비교 가능한 형태로 추가 저장(집합 버전)

In [None]:
Ik_sets = {p: [x["word"] for x in Ik[p]] for p in Ik.keys()}

ik_set_path = os.path.join(OUT_IK_DIR, f"Ik_sets_top{TOP_IK}.json")
with open(ik_set_path, "w", encoding="utf-8") as f:
    json.dump(Ik_sets, f, ensure_ascii=False, indent=2)

print("Saved:", ik_set_path)
print("Example:", list(Ik_sets.items())[:1])


Saved: /content/drive/MyDrive/ig_runs/Ik_exports_ig_steps20_span_title_top25/Ik_sets_top50.json
Example: [('AUTO_EC', ['판단하라.[카테고리]EC[제목]주담대', '첫마을', '판단하라.[카테고리]EC[제목]공정위,', '허위제출”[본문]▲', '월드,', '최대0.03%p', '판단하라.[카테고리]EC[제목]BNK금융그룹,', '‘그린킥오프’[본문]하나은행은', '‘가이드봇’', '고발…“대기업집단', '中알리바바와', '롯데호텔', '수출기업', '\\"인중합체의', '우리금융타운역입니다\\"', '부울경', '변동금리', "챌린지'", "보',", '판단하라.[카테고리]EC[제목]獨서', '호반건설', '허용할까[본문]지난해', "'동남권", '추가', '명동역,', "'탄탄크림", '유명인과', '올려…낮추길', '드라마에', '인하', '판단하라.[카테고리]EC[제목]올리브영', '살리겠다\\"[본문]문재인', '판단하라.[카테고리]EC[제목]IBK기업은행', '기록[본문]세종시1-5생할권', '연간1000명', '‘오늘을', '대전시민', '판단하라.[카테고리]EC[제목]풀무원,', '활용…건보공단', "포럼'", 'K리그와', '개편에', '큐셀,', "'이커머스", '‘위키드와이프’와', '도입[본문]롯데호텔', '지원\\"[본문]하나은행이', '더하기', '송중기와', '계약[본문]한화솔루션'])]


3) 팀원에게 넘길 산출물(정리)

당신이 만들어서 팀원에게 주면 되는 파일은 2개면 충분합니다.

Ik_sets_top50.json

패턴별 top-50 단어 리스트(집합 비교용)

Ik_top50_by_pattern.json

같은 단어 리스트 + 기여도 + 근거 문서 path(“왜 New인지” 정성 근거 제시용)

팀원은 자기 쪽에서:

H_k를 정의하고,

New_k = I_k \ H_k를 계산하고,

example_paths로 근거 문서 열어서 해석하면 됩니다.