In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import os
import zipfile
import pandas as pd
import numpy as np
from collections import defaultdict

# -----------------------------
# Welford: mean/variance online
# -----------------------------
class Welford:
    __slots__ = ("n", "mean", "M2")
    def __init__(self):
        self.n = 0
        self.mean = 0.0
        self.M2 = 0.0

    def update_array(self, x: np.ndarray):
        for v in x:
            if pd.isna(v):
                continue
            self.n += 1
            delta = v - self.mean
            self.mean += delta / self.n
            delta2 = v - self.mean
            self.M2 += delta * delta2

    def variance(self):
        if self.n < 2:
            return np.nan
        return self.M2 / (self.n - 1)

# -----------------------------
# Utility
# -----------------------------
LETTER_TO_IDX = {"a": 0, "b": 1, "c": 2, "d": 3}

def safe_entropy(p: np.ndarray) -> float:
    p = p[p > 0]
    if p.size <= 1:
        return 0.0
    return float(-(p * np.log(p)).sum())

# -----------------------------
# Main
# -----------------------------
def build_item_master_table(
    zip_path: str,
    questions_csv_path: str,
    irt_b_csv_path: str,
    confusion_csv_path: str,
    out_dir: str,
    out_prefix: str = "toeic_item",
    chunksize: int = 200_000,
    part_to_K: dict | None = None,
):
    os.makedirs(out_dir, exist_ok=True)

    # Part 2만 3지선다
    if part_to_K is None:
        part_to_K = {1:4, 2:3, 3:4, 4:4, 5:4, 6:4, 7:4}

    # -----------------------------
    # 1) questions.csv
    # -----------------------------
    q = pd.read_csv(questions_csv_path)
    if "question_id" not in q.columns:
        q = q.rename(columns={"item_id": "question_id"})

    q["question_id"] = q["question_id"].astype(str).str.strip()

    if "tags" in q.columns:
        q["n_tags"] = q["tags"].fillna("").astype(str).apply(
            lambda s: 0 if s.strip() == "" else len(s.split(";"))
        )
    else:
        q["n_tags"] = np.nan

    q["part"] = pd.to_numeric(q["part"], errors="ignore")
    q_meta = q[["question_id", "part", "n_tags"]].drop_duplicates("question_id")

    # -----------------------------
    # 2) KT1 zip → item aggregation
    # -----------------------------
    attempts = defaultdict(int)
    choice_cnt = defaultdict(lambda: np.zeros(4, dtype=np.int64))
    rt_stats = defaultdict(Welford)

    with zipfile.ZipFile(zip_path, "r") as z:
        names = [n for n in z.namelist() if n.endswith(".csv")]
        print("CSV files in zip:", len(names))

        for i, name in enumerate(names, 1):
            with z.open(name) as f:
                for chunk in pd.read_csv(
                    f,
                    chunksize=chunksize,
                    usecols=["question_id", "user_answer", "elapsed_time"],
                ):
                    chunk["question_id"] = chunk["question_id"].astype(str).str.strip()
                    chunk["user_answer"] = chunk["user_answer"].astype(str).str.lower()

                    # attempts
                    for qid, cnt in chunk["question_id"].value_counts().items():
                        attempts[qid] += int(cnt)

                    # choice counts
                    sub = chunk[chunk["user_answer"].isin(["a","b","c","d"])]
                    gb = sub.groupby(["question_id", "user_answer"]).size()
                    for (qid, ans), cnt in gb.items():
                        choice_cnt[qid][LETTER_TO_IDX[ans]] += int(cnt)

                    # RT
                    rt = pd.to_numeric(chunk["elapsed_time"], errors="coerce")
                    for qid, g in rt.groupby(chunk["question_id"]):
                        rt_stats[qid].update_array(g.to_numpy())

            if i % 5000 == 0:
                print(f"processed {i}/{len(names)} files...")

    rows = []
    for item_id, n in attempts.items():
        wf = rt_stats[item_id]
        cnts = choice_cnt[item_id]
        rows.append({
            "item_id": item_id,
            "n_attempts": n,
            "choice_cnt_a": cnts[0],
            "choice_cnt_b": cnts[1],
            "choice_cnt_c": cnts[2],
            "choice_cnt_d": cnts[3],
            "rt_mean": wf.mean if wf.n > 0 else np.nan,
            "rt_var": wf.variance(),
        })
    item_df = pd.DataFrame(rows)

    # -----------------------------
    # 3) merge question meta
    # -----------------------------
    item_df = item_df.merge(
        q_meta, left_on="item_id", right_on="question_id", how="left"
    ).drop(columns="question_id")

    # -----------------------------
    # 4) part-aware rates
    # -----------------------------
    def calc_rates(row):
        K = part_to_K.get(row["part"], 4)
        cnts = np.array([
            row["choice_cnt_a"],
            row["choice_cnt_b"],
            row["choice_cnt_c"],
            row["choice_cnt_d"],
        ], dtype=float)[:K]
        tot = cnts.sum()
        if tot == 0:
            return pd.Series([K] + [np.nan]*6)

        p = cnts / tot
        return pd.Series([
            K,
            p[0] if K > 0 else np.nan,
            p[1] if K > 1 else np.nan,
            p[2] if K > 2 else np.nan,
            p[3] if K > 3 else np.nan,
            p.max(),
            safe_entropy(p) / np.log(K)
        ])

    item_df[
        ["K_part",
         "choice_rate_a","choice_rate_b","choice_rate_c","choice_rate_d",
         "dominant_choice_rate","choice_entropy_norm"]
    ] = item_df.apply(calc_rates, axis=1)

    # -----------------------------
    # 5) IRT merge
    # -----------------------------
    irt = pd.read_csv(irt_b_csv_path).rename(columns={"b": "irt_b"})
    irt["item_id"] = irt["item_id"].astype(str)
    item_df = item_df.merge(irt, on="item_id", how="left")

    # -----------------------------
    # 6) confusion merge (q_* 전부)
    # -----------------------------
    conf = pd.read_csv(confusion_csv_path)
    conf = conf.rename(columns={"question_id": "item_id"})
    conf["item_id"] = conf["item_id"].astype(str)

    conf_cols = [
        "item_id",
        "q_n_attempt",
        "q_accuracy",
        "q_confusion_mean",
        "q_confusion_median",
        "q_confusion_rate",
        "q_rt_median",
        "q_change_rate",
        "q_change_mean",
    ]
    item_df = item_df.merge(conf[conf_cols], on="item_id", how="left")

    # -----------------------------
    # save
    # -----------------------------
    out_path = os.path.join(out_dir, f"{out_prefix}_item_master.csv")
    item_df.to_csv(out_path, index=False)
    print("saved:", out_path)
    print("columns:", list(item_df.columns))

    return item_df


# -----------------------------
# 실행 (너 경로 기준)
# -----------------------------
ZIP_PATH = "/content/drive/MyDrive/EdNET_KT/KT1.zip"
QUESTIONS = "/content/drive/MyDrive/EdNET_KT/questions.csv"
IRT_B = "/content/drive/MyDrive/EdNET_KT/part125_final_optimized/item_b_optimized.csv"

CONF_DRIVE = "/content/drive/MyDrive/EdNET_KT/question_level_confusion (1).csv"
CONF_UPLOADED = "/mnt/data/question_level_confusion (1).csv"  # 혹시 Drive에서 못 읽을 때 fallback

OUT_DIR = "/content/drive/MyDrive/EdNET_KT/item_master_outputs"
OUT_PREFIX = "toeic_item"

df_master = build_item_master_table(
    zip_path=ZIP_PATH,
    questions_csv_path=QUESTIONS,
    irt_b_csv_path=IRT_B,
    confusion_csv_path=CONF_DRIVE,   # ← 하나만 넣으면 됨
    out_dir=OUT_DIR,
    out_prefix=OUT_PREFIX,
    part_to_K={1:4, 2:3, 3:4, 4:4, 5:4, 6:4, 7:4}
)

  q["part"] = pd.to_numeric(q["part"], errors="ignore")


CSV files in zip: 23477
processed 5000/23477 files...
processed 10000/23477 files...
processed 15000/23477 files...
processed 20000/23477 files...
saved: /content/drive/MyDrive/EdNET_KT/item_master_outputs/toeic_item_item_master.csv
columns: ['item_id', 'n_attempts', 'choice_cnt_a', 'choice_cnt_b', 'choice_cnt_c', 'choice_cnt_d', 'rt_mean', 'rt_var', 'part', 'n_tags', 'K_part', 'choice_rate_a', 'choice_rate_b', 'choice_rate_c', 'choice_rate_d', 'dominant_choice_rate', 'choice_entropy_norm', 'irt_b', 'q_n_attempt', 'q_accuracy', 'q_confusion_mean', 'q_confusion_median', 'q_confusion_rate', 'q_rt_median', 'q_change_rate', 'q_change_mean']
