### Outlier Filtering

In [3]:
import pandas as pd
import numpy as np

attempt = pd.read_csv("/Users/ianian/Desktop/26_1_EDA/2026_EDA_education/output_v3/attempt_with_confusion.csv")

# elapsed_time 상위 빈도 확인
print("Top elapsed_time values:")
print(attempt["elapsed_time"].value_counts().head(20))

# 상위 10개 값 퍼센트
vc = attempt["elapsed_time"].value_counts(normalize=True).head(10)
print("\nTop elapsed_time values (ratio):")
print(vc)

# 90000 비율
cap = 90000
print("\nRatio elapsed_time == 90000:", (attempt["elapsed_time"] == cap).mean())

# 문항별 90000 비율 상위
q_cap = (attempt.assign(is_cap=(attempt["elapsed_time"]==cap).astype(int))
         .groupby("question_id")["is_cap"].mean()
         .sort_values(ascending=False)
         .head(20))
print("\nTop questions by cap ratio:")
print(q_cap)

Top elapsed_time values:
elapsed_time
17000    696219
18000    687332
19000    631749
20000    603124
16000    598279
21000    596843
22000    584005
23000    545049
24000    489904
15000    448770
25000    432665
26000    380076
27000    335010
14000    333296
28000    296572
13000    274321
29000    264848
12000    249469
30000    239669
11000    231038
Name: count, dtype: int64

Top elapsed_time values (ratio):
elapsed_time
17000    0.034718
18000    0.034275
19000    0.031503
20000    0.030076
16000    0.029834
21000    0.029763
22000    0.029122
23000    0.027180
24000    0.024430
15000    0.022379
Name: proportion, dtype: float64

Ratio elapsed_time == 90000: 0.0003392440826018201

Top questions by cap ratio:
question_id
q8446     0.009662
q17322    0.008130
q7277     0.008078
q7278     0.008065
q3712     0.008000
q17508    0.007273
q17509    0.007273
q17510    0.007273
q3662     0.007246
q4963     0.007143
q7433     0.007067
q10247    0.007042
q7432     0.007042
q8473     0.0066

### Scoring after Filtering Process

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GroupShuffleSplit

attempt = pd.read_csv("/Users/ianian/Desktop/26_1_EDA/2026_EDA_education/output_v3/attempt_with_confusion.csv")

# -------------------------
# (0) RT 캡 제거
# -------------------------
CAP = 90000
before = len(attempt)
attempt = attempt[attempt["elapsed_time"] < CAP].copy()
print("Removed cap rows:", before - len(attempt), " / remaining:", len(attempt))

# -------------------------
# (1) features
# -------------------------
attempt["RT_star"] = np.log1p(attempt["elapsed_time"].clip(lower=0))

# Change는 flag로 (희소성 해결)
attempt["Change"] = (attempt["response_change"] > 0).astype(int)

# proxy는 incorrect 추천
attempt["incorrect"] = (attempt["correct"] == 0).astype(int)

def zscore(s):
    s = pd.to_numeric(s, errors="coerce").astype(float)
    mu = np.nanmean(s); sd = np.nanstd(s)
    if not np.isfinite(sd) or sd == 0: 
        return s*0.0
    return (s-mu)/sd

# global 표준화(권장)
attempt["x1_rt"] = zscore(attempt["RT_star"])
attempt["x2_ch"] = zscore(attempt["Change"])

# -------------------------
# (2) alpha,beta 추정 (user-group split)
# -------------------------
X = attempt[["x1_rt","x2_ch"]].values
y = attempt["incorrect"].values
groups = attempt["user_id"].values if "user_id" in attempt.columns else None

if groups is not None:
    gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    tr, va = next(gss.split(X, y, groups=groups))
    X_train, y_train = X[tr], y[tr]
else:
    # user_id 없으면 그냥 전체 학습
    X_train, y_train = X, y

clf = LogisticRegression(
    solver="liblinear",
    class_weight="balanced",
    C=1.0,
    random_state=42
)
clf.fit(X_train, y_train)

w_rt, w_ch = clf.coef_[0]
alpha = abs(w_rt); beta = abs(w_ch)
s = alpha + beta
alpha_n, beta_n = (alpha/s, beta/s) if s != 0 else (0.5, 0.5)

print(f"w_rt={w_rt:.4f}, w_ch={w_ch:.4f} -> alpha'={alpha_n:.3f}, beta'={beta_n:.3f}")

# -------------------------
# (3) Confusion_attempt + confused_attempt (top 10%)
# -------------------------
attempt["Confusion_attempt"] = alpha_n*attempt["x1_rt"] + beta_n*attempt["x2_ch"]

TOP_PCT = 0.10
thr = attempt["Confusion_attempt"].quantile(1 - TOP_PCT)
attempt["confused_attempt"] = (attempt["Confusion_attempt"] >= thr).astype(int)

# -------------------------
# (4) question-level 집계
# -------------------------
q = (attempt.groupby("question_id", as_index=False)
     .agg(
        q_n_attempt=("question_id","size"),
        q_accuracy=("correct","mean"),
        q_confusion_mean=("Confusion_attempt","mean"),
        q_confusion_median=("Confusion_attempt","median"),
        q_confusion_rate=("confused_attempt","mean"),
        q_change_rate=("Change","mean"),
        q_rt_median=("elapsed_time","median"),
     ))

print("\nTop-20 by q_confusion_mean:")
print(q.sort_values("q_confusion_mean", ascending=False).head(20))

# 저장
attempt.to_csv("/Users/ianian/Desktop/26_1_EDA/2026_EDA_education/output_filtering/attempt_with_confusion_recalc.csv", index=False)
q.to_csv("/Users/ianian/Desktop/26_1_EDA/2026_EDA_education/output_filtering/question_level_confusion_recalc.csv", index=False)
print("Saved recalc files.")

In [5]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GroupShuffleSplit

attempt = pd.read_csv("/Users/ianian/Desktop/26_1_EDA/2026_EDA_education/output_v3/attempt_with_confusion.csv")

# 1) feature 재구성
attempt["RT_star"] = np.log1p(attempt["elapsed_time"].clip(lower=0))
attempt["Change"] = (attempt["response_change"] > 0).astype(int)
attempt["incorrect"] = (attempt["correct"] == 0).astype(int)

def zscore(s):
    s = pd.to_numeric(s, errors="coerce").astype(float)
    mu, sd = np.nanmean(s), np.nanstd(s)
    return (s-mu)/sd if sd and sd>0 else s*0.0

# 2) global 표준화
attempt["x1_rt"] = zscore(attempt["RT_star"])
attempt["x2_ch"] = zscore(attempt["Change"])

# 3) alpha,beta 추정 (proxy=incorrect)
X = attempt[["x1_rt","x2_ch"]].values
y = attempt["incorrect"].values
groups = attempt["user_id"].values

gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
tr, va = next(gss.split(X, y, groups=groups))

clf = LogisticRegression(solver="liblinear", class_weight="balanced", random_state=42)
clf.fit(X[tr], y[tr])

w_rt, w_ch = clf.coef_[0]
alpha, beta = abs(w_rt), abs(w_ch)
s = alpha + beta
alpha_n, beta_n = (alpha/s, beta/s) if s else (0.5, 0.5)
print("alpha', beta' =", alpha_n, beta_n)

# 4) Confusion 재계산
attempt["Confusion_attempt"] = alpha_n*attempt["x1_rt"] + beta_n*attempt["x2_ch"]

# 5) ✅ confused_attempt를 상위 10%로
TOP_PCT = 0.10
thr = attempt["Confusion_attempt"].quantile(1 - TOP_PCT)
attempt["confused_attempt"] = (attempt["Confusion_attempt"] >= thr).astype(int)

# 6) question-level 집계
q = (attempt.groupby("question_id", as_index=False)
     .agg(
        q_n_attempt=("question_id","size"),
        q_accuracy=("correct","mean"),
        q_confusion_mean=("Confusion_attempt","mean"),
        q_confusion_median=("Confusion_attempt","median"),
        q_confusion_rate=("confused_attempt","mean"),
        q_change_rate=("Change","mean"),
        q_rt_median=("elapsed_time","median"),
     ))

print("\nTop-20 confused questions:")
print(q.sort_values("q_confusion_mean", ascending=False).head(20))

attempt.to_csv("/Users/ianian/Desktop/26_1_EDA/2026_EDA_education/output_filtering/attempt_with_confusion_recalc.csv", index=False)
q.to_csv("/Users/ianian/Desktop/26_1_EDA/2026_EDA_education/output_filtering/question_level_confusion_recalc.csv", index=False)

alpha', beta' = 0.8057432881710683 0.19425671182893162

Top-20 confused questions:
     question_id  q_n_attempt  q_accuracy  q_confusion_mean  \
9016       q7381         2901    0.251640          1.059954   
9015       q7380         2904    0.301034          1.059394   
9013       q7379         2903    0.750604          1.057062   
9017       q7382         2903    0.494308          1.052900   
9018       q7383         2903    0.560883          1.050957   
2640      q17451          954    0.822851          1.040165   
2642      q17453          954    0.738994          1.037659   
2641      q17452          957    0.407524          1.036961   
2644      q17455          957    0.516196          1.036961   
2643      q17454          956    0.630753          1.036373   
9175       q7540         4522    0.365529          0.981919   
9176       q7541         4516    0.348492          0.979933   
9178       q7543         4520    0.313317          0.975864   
9177       q7542         4521    0.

OSError: [Errno 28] No space left on device