In [1]:
!pip install xgboost scikit-learn

Collecting xgboost
  Downloading xgboost-3.0.4-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.7.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting scipy (from xgboost)
  Downloading scipy-1.16.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (61 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading xgboost-3.0.4-py3-none-manylinux_2_28_x86_64.whl (94.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.9/94.9 MB[0m [31m148.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading scikit_learn-1.7.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m245.7 MB/

In [3]:
!pip install pandas

Collecting pandas
  Downloading pandas-2.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (91 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m280.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pytz-2025.2-py2.py3-none-any.whl (509 kB)
Downloading tzdata-2025.2-py2.py3-none-any.whl (347 kB)
Installing collected packages: pytz, tzdata, pandas
Successfully installed pandas-2.3.2 pytz-2025.2 tzdata-2025.2
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mp

In [4]:
!pip install numpy scipy

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [None]:
# ============================================
# XGBoost (GPU) Rich-Features Pipeline
#  - Text: Hashing(word 1-2) + Hashing(char_wb 3-5)
#  - Example Sims: body ↔ (pos/neg) cosine (해싱 L2 dot)
#  - Spam/Pattern: URL/도메인/숫자/대문자비율/키워드
#  - Subreddit KFold Target Encoding (누수 방지)
#  - (옵션) sentiment probs (train/test_with_sentiment_ctx.csv)
#  - Model: XGBoost (binary:logistic), 5-fold OOF AUC + submission
# ============================================

import os, re, gc
import numpy as np
import pandas as pd
import scipy.sparse as sp
from pathlib import Path
import joblib 

from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.metrics import roc_auc_score

import xgboost as xgb
from xgboost import XGBClassifier
from xgboost.callback import EarlyStopping
from packaging import version

# ---------------------------
# 0) Paths
# ---------------------------
TRAIN_PATH = "train.csv"
TEST_PATH  = "test.csv"
SENT_TRAIN = "train_with_sentiment_ctx.csv"   # optional
SENT_TEST  = "test_with_sentiment_ctx.csv"    # optional
OUT_PATH   = "submission_xgb_gpu_rich.csv"

# ---------------------------
# 1) Load & clean
# ---------------------------
train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)

def _clean(s):
    if pd.isna(s): return ""
    s = str(s).strip()
    return " ".join(s.split())

for c in ["body","subreddit","rule",
          "positive_example_1","positive_example_2",
          "negative_example_1","negative_example_2"]:
    if c in train.columns: train[c] = train[c].map(_clean)
    if c in test.columns:  test[c]  = test[c].map(_clean)

assert "row_id" in train.columns and "rule_violation" in train.columns, "train.csv에 row_id, rule_violation 필요"
assert "row_id" in test.columns, "test.csv에 row_id 필요"

# ---------------------------
# 2) Combined text → Hashing vectors 
# ---------------------------
def combine_text(df: pd.DataFrame) -> pd.Series:
    subr = df.get("subreddit","").astype(str)
    rule = df.get("rule","").astype(str)
    body = df.get("body","").astype(str)
    return ("subreddit: " + subr + " [SEP] rule: " + rule + " [SEP] body: " + body)

train_text = combine_text(train)
test_text  = combine_text(test)

HV_WORD = HashingVectorizer(ngram_range=(1,2), alternate_sign=False, norm="l2", n_features=2**19)
HV_CHAR = HashingVectorizer(analyzer="char_wb", ngram_range=(3,5), alternate_sign=False, norm="l2", n_features=2**17)

Xw_tr = sp.csr_matrix(HV_WORD.transform(train_text), dtype=np.float32)
Xc_tr = sp.csr_matrix(HV_CHAR.transform(train_text), dtype=np.float32)
Xw_te = sp.csr_matrix(HV_WORD.transform(test_text),  dtype=np.float32)
Xc_te = sp.csr_matrix(HV_CHAR.transform(test_text),  dtype=np.float32)

# ---------------------------
# 3) Example similarity features (cosine via l2-normalized hashing)
# ---------------------------
def rowwise_cos_by_dot(A, B):
    return A.multiply(B).sum(axis=1).A1

def example_sims(df: pd.DataFrame):
    body = sp.csr_matrix(HV_CHAR.transform(df.get("body","")), dtype=np.float32)
    p1   = sp.csr_matrix(HV_CHAR.transform(df.get("positive_example_1","")), dtype=np.float32)
    p2   = sp.csr_matrix(HV_CHAR.transform(df.get("positive_example_2","")), dtype=np.float32)
    n1   = sp.csr_matrix(HV_CHAR.transform(df.get("negative_example_1","")), dtype=np.float32)
    n2   = sp.csr_matrix(HV_CHAR.transform(df.get("negative_example_2","")), dtype=np.float32)
    sp1  = rowwise_cos_by_dot(body, p1)
    sp2  = rowwise_cos_by_dot(body, p2)
    sn1  = rowwise_cos_by_dot(body, n1)
    sn2  = rowwise_cos_by_dot(body, n2)
    feats = pd.DataFrame({
        "sim_pos1": sp1, "sim_pos2": sp2, "sim_neg1": sn1, "sim_neg2": sn2,
        "sim_pos_max": np.maximum(sp1, sp2),
        "sim_neg_max": np.maximum(sn1, sn2),
        "sim_pos_min": np.minimum(sp1, sp2),
        "sim_neg_min": np.minimum(sn1, sn2),
        "sim_pos_avg": (sp1+sp2)/2.0,
        "sim_neg_avg": (sn1+sn2)/2.0,
        "sim_pos_minus_neg": np.maximum(sp1,sp2) - np.maximum(sn1,sn2),
    })
    return feats

sim_tr_df = example_sims(train)
sim_te_df = example_sims(test)
S_tr = sp.csr_matrix(sim_tr_df.values, dtype=np.float32)
S_te = sp.csr_matrix(sim_te_df.values, dtype=np.float32)

# ---------------------------
# 4) Spam/Pattern features 
# ---------------------------
RULE_KEYWORDS = [
    r"spam", r"referral", r"advertis", r"solicit", r"promotion", r"self[- ]?promo",
    r"legal", r"advice", r"nsfw", r"porn", r"adult", r"sexual",
    r"stream", r"watch", r"live", r"hd", r"free",
    r"torrent", r"download", r"link",
    r"scam", r"fraud", r"giveaway", r"bet", r"lottery",
    r"sell", r"buy", r"trade",
    r"discord", r"telegram", r"whatsapp"
]
KW_PATTERNS = [re.compile(k, re.I) for k in RULE_KEYWORDS]
URL_RE  = re.compile(r"(https?://|www\.)", re.I)
EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
DOMAIN_RE = re.compile(r"\.[a-z]{2,6}([/ \n]|$)", re.I)
PHONE_RE  = re.compile(r"\+?\d[\d\- ]{7,}\d")
PRICE_RE  = re.compile(r"(?:\$|£|€|₩)\s?\d+")
UPPER_RE  = re.compile(r"[A-Z]")

def body_pattern_features(df: pd.DataFrame) -> pd.DataFrame:
    b = df.get("body","").fillna("")
    out = {
        "num_urls": b.str.count(URL_RE),
        "num_emails": b.str.count(EMAIL_RE),
        "num_domains": b.str.count(DOMAIN_RE),
        "num_digits": b.str.count(r"\d"),
        "num_exclam": b.str.count(r"!"),
        "has_phone": b.str.contains(PHONE_RE),
        "has_price": b.str.contains(PRICE_RE),
        "len_chars": b.str.len(),
        "len_tokens": b.str.split().map(len),
        "upper_ratio": b.map(lambda s: (len(UPPER_RE.findall(s)) / max(1, sum(ch.isalpha() for ch in s)))),
    }
    low = b.str.lower()
    for i, pat in enumerate(KW_PATTERNS):
        out[f"kw_{i:02d}"] = low.str.contains(pat)
    return pd.DataFrame(out).fillna(0).astype(float)

pat_tr_df = body_pattern_features(train)
pat_te_df = body_pattern_features(test)
P_tr = sp.csr_matrix(pat_tr_df.values, dtype=np.float32)
P_te = sp.csr_matrix(pat_te_df.values, dtype=np.float32)

# ---------------------------
# 5) Subreddit Target Encoding (KFold, leakage-safe)
# ---------------------------
def kfold_target_encode(train_df, test_df, col, y, n_splits=5, random_state=42):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    enc = pd.Series(np.zeros(len(train_df), dtype=float), index=train_df.index)
    global_mean = float(np.mean(y))
    for tr_idx, va_idx in skf.split(train_df, y):
        grp = train_df.iloc[tr_idx].groupby(col)[y.name].mean()
        enc.iloc[va_idx] = train_df.iloc[va_idx][col].map(grp).fillna(global_mean).values
    full_grp = train_df.groupby(col)[y.name].mean()
    enc_test = test_df[col].map(full_grp).fillna(global_mean).values
    return enc.values.reshape(-1,1), enc_test.reshape(-1,1)

y = train["rule_violation"].astype(int)
TE_tr_arr, TE_te_arr = kfold_target_encode(train.assign(rule_violation=y),
                                           test, col="subreddit", y=y, n_splits=5)
TE_tr = sp.csr_matrix(TE_tr_arr.astype(np.float32))
TE_te = sp.csr_matrix(TE_te_arr.astype(np.float32))

# ---------------------------
# 6) (옵션) Sentiment probs 추가
# ---------------------------
sent_tr_mat = sent_te_mat = None
if Path(SENT_TRAIN).exists() and Path(SENT_TEST).exists():
    st = pd.read_csv(SENT_TRAIN)[["row_id","prob_negative","prob_neutral","prob_positive"]]
    se = pd.read_csv(SENT_TEST)[["row_id","prob_negative","prob_neutral","prob_positive"]]
    train = train.merge(st, on="row_id", how="left")
    test  = test.merge(se, on="row_id", how="left")
    sent_cols = ["prob_negative","prob_neutral","prob_positive"]
    sent_tr_mat = sp.csr_matrix(train[sent_cols].fillna(0.0).values.astype(np.float32))
    sent_te_mat = sp.csr_matrix(test[sent_cols].fillna(0.0).values.astype(np.float32))

# ---------------------------
# 7) Stack all features
# ---------------------------
X_tr = sp.hstack([Xw_tr, Xc_tr, S_tr, P_tr, TE_tr] + ([sent_tr_mat] if sent_tr_mat is not None else []),
                 format="csr").tocsr()
X_te = sp.hstack([Xw_te, Xc_te, S_te, P_te, TE_te] + ([sent_te_mat] if sent_te_mat is not None else []),
                 format="csr").tocsr()

del Xw_tr, Xc_tr, Xw_te, Xc_te; gc.collect()

# ---------------------------
# 8) XGBoost (GPU) with 5-fold OOF
# ---------------------------
pos = y.sum(); neg = len(y) - pos
scale_pos_weight = float(neg / max(1, pos))

xgb_params = dict(
    objective="binary:logistic",
    eval_metric="auc",
    learning_rate=0.05,
    max_depth=8,
    min_child_weight=2.0,
    subsample=0.85,
    colsample_bytree=0.65,
    reg_alpha=0.0,
    reg_lambda=1.0,
    n_estimators=4000,
    random_state=42,
    n_jobs=-1,
    scale_pos_weight=scale_pos_weight,
)

if version.parse(xgb.__version__) >= version.parse("2.0.0"):
    xgb_params.update(dict(device="cuda", tree_method="hist"))
else:
    xgb_params.update(dict(tree_method="gpu_hist", predictor="gpu_predictor"))

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof = np.zeros(len(train), dtype=np.float32)
best_rounds = []

for fold, (tr_idx, va_idx) in enumerate(cv.split(X_tr, y), 1):
    X_tr_f, X_va_f = X_tr[tr_idx], X_tr[va_idx]
    y_tr_f, y_va_f = y.iloc[tr_idx], y.iloc[va_idx]

    # Create the EarlyStopping object
    early_stop = EarlyStopping(rounds=200, save_best=True)
    
    # Pass callbacks to the classifier's CONSTRUCTOR
    clf = XGBClassifier(**xgb_params, callbacks=[early_stop])
    
    # Fit the model
    clf.fit(
        X_tr_f, y_tr_f,
        eval_set=[(X_va_f, y_va_f)],
        verbose=0,
    )
    
    oof[va_idx] = clf.predict_proba(X_va_f)[:,1]
    br = clf.best_iteration
    best_rounds.append(br)
    print(f"[Fold {fold}] best_iter={best_rounds[-1]}  AUC={roc_auc_score(y_va_f, oof[va_idx]):.6f}")

oof_auc = roc_auc_score(y, oof)
print(f"\nOOF AUC (XGBoost GPU, rich): {oof_auc:.6f}")


# ---------------------------
# 9) Full fit & predict test
# ---------------------------
best_n = int(np.median([r for r in best_rounds if r is not None])) if best_rounds else 1200
clf_full = XGBClassifier(**{**xgb_params, "n_estimators": best_n})
clf_full.fit(X_tr, y, verbose=False)

# 모델 저장
import joblib 
MODEL_FILE = "XGBmodel_test20250903.joblib"

# 마지막 fold 모델(clf) 대신 전체 데이터로 학습한 모델(clf_full)을 저장합니다.
joblib.dump(clf_full, MODEL_FILE)
print(f"[Saved Model] {MODEL_FILE}")


test_proba = clf_full.predict_proba(X_te)[:,1]
pd.DataFrame({"row_id": test["row_id"], "rule_violation": test_proba}).to_csv(OUT_PATH, index=False)
print(f"[Saved] {OUT_PATH}") 

[Fold 1] best_iter=80  AUC=0.867039
[Fold 2] best_iter=159  AUC=0.817330
[Fold 3] best_iter=48  AUC=0.864927
[Fold 4] best_iter=227  AUC=0.834122
[Fold 5] best_iter=538  AUC=0.837074

OOF AUC (XGBoost GPU, rich): 0.837170
[Saved Model] XGBmodel_test20250903.joblib
[Saved] submission_xgb_gpu_rich.csv
