In [None]:
# ============================================
# Ensemble for AUC (warnings-free version)
#  - Model A: TF-IDF(word+char)+Heuristic -> Logistic
#  - Model B: TF-IDF(word+char)+Heuristic -> SGDClassifier (hinge) + Calibrated (sigmoid)
#  - Model C: Heuristic-only -> GradientBoosting
#  - Blend weights optimized by OOF AUC (with numerical clamp)
#  - Predict test and build my_submission (probabilities)
# ============================================
import re, warnings
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.exceptions import ConvergenceWarning
from scipy.sparse import csr_matrix
# ---- Optional: suppress convergence warnings (we also fixed root cause) ----
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=FutureWarning) # FutureWarning도 무시하도록 추가
# --------- Paths ----------
PATH_TRAIN = "train.csv"
PATH_TEST  = "test.csv"
# --------- Text column detection ----------
TEXT_CANDIDATES = ["body","text","comment","comment_text","content"]
def guess_text_col(df: pd.DataFrame):
    for c in TEXT_CANDIDATES:
        if c in df.columns:
            return c
    raise ValueError(f"텍스트 컬럼을 찾지 못했습니다. 후보: {TEXT_CANDIDATES}")
# --------- Normalization (obfuscation aware) ----------
def text_obfuscation_normalize(s: str) -> str:
    s = s.lower()
    s = re.sub(r"\s+"," ", s)
    table = str.maketrans({"0":"o","1":"i","3":"e","4":"a","5":"s","7":"t","$":"s","@":"a","!":"i","+":"t"})
    s = s.translate(table)
    s = re.sub(r"([!?.,])\1{2,}", r"\1\1", s)
    # s.h.i.t -> shit
    s = re.sub(r"(?:\b|_)([a-z])(?:[.\-_/ ]+[a-z])+", lambda m: m.group(0).replace(".","").replace("-","").replace("_","").replace("/","").replace(" ",""), s)
    return s
def normalize_simple(s: str) -> str:
    s = text_obfuscation_normalize(s)
    s = re.sub(r"[^a-z0-9'@/:._-]+", " ", s)
    return s.strip()
# --------- Heuristic features (from analysis) ----------
POS = {"good","great","awesome","amazing","love","like","nice","kind","helpful","thanks","thank","appreciate",
       "brilliant","fantastic","excellent","well","welldone","congrats","congratulations","cool","sweet","cheers",
       "solid","decent","smart","interesting","insightful","clear","accurate","agree","true","valid","correct"}
NEG = {"bad","worse","worst","hate","hates","hated","awful","terrible","horrible","disgusting","gross","stupid",
       "idiot","idiotic","moron","dumb","dumber","dumbest","trash","garbage","sucks","loser","pathetic","toxic",
       "annoying","angry","mad","ridiculous","bs","nonsense","spam","fake","liar","lying","lie","lies","ignorant",
       "ugly","useless","pointless","wrong","incorrect","false","misleading","cringe","cringy","wtf","wtheck",
       "shut","shutup","goaway","hell","nazi","racist","sexist","homophobic","offensive","rude","jerk"}
PROF = {"fuck","fucking","fucked","shit","bullshit","bitch","bitches","ass","asses","asshole","dick","dicks","prick",
        "bastard","crap","cunt","piss","pissed","damn","goddamn","mf","motherfucker","retard","retarded","dumbass",
        "screw","screwyou"}
PA   = {"idiot","idiotic","moron","jerk","loser","stupid","dumb","dumbass","asshole","bastard","prick"}
HATE = {"nazi","racist","sexist","homophobic","retard","retarded"}
LAW  = {"sue","lawyer","attorney","police","court","illegal","legal","legally","code","codes"}
MOD  = {"mod","mods","moderator","moderators","ban","banned","unban"}
NEG_PH = {"screw you","shut up","kill yourself","die in a hole","go to hell"}
def cnt(tokens, S):
    return sum(1 for w in tokens if w in S)
class HeuristicFeaturizer(BaseEstimator, TransformerMixin):
    """text -> csr_matrix of engineered features (standardized)"""
    def __init__(self):
        self.mean_ = None
        self.std_  = None
    def fit(self, X, y=None):
        F = self._featurize_many(X).toarray()
        self.mean_ = F.mean(axis=0)
        self.std_ = F.std(axis=0) + 1e-6
        return self
    def transform(self, X):
        F = self._featurize_many(X).toarray()
        F = (F - self.mean_) / self.std_
        return csr_matrix(F)
    def _featurize_many(self, X):
        rows = []
        for s in X:
            raw = str(s)
            norm = normalize_simple(raw)
            toks = norm.split()
            pos_cnt = cnt(toks, POS)
            neg_cnt = cnt(toks, NEG)
            prof_cnt = cnt(toks, PROF)
            pa_cnt   = cnt(toks, PA)
            hate_cnt = cnt(toks, HATE)
            law_cnt  = cnt(toks, LAW)
            mod_cnt  = cnt(toks, MOD)
            neg_phrase_flag = int(any(p in norm for p in NEG_PH))
            sent_score = (pos_cnt - neg_cnt) / (1 + pos_cnt + neg_cnt)
            word_count = len(toks)
            char_len   = len(raw)
            exclaim_cnt= raw.count("!")
            upper_ratio= sum(1 for ch in raw if ch.isupper())/max(1,len(raw))
            has_url    = 1 if re.search(r"http[s]?://", raw, flags=re.I) else 0
            has_mention= 1 if re.search(r"(?:\bu/|@[A-Za-z0-9_]+)", raw) else 0
            neg_x_excl = neg_cnt * exclaim_cnt
            profanity_pos = prof_cnt * int(sent_score>0)
            law_short = law_cnt * int(word_count <= 12)
            url_short = has_url * int(word_count <= 12)
            rows.append([
                sent_score, pos_cnt, neg_cnt, prof_cnt, pa_cnt, hate_cnt, law_cnt, mod_cnt,
                neg_phrase_flag, word_count, char_len, exclaim_cnt, upper_ratio, has_url, has_mention,
                neg_x_excl, profanity_pos, law_short, url_short
            ])
        return csr_matrix(np.array(rows, dtype=np.float32))
# --------- Load data ----------
train = pd.read_csv(PATH_TRAIN)
test  = pd.read_csv(PATH_TEST)
assert "rule_violation" in train.columns, "train.csv에 'rule_violation' 컬럼이 필요합니다."
text_col = guess_text_col(train)
assert text_col in test.columns, f"test.csv에도 '{text_col}' 컬럼이 필요합니다."
if "row_id" not in test.columns:
    test["row_id"] = np.arange(len(test))
X_train = pd.DataFrame({"text": train[text_col].fillna("").astype(str)})
X_test  = pd.DataFrame({"text":  test[text_col].fillna("").astype(str)})
y = pd.to_numeric(train["rule_violation"], errors="coerce").fillna(0).astype(int).values
# --------- Shared preprocessors ----------
prep_wordchar_heur = ColumnTransformer(
    transformers=[
        ("word", TfidfVectorizer(ngram_range=(1,2), min_df=3, token_pattern=r"[A-Za-z']{2,}"), "text"),
        ("char", TfidfVectorizer(analyzer="char", ngram_range=(2,5), min_df=3), "text"),
        ("heur", Pipeline([("feat", HeuristicFeaturizer())]), "text"),
    ],
    remainder="drop",
    sparse_threshold=1.0
)
prep_heur_only = ColumnTransformer(
    transformers=[("heur", Pipeline([("feat", HeuristicFeaturizer())]), "text")],
    remainder="drop",
    sparse_threshold=1.0
)
# --------- Models ----------
model_A = Pipeline([
    ("prep", prep_wordchar_heur),
    ("clf", LogisticRegression(max_iter=1000, C=2.0))
])

model_B = Pipeline([
    ("prep", prep_wordchar_heur),
    ("clf", CalibratedClassifierCV(
        estimator=SGDClassifier(     
            loss="hinge",             # SVM hinge loss
            alpha=1e-5,               # L2 정규화 강도 (튜닝 포인트)
            max_iter=3000,
            tol=1e-3,
            random_state=42
        ),
        cv=3, method="sigmoid"
    ))
])
model_C = Pipeline([
    ("prep", prep_heur_only),
    ("clf", GradientBoostingClassifier(random_state=42))
])
models = {"A_logreg": model_A, "B_sgd_svm_cal": model_B, "C_gb_heur": model_C}
# --------- OOF predictions for weight search ----------
print("OOF 예측을 생성 중입니다...")
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof = {}
for name, pipe in models.items():
    oof[name] = cross_val_predict(pipe, X_train, y, cv=skf, method="predict_proba")[:,1]
    print(f"{name} OOF AUC: {roc_auc_score(y, oof[name]):.4f}")
# --------- Simple grid search for blend weights (clamped) ----------
print("최적의 블렌딩 가중치를 찾는 중입니다...")
best_auc, best_w = -1.0, (1.0, 0.0, 0.0)
weights = np.arange(0.0, 1.01, 0.05)
for w1 in weights:
    for w2 in weights:
        w3 = 1.0 - (w1 + w2)
        if w3 < -1e-9:    # infeasible
            continue
        w3 = max(0.0, w3) # clamp tiny negatives to 0
        blend = w1*oof["A_logreg"] + w2*oof["B_sgd_svm_cal"] + w3*oof["C_gb_heur"]
        auc = roc_auc_score(y, blend)
        if auc > best_auc:
            best_auc, best_w = auc, (float(w1), float(w2), float(w3))
print(f"Best blend AUC: {best_auc:.4f} with weights (A,B,C) = {best_w}")
# --------- Fit all on full data & predict test ----------
print("전체 데이터로 모델을 학습하고 테스트 데이터를 예측하는 중입니다...")
fitted = {name: clone(pipe).fit(X_train, y) for name, pipe in models.items()}
test_pred = {name: est.predict_proba(X_test)[:,1] for name, est in fitted.items()}
w1, w2, w3 = best_w
responses = w1*test_pred["A_logreg"] + w2*test_pred["B_sgd_svm_cal"] + w3*test_pred["C_gb_heur"]
my_submission = pd.DataFrame({
    'row_id': test['row_id'],
    'rule_violation': responses
})
#my_submission.to_csv('submission.csv', index=False)
#print("submission.csv 파일 생성이 완료되었습니다!")

OOF 예측을 생성 중입니다...
A_logreg OOF AUC: 0.8385
B_sgd_svm_cal OOF AUC: 0.7905
C_gb_heur OOF AUC: 0.7246
최적의 블렌딩 가중치를 찾는 중입니다...
Best blend AUC: 0.8385 with weights (A,B,C) = (1.0, 0.0, 0.0)
전체 데이터로 모델을 학습하고 테스트 데이터를 예측하는 중입니다...
