### train.set
* body- 댓글의 텍스트
* rule- 해당 댓글이 위반되었다고 판단되는 규칙
* subreddit- 댓글이 작성된 포럼
* positive_example_{1,2}- 규칙을 위반하는 댓글의 예
* negative_example_{1,2}- 규칙을 위반하지 않는 댓글의 예
* rule_violation- 이진 타겟

In [1]:
# Faster baseline (<=60s): HashingVectorizer + SGDClassifier (One-vs-Rest if multilabel)
# - Uses decision_function scores for AUC and submission
# - Avoids heavy TF-IDF fitting

import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_auc_score

# 1) Load data

train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")

# 2) Auto-detect text & label columns
def detect_text_col(df: pd.DataFrame):
    priority = ["text","comment_text","comment","body","content"]
    for name in priority:
        for col in df.columns:
            if col.lower() == name:
                return col
    obj_cols = [c for c in df.columns if df[c].dtype == "object"]
    if obj_cols:
        avg_lens = [(df[c].astype(str).str.len().mean(), c) for c in obj_cols]
        avg_lens.sort(reverse=True)
        return avg_lens[0][1]
    return df.columns[0]

def is_binary_series(s: pd.Series):
    if s.dtype == bool: return True
    if np.issubdtype(s.dtype, np.number):
        vals = pd.unique(s.dropna())
        return set(vals).issubset({0,1})
    lowered = s.dropna().astype(str).str.lower().unique()
    return set(lowered).issubset({"0","1","true","false","yes","no"})

text_col = detect_text_col(train)
label_cols = []
for c in train.columns:
    if c == text_col: 
        continue
    if is_binary_series(train[c]):
        label_cols.append(c)

if not label_cols:
    last_col = train.columns[-1]
    if last_col != text_col and is_binary_series(train[last_col]):
        label_cols = [last_col]

assert label_cols, "No binary label columns detected in train.csv"
print(f"[Info] Text column: {text_col}")
print(f"[Info] Label columns: {label_cols}")

# 3) Prepare data
X_text = train[text_col].fillna("").astype(str)
y_df   = train[label_cols].copy()

for c in label_cols:
    if y_df[c].dtype == bool:
        y_df[c] = y_df[c].astype(int)
    elif not np.issubdtype(y_df[c].dtype, np.number):
        y_df[c] = y_df[c].astype(str).str.lower().map({"1":1,"0":0,"true":1,"false":0,"yes":1,"no":0}).fillna(0).astype(int)

if len(label_cols) == 1:
    y = y_df[label_cols[0]].values
    X_tr, X_va, y_tr, y_va = train_test_split(X_text, y, test_size=0.2, random_state=42, stratify=y if len(np.unique(y))>1 else None)
else:
    X_tr, X_va, y_tr, y_va = train_test_split(X_text, y_df.values, test_size=0.2, random_state=42)

# 4) Fast feature + model
word_hash = HashingVectorizer(analyzer="word", ngram_range=(1,2), n_features=2**18, alternate_sign=False, lowercase=True, strip_accents="unicode", norm="l2")
char_hash = HashingVectorizer(analyzer="char_wb", ngram_range=(3,5), n_features=2**17, alternate_sign=False, lowercase=True, strip_accents="unicode", norm="l2")
feats = FeatureUnion([("word", word_hash), ("char", char_hash)])

base_clf = SGDClassifier(loss="log_loss", alpha=1e-5, max_iter=5, random_state=42)

if len(label_cols) == 1:
    model = Pipeline([("feats", feats), ("clf", base_clf)])
else:
    model = Pipeline([("feats", feats), ("clf", OneVsRestClassifier(base_clf))])

# 5) Train
model.fit(X_tr, y_tr)

# 6) Evaluate (use decision_function as ranking score for ROC AUC)
def safe_auc(y_true, scores):
    if len(np.unique(y_true)) < 2:
        return np.nan
    try:
        return roc_auc_score(y_true, scores)
    except Exception:
        return np.nan

metrics = {}
if len(label_cols) == 1:
    scores = model.decision_function(X_va)
    auc = safe_auc(y_va, scores)
    metrics[label_cols[0]] = auc
else:
    scores_all = model.decision_function(X_va)  # shape: (n_samples, n_labels)
    for i, col in enumerate(label_cols):
        auc = safe_auc(y_va[:, i], scores_all[:, i])
        metrics[col] = auc

avg_auc = np.nanmean([v for v in metrics.values() if not pd.isna(v)])
metrics_df = pd.DataFrame({"label": list(metrics.keys()), "valid_ROC_AUC": [metrics[k] for k in metrics.keys()]}).sort_values("label").reset_index(drop=True)
print("\n=== Validation AUCs ===")
print(metrics_df.to_string(index=False))
print(f"\nColumn-averaged AUC: {avg_auc:.5f} (NaNs ignored)")

# 7) Predict test and build submission (decision_function scores)
X_test = test[text_col].fillna("").astype(str) if text_col in test.columns else test.iloc[:,0].astype(str)
if len(label_cols) == 1:
    test_scores = model.decision_function(X_test)
    pred_df = pd.DataFrame({label_cols[0]: test_scores})
else:
    scores_test = model.decision_function(X_test)
    pred_df = pd.DataFrame(scores_test, columns=label_cols)

id_candidates = [c for c in test.columns if c.lower() in ["id","comment_id","post_id","index"]]
if id_candidates:
    id_col = id_candidates[0]
    submission = pd.concat([test[[id_col]].reset_index(drop=True), pred_df.reset_index(drop=True)], axis=1)
else:
    submission = pred_df.copy()
    submission.insert(0, "id", np.arange(len(submission)))

save_path = "/mnt/data/baseline_submission_fast.csv"
submission.to_csv(save_path, index=False)

print(f"\nSaved submission to: {save_path}")
save_path


[Info] Text column: body
[Info] Label columns: ['rule_violation']

=== Validation AUCs ===
         label  valid_ROC_AUC
rule_violation       0.824442

Column-averaged AUC: 0.82444 (NaNs ignored)

Saved submission to: /mnt/data/baseline_submission_fast.csv




'/mnt/data/baseline_submission_fast.csv'