# 사전학습 모델 cardiffnlp/twitter-roberta-base-sentiment 사용

In [1]:
!pip install transformers torch tqdm





[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: C:\Users\MYNOTE\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [None]:
import os, re, gc
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
from torch.nn.functional import softmax
from transformers import AutoTokenizer, AutoModelForSequenceClassification

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# ===============================
# 0) Config
# ===============================
MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment"
MAX_LEN_CTX = 320          # 컨텍스트까지 넣을 때 토큰 길이
MAX_LEN_PLAIN = 256        # body+subreddit만 넣을 때 토큰 길이
BATCH_SIZE = 16
USE_BLEND = True           # 컨텍스트+플레인 확률 가중 평균
BLEND_W_CTX = 0.7          # 0.7*ctx + 0.3*plain

INPUT_TRAIN = "train.csv"
INPUT_TEST  = "test.csv"

# ===============================
# 1) Load
# ===============================
train = pd.read_csv(INPUT_TRAIN)
test  = pd.read_csv(INPUT_TEST)

REQUIRED = ["body", "subreddit",
            "positive_example_1", "positive_example_2",
            "negative_example_1", "negative_example_2"]
for c in REQUIRED:
    if c not in train.columns:
        raise ValueError(f"train.csv에 '{c}' 컬럼이 필요합니다.")
    if c not in test.columns:
        # test엔 rule_violation, rule 등이 없어도 되지만 예시 컬럼은 있다고 가정
        raise ValueError(f"test.csv에 '{c}' 컬럼이 필요합니다.")

def _clean(s):
    if pd.isna(s): return ""
    # 트위터/레딧 스타일 텍스트 보존, 과도한 정규화는 피함
    s = str(s).strip()
    s = re.sub(r"\s+", " ", s)
    return s

def preprocess_df(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    for c in ["body","subreddit",
              "positive_example_1","positive_example_2",
              "negative_example_1","negative_example_2"]:
        out[c] = out[c].map(_clean)
    return out

train = preprocess_df(train)
test  = preprocess_df(test)

# ===============================
# 2) 유사한 예시 자동 선택 (row 단위)
#    - char n-gram TF-IDF로 body와 각 예시 간 유사도를 계산해
#      positive/negative 각각 최적 예시 1개씩 고름
# ===============================
def pick_best_example(body: str, ex1: str, ex2: str) -> str:
    cands = [ex1 or "", ex2 or ""]
    # 간단하지만 강력한 char_wb n-gram → URL/오탈자/이모지에 비교적 강함
    vec = TfidfVectorizer(analyzer="char_wb", ngram_range=(3,5), min_df=1)
    X = vec.fit_transform([body] + cands)
    sims = cosine_similarity(X[0], X[1:]).flatten()
    best_idx = int(np.argmax(sims)) if len(sims) else 0
    return cands[best_idx]

def build_context_text(subreddit: str, body: str,
                       pos_ex: str, neg_ex: str) -> str:
    # RoBERTa에 직접 few-shot 힌트를 주는 형식 (간결하고 일관되게)
    return (
        f"subreddit: {subreddit} [SEP] "
        f"body: {body} [SEP] "
        f"pos_example: {pos_ex} [SEP] "
        f"neg_example: {neg_ex}"
    )

def build_plain_text(subreddit: str, body: str) -> str:
    return f"subreddit: {subreddit} [SEP] body: {body}"

def make_text_inputs(df: pd.DataFrame):
    ctx_texts, plain_texts = [], []
    for _, r in tqdm(df.iterrows(), total=len(df), desc="Selecting examples"):
        pos_best = pick_best_example(r["body"], r["positive_example_1"], r["positive_example_2"])
        neg_best = pick_best_example(r["body"], r["negative_example_1"], r["negative_example_2"])
        ctx_texts.append(build_context_text(r["subreddit"], r["body"], pos_best, neg_best))
        plain_texts.append(build_plain_text(r["subreddit"], r["body"]))
    return ctx_texts, plain_texts

train_ctx, train_plain = make_text_inputs(train)
test_ctx,  test_plain  = make_text_inputs(test)

# ===============================
# 3) 모델 로드
# ===============================
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME).to(device)
model.eval()
labels = np.array(["negative","neutral","positive"])

# ===============================
# 4) 배치 추론 함수
# ===============================
@torch.no_grad()
def predict_texts(texts, max_len) -> np.ndarray:
    probs_all = []
    for i in tqdm(range(0, len(texts), BATCH_SIZE), desc="Inferring"):
        batch = texts[i:i+BATCH_SIZE]
        inputs = tokenizer(
            batch, return_tensors="pt",
            padding=True, truncation=True, max_length=max_len
        ).to(device)
        logits = model(**inputs).logits
        probs = softmax(logits, dim=1).cpu().numpy()
        probs_all.append(probs)
        del inputs, logits
        if device == "cuda":
            torch.cuda.empty_cache()
    return np.vstack(probs_all)

# 컨텍스트/플레인 모두 추론
train_probs_ctx  = predict_texts(train_ctx,  MAX_LEN_CTX)
test_probs_ctx   = predict_texts(test_ctx,   MAX_LEN_CTX)

train_probs_plain = predict_texts(train_plain, MAX_LEN_PLAIN)
test_probs_plain  = predict_texts(test_plain,  MAX_LEN_PLAIN)

# ===============================
# 5) 블렌딩 
# ===============================
def blend(p_ctx, p_plain, w_ctx=0.7):
    return w_ctx * p_ctx + (1.0 - w_ctx) * p_plain

if USE_BLEND:
    train_probs = blend(train_probs_ctx, train_probs_plain, BLEND_W_CTX)
    test_probs  = blend(test_probs_ctx,  test_probs_plain,  BLEND_W_CTX)
else:
    train_probs = train_probs_ctx
    test_probs  = test_probs_ctx

# 예측 라벨/신뢰도
train_pred_idx = train_probs.argmax(1)
test_pred_idx  = test_probs.argmax(1)

train_pred = labels[train_pred_idx]
test_pred  = labels[test_pred_idx]

train_conf = train_probs.max(1)
test_conf  = test_probs.max(1)

# ===============================
# 6) 저장
# ===============================
train_out = train.copy()
train_out["sentiment"] = train_pred
train_out["confidence"] = train_conf
for i, name in enumerate(labels):
    train_out[f"prob_{name}"] = train_probs[:, i]

test_out = test.copy()
test_out["sentiment"] = test_pred
test_out["confidence"] = test_conf
for i, name in enumerate(labels):
    test_out[f"prob_{name}"] = test_probs[:, i]

train_save = "train_with_sentiment_ctx.csv"
test_save  = "test_with_sentiment_ctx.csv"
train_out.to_csv(train_save, index=False)
test_out.to_csv(test_save, index=False)

print(f"[Saved] {train_save} (shape={train_out.shape})")
print(f"[Saved] {test_save}  (shape={test_out.shape})")


Selecting examples: 100%|██████████| 2029/2029 [00:09<00:00, 216.00it/s]
Selecting examples: 100%|██████████| 10/10 [00:00<00:00, 223.30it/s]


config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Inferring:   2%|▏         | 2/127 [00:06<06:37,  3.18s/it]Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Inferring:   2%|▏         | 3/127 [00:09<06:46,  3.28s/it]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Inferring: 100%|██████████| 127/127 [09:08<00:00,  4.32s/it]
Inferring: 100%|██████████| 1/1 [00:03<00:00,  3.06s/it]
Inferring: 100%|██████████| 127/127 [04:32<00:00,  2.14s/it]
Inferring: 100%|██████████| 1/1 [00:01<00:00,  1.35s/it]

[Saved] train_with_sentiment_ctx.csv (shape=(2029, 14))
[Saved] test_with_sentiment_ctx.csv  (shape=(10, 13))





In [4]:
# Re-run AUC evaluation now that the user has uploaded the needed CSVs.
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.linear_model import LogisticRegression


# 1) Load data
train_raw = pd.read_csv("train.csv")
train_pred = pd.read_csv("train_with_sentiment_ctx.csv")
test_pred  = pd.read_csv("test_with_sentiment_ctx.csv")

# 2) Sanity checks
assert "row_id" in train_raw.columns and "row_id" in train_pred.columns, "row_id column required"
need = ["prob_negative","prob_neutral","prob_positive"]
for c in need:
    assert c in train_pred.columns, f"Missing column in train predictions: {c}"
    assert c in test_pred.columns, f"Missing column in test predictions: {c}"

# 3) Merge & prepare labels
df = train_raw.merge(train_pred[["row_id"]+need], on="row_id", how="inner").copy()
y = df["rule_violation"].astype(int).values

# 4) Heuristic scores and their AUCs
scores = {
    "neg_only":            df["prob_negative"].values,
    "1-pos_only":          1.0 - df["prob_positive"].values,
    "neg_minus_pos":       (df["prob_negative"] - df["prob_positive"]).values,
    "neg_plus_halfneu":    (df["prob_negative"] + 0.5*df["prob_neutral"]).values,
}

auc_rows = []
for name, s in scores.items():
    auc = roc_auc_score(y, s)
    auc_rows.append({"score_name": name, "valid_ROC_AUC": auc})

# 5) Meta-model (logistic regression) with OOF AUC
X = df[need].values
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
meta = LogisticRegression(max_iter=1000, solver="liblinear")
oof_proba = cross_val_predict(meta, X, y, cv=cv, method="predict_proba")[:,1]
oof_auc = roc_auc_score(y, oof_proba)
auc_rows.append({"score_name": "meta_logreg_OOF", "valid_ROC_AUC": oof_auc})

auc_table = pd.DataFrame(auc_rows).sort_values("valid_ROC_AUC", ascending=False).reset_index(drop=True)

# 6) Fit on full train and predict test, save submission
meta.fit(X, y)
X_test = test_pred[need].values
test_rule_violation = meta.predict_proba(X_test)[:,1]

submission = pd.DataFrame({
    "row_id": test_pred["row_id"],
    "rule_violation": test_rule_violation
})
sub_path = "/mnt/data/submission_from_sentiment.csv"
submission.to_csv(sub_path, index=False)

# 7) Show the AUC table to the user and print path

print("=== AUC Summary ===")
print(auc_table)
print(f"\n[Saved] submission: {sub_path}")


=== AUC Summary ===
         score_name  valid_ROC_AUC
0   meta_logreg_OOF       0.575256
1          neg_only       0.569192
2     neg_minus_pos       0.568118
3  neg_plus_halfneu       0.568118
4        1-pos_only       0.563397

[Saved] submission: /mnt/data/submission_from_sentiment.csv
