In [1]:
# 필수 라이브러리
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm

# 모델 불러오기 (영어 기준. 한국어면 klue/bert-base 계열로 바꿔도 됨)
model = SentenceTransformer("all-MiniLM-L6-v2")

# 데이터 불러오기
df = pd.read_csv("train.csv")  # 경로는 상황에 맞게 수정

# tqdm 설정 (Jupyter에서)
tqdm.pandas()

# ========================
# 🔹 Step 1: 유사도 Feature 생성
# ========================
def compute_similarities(row):
    body_vec = model.encode(row['body'], convert_to_tensor=True)
    rule_vec = model.encode(row['rule'], convert_to_tensor=True)
    
    pos_sim_1 = util.cos_sim(body_vec, model.encode(row['positive_example_1'], convert_to_tensor=True)).item()
    pos_sim_2 = util.cos_sim(body_vec, model.encode(row['positive_example_2'], convert_to_tensor=True)).item()
    neg_sim_1 = util.cos_sim(body_vec, model.encode(row['negative_example_1'], convert_to_tensor=True)).item()
    neg_sim_2 = util.cos_sim(body_vec, model.encode(row['negative_example_2'], convert_to_tensor=True)).item()
    
    return pd.Series({
        'body_rule_similarity': util.cos_sim(body_vec, rule_vec).item(),
        'pos_sim': (pos_sim_1 + pos_sim_2) / 2,
        'neg_sim': (neg_sim_1 + neg_sim_2) / 2
    })

# 계산 적용 (시간 다소 걸릴 수 있음)
df[['body_rule_similarity', 'pos_sim', 'neg_sim']] = df.progress_apply(compute_similarities, axis=1)

# ========================
# 🔹 Step 2: BERT Fine-Tuning용 텍스트 포맷
# ========================
df['bert_input'] = "[RULE] " + df['rule'] + " [SEP] " + df['body']

# ========================
# 🔹 Step 3: 필요한 컬럼만 정리
# ========================
df_final = df[['bert_input', 'body_rule_similarity', 'pos_sim', 'neg_sim', 'rule_violation']]

# ========================
# 🔹 Step 4: 저장
# ========================
df_final.to_csv("preprocessed_dataset.csv", index=False)


100%|██████████| 2029/2029 [03:38<00:00,  9.29it/s]


In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

X = df[['body_rule_similarity', 'pos_sim', 'neg_sim']]
y = df['rule_violation']

model = LogisticRegression()
model.fit(X, y)

y_pred = model.predict(X)
y_prob = model.predict_proba(X)[:, 1]

print("ROC AUC:", roc_auc_score(y, y_prob))
print("Classification Report:")
print(classification_report(y, y_pred))


ROC AUC: 0.701274517998169
Classification Report:
              precision    recall  f1-score   support

           0       0.63      0.63      0.63       998
           1       0.64      0.65      0.65      1031

    accuracy                           0.64      2029
   macro avg       0.64      0.64      0.64      2029
weighted avg       0.64      0.64      0.64      2029



# install

In [4]:
!pip install transformers datasets accelerate


Collecting accelerate
  Downloading accelerate-1.10.1-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-1.10.1-py3-none-any.whl (374 kB)
   ---------------------------------------- 0.0/374.9 kB ? eta -:--:--
   ---- ---------------------------------- 41.0/374.9 kB 960.0 kB/s eta 0:00:01
   ---------------------------------------  368.6/374.9 kB 5.7 MB/s eta 0:00:01
   ---------------------------------------- 374.9/374.9 kB 4.7 MB/s eta 0:00:00
Installing collected packages: accelerate
Successfully installed accelerate-1.10.1



[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: C:\Users\MYNOTE\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


# 데이터 불러오기

In [5]:

import pandas as pd
df = pd.read_csv("preprocessed_dataset.csv")

# 입력/라벨 분리

In [6]:

from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(df['bert_input'], df['rule_violation'], test_size=0.2, random_state=42)

# Tokenizer 준비

In [7]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=256)
val_encodings = tokenizer(X_val.tolist(), truncation=True, padding=True, max_length=256)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

#  Dataset 클래스 정의

In [8]:

import torch

class RedditDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels.tolist()

    def __getitem__(self, idx):
        return {
            key: torch.tensor(val[idx]) for key, val in self.encodings.items()
        } | {'labels': torch.tensor(self.labels[idx])}

    def __len__(self):
        return len(self.labels)

train_dataset = RedditDataset(train_encodings, y_train)
val_dataset = RedditDataset(val_encodings, y_val)

# 모델 + Trainer 설정

In [14]:
!pip install -U transformers accelerate





[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: C:\Users\MYNOTE\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [15]:
from transformers import BertForSequenceClassification, TrainingArguments, Trainer

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`

In [None]:
from sklearn.metrics import classification_report, roc_auc_score

# 예측 확률 및 클래스 예측
preds_output = trainer.predict(val_dataset)
y_pred = preds_output.predictions.argmax(-1)
y_true = y_val.values
probs = preds_output.predictions[:, 1]

# 평가 출력
print("ROC AUC:", roc_auc_score(y_true, probs))
print(classification_report(y_true, y_pred))
