In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m25.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.model_selection import train_test_split
from datasets import load_dataset, Dataset
from sklearn.metrics import *

In [3]:
def evaluate(val_ds, model, device, tokenizer):
    # 입력 데이터셋 토크나이징 (attention_mask 포함)
    inputs = tokenizer(val_ds['text'], return_tensors="pt", padding=True,
                       truncation=True, max_length=128)
    inputs = {key: value.to(device) for key, value in inputs.items()}  # 입력 텐서를 동일한 디바이스로 이동

    model.eval()
    model = model.to(device)

    with torch.no_grad():
        outputs = model(**inputs)  # attention_mask를 포함해 입력

    # 다중 라벨 분류를 위한 sigmoid 함수 적용
    probabilities = torch.sigmoid(outputs.logits)

    if probabilities.is_cuda:
        probabilities = probabilities.cpu().detach().numpy()
    else:
        probabilities = probabilities.detach().numpy()

    # 확률에 대해 threshold 적용하여 다중 라벨 예측 (0 또는 1)
    threshold = 0.5
    y_pred = (probabilities > threshold).astype(int)

    # 최소 1개 이상 클래스 예측 보장 (argmax 적용)
    for i in range(y_pred.shape[0]):
        if y_pred[i].sum() == 0:  # 만약 선택된 클래스가 없으면
            max_idx = np.argmax(probabilities[i])  # 가장 높은 확률을 가진 클래스를 선택
            y_pred[i, max_idx] = 1  # 강제 예측

    del inputs
    torch.cuda.empty_cache()

    return y_pred, probabilities

In [4]:
def predict(text, model, tokenizer, device='cpu'):
    # 입력 문장 토크나이징
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    inputs = {key: value.to(device) for key, value in inputs.items()}  # 각 텐서를 GPU로 이동

    model = model.to(device)
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    probabilities = torch.sigmoid(logits)  # Sigmoid로 확률 계산
    probabilities = probabilities.squeeze()  # 단일 예측에 대해 차원 축소
    threshold = 0.5
    y_pred = (probabilities > threshold).astype(int)

    # 최소 1개 이상 클래스 예측 보장 (argmax 적용)
    if y_pred.sum() == 0:  # 만약 선택된 클래스가 없으면
        max_idx = np.argmax(probabilities)  # 가장 높은 확률을 가진 클래스를 선택
        y_pred[max_idx] = 1  # 강제 예측

    return y_pred, probabilities

In [5]:
data = pd.read_csv('데이터_최종본_1000개.csv')
print(data.head())

                                     text label
0              스님들 사람들 안보이는데서는 고기 먹는거 아냐?   [0]
1         요즘 계속 체한것 때문에 고생이래 음식을 먹을 수 있을까   [0]
2  땀 구멍이 좀 크네요.. 눈이 작으니 아이라인을 크게 그려야 겠어요.   [0]
3          확실히 광주보다는 대구가 훠얼씬 살기 좋은거 같아 ㅋㅋ   [0]
4                     내가 머리 기르면 여자처럼 보일까.   [0]


In [6]:
data.shape

(12895, 2)

In [7]:
train, val = train_test_split(data, test_size=0.2, random_state=42)

In [8]:
# 텐서 데이터셋으로 변환
train_ds = Dataset.from_pandas(train)
val_ds = Dataset.from_pandas(val)

In [9]:
# 모델과 토크나이저 불러오기
model_name = "klue/bert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/425 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/495k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [10]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

class CustomDataCollator:
    def __call__(self, batch):
        # input_ids, attention_mask를 패딩
        input_ids = [torch.tensor(item['input_ids'], dtype=torch.long) for item in batch]
        attention_mask = [torch.tensor(item['attention_mask'], dtype=torch.long) for item in batch]

        # labels는 그대로 텐서로 변환
        labels = [torch.tensor(item['labels'], dtype=torch.float32) for item in batch]

        return {
            'input_ids': pad_sequence(input_ids, batch_first=True, padding_value=0),
            'attention_mask': pad_sequence(attention_mask, batch_first=True, padding_value=0),
            'labels': torch.stack(labels),
        }

In [11]:
import ast

# 토큰화 함수 생성 및 적용
def preprocess_function(data):
    # 텍스트 토큰화
    inputs = tokenizer(data['text'], truncation=True, padding=True, max_length=512)

    num_labels = 12  # 클래스 수
    # 'label'을 원-핫 인코딩
    one_hot_labels = []
    for label in data['label']:
        # label이 문자열로 저장된 리스트라면 이를 파싱
        parsed_labels = ast.literal_eval(label) if isinstance(label, str) else label
        one_hot = [0] * num_labels  # 0으로 초기화된 원-핫 벡터
        for l in parsed_labels:
            one_hot[int(l)] = 1
        one_hot_labels.append(one_hot)

    # 원-핫 인코딩된 레이블을 텐서로 변환
    inputs['labels'] = torch.tensor(one_hot_labels, dtype=torch.float32)
    return inputs

train_ds = train_ds.map(preprocess_function, batched=True)
val_ds = val_ds.map(preprocess_function, batched=True)

Map:   0%|          | 0/10316 [00:00<?, ? examples/s]

Map:   0%|          | 0/2579 [00:00<?, ? examples/s]

In [12]:
# 텐서로 변환
train_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [13]:
# GPU 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [14]:
# 모델 설정 (다중 분류를 위해 num_labels 지정)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = 12, problem_type="multi_label_classification").to(device)

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# TrainingArguments
training_args = TrainingArguments(
    output_dir = './results',          # 출력 디렉토리
    eval_strategy = "epoch",            # 에폭마다 평가
    save_strategy = "epoch",           # 에폭마다 체크포인트 저장
    learning_rate = 1e-5,              # <--[조정가능]학습률
    per_device_train_batch_size = 32,  # <--[조정가능]학습 배치 사이즈
    per_device_eval_batch_size = 32,   # <--[조정가능]평가 배치 사이즈
    num_train_epochs  = 20,              # <--[조정가능]에폭 수
    weight_decay = 0.01,               # <--[조정가능]weight decay
    load_best_model_at_end = True,     # 가장 좋은 모델을 마지막에 로드
    logging_dir ='./logs',            # 로깅 디렉토리
    logging_steps = 10,                # 로깅 스텝
    report_to="tensorboard"          # TensorBoard에 로깅
)

In [16]:
# Trainer 설정
data_collator = CustomDataCollator()

trainer = Trainer(
    model=model,                         # 학습할 모델
    args=training_args,                  # TrainingArguments
    train_dataset = train_ds,
    eval_dataset = val_ds,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],# 조기 종료
    data_collator=data_collator,
)

  trainer = Trainer(


In [17]:
# 모델 학습
trainer.train()

  input_ids = [torch.tensor(item['input_ids'], dtype=torch.long) for item in batch]
  attention_mask = [torch.tensor(item['attention_mask'], dtype=torch.long) for item in batch]
  labels = [torch.tensor(item['labels'], dtype=torch.float32) for item in batch]


Epoch,Training Loss,Validation Loss
1,0.1891,0.182532
2,0.1231,0.122415
3,0.0813,0.107408
4,0.0706,0.096874
5,0.0631,0.097686
6,0.0443,0.099021
7,0.0295,0.097312


  input_ids = [torch.tensor(item['input_ids'], dtype=torch.long) for item in batch]
  attention_mask = [torch.tensor(item['attention_mask'], dtype=torch.long) for item in batch]
  labels = [torch.tensor(item['labels'], dtype=torch.float32) for item in batch]
  input_ids = [torch.tensor(item['input_ids'], dtype=torch.long) for item in batch]
  attention_mask = [torch.tensor(item['attention_mask'], dtype=torch.long) for item in batch]
  labels = [torch.tensor(item['labels'], dtype=torch.float32) for item in batch]
  input_ids = [torch.tensor(item['input_ids'], dtype=torch.long) for item in batch]
  attention_mask = [torch.tensor(item['attention_mask'], dtype=torch.long) for item in batch]
  labels = [torch.tensor(item['labels'], dtype=torch.float32) for item in batch]
  input_ids = [torch.tensor(item['input_ids'], dtype=torch.long) for item in batch]
  attention_mask = [torch.tensor(item['attention_mask'], dtype=torch.long) for item in batch]
  labels = [torch.tensor(item['labels'], dtyp

TrainOutput(global_step=2261, training_loss=0.10660842738300891, metrics={'train_runtime': 2994.7836, 'train_samples_per_second': 68.893, 'train_steps_per_second': 2.157, 'total_flos': 1.900148144283648e+16, 'train_loss': 0.10660842738300891, 'epoch': 7.0})

In [18]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# 모델과 토크나이저 저장 경로
output_dir = "./fine_tuned_model"

# 모델 저장
trainer.save_model(output_dir)

# 토크나이저 저장 (토크나이저가 필요하면 함께 저장)
tokenizer.save_pretrained(output_dir)

('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/vocab.txt',
 './fine_tuned_model/added_tokens.json',
 './fine_tuned_model/tokenizer.json')

In [19]:
# 모델 평가
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

  input_ids = [torch.tensor(item['input_ids'], dtype=torch.long) for item in batch]
  attention_mask = [torch.tensor(item['attention_mask'], dtype=torch.long) for item in batch]
  labels = [torch.tensor(item['labels'], dtype=torch.float32) for item in batch]


Evaluation results: {'eval_loss': 0.09687440097332001, 'eval_runtime': 36.553, 'eval_samples_per_second': 70.555, 'eval_steps_per_second': 2.216, 'epoch': 7.0}


In [20]:
pred, prob = evaluate(val_ds, model, device, tokenizer)

In [21]:
from sklearn.preprocessing import MultiLabelBinarizer

# 다중 레이블을 리스트로 변환
val_df = val_ds.to_pandas()
val_df['label'] = val_df['label'].apply(lambda x: list(map(int, x.strip("[]").split(","))))

# 멀티라벨 인코딩
mlb = MultiLabelBinarizer()
y_true = mlb.fit_transform(val_df['label'])

y_pred = pred

# 평가
from sklearn.metrics import classification_report
print(classification_report(y_true, y_pred, target_names=[str(cls) for cls in mlb.classes_]))

              precision    recall  f1-score   support

           0       0.76      0.88      0.81       227
           1       0.66      0.68      0.67       250
           2       0.72      0.77      0.74       217
           3       0.85      0.85      0.85       245
           4       0.86      0.86      0.86       189
           5       0.83      0.74      0.79       303
           6       0.92      0.89      0.91       221
           7       0.94      0.88      0.91       214
           8       0.88      0.87      0.87       221
           9       0.88      0.79      0.83       212
          10       0.86      0.74      0.79       190
          11       0.76      0.76      0.76       190

   micro avg       0.82      0.81      0.81      2679
   macro avg       0.83      0.81      0.82      2679
weighted avg       0.82      0.81      0.81      2679
 samples avg       0.82      0.81      0.82      2679



In [40]:
def split_text(text, max_length, tokenizer):
    """텍스트를 토큰 길이를 기준으로 나눔"""
    tokens = tokenizer(text, truncation=False, padding=False)
    input_ids = tokens["input_ids"]
    chunks = [input_ids[i:i + max_length] for i in range(0, len(input_ids), max_length)]
    # 토큰 ID를 다시 텍스트로 디코딩
    return [tokenizer.decode(chunk, skip_special_tokens=True) for chunk in chunks if len(chunk) > 0]

# 모델과 토크나이저 로드 (순화 전 예측)
tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")
model = AutoModelForSequenceClassification.from_pretrained("./fine_tuned_model")

# 입력 텍스트
input_text = """
개 빻은 한녀 메갈퇴지냔들 몰려와서 쿵쾅 ㄷㄷ
"""

# 텍스트 조각 나누기 ([CLS]와 [SEP]를 고려하여 max_length - 2)
max_length = 512
chunks = split_text(input_text, max_length - 2, tokenizer)

results = []
model.eval()
for chunk in chunks:
    try:
        inputs = tokenizer(chunk, truncation=True, padding="max_length", max_length=max_length, return_tensors="pt")
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            probabilities = torch.sigmoid(logits).squeeze()  # Sigmoid로 확률 계산
            results.append(probabilities)
    except Exception as e:
        print(f"Error processing chunk: {chunk}")
        print(f"Error details: {e}")

# 결과 병합 및 예측
if results:
    final_probabilities = torch.mean(torch.stack(results), dim=0)  # 평균 확률
    threshold = 0.5
    predicted_labels = (final_probabilities > threshold).nonzero(as_tuple=True)[0].tolist()

    # 라벨 매핑
    label_mapping = {
        0: "정상",
        1: "악성",
        2: "욕설",
        3: "외모",
        4: "장애인",
        5: "인종",
        6: "종교",
        7: "지역",
        8: "성차별",
        9: "나이",
        10: "협박",
        11: "성희롱",
    }

    # 클래스별 확률 출력 (디버깅용)
    print("클래스별 확률:")
    for i, prob in enumerate(final_probabilities.tolist()):
        print(f"{label_mapping[i]}: {prob:.4f}")

    # 결과 출력
    predicted_labels_text = [label_mapping[label] for label in predicted_labels]
    print(f"입력 텍스트: {input_text}")
    print(f"예측 라벨: {predicted_labels_text}")
else:
    print("텍스트 조각 처리 중 문제가 발생하여 결과를 생성할 수 없습니다.")

클래스별 확률:
정상: 0.0020
악성: 0.0147
욕설: 0.1601
외모: 0.0094
장애인: 0.0062
인종: 0.0211
종교: 0.0080
지역: 0.0069
성차별: 0.0448
나이: 0.0071
협박: 0.0930
성희롱: 0.3617
입력 텍스트: 
개 빻은 한녀 메갈퇴지냔들 몰려와서 쿵쾅 ㄷㄷ

예측 라벨: []
