In [3]:
import pandas as pd
import matplotlib.pyplot as plt

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

df = pd.read_csv("./Data/emotion.tsv", sep="\t")

print(df.columns)

null_idx = df[df["document"].isnull()].index
df.loc[null_idx]

train_data = df.sample(frac=0.8, random_state=42)
test_data = df.drop(train_data.index)

print("중복 제거 전 학습 데이터셋: {}".format(len(train_data)))
print("중복 제거 전 테스트 데이터셋: {}".format(len(test_data)))

train_data = train_data.drop_duplicates(["document"])
test_data = test_data.drop_duplicates(["document"])

print("중복 제거 후 학습 데이터셋: {}".format(len(train_data)))
print("중복 제거 후 테스트 데이터셋: {}".format(len(test_data)))

MODEL_NAME = "beomi/KcELECTRA-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

tokenizer_train_sentences = tokenizer(
    list(train_data['document']),
    return_tensors="pt",
    max_length=128,
    padding=True,
    truncation=True,
    add_special_tokens=True,
)

tokenizer_test_sentences = tokenizer(
    list(test_data['document']),
    return_tensors="pt",
    max_length=128,
    padding=True,
    truncation=True,
    add_special_tokens=True,
)


class CurseDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
    
train_label = train_data['label'].values
test_label = test_data['label'].values

train_dataset = CurseDataset(tokenizer_train_sentences, train_label)
test_dataset = CurseDataset(tokenizer_test_sentences, test_label)

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3).to(device)

traning_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="wegihted")
    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall,
}

trainer = Trainer(
    model=model,
    args=traning_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

Index(['document', 'label'], dtype='object')
중복 제거 전 학습 데이터셋: 240525
중복 제거 전 테스트 데이터셋: 60131
중복 제거 후 학습 데이터셋: 234389
중복 제거 후 테스트 데이터셋: 59363


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
trainer.train()

# trainer.evaluate(eval_dataset=test_dataset)

  0%|          | 0/36625 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.6509, 'grad_norm': 7.891141891479492, 'learning_rate': 5e-05, 'epoch': 0.07}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.3521, 'grad_norm': 5.184708595275879, 'learning_rate': 4.930795847750865e-05, 'epoch': 0.14}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.3074, 'grad_norm': 3.114440679550171, 'learning_rate': 4.86159169550173e-05, 'epoch': 0.2}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.2916, 'grad_norm': 2.187145471572876, 'learning_rate': 4.792387543252596e-05, 'epoch': 0.27}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.2827, 'grad_norm': 3.090001106262207, 'learning_rate': 4.723183391003461e-05, 'epoch': 0.34}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.271, 'grad_norm': 4.794856548309326, 'learning_rate': 4.653979238754326e-05, 'epoch': 0.41}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.272, 'grad_norm': 0.9918420910835266, 'learning_rate': 4.58477508650519e-05, 'epoch': 0.48}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.2732, 'grad_norm': 2.2657313346862793, 'learning_rate': 4.515570934256055e-05, 'epoch': 0.55}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.263, 'grad_norm': 3.221052885055542, 'learning_rate': 4.446366782006921e-05, 'epoch': 0.61}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.2596, 'grad_norm': 2.4557912349700928, 'learning_rate': 4.377162629757786e-05, 'epoch': 0.68}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.2534, 'grad_norm': 3.4732131958007812, 'learning_rate': 4.307958477508651e-05, 'epoch': 0.75}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.2404, 'grad_norm': 2.3014650344848633, 'learning_rate': 4.238754325259516e-05, 'epoch': 0.82}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.2438, 'grad_norm': 2.244197368621826, 'learning_rate': 4.1695501730103807e-05, 'epoch': 0.89}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.2511, 'grad_norm': 8.412864685058594, 'learning_rate': 4.1003460207612456e-05, 'epoch': 0.96}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.2239, 'grad_norm': 3.001281499862671, 'learning_rate': 4.031141868512111e-05, 'epoch': 1.02}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.1771, 'grad_norm': 3.6980762481689453, 'learning_rate': 3.961937716262976e-05, 'epoch': 1.09}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.1694, 'grad_norm': 4.234084606170654, 'learning_rate': 3.892733564013841e-05, 'epoch': 1.16}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.1712, 'grad_norm': 6.605740070343018, 'learning_rate': 3.8235294117647055e-05, 'epoch': 1.23}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.1742, 'grad_norm': 4.390466213226318, 'learning_rate': 3.754325259515571e-05, 'epoch': 1.3}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.1784, 'grad_norm': 4.017687797546387, 'learning_rate': 3.685121107266436e-05, 'epoch': 1.37}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.1674, 'grad_norm': 1.8282699584960938, 'learning_rate': 3.615916955017301e-05, 'epoch': 1.43}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.1693, 'grad_norm': 2.221276044845581, 'learning_rate': 3.546712802768166e-05, 'epoch': 1.5}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.1746, 'grad_norm': 6.127838611602783, 'learning_rate': 3.477508650519031e-05, 'epoch': 1.57}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.1782, 'grad_norm': 1.9996906518936157, 'learning_rate': 3.408304498269896e-05, 'epoch': 1.64}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.1783, 'grad_norm': 4.158753395080566, 'learning_rate': 3.339100346020762e-05, 'epoch': 1.71}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.1688, 'grad_norm': 2.4919679164886475, 'learning_rate': 3.269896193771627e-05, 'epoch': 1.77}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.1763, 'grad_norm': 2.1880476474761963, 'learning_rate': 3.200692041522492e-05, 'epoch': 1.84}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.169, 'grad_norm': 1.9571857452392578, 'learning_rate': 3.131487889273357e-05, 'epoch': 1.91}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.1645, 'grad_norm': 2.675015926361084, 'learning_rate': 3.062283737024222e-05, 'epoch': 1.98}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.1185, 'grad_norm': 1.9621556997299194, 'learning_rate': 2.9930795847750863e-05, 'epoch': 2.05}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.11, 'grad_norm': 1.3612946271896362, 'learning_rate': 2.9238754325259516e-05, 'epoch': 2.12}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.1083, 'grad_norm': 2.0220096111297607, 'learning_rate': 2.8546712802768166e-05, 'epoch': 2.18}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.1031, 'grad_norm': 2.0721023082733154, 'learning_rate': 2.7854671280276816e-05, 'epoch': 2.25}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.1148, 'grad_norm': 3.596491575241089, 'learning_rate': 2.716262975778547e-05, 'epoch': 2.32}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.0973, 'grad_norm': 9.330855369567871, 'learning_rate': 2.647058823529412e-05, 'epoch': 2.39}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.1081, 'grad_norm': 7.6221466064453125, 'learning_rate': 2.5778546712802772e-05, 'epoch': 2.46}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.1066, 'grad_norm': 2.689392566680908, 'learning_rate': 2.508650519031142e-05, 'epoch': 2.53}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.1105, 'grad_norm': 2.607438325881958, 'learning_rate': 2.4394463667820068e-05, 'epoch': 2.59}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.1085, 'grad_norm': 8.500758171081543, 'learning_rate': 2.370242214532872e-05, 'epoch': 2.66}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.1102, 'grad_norm': 10.508438110351562, 'learning_rate': 2.301038062283737e-05, 'epoch': 2.73}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.1118, 'grad_norm': 9.002562522888184, 'learning_rate': 2.231833910034602e-05, 'epoch': 2.8}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.1148, 'grad_norm': 6.548465251922607, 'learning_rate': 2.1626297577854674e-05, 'epoch': 2.87}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.108, 'grad_norm': 0.8940153121948242, 'learning_rate': 2.093425605536332e-05, 'epoch': 2.94}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.1061, 'grad_norm': 4.98014497756958, 'learning_rate': 2.0242214532871974e-05, 'epoch': 3.0}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.0678, 'grad_norm': 11.943294525146484, 'learning_rate': 1.9550173010380623e-05, 'epoch': 3.07}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.0637, 'grad_norm': 0.68489670753479, 'learning_rate': 1.8858131487889273e-05, 'epoch': 3.14}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.0671, 'grad_norm': 0.15851539373397827, 'learning_rate': 1.8166089965397926e-05, 'epoch': 3.21}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.0697, 'grad_norm': 17.311885833740234, 'learning_rate': 1.7474048442906576e-05, 'epoch': 3.28}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.0648, 'grad_norm': 6.606149196624756, 'learning_rate': 1.6782006920415226e-05, 'epoch': 3.34}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.0696, 'grad_norm': 1.8201674222946167, 'learning_rate': 1.6089965397923876e-05, 'epoch': 3.41}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.0697, 'grad_norm': 7.413010120391846, 'learning_rate': 1.5397923875432525e-05, 'epoch': 3.48}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.0678, 'grad_norm': 12.93771743774414, 'learning_rate': 1.4705882352941177e-05, 'epoch': 3.55}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.0636, 'grad_norm': 0.731560468673706, 'learning_rate': 1.4013840830449828e-05, 'epoch': 3.62}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.069, 'grad_norm': 0.1551363617181778, 'learning_rate': 1.3321799307958476e-05, 'epoch': 3.69}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.0729, 'grad_norm': 0.13257451355457306, 'learning_rate': 1.2629757785467128e-05, 'epoch': 3.75}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.0684, 'grad_norm': 27.303743362426758, 'learning_rate': 1.193771626297578e-05, 'epoch': 3.82}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.0673, 'grad_norm': 0.8962357640266418, 'learning_rate': 1.124567474048443e-05, 'epoch': 3.89}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.063, 'grad_norm': 18.54599952697754, 'learning_rate': 1.0553633217993079e-05, 'epoch': 3.96}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.0492, 'grad_norm': 14.361431121826172, 'learning_rate': 9.86159169550173e-06, 'epoch': 4.03}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.0401, 'grad_norm': 2.0835936069488525, 'learning_rate': 9.169550173010382e-06, 'epoch': 4.1}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.0417, 'grad_norm': 0.08062437176704407, 'learning_rate': 8.477508650519032e-06, 'epoch': 4.16}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.0359, 'grad_norm': 2.076699733734131, 'learning_rate': 7.785467128027681e-06, 'epoch': 4.23}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.0381, 'grad_norm': 39.59562301635742, 'learning_rate': 7.093425605536333e-06, 'epoch': 4.3}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.0436, 'grad_norm': 16.893592834472656, 'learning_rate': 6.401384083044983e-06, 'epoch': 4.37}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.0427, 'grad_norm': 6.579185485839844, 'learning_rate': 5.709342560553633e-06, 'epoch': 4.44}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.0406, 'grad_norm': 0.12138231098651886, 'learning_rate': 5.017301038062284e-06, 'epoch': 4.51}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.0398, 'grad_norm': 0.07780066132545471, 'learning_rate': 4.325259515570934e-06, 'epoch': 4.57}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.0415, 'grad_norm': 0.061079930514097214, 'learning_rate': 3.6332179930795853e-06, 'epoch': 4.64}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.0352, 'grad_norm': 0.2656605839729309, 'learning_rate': 2.9411764705882355e-06, 'epoch': 4.71}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.0378, 'grad_norm': 0.06815166026353836, 'learning_rate': 2.249134948096886e-06, 'epoch': 4.78}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.0369, 'grad_norm': 0.9932782649993896, 'learning_rate': 1.5570934256055363e-06, 'epoch': 4.85}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.0375, 'grad_norm': 0.03582540154457092, 'learning_rate': 8.650519031141869e-07, 'epoch': 4.91}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.037, 'grad_norm': 3.1320888996124268, 'learning_rate': 1.730103806228374e-07, 'epoch': 4.98}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'train_runtime': 15891.714, 'train_samples_per_second': 73.746, 'train_steps_per_second': 2.305, 'train_loss': 0.13708669007597524, 'epoch': 5.0}


TrainOutput(global_step=36625, training_loss=0.13708669007597524, metrics={'train_runtime': 15891.714, 'train_samples_per_second': 73.746, 'train_steps_per_second': 2.305, 'total_flos': 7.708861358481024e+16, 'train_loss': 0.13708669007597524, 'epoch': 5.0})

In [5]:
model.save_pretrained("./saved_model")
tokenizer.save_pretrained("./saved_model")

('./saved_model\\tokenizer_config.json',
 './saved_model\\special_tokens_map.json',
 './saved_model\\tokenizer.json')

In [6]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# gpu 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:

# 저장된 모델과 토크나이저 불러오기
model = AutoModelForSequenceClassification.from_pretrained("./saved_model")
tokenizer = AutoTokenizer.from_pretrained("./saved_model")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def sentence_predict(sent):
    model.eval()
    tokenized_sent = tokenizer(
        [sent],
        return_tensors="pt",
        max_length=128,
        padding=True,
        truncation=True,
        add_special_tokens=True,
    )
    tokenized_sent = {k: v.to(device) for k, v in tokenized_sent.items()}
    with torch.no_grad():
        outputs = model(
            input_ids=tokenized_sent["input_ids"],
            attention_mask=tokenized_sent["attention_mask"],
        )
    logits = outputs[0]
    logits = logits.detach().cpu()
    result = logits.argmax(-1).numpy()[0]
    return result

# 데이터 로드 및 샘플링
data = pd.read_csv("Data/emotion.tsv", sep='\t')
sampled_data = data.sample(n=1000)

# 정확도 계산을 위한 변수 초기화
correct_predictions = 0

for index, row in sampled_data.iterrows():
    sentence = row['document']
    true_label = row['label']
    predicted_label = sentence_predict(sentence)
    print(f"True Label: {true_label}, Predicted Label: {predicted_label}")
    if true_label == predicted_label:
        correct_predictions += 1

accuracy = correct_predictions / len(sampled_data)
print(f"정확도: {accuracy * 100:.2f}%")

True Label: 2, Predicted Label: 2
True Label: 0, Predicted Label: 0
True Label: 1, Predicted Label: 1
True Label: 1, Predicted Label: 1
True Label: 1, Predicted Label: 1
True Label: 2, Predicted Label: 2
True Label: 2, Predicted Label: 2
True Label: 2, Predicted Label: 2
True Label: 1, Predicted Label: 1
True Label: 1, Predicted Label: 1
True Label: 2, Predicted Label: 2
True Label: 2, Predicted Label: 2
True Label: 0, Predicted Label: 0
True Label: 1, Predicted Label: 1
True Label: 0, Predicted Label: 0
True Label: 0, Predicted Label: 0
True Label: 2, Predicted Label: 2
True Label: 0, Predicted Label: 0
True Label: 0, Predicted Label: 0
True Label: 0, Predicted Label: 0
True Label: 1, Predicted Label: 1
True Label: 0, Predicted Label: 0
True Label: 1, Predicted Label: 1
True Label: 0, Predicted Label: 0
True Label: 1, Predicted Label: 1
True Label: 0, Predicted Label: 0
True Label: 0, Predicted Label: 0
True Label: 1, Predicted Label: 1
True Label: 2, Predicted Label: 2
True Label: 1,

In [8]:

# 저장된 모델과 토크나이저 불러오기
model = AutoModelForSequenceClassification.from_pretrained("./saved_model")
tokenizer = AutoTokenizer.from_pretrained("./saved_model")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def sentence_predict(sent):
    model.eval()
    tokenized_sent = tokenizer(
        [sent],
        return_tensors="pt",
        max_length=128,
        padding=True,
        truncation=True,
        add_special_tokens=True,
    )
    tokenized_sent = {k: v.to(device) for k, v in tokenized_sent.items()}
    with torch.no_grad():
        outputs = model(**tokenized_sent)
    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=1)
    predicted_label = torch.argmax(probabilities, dim=1).item()
    return predicted_label, probabilities.tolist()[0]

# 데이터 로드 및 샘플링
data = pd.read_csv("Data/emotion.tsv", sep='\t')
sampled_data = data.sample(n=1000)

# 정확도 계산을 위한 변수 초기화
correct_predictions = 0

for index, row in sampled_data.iterrows():
    sentence = row['document']
    true_label = row['label']
    predicted_label, probabilities = sentence_predict(sentence)
    print(f"True Label: {true_label}, Predicted Label: {predicted_label}, Probabilities: {probabilities}")
    if true_label == predicted_label:
        correct_predictions += 1

accuracy = correct_predictions / len(sampled_data)
print(f"정확도: {accuracy * 100:.2f}%")

True Label: 0, Predicted Label: 0, Probabilities: [0.9995220899581909, 0.00046339459368027747, 1.459473332943162e-05]
True Label: 2, Predicted Label: 2, Probabilities: [0.0007370602688752115, 0.00032469132565893233, 0.998938262462616]
True Label: 1, Predicted Label: 1, Probabilities: [0.0005135929677635431, 0.9994556307792664, 3.073038169532083e-05]
True Label: 1, Predicted Label: 1, Probabilities: [0.0013506972463801503, 0.9985790252685547, 7.03043770045042e-05]
True Label: 0, Predicted Label: 0, Probabilities: [0.999535083770752, 0.00044909186544828117, 1.589075691299513e-05]
True Label: 1, Predicted Label: 1, Probabilities: [0.00042689903057180345, 0.9995551705360413, 1.7881375242723152e-05]
True Label: 2, Predicted Label: 2, Probabilities: [5.797890844405629e-05, 2.134776696038898e-05, 0.9999207258224487]
True Label: 1, Predicted Label: 1, Probabilities: [0.0004726641927845776, 0.999512791633606, 1.4578280570276547e-05]
True Label: 1, Predicted Label: 1, Probabilities: [0.000438148

In [9]:

# 저장된 모델과 토크나이저 불러오기
model = AutoModelForSequenceClassification.from_pretrained("./saved_model")
tokenizer = AutoTokenizer.from_pretrained("./saved_model")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def sentence_predict(sent):
    model.eval()
    tokenized_sent = tokenizer(
        [sent],
        return_tensors="pt",
        max_length=128,
        padding=True,
        truncation=True,
        add_special_tokens=True,
    )
    tokenized_sent = {k: v.to(device) for k, v in tokenized_sent.items()}
    with torch.no_grad():
        outputs = model(**tokenized_sent)
    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=1)
    predicted_label = torch.argmax(probabilities, dim=1).item()
    return predicted_label, probabilities.tolist()[0]

corrected_texts = [
    "초등학교는 완전히 끝난거 맞죠?",
    "초등학교 해당되는 강사분들 답변바랍니다",
    "제가알기론 13일날 한번 남은걸로 알고있습니다.!",
    "추가로 스마트팜 조립은 프레임 빼고 다 해놨고 오늘 학생 안와서 5개만 되어있어요.",
    "각 조립에 학생들 이름 붙여놨으니 구별할 수 있습니다.",
    "그냥 그대로 하루에 2차시씩 한거 맞나요?",
    "결과보고서 및 기증문서 대부분은 제가 다 작성해놓았으니 서명받고 사업단에 제출하시면 됩니다",
    "네. 원래 일정상 13일이 마지막 날입니다",
    "안녕하세요, 11월 말일까지 진행했던 해봄학교 교육활동 출석부만 먼저 제출 부탁드립니다~ ",
    "현재 다 완료되지 않은 관계로 활동결과보고서 전체가 아닌 서명부 부탁드립니다. ",
    "지금 현재 본인이 담당하고 있는 클래스 주 강사는 현재까지 '출석부' 날짜 잘 보이게 사진 찍어서 내일 10:00시까지 보내주세요.",
    "출석부 12월 1일까지의 출결사항 및 근무일자 기재 후 필히 보내주시기 바랍니다.",
    "학교 담당자입니다.",
    "내일 회식 참석 전",
    "또는 귀가 전",
    "시발",
    "난 너가 너무 싫어",
    "혐오스러워",
    "진짜 세상 다 안망하네~ 인생 좆같네 진짜",
    "싫다 진자 전부 다 귀찮다 이젠",
    "너가 너무 미워서 정말 시러",
    "죽어",
    "살아",
    "네가 너무 좋아",
    "네가 너무 싫어",
    "행복한 감정",
    "불행한 감정",
    "하... 슬프다 진짜",
]

for text in corrected_texts:
    probabilities = sentence_predict(text)
    print(f"Text: {text}, Probabilities: {probabilities}")

Text: 초등학교는 완전히 끝난거 맞죠?, Probabilities: (2, [0.0009647451806813478, 0.0003934853884857148, 0.9986417889595032])
Text: 초등학교 해당되는 강사분들 답변바랍니다, Probabilities: (2, [0.0006567123928107321, 0.00032969730091281235, 0.9990136623382568])
Text: 제가알기론 13일날 한번 남은걸로 알고있습니다.!, Probabilities: (1, [0.0005109758931212127, 0.9994677901268005, 2.126005529135e-05])
Text: 추가로 스마트팜 조립은 프레임 빼고 다 해놨고 오늘 학생 안와서 5개만 되어있어요., Probabilities: (2, [0.00010992071474902332, 3.600112540880218e-05, 0.9998540878295898])
Text: 각 조립에 학생들 이름 붙여놨으니 구별할 수 있습니다., Probabilities: (2, [9.724044502945617e-05, 3.0096547561697662e-05, 0.9998725652694702])
Text: 그냥 그대로 하루에 2차시씩 한거 맞나요?, Probabilities: (2, [0.0001431971468264237, 4.2154966649832204e-05, 0.9998146891593933])
Text: 결과보고서 및 기증문서 대부분은 제가 다 작성해놓았으니 서명받고 사업단에 제출하시면 됩니다, Probabilities: (2, [0.00017101774574257433, 5.873238478670828e-05, 0.9997702240943909])
Text: 네. 원래 일정상 13일이 마지막 날입니다, Probabilities: (2, [0.00015919224824756384, 5.482942651724443e-05, 0.9997859597206116])
