In [2]:
import pandas as pd
import matplotlib.pyplot as plt

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

# gpu 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
df = pd.read_csv("./Data/emotion3.tsv", sep="\t")

In [4]:
null_idx = df[df["document"].isnull()].index
df.loc[null_idx]

Unnamed: 0,document,label


In [5]:
train_data = df.sample(frac=0.8, random_state=42)
test_data = df.drop(train_data.index)

In [6]:
print("중복 제거 전 학습 데이터셋: {}".format(len(train_data)))
print("중복 제거 전 테스트 데이터셋: {}".format(len(test_data)))

# 중복 데이터 제거
train_data = train_data.drop_duplicates(["document"])
test_data = test_data.drop_duplicates(["document"])

# 데이터셋 갯수 확인
print("중복 제거 후 학습 데이터셋: {}".format(len(train_data)))
print("중복 제거 후 테스트 데이터셋: {}".format(len(test_data)))

중복 제거 전 학습 데이터셋: 64770
중복 제거 전 테스트 데이터셋: 16193
중복 제거 후 학습 데이터셋: 59709
중복 제거 후 테스트 데이터셋: 15350


In [7]:
MODEL_NAME = "beomi/KcELECTRA-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [8]:
tokenizer_train_sentences = tokenizer(
    list(train_data['document']),
    return_tensors="pt",
    max_length=128,
    padding=True,
    truncation=True,
    add_special_tokens=True,
)

In [9]:
tokenizer_test_sentences = tokenizer(
    list(test_data['document']),
    return_tensors="pt",
    max_length=128,
    padding=True,
    truncation=True,
    add_special_tokens=True,
)

In [10]:
class CurseDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
train_label = train_data['label'].values
test_label = test_data['label'].values

train_dataset = CurseDataset(tokenizer_train_sentences, train_label)
test_dataset = CurseDataset(tokenizer_test_sentences, test_label)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=7).to(device)

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
traning_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=10,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
)
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall,
    }
trainer = Trainer(
    model=model,
    args=traning_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

In [12]:
trainer.train()

  0%|          | 0/5598 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 1.5789, 'grad_norm': 5.94741678237915, 'learning_rate': 5e-05, 'epoch': 0.27}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 1.0394, 'grad_norm': 5.980869770050049, 'learning_rate': 4.509611612397019e-05, 'epoch': 0.54}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.8662, 'grad_norm': 5.266754627227783, 'learning_rate': 4.019223224794037e-05, 'epoch': 0.8}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.7737, 'grad_norm': 9.293607711791992, 'learning_rate': 3.5288348371910554e-05, 'epoch': 1.07}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.6404, 'grad_norm': 4.210756778717041, 'learning_rate': 3.0384464495880736e-05, 'epoch': 1.34}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.6349, 'grad_norm': 6.529935359954834, 'learning_rate': 2.5480580619850925e-05, 'epoch': 1.61}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.6191, 'grad_norm': 5.791209697723389, 'learning_rate': 2.057669674382111e-05, 'epoch': 1.88}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.5156, 'grad_norm': 11.794272422790527, 'learning_rate': 1.5672812867791292e-05, 'epoch': 2.14}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.4199, 'grad_norm': 9.613369941711426, 'learning_rate': 1.0768928991761477e-05, 'epoch': 2.41}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.4173, 'grad_norm': 5.846056938171387, 'learning_rate': 5.86504511573166e-06, 'epoch': 2.68}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.4126, 'grad_norm': 7.528611660003662, 'learning_rate': 9.61161239701844e-07, 'epoch': 2.95}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'train_runtime': 1852.7001, 'train_samples_per_second': 96.684, 'train_steps_per_second': 3.022, 'train_loss': 0.7142258848876517, 'epoch': 3.0}


TrainOutput(global_step=5598, training_loss=0.7142258848876517, metrics={'train_runtime': 1852.7001, 'train_samples_per_second': 96.684, 'train_steps_per_second': 3.022, 'total_flos': 1.17831024582336e+16, 'train_loss': 0.7142258848876517, 'epoch': 3.0})

In [13]:
# trainer.evaluate(eval_dataset=test_dataset)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


  0%|          | 0/480 [00:00<?, ?it/s]

ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].

In [14]:
# 모델과 토크나이저 저장
model.save_pretrained("./saved_model")
tokenizer.save_pretrained("./saved_model")

('./saved_model\\tokenizer_config.json',
 './saved_model\\special_tokens_map.json',
 './saved_model\\tokenizer.json')

In [15]:
import pandas as pd
import matplotlib.pyplot as plt

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# gpu 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# 테스트 데이터로 정확도 검증
model = AutoModelForSequenceClassification.from_pretrained("./saved_model")
tokenizer = AutoTokenizer.from_pretrained("./saved_model")
model.to(device)

def sentence_predict(sent):
    model.eval()
    tokenized_sent = tokenizer(
        [sent],
        return_tensors="pt",
        max_length=128,
        padding=True,
        truncation=True,
        add_special_tokens=True,
    )
    tokenized_sent = {k: v.to(device) for k, v in tokenized_sent.items()}
    with torch.no_grad():
        outputs = model(
            input_ids=tokenized_sent["input_ids"],
            attention_mask=tokenized_sent["attention_mask"],
        )
    logits = outputs[0]
    logits = logits.detach().cpu()
    result = logits.argmax(-1).numpy()[0]
    return result

data = pd.read_csv("Data/emotion3.csv")
sampled_data = data.sample(n=1000)

correct_predictions = 0

for index, row in sampled_data.iterrows():
    sentence = row['발화문']
    true_label = row['상황']
    predicted_label = sentence_predict(sentence)
    print(f"True Label: {true_label}, Predicted Label: {predicted_label}")
    if true_label == predicted_label:
        correct_predictions += 1

accuracy = correct_predictions / len(sampled_data)
print(f"정확도: {accuracy * 100:.2f}%")


True Label: 1, Predicted Label: 1
True Label: 2, Predicted Label: 2
True Label: 0, Predicted Label: 0
True Label: 5, Predicted Label: 5
True Label: 2, Predicted Label: 2
True Label: 5, Predicted Label: 5
True Label: 6, Predicted Label: 6
True Label: 3, Predicted Label: 3
True Label: 2, Predicted Label: 2
True Label: 6, Predicted Label: 6
True Label: 2, Predicted Label: 2
True Label: 0, Predicted Label: 0
True Label: 2, Predicted Label: 2
True Label: 0, Predicted Label: 0
True Label: 3, Predicted Label: 3
True Label: 2, Predicted Label: 2
True Label: 1, Predicted Label: 4
True Label: 2, Predicted Label: 2
True Label: 5, Predicted Label: 5
True Label: 3, Predicted Label: 3
True Label: 6, Predicted Label: 6
True Label: 5, Predicted Label: 5
True Label: 6, Predicted Label: 6
True Label: 3, Predicted Label: 3
True Label: 3, Predicted Label: 3
True Label: 3, Predicted Label: 3
True Label: 1, Predicted Label: 4
True Label: 2, Predicted Label: 6
True Label: 6, Predicted Label: 6
True Label: 3,