In [1]:
import pandas as pd
import matplotlib.pyplot as plt

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

df = pd.read_csv("./Data/emotion.tsv", sep="\t")

print(df.columns)

null_idx = df[df["document"].isnull()].index
df.loc[null_idx]

train_data = df.sample(frac=0.8, random_state=42)
test_data = df.drop(train_data.index)

print("중복 제거 전 학습 데이터셋: {}".format(len(train_data)))
print("중복 제거 전 테스트 데이터셋: {}".format(len(test_data)))

train_data = train_data.drop_duplicates(["document"])
test_data = test_data.drop_duplicates(["document"])

print("중복 제거 후 학습 데이터셋: {}".format(len(train_data)))
print("중복 제거 후 테스트 데이터셋: {}".format(len(test_data)))

MODEL_NAME = "beomi/KcELECTRA-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

tokenizer_train_sentences = tokenizer(
    list(train_data['document']),
    return_tensors="pt",
    max_length=128,
    padding=True,
    truncation=True,
    add_special_tokens=True,
)

tokenizer_test_sentences = tokenizer(
    list(test_data['document']),
    return_tensors="pt",
    max_length=128,
    padding=True,
    truncation=True,
    add_special_tokens=True,
)


class CurseDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
    
train_label = train_data['label'].values
test_label = test_data['label'].values

train_dataset = CurseDataset(tokenizer_train_sentences, train_label)
test_dataset = CurseDataset(tokenizer_test_sentences, test_label)

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3).to(device)

traning_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="wegihted")
    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall,
}

trainer = Trainer(
    model=model,
    args=traning_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

trainer.evaluate(eval_dataset=test_dataset)

model.save_pretrained("./saved_model")
tokenizer.save_pretrained("./saved_model")

Index(['document', 'label'], dtype='object')
중복 제거 전 학습 데이터셋: 240525
중복 제거 전 테스트 데이터셋: 60131
중복 제거 후 학습 데이터셋: 234389
중복 제거 후 테스트 데이터셋: 59363


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/21975 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.6037, 'grad_norm': 7.523480415344238, 'learning_rate': 5e-05, 'epoch': 0.07}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.3341, 'grad_norm': 3.6211578845977783, 'learning_rate': 4.883585564610012e-05, 'epoch': 0.14}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.2984, 'grad_norm': 3.861325263977051, 'learning_rate': 4.7671711292200234e-05, 'epoch': 0.2}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.2836, 'grad_norm': 2.0585293769836426, 'learning_rate': 4.650756693830035e-05, 'epoch': 0.27}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.2765, 'grad_norm': 2.6176772117614746, 'learning_rate': 4.534342258440047e-05, 'epoch': 0.34}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.2691, 'grad_norm': 4.452495574951172, 'learning_rate': 4.417927823050058e-05, 'epoch': 0.41}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.2685, 'grad_norm': 2.4995338916778564, 'learning_rate': 4.30151338766007e-05, 'epoch': 0.48}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.2608, 'grad_norm': 2.7208056449890137, 'learning_rate': 4.185098952270082e-05, 'epoch': 0.55}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.258, 'grad_norm': 3.186955213546753, 'learning_rate': 4.0686845168800935e-05, 'epoch': 0.61}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.2553, 'grad_norm': 2.454832077026367, 'learning_rate': 3.952270081490105e-05, 'epoch': 0.68}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.2479, 'grad_norm': 4.095009803771973, 'learning_rate': 3.8358556461001167e-05, 'epoch': 0.75}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.2349, 'grad_norm': 3.2459750175476074, 'learning_rate': 3.719441210710128e-05, 'epoch': 0.82}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.235, 'grad_norm': 3.2268450260162354, 'learning_rate': 3.60302677532014e-05, 'epoch': 0.89}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.2483, 'grad_norm': 6.0911149978637695, 'learning_rate': 3.4866123399301514e-05, 'epoch': 0.96}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.216, 'grad_norm': 3.288362503051758, 'learning_rate': 3.370197904540163e-05, 'epoch': 1.02}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.1657, 'grad_norm': 4.361172676086426, 'learning_rate': 3.2537834691501745e-05, 'epoch': 1.09}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.1616, 'grad_norm': 5.535955429077148, 'learning_rate': 3.137369033760186e-05, 'epoch': 1.16}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.1595, 'grad_norm': 5.408161640167236, 'learning_rate': 3.020954598370198e-05, 'epoch': 1.23}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.1612, 'grad_norm': 1.5199769735336304, 'learning_rate': 2.90454016298021e-05, 'epoch': 1.3}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.1651, 'grad_norm': 2.562037229537964, 'learning_rate': 2.788125727590221e-05, 'epoch': 1.37}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.154, 'grad_norm': 2.3707187175750732, 'learning_rate': 2.671711292200233e-05, 'epoch': 1.43}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.1535, 'grad_norm': 4.005970001220703, 'learning_rate': 2.5552968568102446e-05, 'epoch': 1.5}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.1578, 'grad_norm': 3.121469020843506, 'learning_rate': 2.4388824214202562e-05, 'epoch': 1.57}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.1611, 'grad_norm': 1.795736312866211, 'learning_rate': 2.3224679860302678e-05, 'epoch': 1.64}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.1613, 'grad_norm': 2.5851545333862305, 'learning_rate': 2.2060535506402797e-05, 'epoch': 1.71}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.1511, 'grad_norm': 2.2494540214538574, 'learning_rate': 2.0896391152502913e-05, 'epoch': 1.77}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.1599, 'grad_norm': 2.19327449798584, 'learning_rate': 1.973224679860303e-05, 'epoch': 1.84}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.1536, 'grad_norm': 2.693197011947632, 'learning_rate': 1.8568102444703144e-05, 'epoch': 1.91}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.1486, 'grad_norm': 6.933287620544434, 'learning_rate': 1.740395809080326e-05, 'epoch': 1.98}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.1061, 'grad_norm': 2.5567169189453125, 'learning_rate': 1.623981373690338e-05, 'epoch': 2.05}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.0943, 'grad_norm': 0.8684496283531189, 'learning_rate': 1.5075669383003493e-05, 'epoch': 2.12}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.0921, 'grad_norm': 9.099031448364258, 'learning_rate': 1.3911525029103609e-05, 'epoch': 2.18}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.0812, 'grad_norm': 7.876699924468994, 'learning_rate': 1.2747380675203726e-05, 'epoch': 2.25}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.0927, 'grad_norm': 3.748180389404297, 'learning_rate': 1.1583236321303842e-05, 'epoch': 2.32}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.0844, 'grad_norm': 1.142109751701355, 'learning_rate': 1.041909196740396e-05, 'epoch': 2.39}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.0866, 'grad_norm': 1.211368203163147, 'learning_rate': 9.254947613504075e-06, 'epoch': 2.46}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.09, 'grad_norm': 5.889827728271484, 'learning_rate': 8.090803259604193e-06, 'epoch': 2.53}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.0887, 'grad_norm': 2.6724436283111572, 'learning_rate': 6.9266589057043075e-06, 'epoch': 2.59}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.0874, 'grad_norm': 6.308252334594727, 'learning_rate': 5.762514551804424e-06, 'epoch': 2.66}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.0828, 'grad_norm': 1.7279094457626343, 'learning_rate': 4.598370197904541e-06, 'epoch': 2.73}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.087, 'grad_norm': 4.288417339324951, 'learning_rate': 3.4342258440046572e-06, 'epoch': 2.8}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.0876, 'grad_norm': 11.970422744750977, 'learning_rate': 2.270081490104773e-06, 'epoch': 2.87}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.0835, 'grad_norm': 3.3764729499816895, 'learning_rate': 1.1059371362048895e-06, 'epoch': 2.94}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'train_runtime': 7327.1291, 'train_samples_per_second': 95.968, 'train_steps_per_second': 2.999, 'train_loss': 0.17812271031367766, 'epoch': 3.0}


  0%|          | 0/1856 [00:00<?, ?it/s]

InvalidParameterError: The 'average' parameter of precision_recall_fscore_support must be a str among {'binary', 'micro', 'weighted', 'samples', 'macro'} or None. Got 'wegihted' instead.

In [3]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# gpu 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:

# 저장된 모델과 토크나이저 불러오기
model = AutoModelForSequenceClassification.from_pretrained("./saved_model")
tokenizer = AutoTokenizer.from_pretrained("./saved_model")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def sentence_predict(sent):
    model.eval()
    tokenized_sent = tokenizer(
        [sent],
        return_tensors="pt",
        max_length=128,
        padding=True,
        truncation=True,
        add_special_tokens=True,
    )
    tokenized_sent = {k: v.to(device) for k, v in tokenized_sent.items()}
    with torch.no_grad():
        outputs = model(
            input_ids=tokenized_sent["input_ids"],
            attention_mask=tokenized_sent["attention_mask"],
        )
    logits = outputs[0]
    logits = logits.detach().cpu()
    result = logits.argmax(-1).numpy()[0]
    return result

# 데이터 로드 및 샘플링
data = pd.read_csv("Data/emotion.tsv", sep='\t')
sampled_data = data.sample(n=1000)

# 정확도 계산을 위한 변수 초기화
correct_predictions = 0

for index, row in sampled_data.iterrows():
    sentence = row['document']
    true_label = row['label']
    predicted_label = sentence_predict(sentence)
    print(f"True Label: {true_label}, Predicted Label: {predicted_label}")
    if true_label == predicted_label:
        correct_predictions += 1

accuracy = correct_predictions / len(sampled_data)
print(f"정확도: {accuracy * 100:.2f}%")

True Label: 2, Predicted Label: 2
True Label: 0, Predicted Label: 0
True Label: 1, Predicted Label: 1
True Label: 1, Predicted Label: 1
True Label: 1, Predicted Label: 1
True Label: 2, Predicted Label: 2
True Label: 2, Predicted Label: 2
True Label: 2, Predicted Label: 2
True Label: 1, Predicted Label: 1
True Label: 1, Predicted Label: 1
True Label: 2, Predicted Label: 2
True Label: 2, Predicted Label: 2
True Label: 0, Predicted Label: 0
True Label: 1, Predicted Label: 1
True Label: 0, Predicted Label: 0
True Label: 0, Predicted Label: 0
True Label: 2, Predicted Label: 2
True Label: 0, Predicted Label: 0
True Label: 0, Predicted Label: 0
True Label: 0, Predicted Label: 0
True Label: 1, Predicted Label: 1
True Label: 0, Predicted Label: 0
True Label: 1, Predicted Label: 1
True Label: 0, Predicted Label: 0
True Label: 1, Predicted Label: 1
True Label: 0, Predicted Label: 0
True Label: 0, Predicted Label: 0
True Label: 1, Predicted Label: 1
True Label: 2, Predicted Label: 2
True Label: 1,

In [5]:

# 저장된 모델과 토크나이저 불러오기
model = AutoModelForSequenceClassification.from_pretrained("./saved_model")
tokenizer = AutoTokenizer.from_pretrained("./saved_model")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def sentence_predict(sent):
    model.eval()
    tokenized_sent = tokenizer(
        [sent],
        return_tensors="pt",
        max_length=128,
        padding=True,
        truncation=True,
        add_special_tokens=True,
    )
    tokenized_sent = {k: v.to(device) for k, v in tokenized_sent.items()}
    with torch.no_grad():
        outputs = model(**tokenized_sent)
    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=1)
    predicted_label = torch.argmax(probabilities, dim=1).item()
    return predicted_label, probabilities.tolist()[0]

# 데이터 로드 및 샘플링
data = pd.read_csv("Data/emotion.tsv", sep='\t')
sampled_data = data.sample(n=1000)

# 정확도 계산을 위한 변수 초기화
correct_predictions = 0

for index, row in sampled_data.iterrows():
    sentence = row['document']
    true_label = row['label']
    predicted_label, probabilities = sentence_predict(sentence)
    print(f"True Label: {true_label}, Predicted Label: {predicted_label}, Probabilities: {probabilities}")
    if true_label == predicted_label:
        correct_predictions += 1

accuracy = correct_predictions / len(sampled_data)
print(f"정확도: {accuracy * 100:.2f}%")

True Label: 0, Predicted Label: 0, Probabilities: [0.9993261098861694, 0.0005724413204006851, 0.00010144706902792677]
True Label: 2, Predicted Label: 2, Probabilities: [0.011039125733077526, 0.004734841175377369, 0.9842260479927063]
True Label: 1, Predicted Label: 1, Probabilities: [0.0011056532384827733, 0.9988265633583069, 6.773103086743504e-05]
True Label: 1, Predicted Label: 1, Probabilities: [0.015341976657509804, 0.9841603636741638, 0.0004976809723302722]
True Label: 0, Predicted Label: 0, Probabilities: [0.9993076324462891, 0.0005849307053722441, 0.00010743358143372461]
True Label: 1, Predicted Label: 1, Probabilities: [0.000939542253036052, 0.9990046620368958, 5.581815275945701e-05]
True Label: 2, Predicted Label: 2, Probabilities: [4.3718078813981265e-05, 1.2181264537503012e-05, 0.9999440908432007]
True Label: 1, Predicted Label: 1, Probabilities: [0.0013498173793777823, 0.9986364245414734, 1.3754033716395497e-05]
True Label: 1, Predicted Label: 1, Probabilities: [0.0009494688

In [6]:

# 저장된 모델과 토크나이저 불러오기
model = AutoModelForSequenceClassification.from_pretrained("./saved_model")
tokenizer = AutoTokenizer.from_pretrained("./saved_model")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def sentence_predict(sent):
    model.eval()
    tokenized_sent = tokenizer(
        [sent],
        return_tensors="pt",
        max_length=128,
        padding=True,
        truncation=True,
        add_special_tokens=True,
    )
    tokenized_sent = {k: v.to(device) for k, v in tokenized_sent.items()}
    with torch.no_grad():
        outputs = model(**tokenized_sent)
    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=1)
    predicted_label = torch.argmax(probabilities, dim=1).item()
    return predicted_label, probabilities.tolist()[0]

corrected_texts = [
    "초등학교는 완전히 끝난거 맞죠?",
    "초등학교 해당되는 강사분들 답변바랍니다",
    "제가알기론 13일날 한번 남은걸로 알고있습니다.!",
    "추가로 스마트팜 조립은 프레임 빼고 다 해놨고 오늘 학생 안와서 5개만 되어있어요.",
    "각 조립에 학생들 이름 붙여놨으니 구별할 수 있습니다.",
    "그냥 그대로 하루에 2차시씩 한거 맞나요?",
    "결과보고서 및 기증문서 대부분은 제가 다 작성해놓았으니 서명받고 사업단에 제출하시면 됩니다",
    "네. 원래 일정상 13일이 마지막 날입니다",
    "안녕하세요, 11월 말일까지 진행했던 해봄학교 교육활동 출석부만 먼저 제출 부탁드립니다~ ",
    "현재 다 완료되지 않은 관계로 활동결과보고서 전체가 아닌 서명부 부탁드립니다. ",
    "지금 현재 본인이 담당하고 있는 클래스 주 강사는 현재까지 '출석부' 날짜 잘 보이게 사진 찍어서 내일 10:00시까지 보내주세요.",
    "출석부 12월 1일까지의 출결사항 및 근무일자 기재 후 필히 보내주시기 바랍니다.",
    "학교 담당자입니다.",
    "내일 회식 참석 전",
    "또는 귀가 전",
    "시발",
    "난 너가 너무 싫어",
    "혐오스러워",
    "진짜 세상 다 안망하네~ 인생 좆같네 진짜",
    "싫다 진자 전부 다 귀찮다 이젠",
    "너가 너무 미워서 정말 시러",
    "죽어",
    "살아",
    "네가 너무 좋아",
    "네가 너무 싫어",
    "행복한 감정",
    "불행한 감정",
    "하... 슬프다 진짜",
]

for text in corrected_texts:
    probabilities = sentence_predict(text)
    print(f"Text: {text}, Probabilities: {probabilities}")

Text: 초등학교는 완전히 끝난거 맞죠?, Probabilities: (2, [0.011781834065914154, 0.007961065508425236, 0.9802570939064026])
Text: 초등학교 해당되는 강사분들 답변바랍니다, Probabilities: (2, [0.00020253067486919463, 4.499606802710332e-05, 0.9997524619102478])
Text: 제가알기론 13일날 한번 남은걸로 알고있습니다.!, Probabilities: (1, [0.003124585608020425, 0.9967798590660095, 9.563820640323684e-05])
Text: 추가로 스마트팜 조립은 프레임 빼고 다 해놨고 오늘 학생 안와서 5개만 되어있어요., Probabilities: (2, [4.1370662074768916e-05, 2.0740928448503837e-05, 0.999937891960144])
Text: 각 조립에 학생들 이름 붙여놨으니 구별할 수 있습니다., Probabilities: (2, [5.492604395840317e-05, 1.3132258573023137e-05, 0.9999319314956665])
Text: 그냥 그대로 하루에 2차시씩 한거 맞나요?, Probabilities: (2, [5.376265835366212e-05, 1.1768489457608666e-05, 0.9999344348907471])
Text: 결과보고서 및 기증문서 대부분은 제가 다 작성해놓았으니 서명받고 사업단에 제출하시면 됩니다, Probabilities: (2, [0.00010052478319266811, 3.162130087730475e-05, 0.9998679161071777])
Text: 네. 원래 일정상 13일이 마지막 날입니다, Probabilities: (2, [0.00011547724716365337, 4.021577115054242e-05, 0.9998443126678467])
