In [1]:
import pandas as pd
import matplotlib.pyplot as plt

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

# gpu 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
df = pd.read_csv("./Data/ratings.tsv", sep="\t")

# document 열에서 NaN값을 가진 행 삭제
#df = df.dropna()

# 결과를 새로운 tsv로 저장
#df.to_csv("./Data/ratings.tsv", sep="\t", index=False)

In [3]:
# 여기에 document와 label에 값이 있다면 데이터 오류있는 거니까 위의 코드 주석 풀고 하고 올 것
null_idx = df[df["document"].isnull()].index
df.loc[null_idx]

Unnamed: 0,id,document,label


In [4]:
train_data = df.sample(frac=0.8, random_state=42)
test_data = df.drop(train_data.index)

In [5]:
print("중복 제거 전 학습 데이터셋: {}".format(len(train_data)))
print("중복 제거 전 테스트 데이터셋: {}".format(len(test_data)))

# 중복 데이터 제거
train_data = train_data.drop_duplicates(["document"])
test_data = test_data.drop_duplicates(["document"])

# 데이터셋 갯수 확인
print("중복 제거 후 학습 데이터셋: {}".format(len(train_data)))
print("중복 제거 후 테스트 데이터셋: {}".format(len(test_data)))

중복 제거 전 학습 데이터셋: 159994
중복 제거 전 테스트 데이터셋: 39998
중복 제거 후 학습 데이터셋: 155916
중복 제거 후 테스트 데이터셋: 39348


In [6]:
MODEL_NAME = "beomi/KcELECTRA-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [7]:
tokenizer_train_sentences = tokenizer(
    list(train_data['document']),
    return_tensors="pt",
    max_length=128,
    padding=True,
    truncation=True,
    add_special_tokens=True,
)

In [8]:
#print(tokenizer_train_sentences[0])
#print(tokenizer_train_sentences[0].tokens)
#print(tokenizer_train_sentences[0].ids)
#print(tokenizer_train_sentences[0].attention_mask)

In [9]:
tokenizer_test_sentences = tokenizer(
    list(test_data['document']),
    return_tensors="pt",
    max_length=128,
    padding=True,
    truncation=True,
    add_special_tokens=True,
)

In [10]:
class CurseDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [11]:
train_label = train_data['label'].values
test_label = test_data['label'].values

train_dataset = CurseDataset(tokenizer_train_sentences, train_label)
test_dataset = CurseDataset(tokenizer_test_sentences, test_label)

In [12]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2).to(device)

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
traning_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
)

In [14]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall,
    }

In [15]:
trainer = Trainer(
    model=model,
    args=traning_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

In [16]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjhy9732[0m ([33myamiyami[0m). Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011288888886984852, max=1.0…

  0%|          | 0/4873 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.4514, 'grad_norm': 2.778188467025757, 'learning_rate': 5e-05, 'epoch': 0.1}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.3063, 'grad_norm': 2.8127267360687256, 'learning_rate': 4.428310084610108e-05, 'epoch': 0.21}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.2847, 'grad_norm': 5.167689323425293, 'learning_rate': 3.8566201692202155e-05, 'epoch': 0.31}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.2758, 'grad_norm': 2.9037749767303467, 'learning_rate': 3.284930253830323e-05, 'epoch': 0.41}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.2588, 'grad_norm': 2.278263568878174, 'learning_rate': 2.7132403384404296e-05, 'epoch': 0.51}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.2518, 'grad_norm': 4.706526756286621, 'learning_rate': 2.1415504230505376e-05, 'epoch': 0.62}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.2354, 'grad_norm': 3.249265193939209, 'learning_rate': 1.569860507660645e-05, 'epoch': 0.72}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.2459, 'grad_norm': 2.7226595878601074, 'learning_rate': 9.981705922707523e-06, 'epoch': 0.82}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.234, 'grad_norm': 5.719998359680176, 'learning_rate': 4.264806768808598e-06, 'epoch': 0.92}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'train_runtime': 1592.9715, 'train_samples_per_second': 97.877, 'train_steps_per_second': 3.059, 'train_loss': 0.27809603679554434, 'epoch': 1.0}


TrainOutput(global_step=4873, training_loss=0.27809603679554434, metrics={'train_runtime': 1592.9715, 'train_samples_per_second': 97.877, 'train_steps_per_second': 3.059, 'total_flos': 1.025580582687744e+16, 'train_loss': 0.27809603679554434, 'epoch': 1.0})

In [17]:
trainer.evaluate(eval_dataset=test_dataset)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


  0%|          | 0/1230 [00:00<?, ?it/s]

{'eval_loss': 0.2243654578924179,
 'eval_accuracy': 0.9094744332621735,
 'eval_f1': 0.9099458967487486,
 'eval_precision': 0.9059605316149819,
 'eval_recall': 0.9139664804469274,
 'eval_runtime': 128.9791,
 'eval_samples_per_second': 305.073,
 'eval_steps_per_second': 9.536,
 'epoch': 1.0}

In [18]:
# 모델과 토크나이저 저장
model.save_pretrained("./saved_model")
tokenizer.save_pretrained("./saved_model")


('./saved_model\\tokenizer_config.json',
 './saved_model\\special_tokens_map.json',
 './saved_model\\tokenizer.json')

In [19]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# def sentence_predict(sent):
#     model = AutoModelForSequenceClassification.from_pretrained("./saved_model").to(device)
#     tokenizer = AutoTokenizer.from_pretrained("./saved_model")

#     inputs = tokenizer(
#         sent,
#         return_tensors="pt",
#         truncation=True,
#         max_length=128,
#         add_special_tokens=True,
#     )
#     inputs.to(device)

#     model.eval()
#     with torch.no_grad():
#         outputs = model(
#             inputs["input_ids"],
#             attention_mask=inputs["attention_mask"],
#             token_type_ids=inputs["token_type_ids"],
#         )
#     logits = outputs[0]
#     logits = logits.detach().cpu()
#     result = logits.argmax(-1)
#     if result == 0:
#         result = "부정"
#     else:
#         result = "긍정"
#     return result

# while True:
#     sent = input("문장을 입력하세요: ")
#     if sent == "exit":
#         break
#     logits = sentence_predict(sent)
#     print(logits)

긍정
부정
긍정
부정
부정


In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# 저장된 모델과 토크나이저 불러오기
model = AutoModelForSequenceClassification.from_pretrained("./saved_model")
tokenizer = AutoTokenizer.from_pretrained("./saved_model")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def sentence_predict(sent):
    model.eval()
    tokenized_sent = tokenizer(
        [sent],
        return_tensors="pt",
        max_length=128,
        padding=True,
        truncation=True,
        add_special_tokens=True,
    )
    tokenized_sent = {k: v.to(device) for k, v in tokenized_sent.items()}
    with torch.no_grad():
        outputs = model(
            input_ids=tokenized_sent["input_ids"],
            attention_mask=tokenized_sent["attention_mask"],
        )
    logits = outputs[0]
    logits = logits.detach().cpu()
    result = logits.argmax(-1).numpy()[0]
    return result

# 데이터 로드 및 샘플링
data = pd.read_csv("Data/ratings_train2.tsv", sep='\t')
sampled_data = data.sample(n=1000)

# 정확도 계산을 위한 변수 초기화
correct_predictions = 0

for index, row in sampled_data.iterrows():
    sentence = row['document']
    true_label = row['label']
    predicted_label = sentence_predict(sentence)
    print(f"True Label: {true_label}, Predicted Label: {predicted_label}")
    if true_label == predicted_label:
        correct_predictions += 1

accuracy = correct_predictions / len(sampled_data)
print(f"정확도: {accuracy * 100:.2f}%")


True Label: 1, Predicted Label: 1
True Label: 0, Predicted Label: 1
True Label: 0, Predicted Label: 0
True Label: 1, Predicted Label: 1
True Label: 0, Predicted Label: 0
True Label: 1, Predicted Label: 1
True Label: 1, Predicted Label: 1
True Label: 0, Predicted Label: 0
True Label: 1, Predicted Label: 1
True Label: 1, Predicted Label: 0
True Label: 1, Predicted Label: 1
True Label: 1, Predicted Label: 1
True Label: 0, Predicted Label: 0
True Label: 1, Predicted Label: 1
True Label: 1, Predicted Label: 1
True Label: 1, Predicted Label: 1
True Label: 1, Predicted Label: 1
True Label: 1, Predicted Label: 1
True Label: 1, Predicted Label: 1
True Label: 0, Predicted Label: 0
True Label: 0, Predicted Label: 0
True Label: 1, Predicted Label: 1
True Label: 0, Predicted Label: 0
True Label: 0, Predicted Label: 0
True Label: 1, Predicted Label: 1
True Label: 1, Predicted Label: 1
True Label: 0, Predicted Label: 0
True Label: 0, Predicted Label: 0
True Label: 1, Predicted Label: 1
True Label: 1,