In [None]:
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master
!pip install torch transformers pandas sentencepiece

Collecting git+https://****@github.com/SKTBrain/KoBERT.git@master
  Cloning https://****@github.com/SKTBrain/KoBERT.git (to revision master) to /tmp/pip-req-build-xaud3_ux
  Running command git clone --filter=blob:none --quiet 'https://****@github.com/SKTBrain/KoBERT.git' /tmp/pip-req-build-xaud3_ux
  Resolved https://****@github.com/SKTBrain/KoBERT.git to commit 47a69af87928fc24e20f571fe10c3cc9dd9af9a3
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting boto3<=1.15.18 (from kobert==0.2.3)
  Using cached boto3-1.15.18-py2.py3-none-any.whl (129 kB)
Collecting gluonnlp<=0.10.0,>=0.6.0 (from kobert==0.2.3)
  Using cached gluonnlp-0.10.0.tar.gz (344 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting mxnet<=1.7.0.post2,>=1.4.0 (from kobert==0.2.3)
  Using cached mxnet-1.7.0.post2-py2.py3-none-manylinux2014_x86_64.whl (54.7 MB)
INFO: pip is looking at multiple versions of kobert to determine which version is compatible with other requirements. This could take a w

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
pathtrain = '/content/drive/MyDrive/combined_data_new_train.json'
pathtest = '/content/drive/MyDrive/combined_data_new_val.json'

In [None]:
pathtrain2 = '/content/drive/MyDrive/combined_data_final_train.json'
pathtest2 = '/content/drive/MyDrive/combined_data_final_test.json'

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import BertTokenizer, BertModel
import json
from tqdm import tqdm
from sklearn.metrics import accuracy_score, mean_absolute_error

In [None]:
class ReviewDataset(Dataset):
    def __init__(self, data_path, tokenizer, max_length):
        self.data = []
        self.tokenizer = tokenizer
        self.max_length = max_length

        with open(data_path, 'r', encoding='utf-8') as f:
            raw_data = json.load(f)
            for item in raw_data:
                if item["GeneralPolarity"] is not None and item["ReviewScore"] is not None:
                    self.data.append(item)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        inputs = self.tokenizer(
            item["RawText"],
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )

        input_ids = inputs["input_ids"].squeeze(0)
        attention_mask = inputs["attention_mask"].squeeze(0)

        review_score = torch.tensor(float(item["ReviewScore"]) / 100.0, dtype=torch.float32)
        general_polarity = torch.tensor(int(item["GeneralPolarity"]) + 1, dtype=torch.long)

        return input_ids, attention_mask, review_score, general_polarity

In [None]:
class MultiTaskModel(nn.Module):
    def __init__(self, bert_model_name, num_polarity_classes=3):
        super(MultiTaskModel, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_polarity_classes)
        self.regressor = nn.Linear(self.bert.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)

        polarity_logits = self.classifier(pooled_output)
        review_score = self.regressor(pooled_output).squeeze(-1)

        return polarity_logits, review_score

In [None]:
def train(model, dataloader, optimizer, criterion_classification, criterion_regression, device):
    model.train()
    total_loss = 0.0

    for batch in tqdm(dataloader):
        input_ids, attention_mask, review_score, general_polarity = [x.to(device) for x in batch]

        optimizer.zero_grad()

        polarity_logits, predicted_score = model(input_ids, attention_mask)

        loss_classification = criterion_classification(polarity_logits, general_polarity)
        loss_regression = criterion_regression(predicted_score, review_score)

        loss = loss_classification + loss_regression
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

In [None]:
def evaluate(model, dataloader, criterion_classification, criterion_regression, device):
    model.eval()
    total_loss = 0.0
    all_predictions = []
    all_scores = []
    all_labels = []
    all_review_scores = []

    with torch.no_grad():
        for batch in tqdm(dataloader):
            input_ids, attention_mask, review_score, general_polarity = [x.to(device) for x in batch]

            polarity_logits, predicted_score = model(input_ids, attention_mask)

            loss_classification = criterion_classification(polarity_logits, general_polarity)
            loss_regression = criterion_regression(predicted_score, review_score)

            loss = loss_classification + loss_regression
            total_loss += loss.item()

            all_predictions.extend(polarity_logits.argmax(dim=1).cpu().numpy())
            all_scores.extend(predicted_score.cpu().numpy())
            all_labels.extend(general_polarity.cpu().numpy())
            all_review_scores.extend(review_score.cpu().numpy())

    return total_loss / len(dataloader), all_predictions, all_scores, all_labels, all_review_scores

def calculate_metrics(predictions, scores, labels, review_scores):
    # 감성 분석 정확도
    accuracy = accuracy_score(labels, predictions)
    # 별점 예측 MAE
    mae = mean_absolute_error([score * 100 for score in review_scores], [score * 100 for score in scores])
    return accuracy, mae

In [None]:
# Device 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 모델 및 토크나이저 로드
model = MultiTaskModel('monologg/kobert').to(device)
tokenizer = BertTokenizer.from_pretrained('monologg/kobert')

# Train 데이터셋 로드 및 분할
train_dataset = ReviewDataset(pathtrain2, tokenizer, max_length=128)
train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_subset, val_subset = random_split(train_dataset, [train_size, val_size])

train_dataloader = DataLoader(train_subset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_subset, batch_size=32, shuffle=False)

# Validation 데이터셋을 Test 데이터셋으로 사용
test_dataset = ReviewDataset(pathtest2, tokenizer, max_length=128)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# 손실 함수 정의
criterion_classification = nn.CrossEntropyLoss()
criterion_regression = nn.MSELoss()

# Optimizer 설정
optimizer = optim.AdamW(model.parameters(), lr=3e-5)

In [None]:

# 모델 훈련
num_epochs = 5
for epoch in range(num_epochs):
    train_loss = train(model, train_dataloader, optimizer, criterion_classification, criterion_regression, device)
    val_loss, val_predictions, val_scores, _, _ = evaluate(model, val_dataloader, criterion_classification, criterion_regression, device)

    print(f"Epoch {epoch + 1}/{num_epochs}")
    print(f"Train Loss: {train_loss:.4f}")
    print(f"Validation Loss: {val_loss:.4f}")

# 모델 평가
test_loss, test_predictions, test_scores, test_labels, test_review_scores = evaluate(model, test_dataloader, criterion_classification, criterion_regression, device)
accuracy, mae = calculate_metrics(test_predictions, test_scores, test_labels, test_review_scores)
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy (Polarity): {accuracy:.4f}")
print(f"Test MAE (Review Score): {mae:.4f}")



100%|██████████| 4113/4113 [20:09<00:00,  3.40it/s]
100%|██████████| 1029/1029 [01:50<00:00,  9.32it/s]


Epoch 1/5
Train Loss: 0.9272
Validation Loss: 0.8704


100%|██████████| 4113/4113 [20:08<00:00,  3.40it/s]
100%|██████████| 1029/1029 [01:50<00:00,  9.30it/s]


Epoch 2/5
Train Loss: 0.8626
Validation Loss: 0.8607


100%|██████████| 4113/4113 [20:10<00:00,  3.40it/s]
100%|██████████| 1029/1029 [01:51<00:00,  9.24it/s]


Epoch 3/5
Train Loss: 0.8338
Validation Loss: 0.8579


100%|██████████| 4113/4113 [20:10<00:00,  3.40it/s]
100%|██████████| 1029/1029 [01:51<00:00,  9.23it/s]


Epoch 4/5
Train Loss: 0.8042
Validation Loss: 0.8493


100%|██████████| 4113/4113 [20:10<00:00,  3.40it/s]
100%|██████████| 1029/1029 [01:50<00:00,  9.28it/s]


Epoch 5/5
Train Loss: 0.7734
Validation Loss: 0.8925


100%|██████████| 697/697 [01:15<00:00,  9.26it/s]


Test Loss: 0.9385
Test Accuracy (Polarity): 0.6710
Test MAE (Review Score): 24.1453
Text: [CLS] 좋은 [UNK] [UNK] [UNK] [UNK] [UNK] 되어 [UNK] [UNK]. [UNK] 처럼 [UNK] [UNK] [UNK] [UNK] [UNK] [UNK]. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
Predicted Polarity: 1, Actual Polarity: 1
Predicted Score: 10.25, Actual Score: 100.00
Text: [CLS] [UNK] [UNK] [UNK] [UNK] [UNK]. [UNK] [UNK] 보니 [UNK] [UNK] [UNK

In [None]:
import torch

# 모델을 저장할 경로 지정
MODEL_PATH = "multi_task_model.pth"

# 모델의 state_dict 저장
torch.save(model.state_dict(), MODEL_PATH)
print(f"Model saved to {MODEL_PATH}")


Model saved to multi_task_model2.pth
