In [1]:
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd
from pathlib import Path

import evaluate

import torch

from transformers import AutoTokenizer, AutoModelForSequenceClassification, BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments




In [2]:
result_path = str(Path.cwd().parent / 'exercisebook_large_data' / 'Transformers' / 'results')
result_path

'c:\\Coding\\Local\\exercisebook_large_data\\Transformers\\results'

In [3]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [4]:
df = pd.read_csv('./data/daum_movie_review.csv')
df.head()

Unnamed: 0,review,rating,date,title
0,돈 들인건 티가 나지만 보는 내내 하품만,1,2018.10.29,인피니티 워
1,몰입할수밖에 없다. 어렵게 생각할 필요없다. 내가 전투에 참여한듯 손에 땀이남.,10,2018.10.26,인피니티 워
2,이전 작품에 비해 더 화려하고 스케일도 커졌지만.... 전국 맛집의 음식들을 한데 ...,8,2018.10.24,인피니티 워
3,이 정도면 볼만하다고 할 수 있음!,8,2018.10.22,인피니티 워
4,재미있다,10,2018.10.20,인피니티 워


In [5]:
y = (df
 .assign(result=lambda df: np.where(df['rating'] <= 4, 'bad', np.where(df['rating'] >= 8, 'good', 'soso')))
 ['result']
 .replace({'bad':0, 'soso':1, 'good':2})
)
print(y)

0        0
1        2
2        2
3        2
4        2
        ..
14720    2
14721    1
14722    2
14723    2
14724    2
Name: result, Length: 14725, dtype: int64


  .replace({'bad':0, 'soso':1, 'good':2})


In [6]:
X_train_val, X_test, y_train_val, y_test = train_test_split(df.review.tolist(), y.values, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.3, random_state=0)
len(X_train), len(X_val), len(X_test)

(7730, 3313, 3682)

In [7]:
metric = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [8]:
class OurDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels
        super().__init__()

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.inputs.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)

In [9]:
model_name = 'monologg/kobert'

tokenizer = BertTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)
model = model.to(device)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'KoBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
train_input = tokenizer(X_train, truncation=True, padding=True, return_tensors='pt')
val_input = tokenizer(X_val, truncation=True, padding=True, return_tensors='pt')
test_input = tokenizer(X_test, truncation=True, padding=True, return_tensors='pt')

In [11]:
train_dataset = OurDataset(train_input, y_train)
val_dataset = OurDataset(val_input, y_val)
test_dataset = OurDataset(test_input, y_test)

In [12]:
training_args = TrainingArguments(
    output_dir=result_path,
    logging_dir='./logs',
    num_train_epochs=10,
    per_device_train_batch_size=16, per_device_eval_batch_size=16,
    eval_strategy='steps',
    eval_steps=100,
    gradient_accumulation_steps=4,
    #bf16=True,
    warmup_steps=200,
    weight_decay=0.01
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [13]:
trainer.train()

  item = {key: torch.tensor(val[idx]) for key, val in self.inputs.items()}


Step,Training Loss,Validation Loss,Accuracy
100,No log,0.868343,0.657108
200,No log,0.867306,0.657108
300,No log,0.882157,0.657108
400,No log,0.858246,0.654694
500,0.878900,0.843399,0.656505
600,0.878900,0.846599,0.657108
700,0.878900,0.832626,0.658316
800,0.878900,0.840471,0.650166
900,0.878900,0.843907,0.666465
1000,0.809000,0.848278,0.662843


  item = {key: torch.tensor(val[idx]) for key, val in self.inputs.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.inputs.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.inputs.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.inputs.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.inputs.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.inputs.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.inputs.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.inputs.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.inputs.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.inputs.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.inputs.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.inputs.items()}


TrainOutput(global_step=1210, training_loss=0.8247725053267045, metrics={'train_runtime': 537.8355, 'train_samples_per_second': 143.724, 'train_steps_per_second': 2.25, 'total_flos': 6276385578279600.0, 'train_loss': 0.8247725053267045, 'epoch': 10.0})

In [14]:
trainer.evaluate(eval_dataset=test_dataset)

  item = {key: torch.tensor(val[idx]) for key, val in self.inputs.items()}


{'eval_loss': 0.8424486517906189,
 'eval_accuracy': 0.6662140141227594,
 'eval_runtime': 7.2284,
 'eval_samples_per_second': 509.379,
 'eval_steps_per_second': 31.957,
 'epoch': 10.0}

In [15]:
del model
del trainer
torch.cuda.empty_cache()