In [1]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import BertTokenizer, BertTokenizerFast, BertForSequenceClassification, BertModel, ElectraForSequenceClassification
from transformers import Trainer, TrainingArguments
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from kobert_tokenizer import KoBERTTokenizer
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import nltk
from nltk.corpus import movie_reviews
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from datasets import load_metric
import evaluate

In [2]:
df = pd.read_csv('./data/daum_movie_review.csv')
df = df.loc[::5, :].reset_index(drop=True)
df.head()

Unnamed: 0,review,rating,date,title
0,돈 들인건 티가 나지만 보는 내내 하품만,1,2018.10.29,인피니티 워
1,나는 재밌게 봄,10,2018.10.14,인피니티 워
2,개연성은 무시해라 액션을 즐겨라 스타로드가 이끌어준다 각각의 영웅들을 즐겨라 그리고...,8,2018.10.01,인피니티 워
3,마지막에 누구한테 연락한거지? 궁금,9,2018.09.26,인피니티 워
4,이제는 지겨워서 못보겠다,5,2018.09.26,인피니티 워


In [3]:
df.shape

(2945, 4)

In [4]:
y = np.where(df.rating > 7, 2, np.where(df.rating > 4, 1, 0))

In [5]:
X_train_val, X_test, y_train_val, y_test = train_test_split(df.review.tolist(), y, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, random_state=0)
len(X_train), len(X_val), len(X_test)

(1656, 552, 737)

In [28]:
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
model = AutoModelForSequenceClassification.from_pretrained('skt/kobert-base-v1', num_labels=3)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = model.to(device)

loading file spiece.model from cache at C:\Users\admin/.cache\huggingface\hub\models--skt--kobert-base-v1\snapshots\a9f5849fce18fb088f0cd0f9b29ec3f756958464\spiece.model
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at C:\Users\admin/.cache\huggingface\hub\models--skt--kobert-base-v1\snapshots\a9f5849fce18fb088f0cd0f9b29ec3f756958464\special_tokens_map.json
loading file tokenizer_config.json from cache at C:\Users\admin/.cache\huggingface\hub\models--skt--kobert-base-v1\snapshots\a9f5849fce18fb088f0cd0f9b29ec3f756958464\tokenizer_config.json
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.
loading configuration file config.json from cache at C:\Users\admin/.cache\huggingface\hub\models--skt--kobert

In [29]:
type(tokenizer), type(model)

(kobert_tokenizer.kobert_tokenizer.KoBERTTokenizer,
 transformers.models.bert.modeling_bert.BertForSequenceClassification)

In [30]:
metric = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [31]:
train_input = tokenizer(X_train, truncation=True, padding=True, return_tensors='pt')
val_input = tokenizer(X_val, truncation=True, padding=True, return_tensors='pt')
test_input = tokenizer(X_test, truncation=True, padding=True, return_tensors='pt')

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [32]:
class OurDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]).clone().detach() for key, val in self.inputs.items()}
        item['labels'] = torch.tensor(self.labels[idx]).clone().detach().long()
        return item
    def __len__(self):
        return len(self.labels)

In [33]:
train_dataset = OurDataset(train_input, y_train)
val_dataset = OurDataset(val_input, y_val)
test_dataset = OurDataset(test_input, y_test)

In [34]:
training_args = TrainingArguments(output_dir='./results',
                                  num_train_epochs=4,
                                  evaluation_strategy='steps', eval_steps=10,
                                  per_device_train_batch_size=8, per_device_eval_batch_size=8,
                                  warmup_steps=100,
                                  weight_decay=0.01)
trainer = Trainer(model=model, args=training_args,
                  train_dataset=train_dataset, eval_dataset=val_dataset, compute_metrics=compute_metrics)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [35]:
trainer.train()

***** Running training *****
  Num examples = 1656
  Num Epochs = 4
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 828
  Number of trainable parameters = 92189187


  0%|          | 0/828 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]).clone().detach() for key, val in self.inputs.items()}


In [None]:
trainer.predict(test_dataset=test_dataset)

737