In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from datasets import load_metric
from transformers import BertTokenizer, BertForSequenceClassification, BertModel
from transformers import Trainer, TrainingArguments
from transformers import get_linear_schedule_with_warmup
from transformers import AdamW
from kobert_tokenizer import KoBERTTokenizer

In [2]:
df = pd.read_csv('./data/daum_movie_review.csv')
df = df.loc[::5, :]
df.head()

Unnamed: 0,review,rating,date,title
0,돈 들인건 티가 나지만 보는 내내 하품만,1,2018.10.29,인피니티 워
5,나는 재밌게 봄,10,2018.10.14,인피니티 워
10,개연성은 무시해라 액션을 즐겨라 스타로드가 이끌어준다 각각의 영웅들을 즐겨라 그리고...,8,2018.10.01,인피니티 워
15,마지막에 누구한테 연락한거지? 궁금,9,2018.09.26,인피니티 워
20,이제는 지겨워서 못보겠다,5,2018.09.26,인피니티 워


In [3]:
y = [0 if rate < 6 else 1 for rate in df.rating]

In [4]:
X_train_val, X_test, y_train_val, y_test = train_test_split(df.review.tolist(), y, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, random_state=0)

In [5]:
metric = load_metric('accuracy')

  metric = load_metric('accuracy')


In [6]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions, references=labels)

In [14]:
class OurDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.inputs.items()}
        item['labels'] = torch.tensor(self.labels[idx]).clone().detach()
        return item
    
    def __len__(self):
        return len(self.labels)

In [15]:
sentence = '안녕하세요. 반갑습니다.'
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
inputs = tokenizer(sentence)
print(tokenizer.tokenize(sentence), inputs)

loading file vocab.txt from cache at C:\Users\admin/.cache\huggingface\hub\models--bert-base-multilingual-cased\snapshots\fdfce55e83dbed325647a63e7e1f5de19f0382ba\vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at C:\Users\admin/.cache\huggingface\hub\models--bert-base-multilingual-cased\snapshots\fdfce55e83dbed325647a63e7e1f5de19f0382ba\tokenizer_config.json
loading configuration file config.json from cache at C:\Users\admin/.cache\huggingface\hub\models--bert-base-multilingual-cased\snapshots\fdfce55e83dbed325647a63e7e1f5de19f0382ba\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-multilingual-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "int

['안', '##녕', '##하', '##세', '##요', '.', '반', '##갑', '##습', '##니다', '.'] {'input_ids': [101, 9521, 118741, 35506, 24982, 48549, 119, 9321, 118610, 119081, 48345, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [16]:
train_input = tokenizer(X_train, truncation=True, padding=True, return_tensors='pt')
val_input = tokenizer(X_val, truncation=True, padding=True, return_tensors='pt')
test_input = tokenizer(X_test, truncation=True, padding=True, return_tensors='pt')

In [23]:
train_input['input_ids'].shape

torch.Size([1656, 310])

In [17]:
train_dataset = OurDataset(train_input, y_train)
val_dataset = OurDataset(val_input, y_train)
test_dataset = OurDataset(test_input, y_train)

In [34]:
train_dataset[0]['input_ids'].shape, train_dataset[0]['token_type_ids'].shape, train_dataset[0]['attention_mask'].shape, train_dataset[0]['labels'].shape

(torch.Size([310]), torch.Size([310]), torch.Size([310]), torch.Size([]))

In [18]:
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased')

loading configuration file config.json from cache at C:\Users\admin/.cache\huggingface\hub\models--bert-base-multilingual-cased\snapshots\fdfce55e83dbed325647a63e7e1f5de19f0382ba\config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 119547
}

loading weights file pytorch_mo

In [19]:
training_args = TrainingArguments(output_dir='./results',
                                  num_train_epochs=1,
                                  evaluation_strategy='steps',
                                  eval_steps=10,
                                  per_device_train_batch_size=32,
                                  per_device_eval_batch_size=32,
                                  warmup_steps=100,
                                  weight_decay=0.01)
trainer = Trainer(model=model,
                  args=training_args,
                  train_dataset=train_dataset,
                  eval_dataset=val_dataset,
                  compute_metrics=compute_metrics)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [20]:
trainer.train()

***** Running training *****
  Num examples = 1656
  Num Epochs = 1
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 52
  Number of trainable parameters = 177854978


  0%|          | 0/52 [00:00<?, ?it/s]

***** Running Evaluation *****
  Num examples = 1656
  Batch size = 32


  0%|          | 0/52 [00:00<?, ?it/s]

IndexError: index 552 is out of bounds for dimension 0 with size 552

In [None]:
trainer.evaluate(eval_dataset=test_dataset)

In [None]:
del model
del trainer
torch.cuda.empty_cache()

In [None]:
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
bert_model = BertModel.from_pretrained('skt/kobert-base-v1')

(…)kobert-base-v1/resolve/main/spiece.model:   0%|          | 0.00/371k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


(…)-v1/resolve/main/special_tokens_map.json:   0%|          | 0.00/244 [00:00<?, ?B/s]

(…)se-v1/resolve/main/tokenizer_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

loading file spiece.model from cache at C:\Users\admin/.cache\huggingface\hub\models--skt--kobert-base-v1\snapshots\a9f5849fce18fb088f0cd0f9b29ec3f756958464\spiece.model
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at C:\Users\admin/.cache\huggingface\hub\models--skt--kobert-base-v1\snapshots\a9f5849fce18fb088f0cd0f9b29ec3f756958464\special_tokens_map.json
loading file tokenizer_config.json from cache at C:\Users\admin/.cache\huggingface\hub\models--skt--kobert-base-v1\snapshots\a9f5849fce18fb088f0cd0f9b29ec3f756958464\tokenizer_config.json
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.


(…)/kobert-base-v1/resolve/main/config.json:   0%|          | 0.00/535 [00:00<?, ?B/s]

loading configuration file config.json from cache at C:\Users\admin/.cache\huggingface\hub\models--skt--kobert-base-v1\snapshots\a9f5849fce18fb088f0cd0f9b29ec3f756958464\config.json
Model config BertConfig {
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "author": "Heewon Jeon(madjakarta@gmail.com)",
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "kobert_version": 1.0,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 8002
}



pytorch_model.bin:   0%|          | 0.00/369M [00:00<?, ?B/s]

loading weights file pytorch_model.bin from cache at C:\Users\admin/.cache\huggingface\hub\models--skt--kobert-base-v1\snapshots\a9f5849fce18fb088f0cd0f9b29ec3f756958464\pytorch_model.bin
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at skt/kobert-base-v1.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.


In [None]:
train_input = tokenizer(X_train, truncation=True, padding=True, return_tensors='pt')
val_input = tokenizer(X_val, truncation=True, padding=True, return_tensors='pt')
test_input = tokenizer(X_test, truncation=True, padding=True, return_tensors='pt')

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
train_dataset = OurDataset(train_input, y_train)
val_dataset = OurDataset(val_input, y_train)
test_dataset = OurDataset(test_input, y_train)

In [None]:
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=32)
val_loader = DataLoader(val_dataset, shuffle=True, batch_size=32)
test_loader = DataLoader(test_dataset, shuffle=True, batch_size=32)

In [None]:
class MyModel(nn.Module):
    def __init__(self, pretrained_model, token_size, num_labels):
        super(MyModel, self).__init__()
        self.token_size = token_size
        self.num_labels = num_labels
        self.pretrained_model = pretrained_model
        self.classifier = nn.Linear(self.token_size, self.num_labels)
    
    def forward(self, inputs):
        outputs = self.pretrained_model(**inputs)
        bert_clf_token = outputs.last_hidden_state[:, 0, :]
        return self.classifier(bert_clf_token)

In [None]:
model = MyModel(bert_model, num_labels=2, token_size=bert_model.config.hidden_size)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = model.to(device)
model.train()

MyModel(
  (pretrained_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(8002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [None]:
optim = AdamW(model.parameters(), lr=1e-3, weight_decay=0.01)
loss_function = nn.CrossEntropyLoss()
num_epochs = 1
total_training_steps = num_epochs * len(train_loader)
scheduler = get_linear_schedule_with_warmup(optimizer=optim, num_training_steps=total_training_steps, num_warmup_steps=200)

In [None]:
train_loss = 0
eval_steps = 10
step = 0

In [None]:
for epoch in range(num_epochs):
    for step, batch in enumerate(train_loader):
        model.train()
        optim.zero_grad()
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(device)
        outputs = model(inputs)

        loss = loss_function(outputs, F.one_hot(labels, num_classes=2).float())
        train_loss += loss
        loss.backward()
        optim.step()
        scheduler.step()

        if (step+1) % eval_steps == 0:
            with torch.no_grad():
                val_loss = 0
                model.eval()
                for batch in val_loader:
                    inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
                    labels = batch['labels'].to(device)
                    outputs = model(inputs)

                    loss = loss_function(outputs, F.one_hot(labels, num_classes=2).float())
                    val_loss += loss

                avg_val_loss = val_loss / len(val_loader)
            avg_train_loss = train_loss / eval_steps
            print(f"Step: {step+1}, train loss: {avg_train_loss}, validation loss: {avg_val_loss}")


  item = {key: torch.tensor(val[idx]) for key, val in self.inputs.items()}


IndexError: index 568 is out of bounds for dimension 0 with size 552

In [None]:
metric = load_metric('accuracy')
model.eval()
for batch in test_loader:
    inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
    labels = batch['labels'].to(device)

    with torch.no_grad():
        outputs = model(inputs)

    predictions = torch.argmax(outputs, dim=-1)
    metric.add_batch(predictions=predictions, references=labels)

In [None]:
metric.compute()