In [6]:
# !pip install mxnet
# !pip install gluonnlp
# !pip install sentencepiece
# !pip install git+https://git@github.com/SKTBrain/KoBERT.git@master
# !pip install transformers==3.0.2

In [12]:
import os
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import transformers
from tqdm import tqdm, tqdm_notebook
from torch import nn
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset, load_metric, load_from_disk
from transformers import AutoTokenizer, AutoModel
from transformers import AdamW, DataCollatorWithPadding
from transformers.optimization import get_cosine_schedule_with_warmup

# from kobert.utils import get_tokenizer
# from kobert.pytorch_kobert import get_pytorch_kobert_model

In [2]:
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore

seed_everything(42)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


---

In [33]:
model = AutoModel.from_pretrained('skt/kobert-base-v1')
tokenizer = AutoTokenizer.from_pretrained('skt/kobert-base-v1')

In [10]:
print(model)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(8002, 768, padding_idx=1)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )

In [23]:
bertmodel, vocab = get_pytorch_kobert_model()

/home/piai/hustar/Ainalyst/.cache/kobert_v1.zip[██████████████████████████████████████████████████]
/home/piai/hustar/Ainalyst/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece[██████████████████████████████████████████████████]


In [34]:
MAX_LEN = 512

def tokenized_fn(data):
    outputs = tokenizer(data["article"], padding=True, max_length=MAX_LEN, truncation=True)
    if 'label' in data:
        outputs["labels"] = data["label"]
    return outputs

train_dataset = load_dataset("csv", data_files="./data/train_report.csv")["train"]
valid_dataset = load_dataset("csv", data_files="./data/valid_report.csv")["train"]

train_dataset = train_dataset.map(tokenized_fn, remove_columns=['filename', 'article', 'length', 'label'])
valid_dataset = valid_dataset.map(tokenized_fn, remove_columns=['filename', 'article', 'length', 'label'])

Using custom data configuration default-9e3ed0e0e28cd7c9
Reusing dataset csv (/home/piai/.cache/huggingface/datasets/csv/default-9e3ed0e0e28cd7c9/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-1bdad492cbff101c
Reusing dataset csv (/home/piai/.cache/huggingface/datasets/csv/default-1bdad492cbff101c/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached processed dataset at /home/piai/.cache/huggingface/datasets/csv/default-9e3ed0e0e28cd7c9/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-266a0de23b5f1ca0.arrow


  0%|          | 0/550 [00:00<?, ?ex/s]

In [35]:
class ClassificationHead(nn.Module):
    def __init__(self):
        super().__init__()
        # self.dense1 = torch.nn.Linear(768, 3072)
        self.dropout = torch.nn.Dropout(0.25)
        # self.dense2 = torch.nn.Linear(3072, 768)
        self.output = torch.nn.Linear(768, 2)
    
    def forward(self, features):
        # 보통 분류기에선 start 토큰에 분류 결과를 담음
        x = features[:, 0, :]    # take <s> token (equiv. to [CLS])
        x = x.reshape(-1, x.size(-1))
        x = self.dropout(x)
        
#         x = self.dense1(x)
#         x = torch.relu(x)
#         x = self.dropout(x)
        
#         x = self.dense2(x)
#         x = torch.tanh(x)
#         x = self.dropout(x)
        x = self.output(x)
        return x

class KobertForAinalyst(nn.Module):
    def __init__(self, model):
        super(KobertForAinalyst, self).__init__()
        self.model = model
        self.classifier = ClassificationHead()
    
    def forward(self, input_ids=None, attention_mask=None, labels=None):
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            # labels=labels
        )
        self.labels = labels
        logits = self.classifier(outputs["last_hidden_state"])
        # prob = torch.nn.functional.softmax(logits, dim=-1)
        
        if labels is not None:
            loss_fct = torch.nn.CrossEntropyLoss()
            loss = loss_fct(logits, labels)
            return logits, loss
        else:
            return logits

In [36]:
weight = torch.tensor([0.5, 0.5]).to(device)
loss_fct = torch.nn.CrossEntropyLoss(weight=weight)

In [37]:
model = KobertForAinalyst(model=model)
model.to(device)
model = torch.nn.DataParallel(model)
isParallel = True

In [38]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [39]:
batch_size = 8
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_dataloader = torch.utils.data.DataLoader(
    train_dataset,
    sampler = torch.utils.data.RandomSampler(train_dataset),
    batch_size = batch_size,
    collate_fn = data_collator,
)

valid_dataloader = torch.utils.data.DataLoader(
    valid_dataset,
    sampler = torch.utils.data.SequentialSampler(valid_dataset),
    batch_size = batch_size,
    collate_fn = data_collator,
)

In [40]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

In [41]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [42]:
epochs = 5
for epoch in range(epochs):
    print(f"====== Epoch {epoch+1}/{epochs} ======")
    print("Training...")
    
    total_train_loss = 0
    model.train()
    
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch_input_ids = batch["input_ids"].to(device)
        batch_attention_mask = batch["attention_mask"].to(device)
        batch_labels = batch["labels"].to(device)
        
        model.zero_grad()
        
        logits, loss = model(
            input_ids = batch_input_ids,
            attention_mask = batch_attention_mask,
            labels = batch_labels,
        )
        
        # # output 모양 출력해보기
        # outputs = model(
        #     input_ids = batch_input_ids,
        #     attention_mask = batch_attention_mask,
        #     labels = batch_labels,
        # )
        # print("print outputs : ")
        # print(outputs)
        # print(outputs.last_hidden_state.shape)
        # break
    
        if isParallel:
            loss = loss.mean()

        total_train_loss += loss.item()
        loss.backward()
        optimizer.step()
    
        if step % 1000 == 0 and not step == 0:
            print("step : {:>5,} of {:>5,} loss: {:.5f}".format(step, len(train_dataloader), loss.item()))
    
    avg_train_loss = total_train_loss / len(train_dataloader)
    print()
    print(" Average training loss: {0:.5f}".format(avg_train_loss))
    
    # Validation
    print()
    print("Running Validation...")
    
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0
    
    for step, batch in enumerate(tqdm(valid_dataloader)):
        batch_input_ids = batch["input_ids"].to(device)
        batch_attention_mask = batch["attention_mask"].to(device)
        batch_labels = batch["labels"].to(device)
    
        with torch.no_grad():
            logits, loss = model(
                input_ids = batch_input_ids,
                attention_mask = batch_attention_mask,
                labels = batch_labels,
            )
            
            if isParallel:
                loss = loss.mean()
            
            total_eval_loss += loss.item()
            logits = logits.detach().cpu().numpy()
            label_ids = batch_labels.to("cpu").numpy()
            total_eval_accuracy += flat_accuracy(logits, label_ids)

    avg_val_accuracy = total_eval_accuracy / len(valid_dataloader)
    print("Accuracy: {0:.5f}".format(avg_val_accuracy))

Training...





 Average training loss: 0.68584

Running Validation...
Accuracy: 0.59360
Training...

 Average training loss: 0.65627

Running Validation...
Accuracy: 0.74638
Training...

 Average training loss: 0.54111

Running Validation...
Accuracy: 0.71981
Training...

 Average training loss: 0.54420

Running Validation...
Accuracy: 0.73370
Training...

 Average training loss: 0.49660

Running Validation...
Accuracy: 0.72705
