Last Update @ 2020.12.04

- Huggingface Transformers 4.0.0  버전 반영

# Package 설치 & 데이터 받기

In [None]:
!pip install -q transformers pytorch_lightning emoji soynlp

In [None]:
!git clone https://github.com/e9t/nsmc

In [None]:
!head nsmc/ratings_train.txt

# 패키지 import & 기본 Args 설정

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import os
import pandas as pd

from pprint import pprint

import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.optim.lr_scheduler import ExponentialLR

from pytorch_lightning import LightningModule, Trainer, seed_everything

from transformers import BertForSequenceClassification, BertTokenizer, AdamW

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import re
import emoji
from soynlp.normalizer import repeat_normalize

## 기본 학습 Arguments

In [6]:
TRAIN_PATH = "/content/drive/MyDrive/train.tsv"
VAL_PATH = "/content/drive/MyDrive/valid.tsv"


class Arg:
    random_seed: int = 42  # Random Seed
    pretrained_model: str = 'beomi/kcbert-large'  # Transformers PLM name
    pretrained_tokenizer: str = ''  # Optional, Transformers Tokenizer Name. Overrides `pretrained_model`
    auto_batch_size: str = 'power'  # Let PyTorch Lightening find the best batch size 
    batch_size: int = 0  # Optional, Train/Eval Batch Size. Overrides `auto_batch_size` 
    lr: float = 5e-6  # Starting Learning Rate
    epochs: int = 5  # Max Epochs
    max_length: int = 150  # Max Length input size
    report_cycle: int = 100  # Report (Train Metrics) Cycle
    train_data_path: str = TRAIN_PATH  # Train Dataset file 
    val_data_path: str = VAL_PATH  # Validation Dataset file 
    cpu_workers: int = os.cpu_count()  # Multi cpu workers
    test_mode: bool = False  # Test Mode enables `fast_dev_run`
    optimizer: str = 'AdamW'  # AdamW vs AdamP
    lr_scheduler: str = 'exp'  # ExponentialLR vs CosineAnnealingWarmRestarts
    fp16: bool = False  # Enable train on FP16
    tpu_cores: int = 0  # Enable TPU with 1 core or 8 cores

args = Arg()

## 기본값을 Override 하고싶은 경우 아래와 같이 수정

In [7]:
!nvidia-smi

Wed Jul 13 02:48:39 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   44C    P0    29W / 250W |      2MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

위에서 GPU가 V100/P100이면 아래 `batch_size`  를 32 이상으로 하셔도 됩니다.

In [8]:
# args.tpu_cores = 8  # Enables TPU
args.fp16 = True  # Enables GPU FP16
args.batch_size = 16  # Force setup batch_size

# Model 만들기 with Pytorch Lightning

In [9]:
class Model(LightningModule):
    def __init__(self, options):
        super().__init__()
        self.args = options
        self.bert = BertForSequenceClassification.from_pretrained(self.args.pretrained_model)
        #self.config.num_labels = 5 # 라벨링 종류가 바뀔 때 해당 값을 변경
        self.tokenizer = BertTokenizer.from_pretrained(
            self.args.pretrained_tokenizer
            if self.args.pretrained_tokenizer
            else self.args.pretrained_model
        )

    def forward(self, **kwargs):
        return self.bert(**kwargs)

    def training_step(self, batch, batch_idx):
        data, labels = batch
        output = self(input_ids=data, labels=labels)

        # Transformers 4.0.0+
        loss = output.loss
        logits = output.logits
        
        preds = logits.argmax(dim=-1)

        y_true = labels.cpu().numpy()
        y_pred = preds.cpu().numpy()

        # Acc, Precision, Recall, F1
        metrics = [
            metric(y_true=y_true, y_pred=y_pred)
            for metric in
            (accuracy_score, precision_score, recall_score, f1_score)
        ]

        tensorboard_logs = {
            'train_loss': loss.cpu().detach().numpy().tolist(),
            'train_acc': metrics[0],
            'train_precision': metrics[1],
            'train_recall': metrics[2],
            'train_f1': metrics[3],
        }
        if (batch_idx % self.args.report_cycle) == 0:
            print()
            pprint(tensorboard_logs)
        return {'loss': loss, 'log': tensorboard_logs}

    def validation_step(self, batch, batch_idx):
        data, labels = batch
        output = self(input_ids=data, labels=labels)

        # Transformers 4.0.0+
        loss = output.loss
        logits = output.logits

        preds = logits.argmax(dim=-1)

        y_true = list(labels.cpu().numpy())
        y_pred = list(preds.cpu().numpy())

        return {
            'loss': loss,
            'y_true': y_true,
            'y_pred': y_pred,
        }

    def validation_epoch_end(self, outputs):
        loss = torch.tensor(0, dtype=torch.float)
        for i in outputs:
            loss += i['loss'].cpu().detach()
        _loss = loss / len(outputs)

        loss = float(_loss)
        y_true = []
        y_pred = []

        for i in outputs:
            y_true += i['y_true']
            y_pred += i['y_pred']

        # Acc, Precision, Recall, F1
        metrics = [
            metric(y_true=y_true, y_pred=y_pred)
            for metric in
            (accuracy_score, precision_score, recall_score, f1_score)
        ]

        tensorboard_logs = {
            'val_loss': loss,
            'val_acc': metrics[0],
            'val_precision': metrics[1],
            'val_recall': metrics[2],
            'val_f1': metrics[3],
        }

        print()
        pprint(tensorboard_logs)
        return {'loss': _loss, 'log': tensorboard_logs}

    def configure_optimizers(self):
        if self.args.optimizer == 'AdamW':
            optimizer = AdamW(self.parameters(), lr=self.args.lr)
        elif self.args.optimizer == 'AdamP':
            from adamp import AdamP
            optimizer = AdamP(self.parameters(), lr=self.args.lr)
        else:
            raise NotImplementedError('Only AdamW and AdamP is Supported!')
        if self.args.lr_scheduler == 'cos':
            scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=1, T_mult=2)
        elif self.args.lr_scheduler == 'exp':
            scheduler = ExponentialLR(optimizer, gamma=0.5)
        else:
            raise NotImplementedError('Only cos and exp lr scheduler is Supported!')
        return {
            'optimizer': optimizer,
            'scheduler': scheduler,
        }

    def read_data(self, path):
        if path.endswith('xlsx'):
            return pd.read_excel(path)
        elif path.endswith('csv'):
            return pd.read_csv(path)
        elif path.endswith('tsv') or path.endswith('txt'):
            return pd.read_csv(path, sep='\t')
        else:
            raise NotImplementedError('Only Excel(xlsx)/Csv/Tsv(txt) are Supported')

    def preprocess_dataframe(self, df):
        emojis = ''.join(emoji.UNICODE_EMOJI.keys())
        pattern = re.compile(f'[^ .,?!/@$%~％·∼()\x00-\x7Fㄱ-힣{emojis}]+')
        url_pattern = re.compile(
            r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)')

        def clean(x):
            x = pattern.sub(' ', x)
            x = url_pattern.sub('', x)
            x = x.strip()
            x = repeat_normalize(x, num_repeats=2)
            return x

        df['문장'] = df['문장'].map(lambda x: self.tokenizer.encode(
            clean(str(x)),
            padding='max_length',
            max_length=self.args.max_length,
            truncation=True,
        ))
        return df

    def train_dataloader(self):
        df = self.read_data(self.args.train_data_path)
        df = self.preprocess_dataframe(df)

        dataset = TensorDataset(
            torch.tensor(df['문장'].to_list(), dtype=torch.long),
            torch.tensor(df['악플'].to_list(), dtype=torch.long),
        )
        return DataLoader(
            dataset,
            batch_size=self.args.batch_size or self.batch_size,
            shuffle=True,
            num_workers=self.args.cpu_workers,
        )

    def val_dataloader(self):
        df = self.read_data(self.args.val_data_path)
        df = self.preprocess_dataframe(df)

        dataset = TensorDataset(
            torch.tensor(df['문장'].to_list(), dtype=torch.long),
            torch.tensor(df['악플'].to_list(), dtype=torch.long),
        )
        return DataLoader(
            dataset,
            batch_size=self.args.batch_size or self.batch_size,
            shuffle=False,
            num_workers=self.args.cpu_workers,
        )


In [10]:
model = Model(args)
trainer = Trainer(
        max_epochs=args.epochs,
        fast_dev_run=args.test_mode,
        num_sanity_val_steps=None if args.test_mode else 0,
        auto_scale_batch_size=args.auto_batch_size if args.auto_batch_size and not args.batch_size else False,
        # For GPU Setup
        deterministic=torch.cuda.is_available(),
        gpus=-1 if torch.cuda.is_available() else None,
        precision=16 if args.fp16 else 32,
        # For TPU Setup
        # tpu_cores=args.tpu_cores if args.tpu_cores else None,
    )


def main():
    print("Using PyTorch Ver", torch.__version__)
    print("Fix Seed:", args.random_seed)
    seed_everything(args.random_seed)

    print(":: Start Training ::")
    trainer.fit(model)

Downloading:   0%|          | 0.00/672 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.25G [00:00<?, ?B/s]

Some weights of the model checkpoint at beomi/kcbert-large were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initial

Downloading:   0%|          | 0.00/244k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Using 16bit native Automatic Mixed Precision (AMP)
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


# 학습!

> 주의: 1epoch별로 GPU-P100기준 약 2~3시간, GPU V100기준 ~40분이 걸립니다.

> Update @ 2020.09.01
> 최근 Colab Pro에서 V100이 배정됩니다.

```python
# 1epoch 기준 아래 score가 나옵니다.
{'val_acc': 0.90522,
 'val_f1': 0.9049023739289227,
 'val_loss': 0.23429009318351746,
 'val_precision': 0.9143146796431468,
 'val_recall': 0.8956818813808446}
```

In [11]:
main()

Global seed set to 42
Missing logger folder: /content/lightning_logs


Using PyTorch Ver 1.12.0+cu113
Fix Seed: 42
:: Start Training ::


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name | Type                          | Params
-------------------------------------------------------
0 | bert | BertForSequenceClassification | 334 M 
-------------------------------------------------------
334 M     Trainable params
0         Non-trainable params
334 M     Total params
668.785   Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]


{'train_acc': 0.75,
 'train_f1': 0.8571428571428571,
 'train_loss': 0.59259033203125,
 'train_precision': 0.8571428571428571,
 'train_recall': 0.8571428571428571}

{'train_acc': 0.75,
 'train_f1': 0.8571428571428571,
 'train_loss': 0.456085205078125,
 'train_precision': 0.75,
 'train_recall': 1.0}

{'train_acc': 0.875,
 'train_f1': 0.9333333333333333,
 'train_loss': 0.3185920715332031,
 'train_precision': 0.875,
 'train_recall': 1.0}

{'train_acc': 0.8125,
 'train_f1': 0.888888888888889,
 'train_loss': 0.5354824066162109,
 'train_precision': 0.8,
 'train_recall': 1.0}

{'train_acc': 0.875,
 'train_f1': 0.9230769230769231,
 'train_loss': 0.3078315258026123,
 'train_precision': 0.9230769230769231,
 'train_recall': 0.9230769230769231}

{'train_acc': 0.75,
 'train_f1': 0.8181818181818182,
 'train_loss': 0.5087013244628906,
 'train_precision': 0.8181818181818182,
 'train_recall': 0.8181818181818182}

{'train_acc': 0.9375,
 'train_f1': 0.967741935483871,
 'train_loss': 0.25256967544555664,


Validation: 0it [00:00, ?it/s]


{'val_acc': 0.8648648648648649,
 'val_f1': 0.914217767963309,
 'val_loss': 0.31254303455352783,
 'val_precision': 0.8722852512155591,
 'val_recall': 0.9603854389721628}

{'train_acc': 0.8125,
 'train_f1': 0.888888888888889,
 'train_loss': 0.45171046257019043,
 'train_precision': 0.8571428571428571,
 'train_recall': 0.9230769230769231}

{'train_acc': 0.75,
 'train_f1': 0.8571428571428571,
 'train_loss': 0.3593275547027588,
 'train_precision': 0.8571428571428571,
 'train_recall': 0.8571428571428571}

{'train_acc': 0.9375,
 'train_f1': 0.9565217391304348,
 'train_loss': 0.18567895889282227,
 'train_precision': 1.0,
 'train_recall': 0.9166666666666666}

{'train_acc': 0.875,
 'train_f1': 0.888888888888889,
 'train_loss': 0.421451210975647,
 'train_precision': 0.8,
 'train_recall': 1.0}

{'train_acc': 1.0,
 'train_f1': 1.0,
 'train_loss': 0.031863272190093994,
 'train_precision': 1.0,
 'train_recall': 1.0}

{'train_acc': 0.9375,
 'train_f1': 0.9565217391304348,
 'train_loss': 0.278629302978

Validation: 0it [00:00, ?it/s]


{'val_acc': 0.8753010436178753,
 'val_f1': 0.9197105444521021,
 'val_loss': 0.2961452305316925,
 'val_precision': 0.8890739506995337,
 'val_recall': 0.9525339043540328}

{'train_acc': 1.0,
 'train_f1': 1.0,
 'train_loss': 0.09228825569152832,
 'train_precision': 1.0,
 'train_recall': 1.0}

{'train_acc': 0.8125,
 'train_f1': 0.88,
 'train_loss': 0.4053474962711334,
 'train_precision': 1.0,
 'train_recall': 0.7857142857142857}

{'train_acc': 0.875,
 'train_f1': 0.9166666666666666,
 'train_loss': 0.21148312091827393,
 'train_precision': 0.8461538461538461,
 'train_recall': 1.0}

{'train_acc': 0.9375,
 'train_f1': 0.9565217391304348,
 'train_loss': 0.1695912480354309,
 'train_precision': 0.9166666666666666,
 'train_recall': 1.0}

{'train_acc': 0.9375,
 'train_f1': 0.9523809523809523,
 'train_loss': 0.08769187331199646,
 'train_precision': 0.9090909090909091,
 'train_recall': 1.0}

{'train_acc': 1.0,
 'train_f1': 1.0,
 'train_loss': 0.11972793936729431,
 'train_precision': 1.0,
 'train_rec

Validation: 0it [00:00, ?it/s]


{'val_acc': 0.877976986887878,
 'val_f1': 0.9173913043478261,
 'val_loss': 0.30854693055152893,
 'val_precision': 0.9315673289183223,
 'val_recall': 0.9036402569593148}

{'train_acc': 1.0,
 'train_f1': 1.0,
 'train_loss': 0.05322951078414917,
 'train_precision': 1.0,
 'train_recall': 1.0}

{'train_acc': 0.9375,
 'train_f1': 0.9523809523809523,
 'train_loss': 0.1248534619808197,
 'train_precision': 0.9090909090909091,
 'train_recall': 1.0}

{'train_acc': 0.9375,
 'train_f1': 0.962962962962963,
 'train_loss': 0.27305060625076294,
 'train_precision': 0.9285714285714286,
 'train_recall': 1.0}

{'train_acc': 1.0,
 'train_f1': 1.0,
 'train_loss': 0.10006323456764221,
 'train_precision': 1.0,
 'train_recall': 1.0}

{'train_acc': 0.9375,
 'train_f1': 0.9473684210526316,
 'train_loss': 0.16430658102035522,
 'train_precision': 1.0,
 'train_recall': 0.9}

{'train_acc': 1.0,
 'train_f1': 1.0,
 'train_loss': 0.02290957421064377,
 'train_precision': 1.0,
 'train_recall': 1.0}

{'train_acc': 1.0,
 '

Validation: 0it [00:00, ?it/s]


{'val_acc': 0.8774417982338775,
 'val_f1': 0.9195361911454674,
 'val_loss': 0.35777124762535095,
 'val_precision': 0.9055363321799308,
 'val_recall': 0.9339757316202713}

{'train_acc': 1.0,
 'train_f1': 1.0,
 'train_loss': 0.03529820591211319,
 'train_precision': 1.0,
 'train_recall': 1.0}

{'train_acc': 0.9375,
 'train_f1': 0.9600000000000001,
 'train_loss': 0.13119028508663177,
 'train_precision': 1.0,
 'train_recall': 0.9230769230769231}

{'train_acc': 1.0,
 'train_f1': 1.0,
 'train_loss': 0.003913357853889465,
 'train_precision': 1.0,
 'train_recall': 1.0}

{'train_acc': 1.0,
 'train_f1': 1.0,
 'train_loss': 0.07202080637216568,
 'train_precision': 1.0,
 'train_recall': 1.0}

{'train_acc': 1.0,
 'train_f1': 1.0,
 'train_loss': 0.020320482552051544,
 'train_precision': 1.0,
 'train_recall': 1.0}

{'train_acc': 1.0,
 'train_f1': 1.0,
 'train_loss': 0.024102605879306793,
 'train_precision': 1.0,
 'train_recall': 1.0}

{'train_acc': 1.0,
 'train_f1': 1.0,
 'train_loss': 0.003933630883

Validation: 0it [00:00, ?it/s]


{'val_acc': 0.8766390152528767,
 'val_f1': 0.9180152943268719,
 'val_loss': 0.43592190742492676,
 'val_precision': 0.9149237858915278,
 'val_recall': 0.9211277658815132}


In [12]:
def infer(x):
    return torch.softmax(
        model(**model.tokenizer(x, return_tensors='pt')
    ).logits, dim=-1)


print(infer('아 좆같다 씨발'))

tensor([[3.3025e-04, 9.9967e-01]], grad_fn=<SoftmaxBackward0>)


In [14]:
loop = True;

while loop: 
  sentence = input("하고싶은 말을 입력해주세요 : ") 
  if sentence == 0: 
    break;
  print(infer(sentence))

하고싶은 말을 입력해주세요 : 왜사냐
tensor([[0.0313, 0.9687]], grad_fn=<SoftmaxBackward0>)
하고싶은 말을 입력해주세요 : 오..
tensor([[0.9983, 0.0017]], grad_fn=<SoftmaxBackward0>)
하고싶은 말을 입력해주세요 : 진짜 개새끼 좆같이 게임하네
tensor([[3.9815e-04, 9.9960e-01]], grad_fn=<SoftmaxBackward0>)
하고싶은 말을 입력해주세요 : 와 저렇게 하려고해도 힘들꺼 같은데
tensor([[0.9883, 0.0117]], grad_fn=<SoftmaxBackward0>)
하고싶은 말을 입력해주세요 : 손가락 놀리는 꼬라지바
tensor([[0.0055, 0.9945]], grad_fn=<SoftmaxBackward0>)


KeyboardInterrupt: ignored

In [None]:
def preprocess_hoho(test_sentence):
        emojis = ''.join(emoji.UNICODE_EMOJI.keys())
        pattern = re.compile(f'[^ .,?!/@$%~％·∼()\x00-\x7Fㄱ-힣{emojis}]+')
        url_pattern = re.compile(
            r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)')
        
        tokenizer = BertTokenizer.from_pretrained(
            args.pretrained_tokenizer
            if args.pretrained_tokenizer
            else args.pretrained_model
        )


        def clean(x):
            x = pattern.sub(' ', x)
            x = url_pattern.sub('', x)
            x = x.strip()
            x = repeat_normalize(x, num_repeats=2)
            return x

        sentence = tokenizer.encode(
            clean(str(test_sentence)),
            padding='max_length',
            max_length=args.max_length,
            truncation=True,
        )

        return sentence


In [None]:
loop = True;

while loop:
  sentence = input("하고싶은 말을 입력해주세요 : ")
  if sentence == 0 :
        break
  vector = preprocess_hoho(sentence) # 단어 백터를 받고서 모델에 넘겨주기
  #모델에서 문장을 받고서 처리를해서 라벨링을 진행
  print(torch.tensor([vector], dtype=torch.long).shape)
  
  model.forward(sentence)

  dataset = TensorDataset(
            torch.tensor([vector], dtype=torch.long),
            torch.tensor([0], dtype=torch.long),
  )


  temp = DataLoader(
              dataset,
              batch_size=args.batch_size,
              shuffle=False,
              num_workers=args.cpu_workers,
  )


  proto = trainer.validate(dataloaders=temp)
  
  print(proto)
  # Transformers 4.0.0+


KeyboardInterrupt: ignored