In [71]:
!pip install transformers
!pip install pytorch_lightning



In [72]:
import numpy as np
import pandas as pd
import torch

from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.core.lightning import LightningModule
from torch.utils.data import DataLoader, Dataset

from transformers.optimization import AdamW, get_cosine_schedule_with_warmup
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel

In [73]:
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [88]:
import gc
gc.collect()

3482

In [89]:
USER_TOKEN = '<usr>'
SYSTEM_TOKEN = '<sys>'

BOS_TOKEN = '</s>'
EOS_TOKEN = '</s>'
UNK_TOKEN = '<unk>'

MASK_TOKEN = '<unused0>'
SENT_TOKEN = '<unused1>'
PAD_TOKEN = '<pad>'

batch_size = 96
max_len = 32

lr = 5e-5
warmup_ratio = 0.1

gpus = 1
max_epochs = 2

In [90]:
TOKENIZER = PreTrainedTokenizerFast.from_pretrained('skt/kogpt2-base-v2', 
    bos_token=BOS_TOKEN, eos_token=EOS_TOKEN, unk_token=UNK_TOKEN,
    pad_token=PAD_TOKEN, mask_token=MASK_TOKEN)

In [91]:
KOGPT2 = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')

In [92]:
!git clone https://github.com/haven-jeon/Chatbot_data.git

fatal: destination path 'Chatbot_data' already exists and is not an empty directory.


In [93]:
class CustomDataset(Dataset):
    def __init__(self, chats, max_len=32):
        self._data = chats
        self.first = True
        self.q_token = USER_TOKEN
        self.a_token = SYSTEM_TOKEN

        self.sent_token = SENT_TOKEN
        self.bos = BOS_TOKEN
        self.eos = EOS_TOKEN
        self.mask = MASK_TOKEN
        self.pad = PAD_TOKEN
        self.max_len = max_len
        self.tokenizer = TOKENIZER

    def __len__(self):
        return len(self._data)

    def __getitem__(self, idx):
        turn = self._data.iloc[idx]

        q = turn['Q']
        a = turn['A']

        sentiment = str(turn['label'])

        q_toked = self.tokenizer.tokenize(self.q_token + q + self.sent_token + sentiment)
        q_len = len(q_toked)

        a_toked = self.tokenizer.tokenize(self.a_token + a + self.eos)
        a_len = len(a_toked)

        if q_len + a_len > self.max_len:
            a_len = self.max_len - q_len

            if a_len <= 0:
                q_toked = q_toked[-(int(self.max_len/2)):]
                q_len = len(q_toked)
                a_len = self.max_len - q_len
                
                print('aaa', q_toked)
                
                assert a_len > 0

            a_toked = a_toked[:a_len]
            a_len = len(a_toked)
            assert a_len == len(a_toked), f'{a_len} ==? {len(a_toked)}'
            # 3박4일 놀러가고 싶다 -> '러', '가고', '▁싶', '다' (클경우 뒤만)

        labels = [
            self.mask,
        ] * q_len + a_toked[1:]

        mask = [0] * q_len + [1] * a_len + [0] * (self.max_len - q_len - a_len)

        labels_ids = self.tokenizer.convert_tokens_to_ids(labels)
        while len(labels_ids) < self.max_len:
            labels_ids += [self.tokenizer.pad_token_id]
        token_ids = self.tokenizer.convert_tokens_to_ids(q_toked + a_toked)
        while len(token_ids) < self.max_len:
            token_ids += [self.tokenizer.pad_token_id]

        # return q_toked + a_toked, mask, labels
        return token_ids, np.array(mask), labels_ids

In [94]:
# training_step - validation_step - validation_epoch_end
class CustomModel(LightningModule):
    def __init__(self):
        super(CustomModel, self).__init__()
        
        self.neg = -1e18
        self.kogpt2 = KOGPT2
        self.criterion = torch.nn.CrossEntropyLoss(reduction='none')

        self.max_epochs = max_epochs
        self.warmup_ratio = warmup_ratio
        self.lr = lr

    # (3, batch, maxlen) => (batch, maxlen) * 3 => To Tensor
    def _collate_fn(self, batch):
        data = [item[0] for item in batch]
        mask = [item[1] for item in batch]
        label = [item[2] for item in batch]
        return torch.LongTensor(data), torch.LongTensor(mask), torch.LongTensor(label)

    # 데이터 로더 생성
    def train_dataloader(self):
        data_df = pd.read_csv('Chatbot_data/ChatbotData.csv')
        dataset = CustomDataset(data_df, max_len=max_len)
        dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=2, shuffle=True, collate_fn=self._collate_fn)
        
        return dataloader

    # forward
    def forward(self, x):
        out = self.kogpt2(x)
        return out.logits

    # training_step
    def training_step(self, batch, batch_idx):
        train_ids, mask, labels_ids = batch
        # * => (batch_size, max_len)

        out = self(train_ids)
        # out => (batch, max_len) 
        mask_3d = mask.unsqueeze(dim=2).repeat_interleave(repeats=out.shape[2], dim=2)
        # (batch_size, max_len) => (batch_size, max_len, 1) => (batch_size, max_len, vocab_size)
        mask_out = torch.where(mask_3d == 1, out, self.neg * torch.ones_like(out))
        # mask가 1이 아닐경우 negative 극적인 값

        loss = self.criterion(mask_out.transpose(2, 1), labels_ids)
        loss_avg = loss.sum() / mask.sum()
        # (loss.sum() / mask.sum)
        # mask 부분
        self.log('train_loss', loss_avg)
        return loss_avg
    
    # def validation_step(self, batch, batch_idx):
    #     pass

    def configure_optimizers(self):
        param_optimizer = list(self.named_parameters())

        # 가중치 감쇠 안하는 파라미터
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.lr, correct_bias=False)

        # warm up lr
        num_train_steps = len(self.train_dataloader()) * self.max_epochs
        num_warmup_steps = int(num_train_steps * self.warmup_ratio)
        scheduler = get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=num_warmup_steps, num_training_steps=num_train_steps)
        lr_scheduler = {'scheduler': scheduler, 'name': 'cosine_schedule_with_warmup',
                        'monitor': 'loss', 'interval': 'step',
                        'frequency': 1}
        return [optimizer], [lr_scheduler]

    def query(self, q, sent='0'):
        tok = TOKENIZER
        sent_tokens = tok.tokenize(sent)
        a = ''
        while 1:
            input_ids = torch.LongTensor(tok.encode(USER_TOKEN + q + SENT_TOKEN + sent + SYSTEM_TOKEN + a)).unsqueeze(dim=0)
            pred = self(input_ids)
            gen = tok.convert_ids_to_tokens(
                torch.argmax(pred, dim=-1).squeeze().numpy().tolist())[-1]
            if gen == EOS_TOKEN:
                break
            a += gen.replace('▁', ' ')
        return a.strip()

    def chat(self, sent='0'):
        tok = TOKENIZER
        sent_tokens = tok.tokenize(sent)
        with torch.no_grad():
            while 1:
                q = input('User > ').strip()
                if q == 'quit':
                    break
                a = ''
                while 1:
                    input_ids = torch.LongTensor(tok.encode(USER_TOKEN + q + SENT_TOKEN + sent + SYSTEM_TOKEN + a)).unsqueeze(dim=0)
                    pred = self(input_ids)
                    gen = tok.convert_ids_to_tokens(
                        torch.argmax(pred, dim=-1).squeeze().numpy().tolist())[-1]
                    if gen == EOS_TOKEN:
                        break
                    a += gen.replace('▁', ' ')
                print("System > {}".format(a.strip()))

In [95]:
torch.cuda.empty_cache()

checkpoint_callback = ModelCheckpoint(
    dirpath='./model_chp/',
    filename='{epoch:02d}-{train_loss:.2f}',
    verbose=True,
    save_last=True,
    monitor='train_loss',
    mode='min',
)

model = CustomModel()
model.train()

trainer = Trainer(gpus=gpus,
                  max_epochs=max_epochs,
                  checkpoint_callback=checkpoint_callback,
                  gradient_clip_val=1.0)
trainer.fit(model)
logging.info('best model path {}'.format(checkpoint_callback.best_model_path))

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params
-----------------------------------------------
0 | kogpt2    | GPT2LMHeadModel  | 125 M 
1 | criterion | CrossEntropyLoss | 0     
-----------------------------------------------
125 M     Trainable params
0         Non-trainable params
125 M     Total params
500.656   Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…




INFO:root:best model path 


In [96]:
trainer.save_checkpoint("model.ckpt")

In [97]:
!pip install flask-ngrok
!pip install flask_cors



In [98]:
from flask_ngrok import run_with_ngrok
from flask import Flask, jsonify, request
from flask_cors import CORS

model = CustomModel.load_from_checkpoint('model.ckpt')
model.eval()

app = Flask(__name__)

run_with_ngrok(app)
CORS(app)

@app.route('/api/chat', methods=['POST'])
def test_router():
    query = request.form['query']
    sent = request.form['sent']
    system = model.query(query, sent)
    return jsonify({
        'system': system
    })

app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


INFO:werkzeug: * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)


 * Running on http://b71f8842b845.ngrok.io
 * Traffic stats available on http://127.0.0.1:4040
