# Classification using KoGPT2

In [None]:
!nvidia-smi

Wed Jul 20 00:46:27 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   55C    P0    28W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from google.colab import drive
drive.mount('/content/drive')

!pip install mxnet
!pip install gluonnlp
!pip install sentencepiece
!pip install lightning-flash
!pip install transformers
!pip install pytorch_lightning
!pip install soynlp

In [None]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import os
os.chdir('/content/drive/MyDrive/shopping')

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint
from transformers import PreTrainedTokenizerFast, GPT2ForSequenceClassification

from sklearn.utils import shuffle
from flash.core.optimizers import LAMB

from tqdm.notebook import tqdm

## Load Data

In [None]:
train = pd.read_csv("./data/train_aug.csv")
train = train[['reviews', 'target']]
test_df = pd.read_csv("./data/test.csv")
submission = pd.read_csv("./data/sample_submission.csv")

## Data Preprocess

In [None]:
from soynlp.normalizer import emoticon_normalize, only_text
import re

def normalise(df):
     clean=[]
     for text in df.reviews.to_list():
         text = re.sub('\.*!+', '!', text)
         text = re.sub('\.*\?+', '?', text)
         text = re.sub('\.+', '.', text)
         text = re.sub(',+', ',', text)
         text = re.sub('[^A-Za-z0-9가-힣\s\[\].,!?\"\']', '', text)
         text = emoticon_normalize(only_text(text), num_repeats=1)
         text = re.sub('\s+', ' ', text)
         clean.append(text)
     return clean

clean_train = normalise(train)
train['clean_train'] = clean_train
print(train.columns)

clean_test = normalise(test_df)
test_df['clean_test'] = clean_test
print(test_df.columns)

In [None]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(train, test_size=0.1, stratify=train.target, shuffle=True)
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

In [None]:
tokenizer = PreTrainedTokenizerFast.from_pretrained("skt/kogpt2-base-v2",
  bos_token='</s>', eos_token='</s>', unk_token='<unk>',
  pad_token='<pad>', mask_token='<mask>')

## Custom Dataset

In [None]:
class CompDataset(Dataset):

    def __init__(self, df, tokenizer, clean:bool=True):
        self.df_data = df
        self.tokenizer = tokenizer
        self.clean = clean

    def __getitem__(self, index):

        if self.clean:
            text = self.df_data.loc[index, 'clean_train']
        else:
            text = self.df_data.loc[index, 'reviews']
        
        encoded_dict = self.tokenizer.encode_plus(
            text,           
            add_special_tokens=True,      
            max_length=512,           
            pad_to_max_length=True,
            truncation=True,
            return_attention_mask=True,   
            return_tensors='pt',          
            )

        padded_token_list = encoded_dict['input_ids'][0]
        att_mask = encoded_dict['attention_mask'][0]

        target = self.df_data.loc[index, 'target']
        one_hot = F.one_hot(torch.arange(0, 6))

        return {'input_ids':padded_token_list,'attention_mask':att_mask,'labels':one_hot[target][[1,2,4,5]].to(torch.float)}

    def __len__(self):
        return len(self.df_data)


class TestDataset(Dataset):

    def __init__(self, df, tokenizer, clean:bool=True):
        self.df_data = df
        self.tokenizer = tokenizer
        self.clean = clean

    def __getitem__(self, index):

        if self.clean:
            text = self.df_data.loc[index, 'clean_test']
        else:
            text = self.df_data.loc[index, 'reviews']

        return tokenizer(text, return_tensors="pt") # out: input_ids, attention_mask
    
    def __len__(self):
        return len(self.df_data)

In [None]:
train_dataset = CompDataset(train_df, tokenizer)
val_dataset = CompDataset(val_df, tokenizer)
test_dataset = TestDataset(test_df, tokenizer)

## Modeling

In [None]:
class GPT2Model(pl.LightningModule):
    def __init__(self,
                train_data,
                val_data, 
                batch_size,
                lr = 0.1e-4):
        super().__init__()
        self.model = GPT2ForSequenceClassification.from_pretrained(
            'skt/kogpt2-base-v2',
            num_labels=4,
            return_dict=True,
            problem_type='multi_label_classification')
        self.train_data = train_data
        self.val_data = val_data
        self.batch_size = batch_size
        self.my_learning_rate = lr

    def forward(self, input_ids, attention_mask, labels):
        outputs = self.model(
            input_ids=input_ids, 
            attention_mask=attention_mask,
            labels=labels
        )

        return outputs

    def training_step(self, batch, batch_idx):
        outputs = self(
            input_ids=batch['input_ids'],
            attention_mask=batch['attention_mask'],
            labels=batch['labels']
        )

        self.log('train_loss', outputs.loss, prog_bar=True, logger = True)

        return {'loss': outputs.loss}
    
    def validation_step(self, batch, batch_idx):
        outputs = self(
            input_ids=batch['input_ids'],
            attention_mask=batch['attention_mask'],
            labels=batch['labels']
        )

        self.log('val_loss', outputs.loss, prog_bar=True, logger = True)
        
        return {'val_loss': outputs.loss}

    def train_dataloader(self):
        return DataLoader(self.train_data,
                        batch_size=self.batch_size,
                        shuffle=True,
                            )

    def val_dataloader(self):
        return DataLoader(self.val_data,
                        batch_size=self.batch_size,
                        shuffle=True,
                            )

    def configure_optimizers(self):
        return LAMB(self.model.parameters(), lr=0.0001, amsgrad=True)

In [None]:
model = GPT2Model(train_dataset, val_dataset, 8)

## Set Checkpoint

In [None]:
log_dir = './lightning_logs'
if not os.path.exists(log_dir):
    os.mkdir(log_dir)
logger = TensorBoardLogger(save_dir=log_dir, version=1, name='GPT2')

checkpoint_callback = ModelCheckpoint(
    dirpath='checkpoints',
    filename='best-checkpoint',
    verbose=True,
    monitor='val_loss',
    mode='min',
    save_top_k=3,
)

## Load Trainer

In [None]:
N_EPOCHS = 3

trainer = pl.Trainer(
    checkpoint_callback=checkpoint_callback,
    max_epochs=N_EPOCHS,
    gpus=-1,
    logger=logger)

## Tensorboard

In [None]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir ./lightning_logs

## Train

In [None]:
trainer.fit(model)

## Evaluate

In [None]:
model.freeze()

In [None]:
torch.set_grad_enabled(False)

model.to('cuda:0')
result=[]

for idx in tqdm(range(len(test_dataset))):
    batch = test_dataset[idx]
    output = model.model(**batch.to('cuda:0'))

    answer = torch.argmax(output.logits, dim=1)
    result.append(answer.item())

## Predict

In [None]:
submission.target = result
submission.target.replace({0:1,1:2,2:4,3:5}, inplace=True)
submission.to_csv('kogpt2.csv', index=False)