# Classification using RoBERTa

In [None]:
!nvidia-smi

Sat Jul 23 10:58:35 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   48C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir('/content/drive/My Drive/shopping')

from IPython.display import clear_output
!pip install transformers datasets
clear_output()

In [2]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

import random
from typing import Callable, Dict, List
from tqdm.notebook import tqdm

import torch
from torch.utils.data import DataLoader
import torch.nn.functional as F

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import EarlyStoppingCallback, TrainingArguments, Trainer
from transformers.tokenization_utils_base import BatchEncoding
from transformers.trainer_utils import EvalPrediction
from datasets import load_dataset
from datasets.arrow_dataset import Dataset
from sklearn.metrics import accuracy_score
from scipy.special import softmax

In [3]:
SEED = 42
random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [4]:
MAX_LEN = 128
VALID_SPLIT = 0.1
BATCH_SIZE = 32
EPOCHS = 10
LEARNING_RATE = 1e-5
DR_RATE = 0.3
WARMUP_STEPS = 500
WEIGHT_DECAY = 0.01
METRIC = 'accuracy'

MODEL_NAME = lambda x: f'roberta_{x}'
MODEL_PATH = 'klue/roberta-large'
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

## Load Data

In [5]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

LABELS = [1,2,4,5]
id2label = {idx:label for idx, label in enumerate(LABELS)}
label2id = {label:idx for idx, label in enumerate(LABELS)}
print(train.shape)
train.head()

(39988, 3)


Unnamed: 0,id,reviews,target
0,0,조아요 처음구입 싸게햇어요,2
1,1,생각보다 잘 안돼요 매지 바른지 하루밖에 안됐는데ㅠㅠ 25천원가량 주고 사기 너무 ...,1
2,2,디자인은괜찮은데 상품이 금이가서 교환했는데 두번째받은상품도 까져있고 안쪽에 금이가져...,2
3,3,기전에 이 제품말고 이마트 트레이더스에서만 팔던 프리미엄 제품을 사용했었습니다. 샘...,2
4,4,튼튼하고 손목을 잘 받쳐주네요~,5


In [6]:
train_dataset = load_dataset('csv', data_files='./data/train.csv', split='train')
train_dataset = train_dataset.train_test_split(test_size=VALID_SPLIT)
clear_output()
print(train_dataset['train'].shape, train_dataset['test'].shape)

(22500, 3) (2500, 3) (25000, 2)


## Data Preprocess

In [13]:
def tokenize(model_path: str) -> Callable[[Dataset],BatchEncoding]:
    tokenizer = AutoTokenizer.from_pretrained(model_path, problem_type='multi_label_classification')
    clear_output()
    return lambda examples: tokenizer(examples['reviews'], max_length=MAX_LEN, padding='max_length', truncation=True)

def one_hot(examples: Dataset) -> Dict[str,np.ndarray]:
    return {'labels':np.eye(len(LABELS))[label2id[examples['target']]]}

def preprocess(data: Dataset, model_path: str, labeled=True) -> Dataset:
    encoded = data.map(tokenize(model_path), batched=True, remove_columns=['id','reviews'], load_from_cache_file=False)
    encoded = encoded.map(one_hot, remove_columns=['target'], load_from_cache_file=False) if labeled else encoded
    encoded.set_format('torch')
    return encoded

## Load Model

In [14]:
def model(model_path: str) -> AutoModelForSequenceClassification:
    model = AutoModelForSequenceClassification.from_pretrained(
        model_path,
        problem_type='multi_label_classification',
        ignore_mismatched_sizes=True,
        num_labels=len(LABELS),
        id2label=id2label,
        label2id=label2id,)
    clear_output()

    return model

In [15]:
def compute_metrics(p: EvalPrediction) -> Dict[str,float]:
    predictions, labels = p
    y_pred = np.argmax(predictions, axis=1)
    y_true = np.argmax(labels, axis=1)
    return {
        'accuracy': accuracy_score(y_true=y_true, y_pred=y_pred),
    }

## Load Trainer

In [16]:
def make_dirs(name: str) -> Dict[str,str]:
    required_dirs = {
        'root':'./saved','model_root':'./saved/models','model_dir':f'./saved/models/{name}',
        'logging_root':'./saved/logger','logging_dir':f'./saved/logger/{name}'}
    for dir in required_dirs.values():
        if not os.path.isdir(dir):
            os.mkdir(dir)
    return required_dirs

def training_args(model_name: str) -> TrainingArguments:
    required_dirs = make_dirs(model_name)
    return TrainingArguments(
        output_dir=required_dirs['model_dir'],
        overwrite_output_dir=True,
        evaluation_strategy='epoch',
        save_strategy='epoch',
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        learning_rate=LEARNING_RATE,
        warmup_steps=WARMUP_STEPS,
        weight_decay=WEIGHT_DECAY,
        logging_dir=required_dirs['logging_dir'],
        load_best_model_at_end=True,
        metric_for_best_model=METRIC,
    )

def trainer(dataset: Dataset, model_name: str, model_path: str, labeled=True) -> Trainer:
    data_loader = preprocess(dataset, model_path, labeled)

    return Trainer(
        model=model(model_path),
        args=training_args(model_name),
        train_dataset=data_loader['train'],
        eval_dataset=data_loader['test'],
        compute_metrics=compute_metrics,
        callbacks = [EarlyStoppingCallback(3)],
    )

In [None]:
distinguisher = 'epoch10'
roberta_trainer = trainer(train_dataset, MODEL_NAME(distinguisher), MODEL_PATH)

## Train

In [None]:
roberta_trainer.train()

## Evaluate

In [None]:
roberta_trainer.evaluate()

In [None]:
del roberta_trainer

## Predict

In [None]:
from transformers import pipeline

output_dir = os.path.join('./saved/models', MODEL_NAME(distinguisher))
checkpoints = [ckp for ckp in os.listdir(output_dir) if re.match('checkpoint-[0-9]+', ckp)]
recent_checkpoint = sorted(checkpoints, key=lambda x: int(x.split('-')[1]))[-1]

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, problem_type='multi_label_classification')
roberta = AutoModelForSequenceClassification.from_pretrained(
        os.path.join(output_dir, recent_checkpoint),
        problem_type='multi_label_classification',
        ignore_mismatched_sizes=True,
        num_labels=len(LABELS),
        id2label=id2label,
        label2id=label2id,)
classifier = pipeline('sentiment-analysis', model=roberta, tokenizer=tokenizer, device=0)
clear_output()

In [None]:
result = classifier(test['reviews'].tolist())
preds = [pred['label'] for pred in result]

## Ensemble

In [None]:
test_dataset = load_dataset('csv', data_files='./data/test.csv', split='train')
test_dataset = preprocess(test_dataset, MODEL_PATH, labeled=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

output_dir = os.path.join('./saved/models', MODEL_NAME(distinguisher))
submission_dir = os.path.join('./data/samples', MODEL_NAME(distinguisher))
if not os.path.exists(submission_dir):
    os.mkdir(submission_dir)

In [None]:
def predict_proba(checkpoint: str, dataloader: DataLoader) -> List[List[float]]:
    pretrained_path = os.path.join(output_dir, checkpoint)
    pretrained_model = model(pretrained_path).to(device)
    proba = list()

    with torch.no_grad():
        for samples in tqdm(dataloader):
            for key in samples.keys():
                samples[key] = samples[key].to(device)
            outputs = pretrained_model(**samples)
            proba += F.softmax(outputs.logits).tolist()

    return proba

In [None]:
for i,checkpoint in enumerate(sorted(os.listdir(output_dir)), start=1):
    proba = predict_proba(checkpoint, test_loader)
    proba = pd.DataFrame(proba, columns=LABELS)
    proba.to_csv(f'{submission_dir}/epoch{i}.csv', index=False)

In [None]:
proba_list = list()
for sample in sorted(os.listdir(submission_dir)):
    if os.path.splitext(sample)[-1] != '.csv':
        continue
    sample_path = os.path.join(submission_dir, sample)
    proba_list.append(pd.read_csv(sample_path))

ensembled = sum(proba_list)
ensembled = ensembled.apply(lambda x: softmax(x), axis=1)
ensembled.head()

Unnamed: 0,1,2,4,5
0,0.000519,0.998819,0.000336,0.000326
1,0.988167,0.009231,0.001285,0.001317
2,0.000206,0.000209,0.000287,0.999298
3,0.994319,0.003921,0.000867,0.000893
4,0.994662,0.003624,0.000843,0.000871


## Submission

In [None]:
submission = pd.read_csv('./data/sample_submission.csv')
preds = ensembled.apply(lambda x: id2label[np.argmax(x)], axis=1)
submission['target'] = preds
submission.head()

Unnamed: 0,id,target
0,0,2
1,1,1
2,2,5
3,3,1
4,4,1


In [None]:
submission.to_csv(f'./data/samples/{MODEL_NAME(distinguisher)}.csv', index=False)