# Predict Probability

In [None]:
!nvidia-smi

Fri Jul 29 03:48:15 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P8    11W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir('/content/drive/My Drive/shopping')

from IPython.display import clear_output
!pip install transformers datasets
clear_output()

In [None]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

import random
from typing import Callable, Dict, List
from tqdm.notebook import tqdm

import torch
from torch.utils.data import DataLoader
import torch.nn.functional as F

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers.tokenization_utils_base import BatchEncoding
from datasets import load_dataset
from datasets.arrow_dataset import Dataset
from scipy.special import softmax

In [None]:
SEED = 42
random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
MAX_LEN = 128
VALID_SPLIT = 0.1
BATCH_SIZE = 64
EPOCHS = 10
LEARNING_RATE = 1e-5
DR_RATE = 0.3
WARMUP_STEPS = 500
WEIGHT_DECAY = 0.01
METRIC = 'accuracy'

LABELS = [1,2,4,5]
id2label = {idx:label for idx, label in enumerate(LABELS)}
label2id = {label:idx for idx, label in enumerate(LABELS)}
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
def tokenize(model_path: str) -> Callable[[Dataset],BatchEncoding]:
    tokenizer = AutoTokenizer.from_pretrained(model_path, problem_type='multi_label_classification')
    clear_output()
    return lambda examples: tokenizer(examples['reviews'], max_length=MAX_LEN, padding='max_length', truncation=True)

def one_hot(examples: Dataset) -> Dict[str,np.ndarray]:
    return {'labels':np.eye(len(LABELS))[label2id[examples['target']]]}

def preprocess(data: Dataset, model_path: str, labeled=True) -> Dataset:
    encoded = data.map(tokenize(model_path), batched=True, remove_columns=['id','reviews'], load_from_cache_file=False)
    encoded = encoded.map(one_hot, remove_columns=['target'], load_from_cache_file=False) if labeled else encoded
    encoded.set_format('torch')
    return encoded

In [None]:
def model(model_path: str) -> AutoModelForSequenceClassification:
    model = AutoModelForSequenceClassification.from_pretrained(
        model_path,
        problem_type='multi_label_classification',
        ignore_mismatched_sizes=True,
        num_labels=len(LABELS),
        id2label=id2label,
        label2id=label2id,)
    clear_output()

    return model

## Load Test Data

In [None]:
# MODEL_PATH = 'jaehyeong/koelectra-base-v3-generalized-sentiment-analysis' # KoELECTRA
MODEL_PATH = 'klue/roberta-large' # RoBERTa
BATCH_SIZE = 32

test_vanilla_dataset = load_dataset('csv', data_files='./data/test.csv', split='train')
test_vanilla_dataset = preprocess(test_vanilla_dataset, MODEL_PATH, labeled=False)
test_vanilla_loader = DataLoader(test_vanilla_dataset, batch_size=BATCH_SIZE, shuffle=False)

test_cleaned_dataset = load_dataset('csv', data_files='./data/test_cleaned.csv', split='train')
test_cleaned_dataset = preprocess(test_cleaned_dataset, MODEL_PATH, labeled=False)
test_cleaned_loader = DataLoader(test_cleaned_dataset, batch_size=BATCH_SIZE, shuffle=False)

clear_output()
len(test_vanilla_loader)

782

## Select Model

In [None]:
MODEL_NAME = 'koelectra_epoch10'
output_dir = os.path.join('./saved/models', MODEL_NAME)
submission_vanilla_dir = os.path.join('./data/samples', MODEL_NAME+'_vanilla')
submission_cleaned_dir = os.path.join('./data/samples', MODEL_NAME+'_cleaned')
if not os.path.exists(submission_vanilla_dir):
    os.mkdir(submission_vanilla_dir)
if not os.path.exists(submission_cleaned_dir):
    os.mkdir(submission_cleaned_dir)

In [None]:
MODEL_NAME = 'roberta_epoch10'
output_dir = os.path.join('./saved/models', MODEL_NAME)
submission_vanilla_dir = os.path.join('./data/samples', MODEL_NAME+'_vanilla')
submission_cleaned_dir = os.path.join('./data/samples', MODEL_NAME+'_cleaned')
if not os.path.exists(submission_vanilla_dir):
    os.mkdir(submission_vanilla_dir)
if not os.path.exists(submission_cleaned_dir):
    os.mkdir(submission_cleaned_dir)

In [None]:
checkpoint = './saved/models/roberta_large/checkpoint-1408'
submission_dir = os.path.join('./data/samples', MODEL_NAME)
if not os.path.exists(submission_dir):
    os.mkdir(submission_dir)

## Predict Proba

In [None]:
def predict_proba(checkpoint: str, dataloader: DataLoader) -> List[List[float]]:
    pretrained_path = os.path.join(output_dir, checkpoint)
    pretrained_model = model(pretrained_path).to(device)
    proba = list()

    with torch.no_grad():
        for samples in tqdm(dataloader):
            for key in samples.keys():
                samples[key] = samples[key].to(device)
            outputs = pretrained_model(**samples)
            proba += F.softmax(outputs.logits).tolist()

    return proba

In [None]:
for i,checkpoint in enumerate(sorted(os.listdir(output_dir)), start=1):
    proba = predict_proba(checkpoint, test_vanilla_loader)
    proba = pd.DataFrame(proba, columns=LABELS)
    proba.to_csv(f'{submission_vanilla_dir}/epoch{i}.csv', index=False)

for i,checkpoint in enumerate(sorted(os.listdir(output_dir)), start=1):
    proba = predict_proba(checkpoint, test_cleaned_loader)
    proba = pd.DataFrame(proba, columns=LABELS)
    proba.to_csv(f'{submission_cleaned_dir}/epoch{i}.csv', index=False)

In [None]:
proba = predict_proba(checkpoint, test_vanilla_dataset)
proba = pd.DataFrame(proba, columns=LABELS)
proba.to_csv(f'{submission_dir}/vanilla.csv', index=False)

proba = predict_proba(checkpoint, test_cleaned_dataset)
proba = pd.DataFrame(proba, columns=LABELS)
proba.to_csv(f'{submission_dir}/cleaned.csv', index=False)