# Setting

라이브러리 다운로드

In [None]:
#!pip install mxnet
#!pip install gluonnlp pandas tqdm
#!pip install sentencepiece
#!pip install transformers==3
#!pip install torch
#!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master

라이브러리 불러오기

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import pandas as pd
import numpy as np
import re
import tarfile
import pickle as pickle
from tqdm import tqdm
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup
from sklearn.model_selection import train_test_split,StratifiedKFold

from transformers import *
import tqdm

GPU 설정

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available else "cpu")

kobert 불러오기

In [None]:
device

# Preprocessing

In [None]:
def load_data(dataset_dir):
    with open('/opt/ml/input/data/label_type.pkl', 'rb') as f:
        label_type = pickle.load(f)
    dataset = pd.read_csv(dataset_dir, delimiter='\t', header=None)
    dataset = preprocessing_dataset(dataset, label_type)
    return dataset

def preprocessing_dataset(dataset, label_type):
    label = []
    for i in dataset[8]:
        if i == 'blind':
            label.append(100)
        else:
            label.append(label_type[i])
    out_dataset = pd.DataFrame({'sentence':dataset[1],'entity_01':dataset[2],'entity_02':dataset[5],'label':label,})
    return out_dataset

In [None]:
dataset_path = r"/opt/ml/input/data/train/train.tsv"

dataset = load_data(dataset_path)

dataset['sentence'] = dataset['entity_01'] + ' [SEP] ' + dataset['entity_02'] + ' [SEP] ' + dataset['sentence']

In [None]:
skf = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)
for train_index, test_index in skf.split(dataset['sentence'], dataset['label']):
    vali = dataset.loc[test_index]
    train = dataset.loc[train_index]

In [None]:
#train, vali = train_test_split(dataset, test_size=0.2, random_state=42)
train[['sentence','label']].to_csv("/opt/ml/input/data/train/train_train.txt", sep='\t', index=False)
vali[['sentence','label']].to_csv("/opt/ml/input/data/train/train_vali.txt", sep='\t', index=False)

In [None]:
dataset_train = nlp.data.TSVDataset("/opt/ml/input/data/train/train_train.txt", field_indices=[0,1], num_discard_samples=1)
dataset_vali = nlp.data.TSVDataset("/opt/ml/input/data/train/train_vali.txt", field_indices=[0,1], num_discard_samples=1)

In [None]:
#import os
#import sentencepiece as spm
#vocab_size = 32000
#sp_model_root='sentencepiece'
#sp_model_name = 'tokenizer_%d' % (vocab_size)
#sp_model_path = os.path.join(sp_model_root, sp_model_name)
#sp = spm.SentencePieceProcessor()
#sp.Load('{}.model'.format(sp_model_path))

In [None]:
from transformers import *
model = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-large")
input_size = model.classifier.out_proj.in_features
model.classifier.out_proj = nn.Linear(in_features=input_size, out_features=42, bias=True)
model.classifier

tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-large")
config = XLMRobertaConfig.from_pretrained("xlm-roberta-large")

# KoELECTRA-Small
#model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-small-discriminator")
#model.classifier.out_proj = nn.Linear(in_features=256, out_features=42, bias=True)#

#tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-small-discriminator")
#config = ElectraConfig.from_pretrained("monologg/koelectra-small-discriminator")



In [None]:
config

In [None]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, tokenizer, max_len, pad, pair):
        #transform = nlp.data.BERTSentenceTransform(bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)
        self.sentences = dataset
        #self.token =  [tokenizer.conver_tokens_to_ids(i[0]) for i in self.sentences]
        
        self.labels = [np.int32(i[label_idx]) for i in dataset]
        #self.sentences = [sp.encode_as_ids(i[sent_idx]) for i in dataset]
        #self.labels = [np.int32(i[label_idx]) for i in dataset]
        
    def __getitem__(self, i):
        sentence = tokenizer(self.sentences[i][0], max_length=max_len, pad_to_max_length=True, truncation=True)
        label = self.labels[i]
        #return (self.sentences[i] + (self.labels[i], ))
        return (np.array(sentence['input_ids']),np.array(sentence['attention_mask']),label)
    def __len__(self):
        return (len(self.labels))

In [None]:
max_len = 128
batch_size = 32
warmup_ratio = 0.01
num_epochs = 100
max_grad_norm = 1
log_interval = 50
learning_rate =1e-5

In [None]:
data_train = BERTDataset(dataset_train, 0, 1, tokenizer, max_len, True, False)
data_vali = BERTDataset(dataset_vali, 0, 1, tokenizer, max_len, True, False)

In [None]:
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)
vali_dataloader = torch.utils.data.DataLoader(data_vali, batch_size=batch_size, num_workers=5)

# Classification

In [None]:
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

In [None]:
class LabelSmoothingLoss(nn.Module):
    def __init__(self, classes=42, smoothing=0.0, dim=-1):
        super(LabelSmoothingLoss, self).__init__()
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.cls = classes
        self.dim = dim

    def forward(self, pred, target):
        pred = pred.log_softmax(dim=self.dim)
        with torch.no_grad():
            true_dist = torch.zeros_like(pred)
            true_dist.fill_(self.smoothing / (self.cls - 1))
            true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))

In [None]:
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = LabelSmoothingLoss(classes=42, smoothing=0.5)

In [None]:
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

In [None]:
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [None]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [None]:
model = model.to(device)

In [None]:
cnt = 0
best_acc = 0.0

for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, attention_mask, label) in enumerate(tqdm(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        attention_mask = attention_mask.long().to(device)
        label = label.long().to(device)
        out = model(token_ids, attention_mask)[0]
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()
        train_acc += calc_accuracy(out, label)
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    model.eval()
    for batch_id, (token_ids, attention_mask, label) in enumerate(vali_dataloader):
        token_ids = token_ids.long().to(device)
        attention_mask = attention_mask.long().to(device)
        label = label.long().to(device)
        out = model(token_ids, attention_mask)[0]
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))
    
    test_acc = test_acc / (batch_id+1)
    
    if test_acc > best_acc:
        cnt = 0
        best_acc = test_acc
        torch.save(model.state_dict(), "/opt/ml/model/xlm-roberta-large.pt")
    else:
        cnt+=1
        if cnt == 10:
            print('EarlyStop: '+str(e)+' Epochs')
            break
print('Best Score: ', best_acc)

# Predict

In [None]:
dataset_path = r"/opt/ml/input/data/test/test.tsv"

dataset = load_data(dataset_path)

dataset['sentence'] = dataset['entity_01'] + ' [SEP] ' + dataset['entity_02'] + ' [SEP] ' + dataset['sentence']

dataset[['sentence','label']].to_csv("/opt/ml/input/data/test/test.txt", sep='\t', index=False)

In [None]:
dataset_test = nlp.data.TSVDataset("/opt/ml/input/data/test/test.txt", field_indices=[0,1], num_discard_samples=1)

data_test = BERTDataset(dataset_test, 0, 1, tokenizer, max_len, True, False)

test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)

In [None]:
model.load_state_dict(torch.load("/opt/ml/model/xlm-roberta-large.pt"))

model.eval()

Predict = []

for batch_id, (token_ids, attention_mask, label) in enumerate(test_dataloader):
    token_ids = token_ids.long().to(device)
    attention_mask = attention_mask.long().to(device)
    label = label.long().to(device)
    out = model(token_ids, attention_mask)[0]
    _, predict = torch.max(out,1)
    Predict.extend(predict.tolist())

In [None]:
output = pd.DataFrame(Predict, columns=['pred'])
output.to_csv('/opt/ml/result/xlm_roberta_large_stratified.csv', index=False)