# Setting

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
import os
os.chdir('/content/drive/MyDrive/Stage2/code') 

In [3]:
os.getcwd()

'/content/drive/MyDrive/Stage2/code'

라이브러리 다운로드

In [4]:
!pip install mxnet
!pip install gluonnlp pandas tqdm
!pip install sentencepiece
!pip install transformers==3
!pip install torch
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master

Collecting mxnet
[?25l  Downloading https://files.pythonhosted.org/packages/30/07/66174e78c12a3048db9039aaa09553e35035ef3a008ba3e0ed8d2aa3c47b/mxnet-1.8.0.post0-py2.py3-none-manylinux2014_x86_64.whl (46.9MB)
[K     |████████████████████████████████| 46.9MB 112kB/s 
[?25hCollecting graphviz<0.9.0,>=0.8.1
  Downloading https://files.pythonhosted.org/packages/53/39/4ab213673844e0c004bed8a0781a0721a3f6bb23eb8854ee75c236428892/graphviz-0.8.4-py2.py3-none-any.whl
Installing collected packages: graphviz, mxnet
  Found existing installation: graphviz 0.10.1
    Uninstalling graphviz-0.10.1:
      Successfully uninstalled graphviz-0.10.1
Successfully installed graphviz-0.8.4 mxnet-1.8.0.post0
Collecting gluonnlp
[?25l  Downloading https://files.pythonhosted.org/packages/9c/81/a238e47ccba0d7a61dcef4e0b4a7fd4473cb86bed3d84dd4fe28d45a0905/gluonnlp-0.10.0.tar.gz (344kB)
[K     |████████████████████████████████| 348kB 4.1MB/s 
Building wheels for collected packages: gluonnlp
  Building wheel fo

라이브러리 불러오기

In [5]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import pandas as pd
import numpy as np
import re
import tarfile
import pickle as pickle
from tqdm import tqdm
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup
from sklearn.model_selection import train_test_split,StratifiedKFold

from transformers import *
from tqdm import tqdm

GPU 설정

In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available else "cpu")

kobert 불러오기

In [7]:
device

device(type='cuda', index=0)

# Preprocessing

In [8]:
data_path = "/content/drive/MyDrive/Stage2/input/data/"

In [10]:
def load_data(dataset_dir):
    with open('/content/drive/MyDrive/Stage2/input/data/label_type.pkl', 'rb') as f:
        label_type = pickle.load(f)
    dataset = pd.read_csv(dataset_dir, delimiter='\t', header=None)
    dataset = preprocessing_dataset(dataset, label_type)
    return dataset

def preprocessing_dataset(dataset, label_type):
    label = []
    for i in dataset[8]:
        if i == 'blind':
            label.append(100)
        else:
            label.append(label_type[i])
    out_dataset = pd.DataFrame({'sentence':dataset[1],'entity_01':dataset[2],'entity_02':dataset[5],'label':label,})
    return out_dataset

In [11]:
dataset_path = r"/content/drive/MyDrive/Stage2/input/data/train/new_train.tsv"

dataset = load_data(dataset_path)

dataset['sentence'] = dataset['entity_01'] + ' [SEP] ' + dataset['entity_02'] + ' [SEP] ' + dataset['sentence']

In [12]:
dataset

Unnamed: 0,sentence,entity_01,entity_02,label
0,랜드로버 [SEP] 자동차 [SEP] 영국에서 사용되는 스포츠 유틸리티 ∀∮ARTI...,랜드로버,자동차,17
1,민주당 [SEP] 27석 [SEP] 선거에서 ∏∑ORGANIZATION∑민주당∏은 ...,민주당,27석,0
2,유럽 축구 연맹 [SEP] UEFA [SEP] ∏∑ORGANIZATION∑유럽 축구...,유럽 축구 연맹,UEFA,6
3,강수일 [SEP] 공격수 [SEP] 용병 ∀∮CIVILIZATION∮공격수∀ 챠디의...,강수일,공격수,2
4,람캄행 [SEP] 퍼쿤 씨 인트라팃 [SEP] ∏∑LOCATION∑람캄행∏ 왕은 1...,람캄행,퍼쿤 씨 인트라팃,8
...,...,...,...,...
8995,사우디아라비아 [SEP] 2002년 [SEP] ∀∮DATE∮2002년∀ FIFA 월...,사우디아라비아,2002년,0
8996,토요타 [SEP] 일본 [SEP] ∀∮COUNTRY∮일본∀의 2대 메이커인 ∏∑OR...,토요타,일본,9
8997,방덕룡 [SEP] 선무원종공신(宣武原從功臣) [SEP] 방호의의 손자 ∏∑PERSO...,방덕룡,선무원종공신(宣武原從功臣),2
8998,LG전자 [SEP] 북미 [SEP] ∏∑ORGANIZATION∑LG전자∏는 올해 초...,LG전자,북미,0


In [13]:
skf = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)
for train_index, test_index in skf.split(dataset['sentence'], dataset['label']):
    vali = dataset.loc[test_index]
    train = dataset.loc[train_index]



In [14]:
#train, vali = train_test_split(dataset, test_size=0.2, random_state=42)
train[['sentence','label']].to_csv(data_path+"train/train_train.txt", sep='\t', index=False)
vali[['sentence','label']].to_csv(data_path+"train/train_vali.txt", sep='\t', index=False)

In [15]:
dataset_train = nlp.data.TSVDataset(data_path+"train/train_train.txt", field_indices=[0,1], num_discard_samples=1)
dataset_vali = nlp.data.TSVDataset(data_path+"train/train_vali.txt", field_indices=[0,1], num_discard_samples=1)

In [16]:
#import os
#import sentencepiece as spm
#vocab_size = 32000
#sp_model_root='sentencepiece'
#sp_model_name = 'tokenizer_%d' % (vocab_size)
#sp_model_path = os.path.join(sp_model_root, sp_model_name)
#sp = spm.SentencePieceProcessor()
#sp.Load('{}.model'.format(sp_model_path))

In [17]:
from transformers import *
model = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-large")
input_size = model.classifier.out_proj.in_features
model.classifier.out_proj = nn.Linear(in_features=input_size, out_features=42, bias=True)
model.classifier

tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-large")
config = XLMRobertaConfig.from_pretrained("xlm-roberta-large")

# KoELECTRA-Small
#model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-small-discriminator")
#model.classifier.out_proj = nn.Linear(in_features=256, out_features=42, bias=True)#

#tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-small-discriminator")
#config = ElectraConfig.from_pretrained("monologg/koelectra-small-discriminator")



HBox(children=(FloatProgress(value=0.0, description='Downloading', max=513.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2244861551.0, style=ProgressStyle(descr…




Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.we

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=5069051.0, style=ProgressStyle(descript…




In [18]:
tokenizer.add_special_tokens({"additional_special_tokens":['∮','∀','∏','∑']})

0

In [22]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, tokenizer, max_len, pad, pair):
        #transform = nlp.data.BERTSentenceTransform(bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)
        self.sentences = dataset
        #self.token =  [tokenizer.conver_tokens_to_ids(i[0]) for i in self.sentences]
        
        self.labels = [np.int32(i[label_idx]) for i in dataset]
        #self.sentences = [sp.encode_as_ids(i[sent_idx]) for i in dataset]
        #self.labels = [np.int32(i[label_idx]) for i in dataset]
        
    def __getitem__(self, i):
        sentence = tokenizer(self.sentences[i][0], max_length=max_len, pad_to_max_length=True, truncation=True)
        label = self.labels[i]
        #return (self.sentences[i] + (self.labels[i], ))
        return (np.array(sentence['input_ids']),np.array(sentence['attention_mask']),label)
    def __len__(self):
        return (len(self.labels))

In [23]:
max_len = 128
batch_size = 8
warmup_ratio = 0.01
num_epochs = 100
max_grad_norm = 1
log_interval = 50
learning_rate =1e-5

In [24]:
data_train = BERTDataset(dataset_train, 0, 1, tokenizer, max_len, True, False)
data_vali = BERTDataset(dataset_vali, 0, 1, tokenizer, max_len, True, False)

In [25]:
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=2)
vali_dataloader = torch.utils.data.DataLoader(data_vali, batch_size=batch_size, num_workers=2)

# Classification

In [26]:
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

In [27]:
class LabelSmoothingLoss(nn.Module):
    def __init__(self, classes=42, smoothing=0.0, dim=-1):
        super(LabelSmoothingLoss, self).__init__()
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.cls = classes
        self.dim = dim

    def forward(self, pred, target):
        pred = pred.log_softmax(dim=self.dim)
        with torch.no_grad():
            true_dist = torch.zeros_like(pred)
            true_dist.fill_(self.smoothing / (self.cls - 1))
            true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))

In [28]:
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = LabelSmoothingLoss(classes=42, smoothing=0.5)

In [29]:
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

In [30]:
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [31]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [32]:
model = model.to(device)

In [33]:
cnt = 0
best_acc = 0.0

for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, attention_mask, label) in enumerate(tqdm(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        attention_mask = attention_mask.long().to(device)
        label = label.long().to(device)
        out = model(token_ids, attention_mask)[0]
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()
        train_acc += calc_accuracy(out, label)
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    model.eval()
    for batch_id, (token_ids, attention_mask, label) in enumerate(vali_dataloader):
        token_ids = token_ids.long().to(device)
        attention_mask = attention_mask.long().to(device)
        label = label.long().to(device)
        out = model(token_ids, attention_mask)[0]
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))
    
    test_acc = test_acc / (batch_id+1)
    
    if test_acc > best_acc:
        cnt = 0
        best_acc = test_acc
        torch.save(model.state_dict(), "/content/drive/MyDrive/Stage2/model/xlm-roberta-large.pt")
    else:
        cnt+=1
        if cnt == 10:
            print('EarlyStop: '+str(e)+' Epochs')
            break
print('Best Score: ', best_acc)

100%|██████████| 900/900 [04:06<00:00,  3.65it/s]

epoch 1 train acc 0.4795833333333333





epoch 1 test acc 0.42055555555555557


100%|██████████| 900/900 [04:08<00:00,  3.62it/s]

epoch 2 train acc 0.6566666666666666





epoch 2 test acc 0.6177777777777778


100%|██████████| 900/900 [04:08<00:00,  3.62it/s]

epoch 3 train acc 0.7404166666666666





epoch 3 test acc 0.6794444444444444


100%|██████████| 900/900 [04:08<00:00,  3.62it/s]

epoch 4 train acc 0.7961111111111111



  0%|          | 0/900 [00:00<?, ?it/s]

epoch 4 test acc 0.6638888888888889


100%|██████████| 900/900 [04:06<00:00,  3.64it/s]

epoch 5 train acc 0.8431944444444445





epoch 5 test acc 0.6927777777777778


100%|██████████| 900/900 [04:07<00:00,  3.63it/s]

epoch 6 train acc 0.8843055555555556





epoch 6 test acc 0.7277777777777777


100%|██████████| 900/900 [04:07<00:00,  3.63it/s]

epoch 7 train acc 0.91375





epoch 7 test acc 0.7644444444444445


100%|██████████| 900/900 [04:06<00:00,  3.65it/s]

epoch 8 train acc 0.9279166666666666



  0%|          | 0/900 [00:00<?, ?it/s]

epoch 8 test acc 0.7294444444444445


100%|██████████| 900/900 [04:05<00:00,  3.66it/s]

epoch 9 train acc 0.9425



  0%|          | 0/900 [00:00<?, ?it/s]

epoch 9 test acc 0.7516666666666667


100%|██████████| 900/900 [04:04<00:00,  3.68it/s]

epoch 10 train acc 0.9473611111111111





epoch 10 test acc 0.7661111111111111


100%|██████████| 900/900 [04:05<00:00,  3.67it/s]

epoch 11 train acc 0.9586111111111111



  0%|          | 0/900 [00:00<?, ?it/s]

epoch 11 test acc 0.7577777777777778


100%|██████████| 900/900 [04:04<00:00,  3.69it/s]

epoch 12 train acc 0.9634722222222222



  0%|          | 0/900 [00:00<?, ?it/s]

epoch 12 test acc 0.7577777777777778


100%|██████████| 900/900 [04:04<00:00,  3.69it/s]

epoch 13 train acc 0.9706944444444444



  0%|          | 0/900 [00:00<?, ?it/s]

epoch 13 test acc 0.7633333333333333


100%|██████████| 900/900 [04:03<00:00,  3.69it/s]

epoch 14 train acc 0.9669444444444445



  0%|          | 0/900 [00:00<?, ?it/s]

epoch 14 test acc 0.7638888888888888


100%|██████████| 900/900 [04:03<00:00,  3.70it/s]

epoch 15 train acc 0.9716666666666667





epoch 15 test acc 0.7705555555555555


100%|██████████| 900/900 [04:04<00:00,  3.68it/s]

epoch 16 train acc 0.9780555555555556



  0%|          | 0/900 [00:00<?, ?it/s]

epoch 16 test acc 0.74


100%|██████████| 900/900 [04:03<00:00,  3.70it/s]

epoch 17 train acc 0.9798611111111111



  0%|          | 0/900 [00:00<?, ?it/s]

epoch 17 test acc 0.7466666666666667


100%|██████████| 900/900 [04:03<00:00,  3.70it/s]

epoch 18 train acc 0.9806944444444444





epoch 18 test acc 0.7772222222222223


100%|██████████| 900/900 [04:03<00:00,  3.69it/s]

epoch 19 train acc 0.9838888888888889



  0%|          | 0/900 [00:00<?, ?it/s]

epoch 19 test acc 0.7566666666666667


100%|██████████| 900/900 [04:02<00:00,  3.71it/s]

epoch 20 train acc 0.9856944444444444



  0%|          | 0/900 [00:00<?, ?it/s]

epoch 20 test acc 0.7544444444444445


100%|██████████| 900/900 [04:02<00:00,  3.71it/s]

epoch 21 train acc 0.9852777777777778



  0%|          | 0/900 [00:00<?, ?it/s]

epoch 21 test acc 0.7722222222222223


100%|██████████| 900/900 [04:02<00:00,  3.71it/s]

epoch 22 train acc 0.9875



  0%|          | 0/900 [00:00<?, ?it/s]

epoch 22 test acc 0.7622222222222222


100%|██████████| 900/900 [04:02<00:00,  3.71it/s]

epoch 23 train acc 0.9876388888888888



  0%|          | 0/900 [00:00<?, ?it/s]

epoch 23 test acc 0.7477777777777778


100%|██████████| 900/900 [04:02<00:00,  3.72it/s]

epoch 24 train acc 0.9897222222222222



  0%|          | 0/900 [00:00<?, ?it/s]

epoch 24 test acc 0.75


100%|██████████| 900/900 [04:02<00:00,  3.71it/s]

epoch 25 train acc 0.9891666666666666



  0%|          | 0/900 [00:00<?, ?it/s]

epoch 25 test acc 0.75


100%|██████████| 900/900 [04:02<00:00,  3.71it/s]

epoch 26 train acc 0.9905555555555555



  0%|          | 0/900 [00:00<?, ?it/s]

epoch 26 test acc 0.7666666666666667


100%|██████████| 900/900 [04:02<00:00,  3.71it/s]

epoch 27 train acc 0.9933333333333333



  0%|          | 0/900 [00:00<?, ?it/s]

epoch 27 test acc 0.7638888888888888


100%|██████████| 900/900 [04:02<00:00,  3.72it/s]

epoch 28 train acc 0.9926388888888888





epoch 28 test acc 0.7666666666666667
EarlyStop: 27 Epochs
Best Score:  0.7772222222222223


# Predict

In [None]:
dataset_path = r"/content/drive/MyDrive/Stage2/input/data/test/test.tsv"

dataset = load_data(dataset_path)

dataset['sentence'] = dataset['entity_01'] + ' [SEP] ' + dataset['entity_02'] + ' [SEP] ' + dataset['sentence']

dataset[['sentence','label']].to_csv(data_path+"test/test.txt", sep='\t', index=False)

In [None]:
dataset_test = nlp.data.TSVDataset(data_path+"test/test.txt", field_indices=[0,1], num_discard_samples=1)

data_test = BERTDataset(dataset_test, 0, 1, tokenizer, max_len, True, False)

test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)

  cpuset_checked))


In [None]:
model.load_state_dict(torch.load("/content/drive/MyDrive/Stage2/model/xlm-roberta-large.pt"))

model.eval()

Predict = []

for batch_id, (token_ids, attention_mask, label) in enumerate(test_dataloader):
    token_ids = token_ids.long().to(device)
    attention_mask = attention_mask.long().to(device)
    label = label.long().to(device)
    out = model(token_ids, attention_mask)[0]
    _, predict = torch.max(out,1)
    Predict.extend(predict.tolist())

  cpuset_checked))


In [None]:
output = pd.DataFrame(Predict, columns=['pred'])
output.to_csv('/content/drive/MyDrive/Stage2/result/xlm_roberta_large_stratified.csv', index=False)