## Notebook for saving logits

In [1]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import re
import tarfile
import pickle as pickle
from tqdm import tqdm
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

# Using KoELECTRA Model
from transformers import ElectraModel, ElectraTokenizer, ElectraForSequenceClassification
from transformers import *

# Added by Me
import os
from tqdm import tqdm
from ohsuz.utils import *
from ohsuz.loss import *
from ohsuz.config import *
from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau, CosineAnnealingLR

There are 1 GPU(s) available.
We will use the GPU: Tesla V100-PCIE-32GB
There are 1 GPU(s) available.
We will use the GPU: Tesla V100-PCIE-32GB


In [2]:
batch_size = 32
epochs = 10
log_interval = 50
lr = 1e-5

In [3]:
seed_everything(2021)

이 실험은 seed 2021로 고정되었습니다.


In [4]:
error_label_0 = ['wikitree-12599-4-108-111-4-7',
                 'wikipedia-25967-115-24-26-35-37',
                 'wikipedia-16427-6-14-17-20-22',
                 'wikipedia-16427-8-0-3-26-28',
                 'wikitree-19765-5-30-33-6-8',
                 'wikitree-58702-0-18-20-22-24',
                 'wikitree-71638-8-21-23-15-17',
                 'wikipedia-257-0-0-1-53-57',
                 'wikipedia-13649-28-66-70-14-24',
                 'wikipedia-6017-8-20-26-4-7']
error_label_1 = ['wikitree-55837-4-0-2-10-11']
error_label_2 = ['wikitree-62775-3-3-7-0-2']
error_label_3 = ['wikipedia-23188-0-74-86-41-42']

### 1. Dataset & DataLoader 준비

In [5]:
def data_truncation(data):
    padding_length = 50

    entity_min_index = min(data[3], data[6])
    entity_max_index = max(data[4], data[7])

    min_entity_start, min_entity_end = entity_min_index-padding_length, entity_min_index+padding_length
    max_entity_start, max_entity_end = entity_max_index-padding_length, entity_max_index+padding_length

    if min_entity_end < max_entity_start:
        min_entity_start = max(min_entity_start, 0)
        max_entity_end = min(max_entity_end, len(data[1]))
        return data[1][min_entity_start:min_entity_end] + data[1][max_entity_start:max_entity_end] # </s> 제거
    else:
        min_entity_start = max(min_entity_start, 0)
        max_entity_end = min(max_entity_end, len(data[1]))
        return data[1][min_entity_start:max_entity_end]

In [6]:
def add_entity_tokens(sentence, a1, a2, b1, b2):
    new_sentence = None
    special_tokens = special_tokens_dict['additional_special_tokens']
    
    if a1 > b1: # b1 먼저
        new_sentence = sentence[:b1] + special_tokens[2] + sentence[b1:b2+1] + special_tokens[3] + sentence[b2+1:a1] + special_tokens[0] + sentence[a1:a2+1] + special_tokens[1] + sentence[a2+1:]
        #new_sentence = sentence[:b1] + "$" + sentence[b1:b2+1] + "$" + sentence[b2+1:a1] + "#" + sentence[a1:a2+1] + "#" + sentence[a2+1:]
    else: # a1 먼저
        new_sentence = sentence[:a1] + special_tokens[0] + sentence[a1:a2+1] + special_tokens[1] + sentence[a2+1:b1] + special_tokens[2] + sentence[b1:b2+1] + special_tokens[3] + sentence[b2+1:]
    return new_sentence

In [7]:
def load_data(dataset_dir, add_entity=True, truncation=True):
    with open('/opt/ml/input/data/label_type.pkl', 'rb') as f:
        label_type = pickle.load(f)
    dataset = pd.read_csv(dataset_dir, delimiter='\t', header=None)
    if truncation:
        dataset[1] = dataset.apply(data_truncation, axis=1)
    dataset = preprocessing_dataset(dataset, label_type, add_entity)
    return dataset


def preprocessing_dataset(dataset, label_type, add_entity):
    label = []
    for ID, i in zip(dataset[0], dataset[8]):
        if i == 'blind':
            label.append(100)
        elif ID in error_label_0:
            label.append(label_type['관계_없음'])
        elif ID in error_label_1:
            label.append(label_type['단체:구성원'])
        elif ID in error_label_2:
            label.append(label_type['단체:본사_도시'])
        elif ID in error_label_3:
            label.append(label_type['단체:하위_단체'])
        else:
            label.append(label_type[i])
    
    if add_entity:
        ### 이 부분을 더 효율적으로 고치려면???
        sentences = [add_entity_tokens(dataset[1][i], dataset[3][i], dataset[4][i], dataset[6][i], dataset[7][i]) for i in tqdm(range(len(dataset)))]
    else:
        sentences = dataset[1]

    out_dataset = pd.DataFrame({'sentence':sentences,'entity_01':dataset[2],'entity_02':dataset[5],'label':label})

    return out_dataset

In [8]:
class KlueDataset(Dataset):
    def __init__(self, tsv_file, add_entity=True, threshold=0.1):
        self.dataset = load_data(tsv_file, add_entity)
        self.dataset['sentence'] = self.dataset['entity_01'] + ' RELATION ' + self.dataset['entity_02'] + ' </s></s> ' + self.dataset['sentence']
        self.sentences = list(self.dataset['sentence'])
        self.labels = list(self.dataset['label'])
        self.tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')
        
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        sentence, label = self.sentences[idx], self.labels[idx]
        inputs = self.tokenizer(
            sentence,
            return_tensors='pt',
            truncation=True,
            max_length=200,
            pad_to_max_length=True,
            add_special_tokens=True
        )
            
        input_ids = inputs['input_ids'][0]
        attention_mask = inputs['attention_mask'][0]
        
        return input_ids, attention_mask, label

In [9]:
test_dataset = KlueDataset(os.path.join(test_dir, 'test.tsv'))

100%|██████████| 1000/1000 [00:00<00:00, 25313.71it/s]


In [10]:
test_loader = DataLoader(test_dataset, batch_size=batch_size, num_workers=5)

### 2. Model 준비

In [11]:
model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-large', num_labels=42).to(device)

Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.den

In [103]:
trunc_0 = np.load('/opt/ml/logits/trunc_0.npy')
trunc_1 = np.load('/opt/ml/logits/trunc_1.npy')
trunc_2 = np.load('/opt/ml/logits/trunc_2.npy')
trunc_3 = np.load('/opt/ml/logits/trunc_3.npy')
trunc_4 = np.load('/opt/ml/logits/trunc_4.npy')
trunc_5 = np.load('/opt/ml/logits/trunc_5.npy')
trunc_6 = np.load('/opt/ml/logits/trunc_6.npy')
trunc_7 = np.load('/opt/ml/logits/trunc_7.npy')
trunc_8 = np.load('/opt/ml/logits/trunc_8.npy')
trunc_9 = np.load('/opt/ml/logits/trunc_9.npy')
#_0418 = np.load('/opt/ml/logits/0418.npy') * 0.8
#_0420_ver4 = np.load('/opt/ml/logits/0420_ver4.npy') * 1.1
#_0420_ver5 = np.load('/opt/ml/logits/0420_ver5.npy') * 1.1

mask_0 = np.load('/opt/ml/logits/mask_0.npy')
mask_1 = np.load('/opt/ml/logits/mask_1.npy')
mask_2 = np.load('/opt/ml/logits/mask_2.npy')
mask_3 = np.load('/opt/ml/logits/mask_3.npy')
mask_4 = np.load('/opt/ml/logits/mask_4.npy')
mask_5 = np.load('/opt/ml/logits/mask_5.npy')
mask_6 = np.load('/opt/ml/logits/mask_6.npy')
mask_7 = np.load('/opt/ml/logits/mask_7.npy')
mask_8 = np.load('/opt/ml/logits/mask_8.npy')
mask_9 = np.load('/opt/ml/logits/mask_9.npy')

final_0 = np.load('/opt/ml/logits/final_0.npy')
final_1 = np.load('/opt/ml/logits/final_1.npy')
final_2 = np.load('/opt/ml/logits/final_2.npy')
final_3 = np.load('/opt/ml/logits/final_3.npy')
final_4 = np.load('/opt/ml/logits/final_4.npy')

ko1_0 = np.load('/opt/ml/logits/koelectra1_0.npy')
ko1_1 = np.load('/opt/ml/logits/koelectra1_1.npy')
ko1_2 = np.load('/opt/ml/logits/koelectra1_2.npy')
ko1_3 = np.load('/opt/ml/logits/koelectra1_3.npy')
ko1_4 = np.load('/opt/ml/logits/koelectra1_4.npy')
ko1_5 = np.load('/opt/ml/logits/koelectra1_5.npy')
ko1_6 = np.load('/opt/ml/logits/koelectra1_6.npy')
ko1_7 = np.load('/opt/ml/logits/koelectra1_7.npy')
ko1_8 = np.load('/opt/ml/logits/koelectra1_8.npy')
ko1_9 = np.load('/opt/ml/logits/koelectra1_9.npy')

ko2_0 = np.load('/opt/ml/logits/koelectra2_0.npy')
ko2_1 = np.load('/opt/ml/logits/koelectra2_1.npy')
ko2_2 = np.load('/opt/ml/logits/koelectra2_2.npy')
ko2_3 = np.load('/opt/ml/logits/koelectra2_3.npy')
ko2_4 = np.load('/opt/ml/logits/koelectra2_4.npy')
ko2_5 = np.load('/opt/ml/logits/koelectra2_5.npy')
ko2_6 = np.load('/opt/ml/logits/koelectra2_6.npy')
ko2_7 = np.load('/opt/ml/logits/koelectra2_7.npy')
ko2_8 = np.load('/opt/ml/logits/koelectra2_8.npy')
ko2_9 = np.load('/opt/ml/logits/koelectra2_9.npy')

In [130]:
predictions = []

trunc = trunc_0 + trunc_1 + trunc_2 + trunc_3 + trunc_4 + trunc_5 + trunc_6 + trunc_7 + trunc_8 + trunc_9
mask = mask_0 + mask_1 + mask_2 + mask_3 + mask_4 + mask_5 + mask_6 + mask_7 + mask_8 + mask_9
ko1 = ko1_0 + ko1_1 + ko1_2 + ko1_3 + ko1_4 + ko1_5 + ko1_6 + ko1_7 + ko1_8 + ko1_9
ko2 = ko2_0 + ko2_1 + ko2_2 + ko2_3 + ko2_4 + ko2_5 + ko2_6 + ko2_7 + ko2_8 + ko2_9

total = trunc + mask + ko1 + ko2

pred = torch.argmax(torch.from_numpy(ko1+ko2), dim=-1)
predictions.extend(pred.tolist())
"""
for input_ids_batch, attention_masks_batch, y_batch in tqdm(test_loader):
    y_batch = y_batch.to(device)
    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
    _, predict = torch.max(y_pred, 1)
    pred = torch.argmax(torch.from_numpy(total_logits), dim=-1)
    predictions.extend(predict.tolist())
    """

'\nfor input_ids_batch, attention_masks_batch, y_batch in tqdm(test_loader):\n    y_batch = y_batch.to(device)\n    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]\n    _, predict = torch.max(y_pred, 1)\n    pred = torch.argmax(torch.from_numpy(total_logits), dim=-1)\n    predictions.extend(predict.tolist())\n    '

In [131]:
submission = pd.DataFrame(predictions, columns=['pred'])
submission.to_csv(os.path.join(submission_dir, '0422_ko12.csv'), index=False)

### **하드보팅**

In [34]:
from collections import Counter

In [168]:
output1 = pd.read_csv(os.path.join(submission_dir, "0421_submission_ver1_trunc_kfold.csv"))
output2 = pd.read_csv(os.path.join(submission_dir, "0422_ver1_soft_voting.csv")) # mask kfold
output3 = pd.read_csv(os.path.join(submission_dir, "0422_ko12.csv"))
output4 = pd.read_csv(os.path.join(submission_dir, "0422_ver5_soft_voting_5.csv"))

#output5 = pd.read_csv(os.path.join(submission_dir, "0418_submission_1.csv"))
#output6 = pd.read_csv(os.path.join(submission_dir, "0419_submission_1.csv"))
#output7 = pd.read_csv(os.path.join(conf.submission_dir, "0406_efficientnet_64.csv"))
#output8 = pd.read_csv(os.path.join(conf.submission_dir, "0408_last_soft_voting.csv"))
#output9 = pd.read_csv(os.path.join(conf.submission_dir, "0408_ensemble.csv"))

all_predictions_v1 = []
all_predictions_v2 = []

for i in range(len(output1)):
    outputs_v1 = [output1["pred"][i], output2["pred"][i], output3["pred"][i], output4["pred"][i]]
    outputs_v2 = [output1["pred"][i], output2["pred"][i], output3["pred"][i]]

    common_v1 = Counter(outputs_v1).most_common()
    common_v2 = Counter(outputs_v2).most_common()
    
    if len(common_v1) == 1:
        ans = Counter(outputs_v1).most_common(1)
        all_predictions_v1.append(ans[0][0])
    else:
        if common_v1[0][1] == common_v1[1][1]:
            ans = output4["pred"][i]
            all_predictions_v1.append(ans)
        else:
            ans = Counter(outputs_v1).most_common(1)
            all_predictions_v1.append(ans[0][0])
    
    if len(common_v2) == 1:
        ans = Counter(outputs_v2).most_common(1)
        all_predictions_v2.append(ans[0][0])
    else:
        if common_v2[0][1] == common_v2[1][1]:
            ans = output1["pred"][i]
            all_predictions_v2.append(ans)
        else:
            ans = Counter(outputs_v2).most_common(1)
            all_predictions_v2.append(ans[0][0])

In [169]:
for i in range(1000):
    if all_predictions_v1[i] != all_predictions_v2[i]:
        print(all_predictions_v1[i], ', ', all_predictions_v2[i], ', ', i)

7 ,  0 ,  17
0 ,  23 ,  34
4 ,  0 ,  64
0 ,  5 ,  90
0 ,  17 ,  93
11 ,  0 ,  94
0 ,  5 ,  113
0 ,  17 ,  161
39 ,  4 ,  182
22 ,  17 ,  184
10 ,  12 ,  210
0 ,  4 ,  236
7 ,  0 ,  248
0 ,  10 ,  275
4 ,  0 ,  280
2 ,  0 ,  356
12 ,  10 ,  366
0 ,  15 ,  384
0 ,  10 ,  389
8 ,  0 ,  416
15 ,  3 ,  432
0 ,  8 ,  703
0 ,  5 ,  717
2 ,  0 ,  733
0 ,  5 ,  753
0 ,  21 ,  787
0 ,  4 ,  806
0 ,  4 ,  807
0 ,  9 ,  810
0 ,  20 ,  815
5 ,  0 ,  854
5 ,  0 ,  881
0 ,  2 ,  915


In [84]:
for i in range(1000):
    if all_predictions_v1[i] != pred_fix[i]:
        print(all_predictions_v1[i], ', ', pred_fix[i], ', ', i)

28 ,  0 ,  7
7 ,  0 ,  17
0 ,  23 ,  34
1 ,  21 ,  36
15 ,  9 ,  99
10 ,  0 ,  102
20 ,  0 ,  173
22 ,  17 ,  184
1 ,  0 ,  215
7 ,  0 ,  248
10 ,  0 ,  250
11 ,  8 ,  255
10 ,  0 ,  284
0 ,  4 ,  303
7 ,  0 ,  363
8 ,  11 ,  409
0 ,  4 ,  438
4 ,  0 ,  491
0 ,  22 ,  493
32 ,  0 ,  565
9 ,  0 ,  567
0 ,  17 ,  601
23 ,  15 ,  618
0 ,  9 ,  681
0 ,  15 ,  685
20 ,  0 ,  720
0 ,  23 ,  750
0 ,  2 ,  772
21 ,  0 ,  794
0 ,  4 ,  807
10 ,  0 ,  923
0 ,  25 ,  943


In [157]:
submission = pd.DataFrame(all_predictions_v1, columns=['pred'])
submission.to_csv(os.path.join(submission_dir, '0422_real_final.csv'), index=False)

### **예측**

In [21]:
# row 생략 없이 출력
pd.set_option('display.max_rows', None)
# col 생략 없이 출력
pd.set_option('display.max_columns', None)

In [163]:
one_df = pd.read_csv(submission_dir+"/0422_real_final.csv")
two_df = pd.read_csv(submission_dir+"/0421_submission_ver1_trunc_kfold.csv")
test_df = pd.read_csv(submission_dir+"/test.csv")

one = list(one_df['pred'])
two = list(two_df['pred'])

In [164]:
idx = []
ans1 = []
ans2 = []
for i, val in enumerate(one):
    if val != two[i]:
        idx.append(i)
        ans1.append(val)
        ans2.append(two[i])
    else:
        ans1.append(0)
        ans2.append(0)

In [165]:
test_df['ans1'] = ans1
test_df['ans2'] = ans2

In [166]:
len(idx)

39

In [167]:
test_df.iloc[idx, :]

Unnamed: 0,sentence,entity_01,entity_02,label,ans1,ans2
6,카를로스 4세 RELATION 부르봉 왕가 </s></s> 카를로스 4세(Carlo...,카를로스 4세,부르봉 왕가,100,7,4
17,찰스 1세 RELATION 스튜어트 </s></s> 스튜어트 왕가의 남자 후손은 없...,찰스 1세,스튜어트,100,7,0
34,광주광역시 RELATION 전남대병원 </s></s> 이번 설명회는 광주광역시와 (...,광주광역시,전남대병원,100,0,23
64,"존 하지 RELATION 미국 육군 </s></s> 여운형이 암살당한뒤 2개월 뒤,...",존 하지,미국 육군,100,4,0
90,문재인 RELATION 김정은 </s></s> 문재인 대통령이 김정은 북한 국무위원...,문재인,김정은,100,0,5
94,비다쓰 천황 RELATION 긴메이 천황 </s></s> 585년 8월 비다쓰 천황...,비다쓰 천황,긴메이 천황,100,11,0
100,조선민족청년단 RELATION 자유당 </s></s> 이때 자유당의 실력자 1순위에...,조선민족청년단,자유당,100,0,15
113,콘돌리자 라이스 RELATION 조지 W. 부시 </s></s> 스팀슨 센터 출신으...,콘돌리자 라이스,조지 W. 부시,100,0,5
161,삼성전기 RELATION MLCC </s></s> 그는 “일본의 수출 규제가 MLC...,삼성전기,MLCC,100,0,17
166,경제기획원 RELATION 대한민국 </s></s> 경제기획원에 따르면 인구의 79...,경제기획원,대한민국,100,9,0
