In [2]:
import pandas as pd
import numpy as np
import os
import time
import glob
import random
import argparse
from pathlib import Path
import re
import tarfile
import pickle as pickle
from tqdm import tqdm
from pororo import Pororo

import torch
from torch import nn
import torch.tensor as tensor
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau, CosineAnnealingLR
from torch.utils.data import Dataset, DataLoader, Subset
from transformers import AdamW
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from transformers import *
import transformers
import wandb
import warnings
from collections import Counter

# import modules made by me
import os
from ohsuz.utils import *
from ohsuz.loss import *
from ohsuz.config import *
from ohsuz.mectric import *

There are 1 GPU(s) available.
We will use the GPU: Tesla V100-PCIE-32GB
There are 1 GPU(s) available.
We will use the GPU: Tesla V100-PCIE-32GB


In [15]:
# row 생략 없이 출력
pd.set_option('display.max_rows', None)
# col 생략 없이 출력
pd.set_option('display.max_columns', None)

### **1. logit을 담을 틀을 만듭시다**

In [24]:
train = pd.read_csv(os.path.join(train_dir, 'real_ner.tsv'), sep='\t')
test = pd.read_csv(os.path.join(test_dir, 'real_test.tsv'), sep='\t')

In [25]:
train.head()

Unnamed: 0,sentence,entity_01,entity_02,label
0,영국에서 사용되는 스포츠 유틸리티 @ α ARTIFACT α 자동차@의 브랜드로는 ...,랜드로버,자동차,17
1,선거에서 # β ORGANIZATION β 민주당#은 해산 전 의석인 230석에 한...,민주당,27석,0
2,# β ORGANIZATION β 유럽 축구 연맹#(@ α ORGANIZATION ...,유럽 축구 연맹,UEFA,6
3,용병 @ α CIVILIZATION α 공격수@ 챠디의 부진과 시즌 초 활약한 # ...,강수일,공격수,2
4,# β LOCATION β 람캄행# 왕은 1237년에서 1247년 사이 수코타이의 ...,람캄행,퍼쿤 씨 인트라팃,8


In [26]:
train_e1, train_e2, test_e1, test_e2 = [], [], [], []

In [27]:
train_sent = list(train['sentence'])
test_sent = list(test['sentence'])

In [28]:
text = train_sent[0]

In [29]:
text

'영국에서 사용되는 스포츠 유틸리티 @ α ARTIFACT α 자동차@의 브랜드로는 # β ORGANIZATION β 랜드로버#(Land Rover)와 지프(Jeep)가 있으며, 이 브랜드들은 자동차의 종류를 일컫는 말로 사용되기도 한다.'

In [30]:
text.split('α')[1].strip()

'ARTIFACT'

In [31]:
for sent in train_sent:
    train_e1.append(sent.split('α')[1].strip())
    train_e2.append(sent.split('β')[1].strip())

In [32]:
for sent in test_sent:
    test_e1.append(sent.split('α')[1].strip())
    test_e2.append(sent.split('β')[1].strip())

In [33]:
train_df = pd.DataFrame({'e1':train_e1, 'e2':train_e2})
test_df = pd.DataFrame({'e1':test_e1, 'e2':test_e2})

In [34]:
ner = set(train_e1 + train_e2 + test_e1 + test_e2)

In [35]:
print(ner)

{'LOCATION', 'PLANT', 'STUDY_FIELD', 'OCCUPATION', 'CIVILIZATION', 'ANIMAL', 'MATERIAL', 'PERSON', 'QUANTITY', 'DISEASE', 'O', 'ARTIFACT', 'DATE', 'ORGANIZATION', 'TIME', 'CITY', 'COUNTRY', 'TERM', 'THEORY', 'EVENT'}


In [36]:
ner_dict = {}

In [37]:
from itertools import permutations

ner_dict_key = list(permutations(ner, 2))
ner_dict_value = [[0]*42 for _ in range(len(ner_dict_key))]

In [38]:
e1 = [key[0] for key in ner_dict_key]
e2 = [key[1] for key in ner_dict_key]

In [39]:
ner_dict = {'e1': e1, 'e2': e2, 'logit': ner_dict_value}

In [40]:
ner_df = pd.DataFrame(ner_dict)

In [41]:
ner_df

Unnamed: 0,e1,e2,logit
0,LOCATION,PLANT,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,LOCATION,STUDY_FIELD,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,LOCATION,OCCUPATION,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,LOCATION,CIVILIZATION,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,LOCATION,ANIMAL,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
5,LOCATION,MATERIAL,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
6,LOCATION,PERSON,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7,LOCATION,QUANTITY,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
8,LOCATION,DISEASE,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
9,LOCATION,O,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


### **2. 직접 거를까 도구를 쓸까**

In [17]:
label_dict = {'관계_없음': 0, '인물:배우자': 1, '인물:직업/직함': 2, '단체:모회사': 3, '인물:소속단체': 4, '인물:동료': 5, '단체:별칭': 6, '인물:출신성분/국적': 7, '인물:부모님': 8, '단체:본사_국가': 9, '단체:구성원': 10, '인물:기타_친족': 11, '단체:창립자': 12, '단체:주주': 13, '인물:사망_일시': 14, '단체:상위_단체': 15, '단체:본사_주(도)': 16, '단체:제작': 17, '인물:사망_원인': 18, '인물:출생_도시': 19, '단체:본사_도시': 20, '인물:자녀': 21, '인물:제작': 22, '단체:하위_단체': 23, '인물:별칭': 24, '인물:형제/자매/남매': 25, '인물:출생_국가': 26, '인물:출생_일시': 27, '단체:구성원_수': 28, '단체:자회사': 29, '인물:거주_주(도)': 30, '단체:해산일': 31, '인물:거주_도시': 32, '단체:창립일': 33, '인물:종교': 34, '인물:거주_국가': 35, '인물:용의자': 36, '인물:사망_도시': 37, '단체:정치/종교성향': 38, '인물:학교': 39, '인물:사망_국가': 40, '인물:나이': 41}

In [19]:
label_dict_1 = {'tag':list(label_dict.keys()), 'value':list(label_dict.values())}

In [21]:
label_df = pd.DataFrame(label_dict_1)

In [22]:
label_df

Unnamed: 0,tag,value
0,관계_없음,0
1,인물:배우자,1
2,인물:직업/직함,2
3,단체:모회사,3
4,인물:소속단체,4
5,인물:동료,5
6,단체:별칭,6
7,인물:출신성분/국적,7
8,인물:부모님,8
9,단체:본사_국가,9


In [21]:
labels = list(label_dict.keys())

In [24]:
print(labels)

['관계_없음', '인물:배우자', '인물:직업/직함', '단체:모회사', '인물:소속단체', '인물:동료', '단체:별칭', '인물:출신성분/국적', '인물:부모님', '단체:본사_국가', '단체:구성원', '인물:기타_친족', '단체:창립자', '단체:주주', '인물:사망_일시', '단체:상위_단체', '단체:본사_주(도)', '단체:제작', '인물:사망_원인', '인물:출생_도시', '단체:본사_도시', '인물:자녀', '인물:제작', '단체:하위_단체', '인물:별칭', '인물:형제/자매/남매', '인물:출생_국가', '인물:출생_일시', '단체:구성원_수', '단체:자회사', '인물:거주_주(도)', '단체:해산일', '인물:거주_도시', '단체:창립일', '인물:종교', '인물:거주_국가', '인물:용의자', '인물:사망_도시', '단체:정치/종교성향', '인물:학교', '인물:사망_국가', '인물:나이']


In [23]:
Pororo.available_tasks() # 쓸만한 것 -> similarity, w2v, wordvec, word2vec, word_vector, word_embedding, machine_translation, translation

"Available tasks are ['mrc', 'rc', 'qa', 'question_answering', 'machine_reading_comprehension', 'reading_comprehension', 'sentiment', 'sentiment_analysis', 'nli', 'natural_language_inference', 'inference', 'fill', 'fill_in_blank', 'fib', 'para', 'pi', 'cse', 'contextual_subword_embedding', 'similarity', 'sts', 'semantic_textual_similarity', 'sentence_similarity', 'sentvec', 'sentence_embedding', 'sentence_vector', 'se', 'inflection', 'morphological_inflection', 'g2p', 'grapheme_to_phoneme', 'grapheme_to_phoneme_conversion', 'w2v', 'wordvec', 'word2vec', 'word_vector', 'word_embedding', 'tokenize', 'tokenise', 'tokenization', 'tokenisation', 'tok', 'segmentation', 'seg', 'mt', 'machine_translation', 'translation', 'pos', 'tag', 'pos_tagging', 'tagging', 'const', 'constituency', 'constituency_parsing', 'cp', 'pg', 'collocation', 'collocate', 'col', 'word_translation', 'wt', 'summarization', 'summarisation', 'text_summarization', 'text_summarisation', 'summary', 'gec', 'review', 'review_s