In [1]:
import json
import sys
import glob
import torch
sys.path.append('../')
import os
from transformers import *
from frameBERT.src import utils
from frameBERT.src import dataio
from frameBERT.src import eval_fn
from frameBERT import frame_parser
from frameBERT.src.modeling import BertForJointShallowSemanticParsing
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch
from torch import nn
from torch.optim import Adam
from tqdm import tqdm, trange

from pprint import pprint

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
if device != "cpu":
    torch.cuda.set_device(0)
import pickle

import numpy as np
import random
np.random.seed(0)   
random.seed(0)

from torch import autograd
torch.cuda.empty_cache()

import argparse

Using TensorFlow backend.



###DEVICE: cuda

###DEVICE: cuda


In [2]:
try:
    dir_path = os.path.dirname(os.path.abspath( __file__ ))
except:
    dir_path = '.'
    
# 실행시간 측정 함수
import time

_start_time = time.time()

def tic():
    global _start_time 
    _start_time = time.time()

def tac():
    t_sec = round(time.time() - _start_time)
    (t_min, t_sec) = divmod(t_sec,60)
    (t_hour,t_min) = divmod(t_min,60)
    
    result = '{}hour:{}min:{}sec'.format(t_hour,t_min,t_sec)
    return result

# Define task

In [3]:
srl = 'framenet'
language = 'multilingual'
fnversion = '1.2'

parser = argparse.ArgumentParser()
parser.add_argument('--model', required=False, help='모델 폴더', default='/disk/frameBERT/cltl_eval/models/efn_ekfn_multitask/34')
parser.add_argument('--domain', required=True, help='도메인')
parser.add_argument('--result', required=False, help='결과 저장 폴더', default=False)
args = parser.parse_args()

print('#####')
print('\ttask:', srl)
print('\tlanguage:', language)
print('\tfn_version:', fnversion)
bert_io = utils.for_BERT(mode='train', language=language, masking=True, fnversion=fnversion)

#####
	task: framenet
	language: multilingual
	fn_version: 1.2
used dictionary:
	 /disk/frameBERT/frameBERT/src/../koreanframenet/resource/info/mul_lu2idx.json
	 /disk/frameBERT/frameBERT/src/../koreanframenet/resource/info/mul_lufrmap.json
	 /disk/frameBERT/frameBERT/src/../koreanframenet/resource/info/mul_bio_frargmap.json


# Load data

In [4]:
from koreanframenet import koreanframenet
kfn = koreanframenet.interface(version=fnversion)

en_trn, en_dev, en_tst = dataio.load_data(srl=srl, language='en')

ekfn_trn_d, ekfn_tst_d = kfn.load_data(source='efn')
jkfn_trn_d, jkfn_tst_d = kfn.load_data(source='jfn')
skfn_trn_d, skfn_unlabel_d, skfn_tst_d = kfn.load_data(source='sejong')
pkfn_trn_d, pkfn_unlabel_d, pkfn_tst_d = kfn.load_data(source='propbank')

ekfn_trn = dataio.data2tgt_data(ekfn_trn_d, mode='train')
ekfn_tst = dataio.data2tgt_data(ekfn_tst_d, mode='train')

jkfn_trn = dataio.data2tgt_data(jkfn_trn_d, mode='train')
jkfn_tst = dataio.data2tgt_data(jkfn_tst_d, mode='train')

skfn_trn = dataio.data2tgt_data(skfn_trn_d, mode='train')
skfn_unlabel = dataio.data2tgt_data(skfn_unlabel_d, mode='train')
skfn_tst = dataio.data2tgt_data(skfn_tst_d, mode='train')

pkfn_trn = dataio.data2tgt_data(pkfn_trn_d, mode='train')
pkfn_unlabel = dataio.data2tgt_data(pkfn_unlabel_d, mode='train')
pkfn_tst = dataio.data2tgt_data(pkfn_tst_d, mode='train')

# of instances in trn: 19391
# of instances in dev: 2272
# of instances in tst: 6714
data example: [['Greece', 'wildfires', 'force', 'thousands', 'to', '<tgt>', 'evacuate', '</tgt>'], ['_', '_', '_', '_', '_', '_', 'evacuate.v', '_'], ['_', '_', '_', '_', '_', '_', 'Escaping', '_'], ['O', 'O', 'O', 'B-Escapee', 'O', 'X', 'O', 'X']]

### loading Korean FrameNet (from efn )
tuples: (trn, tst)
10647 3550

### loading Korean FrameNet (from jfn )
tuples: (trn, tst)
2200 1000

### loading Korean FrameNet (from sejong )
tuples: (trn, unlabel_data, tst)
500 4212 1000

### loading Korean FrameNet (from propbank )
tuples: (trn, unlabel_data, tst)
500 852 1000


# Define Dataset

In [5]:
trn_data = {}
trn_data['ekfn'] = ekfn_trn
trn_data['jkfn'] = jkfn_trn
trn_data['skfn'] = skfn_trn
trn_data['pkfn'] = pkfn_trn

tst_data = {}
tst_data['ekfn'] = ekfn_tst
tst_data['jkfn'] = jkfn_tst
tst_data['skfn'] = skfn_tst
tst_data['pkfn'] = pkfn_tst

unlabeled_data = {}
unlabeled_data['ekfn'] = ekfn_trn
unlabeled_data['jkfn'] = jkfn_trn
unlabeled_data['skfn'] = skfn_trn + skfn_unlabel
unlabeled_data['pkfn'] = pkfn_trn + pkfn_unlabel

# Pre-trained Model

In [6]:
model_path = '/disk/frameBERT/cltl_eval/models/efn_ekfn_multitask/34'
model_name = 'efn_ekfn_multitask/34'
pretrained_model = args.model
# pretrained_model = '/disk/frameBERT/models/enModel-fn17/2'
print('pretrained_model:', pretrained_model)

pretrained_model: /disk/frameBERT/cltl_eval/models/efn_ekfn_finetune_pkfn/best


# Parsing Unlabeld data

In [7]:
def parsing_unlabeled_data(model_path, masking=True, language='ko', data='ekfn', threshold=0.7, added_list=[]):
#     torch.cuda.set_device(device)
    model = frame_parser.FrameParser(srl=srl,gold_pred=True, model_path=model_path, masking=masking, language=language, info=False)    
    result = []
    for i in range(len(unlabeled_data[data])):
        instance = unlabeled_data[data][i]
        
        if i not in added_list:

            parsed = model.parser(instance, result_format='all')        
            conll = parsed['conll'][0]
            frame_score = parsed['topk']['targets'][0]['frame_candidates'][0][-1]

            if frame_score >= float(threshold):
                parsed_result = conll
                result.append(parsed_result)
                added_list.append(i)
            
    added_list.sort()
        
    return result, added_list

In [8]:
def train(model_path="bert-base-multilingual-cased",
          model_saved_path=False, epochs=10, batch_size=6, 
          trn=False): 
            
    if not os.path.exists(model_saved_path):
        os.makedirs(model_saved_path)
    print('### START TRAINING:', model_saved_path)
    # load a pre-trained model first
    model = BertForJointShallowSemanticParsing.from_pretrained(model_path, 
                                                               num_senses = len(bert_io.sense2idx), 
                                                               num_args = len(bert_io.bio_arg2idx),
                                                               lufrmap=bert_io.lufrmap, 
                                                               frargmap = bert_io.bio_frargmap)
    model.to(device)
    
    print('\nconverting data to BERT input...')
    print('# of instances:', len(trn))
    trn_data = bert_io.convert_to_bert_input_JointShallowSemanticParsing(trn)
    sampler = RandomSampler(trn)
    trn_dataloader = DataLoader(trn_data, sampler=sampler, batch_size=batch_size)
    
    # load optimizer
    FULL_FINETUNING = True
    if FULL_FINETUNING:
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'gamma', 'beta']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
             'weight_decay_rate': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
             'weight_decay_rate': 0.0}
        ]
    else:
        param_optimizer = list(model.classifier.named_parameters()) 
        optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
    optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)
    
    max_grad_norm = 1.0
    
    for _ in trange(epochs, desc="Epoch"):
        
        # TRAIN loop
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        for step, batch in enumerate(trn_dataloader):
            model.train()
            # add batch to gpu
            torch.cuda.set_device(device)
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_orig_tok_to_maps, b_input_lus, b_input_senses, b_input_args, b_token_type_ids, b_input_masks = batch            
            loss = model(b_input_ids, lus=b_input_lus, senses=b_input_senses, args=b_input_args,
                     token_type_ids=b_token_type_ids, attention_mask=b_input_masks)
            
            # backward pass
            loss.backward()
            # track train loss
            tr_loss += loss.item()
            nb_tr_examples += b_input_ids.size(0)
            nb_tr_steps += 1
            
            # gradient clipping
            torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
            
            # update parameters
            optimizer.step()
            model.zero_grad()            

    # save your model at 10 epochs
    model.save_pretrained(model_saved_path)
    print('... TRAINNG is DONE')

In [None]:
model_saved_dir = '/disk/frameBERT/cltl_eval/models/'

if args.result:
    result_dir = args.result
else:
    result_dir = 'self_'+ args.domain +'_using_'+ args.model
model_saved_dir = model_saved_dir + result_dir

if model_saved_dir[-1] != '/':
    model_saved_dir = model_saved_dir+'/'
    
if not os.path.exists(model_saved_dir):
    os.makedirs(model_saved_dir)
print('your models are saved to', model_saved_path)
    
iters = 5
threshold = 0.9
instance = []
added_list = []
batch_size = 6

for _ in trange(iters, desc="Iteration"):
    iteration = _ + 1    
    
    if iteration == 1:
        pre_model = BertForJointShallowSemanticParsing.from_pretrained(pretrained_model, 
                                                               num_senses = len(bert_io.sense2idx), 
                                                               num_args = len(bert_io.bio_arg2idx),
                                                               lufrmap=bert_io.lufrmap, 
                                                               frargmap = bert_io.bio_frargmap)
        pre_model.to(device)
        
        model_saved_path = model_saved_dir+'0/'
        if not os.path.exists(model_saved_path):
            os.makedirs(model_saved_path)
        pre_model.save_pretrained(model_saved_path)
        
    
        
        
    parsing_model_path = model_saved_dir + str(iteration-1) +'/'
    model_saved_path = model_saved_dir+str(iteration)+'/'
    if not os.path.exists(model_saved_path):
        os.makedirs(model_saved_path)
    
    print('\n### ITERATION:', str(iteration))
    trn = trn_data['ekfn']
    print('### PARSING START...')
    parsed_result, added_list = parsing_unlabeled_data(parsing_model_path, data=args.domain, 
                                                       masking=True, 
                                                       threshold=threshold, added_list=added_list)
    instance += parsed_result
    print('... is done')
    
    # training process
    trn_instance = trn + instance
    
    print('\n# of original training data:', len(trn))
    print('# of all unlabeled data:', len(unlabeled_data[args.domain]))
    print('# of psuedo labeled data:', len(instance), '('+str((round(len(instance)/len(unlabeled_data[args.domain])*100), 2))+'%)')
    print('Total Training Instance:', len(trn_instance), '\n') 
    
    train(model_path=parsing_model_path, model_saved_path=model_saved_path, trn=trn_instance)    

Iteration:   0%|          | 0/10 [00:00<?, ?it/s]


### ITERATION: 1
### PARSING START...


  pred_logits = sm(masked_logit).view(1,-1)
  pred_logits = sm(masked_logit).view(1,-1)


... is done

# of original training data: 500
# of all unlabeled data: 852
# of psuedo labeled data: 755 ((89, 2))
Total Training Instance: 1255 

[['증권전문가들은', '이틀', '연속', '대형우량주에', '매수세가', '형성되면서', '반등에', '<tgt>', '성공했으나', '</tgt>', '기관과', '개인투자자의', '매물공세가', '계속되는', '한', '불안정한', '흐름이', '이어질', '수밖에', '없다고', '내다봤다.'], ['_', '_', '_', '_', '_', '_', '_', '_', '성공하다.v', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', '_', 'Success_or_failure', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_'], ['B-Agent', 'O', 'O', 'O', 'O', 'O', 'O', 'X', 'O', 'X', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]
[['레딩', '집행위원은', '유럽의', '학교간', '정보통신망', '구축을', '위해', '시험', '운용', '중인', '유럽학교통신망(EUN)도', '강화할', '필요가', '있다고', '밝혔다.'], ['_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '강화하다.v', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', 'Cause_change_of_strength', '_', '_', '_'], ['B-Agent', 'I-Agent', 'O', 'O',


Epoch:   0%|          | 0/10 [00:00<?, ?it/s][A
Epoch:  10%|█         | 1/10 [01:20<12:08, 80.90s/it][A
Epoch:  20%|██        | 2/10 [02:44<10:53, 81.72s/it][A
Epoch:  30%|███       | 3/10 [04:13<09:46, 83.85s/it][A
Epoch:  40%|████      | 4/10 [05:42<08:33, 85.50s/it][A
Epoch:  50%|█████     | 5/10 [07:12<07:13, 86.74s/it][A
Epoch:  60%|██████    | 6/10 [08:41<05:50, 87.60s/it][A
Epoch:  70%|███████   | 7/10 [10:11<04:24, 88.24s/it][A
Epoch:  80%|████████  | 8/10 [11:41<02:57, 88.70s/it][A
Epoch:  90%|█████████ | 9/10 [13:11<01:29, 89.06s/it][A
Epoch: 100%|██████████| 10/10 [14:41<00:00, 88.14s/it][A
Iteration:  10%|█         | 1/10 [15:57<2:23:41, 957.98s/it]

... TRAINNG is DONE

### ITERATION: 2
### PARSING START...
... is done

# of original training data: 500
# of all unlabeled data: 852
# of psuedo labeled data: 802 ((94, 2))
Total Training Instance: 1302 

[['증권전문가들은', '이틀', '연속', '대형우량주에', '매수세가', '형성되면서', '반등에', '<tgt>', '성공했으나', '</tgt>', '기관과', '개인투자자의', '매물공세가', '계속되는', '한', '불안정한', '흐름이', '이어질', '수밖에', '없다고', '내다봤다.'], ['_', '_', '_', '_', '_', '_', '_', '_', '성공하다.v', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', '_', 'Success_or_failure', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_'], ['B-Agent', 'O', 'O', 'O', 'O', 'O', 'O', 'X', 'O', 'X', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]
[['레딩', '집행위원은', '유럽의', '학교간', '정보통신망', '구축을', '위해', '시험', '운용', '중인', '유럽학교통신망(EUN)도', '강화할', '필요가', '있다고', '밝혔다.'], ['_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '강화하다.v', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', 'Cause_change_of_


Epoch:   0%|          | 0/10 [00:00<?, ?it/s][A
Epoch:  10%|█         | 1/10 [01:30<13:37, 90.86s/it][A
Epoch:  20%|██        | 2/10 [03:04<12:12, 91.55s/it][A
Epoch:  30%|███       | 3/10 [04:37<10:44, 92.03s/it][A
Epoch:  40%|████      | 4/10 [06:10<09:14, 92.36s/it][A
Epoch:  50%|█████     | 5/10 [07:43<07:43, 92.62s/it][A
Epoch:  60%|██████    | 6/10 [09:16<06:11, 92.81s/it][A
Epoch:  70%|███████   | 7/10 [10:50<04:38, 92.96s/it][A
Epoch:  80%|████████  | 8/10 [12:23<03:06, 93.03s/it][A
Epoch:  90%|█████████ | 9/10 [13:56<01:33, 93.11s/it][A
Epoch: 100%|██████████| 10/10 [15:29<00:00, 92.98s/it][A
Iteration:  20%|██        | 2/10 [31:44<2:07:16, 954.54s/it]

... TRAINNG is DONE

### ITERATION: 3
### PARSING START...
... is done

# of original training data: 500
# of all unlabeled data: 852
# of psuedo labeled data: 824 ((97, 2))
Total Training Instance: 1324 

[['증권전문가들은', '이틀', '연속', '대형우량주에', '매수세가', '형성되면서', '반등에', '<tgt>', '성공했으나', '</tgt>', '기관과', '개인투자자의', '매물공세가', '계속되는', '한', '불안정한', '흐름이', '이어질', '수밖에', '없다고', '내다봤다.'], ['_', '_', '_', '_', '_', '_', '_', '_', '성공하다.v', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', '_', 'Success_or_failure', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_'], ['B-Agent', 'O', 'O', 'O', 'O', 'O', 'O', 'X', 'O', 'X', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]
[['레딩', '집행위원은', '유럽의', '학교간', '정보통신망', '구축을', '위해', '시험', '운용', '중인', '유럽학교통신망(EUN)도', '강화할', '필요가', '있다고', '밝혔다.'], ['_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '강화하다.v', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', 'Cause_change_of_


Epoch:   0%|          | 0/10 [00:00<?, ?it/s][A
Epoch:  10%|█         | 1/10 [01:33<13:57, 93.03s/it][A
Epoch:  20%|██        | 2/10 [03:08<12:28, 93.62s/it][A

# Training