# Task 6. predict document order using cosine distance

## objective
- cos_distance (query, most recent) = 1
- cos_distance (query, else) = 0

## Note
- Task 5 requires GPU which have 24GB or more vram

In [None]:
from __future__ import absolute_import, division, print_function

import argparse
import csv
import json
import logging
import os
import random
import sys

import numpy as np
import torch
import torch.nn.functional as F

import torch.multiprocessing as mp
import torch.utils.data.distributed

from transformers import (WEIGHTS_NAME, AdamW, BertConfig,
                          BertForTokenClassification, BertTokenizer,
                          get_linear_schedule_with_warmup, 
                          BertPreTrainedModel, BertModel)

from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss


from torch import nn
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
from tqdm import tqdm, trange

from seqeval.metrics import classification_report
from seqeval.metrics import sequence_labeling 
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger(__name__)
import codecs

try:
    import cPickle as pickle
except ModuleNotFoundError:
    import pickle
    
import glob
import time

import easydict

import six
import tqdm
from tqdm import tqdm, trange
import collections

from seqeval.metrics import classification_report

logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger(__name__)

import re

import shutil


import math
from datetime import datetime
import os
import gzip
import csv

## data class

In [None]:
class TrainingInstance_ext(object):
    """A single training instance (sentence pair)."""
    def __init__(self, input_ids_0, input_mask_0, segment_ids_0, 
                 input_ids_1, input_mask_1, segment_ids_1, label_ids):
        
        self.input_ids_0 = input_ids_0
        self.input_mask_0 = input_mask_0
        self.segment_ids_0 = segment_ids_0
        
        self.input_ids_1 = input_ids_1
        self.input_mask_1 = input_mask_1
        self.segment_ids_1 = segment_ids_1
        
        self.label_ids = label_ids
        

## BERT with cos distance class

In [None]:
class SBERT(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        
        self.bert = BertModel(config, add_pooling_layer=False)
        
        self.ffnn_qury_0 = nn.Linear(config.hidden_size, config.hidden_size//2)
        self.ffnn_qury_1 = nn.Linear(config.hidden_size//2, config.hidden_size)
        
        self.ffnn_cand_0 = nn.Linear(config.hidden_size, config.hidden_size//2)
        self.ffnn_cand_1 = nn.Linear(config.hidden_size//2, config.hidden_size)
        
        self.act1 = nn.ReLU()
        self.act2 = nn.ReLU()
        
        self.init_weights()        
        self.loss = nn.MSELoss()        

    def forward(self, sentence_features, label_ids):
        s = 0
        outputs = self.bert(
            input_ids = sentence_features[s]['input_ids'], # 넘길때 사전으로 값을 넣어서 넣어줘야 함
            attention_mask=sentence_features[s]['attention_mask'],
            token_type_ids=sentence_features[s]['token_type_ids'],
            position_ids=None, head_mask=None, inputs_embeds=None, output_attentions=None, output_hidden_states=None,return_dict=None,
        )
        sequence_output = outputs[0]
        
        # batchsize x (1), dimension
        cls_embeds_q = torch.zeros(sequence_output.size()[0], sequence_output.size()[2])
        cls_embeds_q = cls_embeds_q.to(self.device)
        for bat in range(sequence_output.size()[0]):
            cls_embeds_q[bat] = sequence_output[bat][0]
        
        # ffnn for query
        m2d_h1_q = self.ffnn_qury_0(cls_embeds_q)
        m2d_h1_q = self.act1(m2d_h1_q) # relu (remove negatives)
        m2d_h2_q = self.ffnn_qury_1(m2d_h1_q)
        m2d_h2_q = self.act1(m2d_h2_q) # relu (remove negatives)
        m2d_h2_q = m2d_h2_q / m2d_h2_q.norm(dim=-1)[:, None].to(self.device) # norm (for cosine sim)
        
        # loop 1: sentence_features cands (as much batsize)
        s = 1
        outputs = self.bert(
            input_ids = sentence_features[s]['input_ids'], # 넘길때 사전으로 값을 넣어서 넣어줘야 함
            attention_mask=sentence_features[s]['attention_mask'],
            token_type_ids=sentence_features[s]['token_type_ids'],
            position_ids=None, head_mask=None, inputs_embeds=None, output_attentions=None, output_hidden_states=None,return_dict=None,
        )
        sequence_output = outputs[0]
        # batchsize x (1), dimension
        cls_embeds_c = torch.zeros(sequence_output.size()[0], sequence_output.size()[2])
        cls_embeds_c = cls_embeds_c.to(self.device)
        for bat in range(sequence_output.size()[0]):
            cls_embeds_c[bat] = sequence_output[bat][0]
        
        # ffnn for candidate
        m2d_h1_c = self.ffnn_cand_0(cls_embeds_c)
        m2d_h1_c = self.act1(m2d_h1_c) # relu (remove negatives)
        m2d_h2_c = self.ffnn_cand_1(m2d_h1_c)
        m2d_h2_c = self.act1(m2d_h2_c) # relu (remove negatives)
        m2d_h2_c = m2d_h2_c / m2d_h2_c.norm(dim=-1)[:, None].to(self.device) # norm (for cosine sim)
                
        # train [CLS] embedding distance
        cos_sim_fn = nn.CosineSimilarity(dim=1, eps=1e-6)
        
        # cosine distance = 1 - CosineSimilarity
        cos_distance = 1-cos_sim_fn(m2d_h2_q, m2d_h2_c)
        cos_distance = cos_distance.to(self.device)
        
        labels = torch.zeros(sequence_output.size()[0])
        labels = labels.to(self.device)
        for bat in range(len(label_ids)):
            labels[bat] = label_ids[bat][0]
        loss = self.loss(labels, cos_distance)       
        return loss, cos_distance

In [None]:
def train(args):
    ############################################################################
    ##              Multi-GPUs, Distributed settings
    ############################################################################ 
    if args.server_ip and args.server_port:
    # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()
    
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
        device, n_gpu, bool(args.local_rank != -1), args.fp16))
    
    print("n_gpu: ", n_gpu)
    
    
    ############################################################################
    ##              batch size, gradient_accumulation_steps
    ############################################################################ 
    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
                            args.gradient_accumulation_steps))
    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
    print("args.train_batch_size: ", args.train_batch_size)
    
    ###############################################################
    #                Functions for Dataload (load cache data)
    ###############################################################
    # 파일 목록 획득
    def get_file_arrays(data_path):
        data_path = data_path.split(",")
        print("data_path: ", data_path)
        
        cache_files_all = []
        for i in range(len(data_path)):
            path = data_path[i].strip()
            files = glob.glob(path)
            cache_files_all = cache_files_all + files
        cache_files_all.sort()
        
        print("Got " + str(len(cache_files_all)) + " cache_files")
        
        return cache_files_all
    
    # split file arrays
    def split_file_array(cache_files, num_files_on_memory):
        cache_file_groups = []
        for i in range(0, len(cache_files), num_files_on_memory):
            start = i
            if (i+num_files_on_memory) < len(cache_files):
                end = i+num_files_on_memory
            else:
                end = len(cache_files)
            cache_file_groups.append(cache_files[start:end])                
        return cache_file_groups
    
    def read_files(cache_files_pieces):
#         print("READING NEXT FILES OF GROUP...")
        train_features = []
        for c in range(len(cache_files_pieces)):
            with open(cache_files_pieces[c], 'rb') as input:
                train_features = train_features + pickle.load(input)

        num_train_examples = len(train_features)
        
        all_input_ids_0 = torch.tensor([f.input_ids_0 for f in train_features], dtype=torch.long)
        all_input_mask_0 = torch.tensor([f.input_mask_0 for f in train_features], dtype=torch.long)
        all_segment_ids_0 = torch.tensor([f.segment_ids_0 for f in train_features], dtype=torch.long)
        
        all_input_ids_1 = torch.tensor([f.input_ids_1 for f in train_features], dtype=torch.long)
        all_input_mask_1 = torch.tensor([f.input_mask_1 for f in train_features], dtype=torch.long)
        all_segment_ids_1 = torch.tensor([f.segment_ids_1 for f in train_features], dtype=torch.long)
        
        all_label_ids = torch.tensor([f.label_ids for f in train_features], dtype=torch.long)
        
        
        
        all_input_ids_0 = torch.squeeze(all_input_ids_0)
        all_input_mask_0 = torch.squeeze(all_input_mask_0)
        all_segment_ids_0 = torch.squeeze(all_segment_ids_0)
        
        all_input_ids_1 = torch.squeeze(all_input_ids_1)
        all_input_mask_1 = torch.squeeze(all_input_mask_1)
        all_segment_ids_1 = torch.squeeze(all_segment_ids_1)
        
        all_label_ids = torch.squeeze(all_label_ids)
        
        train_data = TensorDataset(
            all_input_ids_0, all_input_mask_0, all_segment_ids_0, 
            all_input_ids_1, all_input_mask_1, all_segment_ids_1, 
            all_label_ids            
        )
        
        if args.local_rank == -1:
            #train_sampler = RandomSampler(train_data)
            # Since one query document must be compared with multiple documents, dont mix the order.
            train_sampler = SequentialSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        
        print("len(train_data): ", len(train_data))
        print("args.train_batch_size: ", args.train_batch_size)
        
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
        
        print("len(train_dataloader): ", len(train_dataloader))
        
        return train_dataloader, num_train_examples
    
    
    ############################################################################
    ##                    Load Training data
    ############################################################################
    label_list = ["0", "1"] # not extraction, extraction
    num_labels = len(label_list) # + 1 NER 에서는 [PAD] 때문에 +1 했었음
    
    
    ###############################################################
    #            Dataload (load cache data) - Train data
    ###############################################################
    # 학습 파일 목록 획득
    cache_files = get_file_arrays(args.data_dir)
    print("cache_files: ", cache_files)
    train_dataloader, num_train_examples = read_files(cache_files)
    

    ############################################################################
    ##                      Prepare model
    ############################################################################
    print("loading weights from checkpoint (", args.checkpoint, ")")
    config = BertConfig.from_pretrained(args.checkpoint, num_labels=num_labels)
    model = SBERT.from_pretrained(args.checkpoint,
              from_tf = False,
              config = config)
    print("loaded weights from checkpoint (", args.checkpoint, ")")
    
    if args.local_rank == 0:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
    print("device: ", device)
    
    model.to(device)
    
    
    ############################################################################
    ##                        Optimizer
    ############################################################################
    num_train_optimization_steps = int(
        num_train_examples / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
    if args.local_rank != -1:
        num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()

    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
    
    
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias','LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    warmup_steps = int(args.warmup_proportion * num_train_optimization_steps)
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_train_optimization_steps)
    
    
    ############################################################################
    ##              MultiGPU & Distributed settings
    ############################################################################
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    if n_gpu > 1:
        model = torch.nn.DataParallel(model) 

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
                                                          output_device=args.local_rank,
                                                          find_unused_parameters=True)
    
    ############################################################################
    ##              save model
    ############################################################################
    def save_model(args, global_step):
        if not os.path.exists(args.output_dir+"/"+str(global_step)):
            os.makedirs(args.output_dir+"/"+str(global_step))

        # save model
        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
        model_to_save.save_pretrained(args.output_dir+"/"+str(global_step))

    
    def make_sentence_feature(input_ids, input_mask, segment_ids):
        sentencefeature = {}
        sentencefeature['input_ids'] = input_ids
        sentencefeature['attention_mask'] = input_mask
        sentencefeature['token_type_ids'] = segment_ids
        return sentencefeature

    ############################################################################
    ##              Start training
    ############################################################################
    global_step = -1
    nb_tr_steps = 0
    tr_loss = 0
    
    # save loss
    file = open(args.output_dir+"/loss.txt", "w")
    file.write("globalstep\tepoch\tstep\tloss\n")
    file.close()
    
    args.save_file_limit
    min_train_loss = 1000000
    save_checkpoints = []
    save_loss = []
    
    
    model.train()
    for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
            # make sentence features
            batch = tuple(t.to(device) for t in batch)
            input_ids_0, input_mask_0, segment_ids_0, \
            input_ids_1, input_mask_1, segment_ids_1, label_ids = batch
            
            sentence_features = []
            sentence_features.append(make_sentence_feature(input_ids_0, input_mask_0, segment_ids_0))
            sentence_features.append(make_sentence_feature(input_ids_1, input_mask_1, segment_ids_1))
            
            loss, props = model(sentence_features, label_ids)
            
            props  = props.detach().cpu()
            
            if n_gpu > 1:
                loss = loss.mean() # mean() to average on multi-gpu.
                
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
            else:
                current_loss = loss.item()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

            tr_loss += loss.item()
            nb_tr_examples += input_ids_0.size(0)
            nb_tr_steps += 1
            if (step + 1) % args.gradient_accumulation_steps == 0:
                # save loss
                file = open(args.output_dir+"/loss.txt", "a")
                file.write(str(global_step+1)+"\t"+str(epoch)+"\t"+str(step)+"\t"+str(loss.item())+"\n")
                file.close()
                
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1
            
#             save_model(args, global_step)
             
    save_model(args, global_step)
    
    
    

In [None]:
def main(args):
    parser = argparse.ArgumentParser()
    
    print("args.output_dir: ", args.output_dir)
    
    # check directories
    if not args.do_train and not args.do_eval:
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")
    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
            
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    
    if args.do_train:
        train(args)
    
    if args.do_eval:
        evaluate(args)

# finetune

In [None]:
data_dirs = [
    './cache/bertbase_cased/train/*.cache',
    './cache/mbert_cased/train/*.cache',
    './cache/biobert/train/*.cache',
    './cache/kobert/train/*.cache',

]

checkpoints = [
    '../../pretrainBERT/multigpu/ver9.1.4/521121_epoch2',
    '../../pretrainBERT/multigpu/ver8.1.4/1142642_epoch2',
    '../../pretrainBERT/multigpu/ver11.1.4/521079_epoch2',
    '../../pretrainBERT/multigpu/ver12.1.4/407013_epoch2',
    
]

assert len(data_dirs)==len(checkpoints)

for d in range(len(data_dirs)):
    otherbert = data_dirs[d].split("/")[2]
    output_dir = "./finetuned/"+str(checkpoints[d].split("/")[-2])+"_"+str(checkpoints[d].split("/")[-1])
    
    print("output_dir: ", output_dir)
                                   
    args = easydict.EasyDict({
        'data_dir':data_dirs[d],
        'train_batch_size':40, 
        'gradient_accumulation_steps':1,
        'save_globalstep':1, 
        'checkpoint':checkpoints[d],
        'output_dir':output_dir,
        'save_step':1,
        'num_files_on_memory':2,
        
        'do_train':True,
        'do_eval':False,
        'do_lower_case':True,
        'fp16_allreduce':False,
        'seed':42,
        'no_cuda':False,
        'use_adasum':False,
        'learning_rate':3e-05,
        'warmup_proportion':1.0,
        'num_train_epochs':2,
        'save_file_limit':1,
        
        'weight_decay':0.01,
        'adam_epsilon':1e-8,
        'gradient_predivide_factor':1.0,
        'fp16':False,
        'max_grad_norm':1.0,
        'num_files_on_memory':2,
        'local_rank':-1,
        'fp16_opt_level':'O1',
        'loss_scale':0,
        
        'server_ip':None,
        'server_port':None,
        'cuda':True,
    })
    
    
    # configs
    if "multilingualbert" == str(data_dirs[d].split("/")[2]):
        print("multilingualbert configs")
        # multilingual bert config
        args["attention_probs_dropout_prob"] = 0.1
        args["directionality"] = "bidi"
        args["hidden_act"] = "gelu"
        args["hidden_dropout_prob"] = 0.1
        args["hidden_size"] = 768
        args["initializer_range"] = 0.02
        args["intermediate_size"] = 3072
        args["max_position_embeddings"] = 512
        args["num_attention_heads"] =  12
        args["num_hidden_layers"] = 12
        args["pooler_fc_size"] = 768
        args["pooler_num_attention_heads"] = 12
        args["pooler_num_fc_layers"] = 3
        args["pooler_size_per_head"] = 128
        args["pooler_type"] = "first_token_transform"
        args["type_vocab_size"] = 2
        args["vocab_size"] = 119547
        args["cls_id"] = 101
        args["sep_id"] = 102
    # configs
    elif "biobert" == str(data_dirs[d].split("/")[2]):
        # BioBERT
        print("biobert configs")
        args["attention_probs_dropout_prob"]= 0.1
        args["hidden_act"] = "gelu"
        args["hidden_dropout_prob"] = 0.1
        args["hidden_size"] = 768
        args["initializer_range"] = 0.02
        args["intermediate_size"] = 3072
        args["max_position_embeddings"] = 512
        args["num_attention_heads"] = 12
        args["num_hidden_layers"] = 12
        args["type_vocab_size"] = 2
        args["vocab_size"] = 28996
        args["cls_id"] = 101
        args["sep_id"] = 102
        
    elif "kobert" == str(data_dirs[d].split("/")[2]):
        print("kobert configs")
        args["attention_probs_dropout_prob"]= 0.1
        args["gradient_checkpointing"]= False
        args["hidden_act"]= "gelu"
        args["hidden_dropout_prob"]= 0.1
        args["hidden_size"]= 768
        args["initializer_range"]= 0.02
        args["intermediate_size"]= 3072
        args["layer_norm_eps"]= 1e-12
        args["max_position_embeddings"]= 512
        args["model_type"]= "bert"
        args["num_attention_heads"]= 12
        args["num_hidden_layers"]= 12
        args["pad_token_id"]= 1
        args["type_vocab_size"]= 2
        args["vocab_size"]= 8002
        args["author"]= "Heewon Jeon(madjakarta@gmail.com)"
        args["kobert_version"]= 1.0
        args["cls_id"] = 2
        args["sep_id"] = 3
        
    elif "bertbase" == str(data_dirs[d].split("/")[2]):
        args['vocab_size']=30522 # bert-base-uncased
        args['hidden_size']=768
        args['num_hidden_layers']=12
        args['num_attention_heads']=12
        args['hidden_act']='gelu'
        args['intermediate_size']=3072
        args['hidden_dropout_prob']=0.1
        args['attention_probs_dropout_prob']=0.1
        args['max_position_embeddings']=512
        args['type_vocab_size']=2
        args['initializer_range']=0.02
        args['layer_norm_eps']=1e-12
        args['gradient_checkpointing']=None
        args['position_embedding_type']=None
        args['use_cache']=None
        args['classifier_dropout']=None
        args["cls_id"] = 101
        args["sep_id"] = 102
        
    else:
        print("MY BERT")
        args['vocab_size']=30014
        args['hidden_size']=768
        args['num_hidden_layers']=12
        args['num_attention_heads']=12
        args['hidden_act']='gelu'
        args['intermediate_size']=3072
        args['hidden_dropout_prob']=0.1
        args['attention_probs_dropout_prob']=0.1
        args['max_position_embeddings']=512
        args['type_vocab_size']=2
        args['initializer_range']=0.02
        args['layer_norm_eps']=1e-12
        args['gradient_checkpointing']=None
        args['position_embedding_type']=None
        args['use_cache']=None
        args['classifier_dropout']=None
        args["cls_id"] = 4
        args["sep_id"] = 5

    main(args)