# scores

In [None]:
import glob
import os

from seqeval.metrics import classification_report
from seqeval.metrics import sequence_labeling
# bert
from tokenization import BertTokenizer
import tokenization as tokenization
# kobert tokenizer
import gluonnlp as nlp
import sentencepiece as spm
import torch

vocab_paths = [
    "../otherberts/bertbase_cased",
    "../otherberts/mbert_cased",
    "../otherberts/bioBERT",
    "../otherberts/KoBERT",
]

preds_paths = [
    "./data/08_preds/ver9.1.4_521121_epoch2/test_pred",
    "./data/08_preds/ver8.1.4_1142642_epoch2/test_pred",
    './data/08_preds/ver11.1.4_521079_epoch2/test_pred',
    './data/08_preds/ver12.1.4_407013_epoch2/test_pred',
]

assert len(vocab_paths)==len(preds_paths)


# vocab_words
class Vocab_words(object):
    def __init__(self, vocab_file):
        self.i_to_w = {}
        self.w_to_i = {}
        self.getvocab(vocab_file)

    def getvocab(self, vocab_file):
        f = open(vocab_file, 'r')
        lines = f.readlines()
        for l in range(len(lines)):
            term = lines[l].strip("\n")
            term = convert_to_unicode(term)
            self.i_to_w[int(l)] = term
            self.w_to_i[term] = int(l)
            
def convert_to_unicode(text):
    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
    if six.PY3:
        if isinstance(text, str):
            return text
        elif isinstance(text, bytes):
            return text.decode("utf-8", "ignore")
        else:
            raise ValueError("Unsupported string type: %s" % (type(text)))
    elif six.PY2:
        if isinstance(text, str):
            return text.decode("utf-8", "ignore")
        elif isinstance(text, unicode):
            return text
        else:
            raise ValueError("Unsupported string type: %s" % (type(text)))
    else:
        raise ValueError("Not running on Python2 or Python 3?")

In [None]:
def read_documents(targetfile, mode):
    f = open("./data/05_samples/"+str(mode)+"/"+str(targetfile), "r")
    lines = f.readlines()
    f.close()
    
    questions = []
    labels = []
    for l in range(len(lines)):
        line = lines[l].strip("\n")
        if line=="":
            continue
            
        # info line
        if "[START_QUESTION]"==line:
            quest = []
            label_q = []
        elif "[END_QUESTION]"==line:
            assert len(quest)==5
            assert len(quest)==len(label_q)
            
            questions.append(quest)
            labels.append(label_q)
            
        elif line!=0:
            filename  = line.split("\t")[0]
            doc_order = int(line.split("\t")[1])
            date      = line.split("\t")[2]
            content   = line.split("\t")[3]
            
            if doc_order==4:
                label_tmp = 1
            else:
                label_tmp = 0
                
            quest.append(content)
            label_q.append(label_tmp)
        
    return questions, labels


In [None]:
# Evaluation 1. Evaluation viewed as a problem of selecting one of four documents
def prediction(filepath, vocab_path, collect_tokens=True):
    file = open(filepath, "r")
    filename = filepath.split("/")[-1]
    filename = filename.split(".")[0]
    lines = file.readlines()
    
    fileinfo = read_documents(filename+".txt", mode="test")
    fileinfo = fileinfo[0]
    
    probs = []
    labels = []
    outtext = []
    if collect_tokens==True:
        # vocab
        if "mbert_" in vocab_path.lower():
            vocab_file = vocab_path+'/vocab.txt'
            tokenizer = BertTokenizer(vocab_file=vocab_file, do_lower_case=False, max_len=512)
            vocab_words = list(tokenizer.vocab.keys())

        elif "biobert_" in vocab_path.lower():
            vocab_file = vocab_path+'/vocab.txt'
            tokenizer = BertTokenizer(vocab_file=vocab_file, do_lower_case=True, max_len=512)
            vocab_words = list(tokenizer.vocab.keys())

        elif "bertbase_" in vocab_path.lower():
            vocab_file = vocab_path+'/vocab.txt'
            tokenizer = BertTokenizer(vocab_file=vocab_file, do_lower_case=True, max_len=512)
            vocab_words = list(tokenizer.vocab.keys())

        elif "kobert" in vocab_path.lower():
            vocab_file = "../otherberts/KoBERT/models/vocab.txt"
            vocab_words = Vocab_words(vocab_file)

            # sptokenizer
            spmodel = "../otherberts/KoBERT/models/spiece.model"
            tokenizer = spm.SentencePieceProcessor()
            tokenizer.load(spmodel)
        else:
            vocab_file = vocab_path+'/vocab.txt'
            vocab_words = Vocab_words(vocab_file)

        for p in range(len(lines)):
            line = lines[p].strip("\n")
            prob = float(line.split("\t")[0])
            label = int(line.split("\t")[1])
            query = line.split("\t")[2]
            query_tokens = query.split(",")
            tokens_q = []
            for i in range(len(query_tokens)):
                token_id = int(query_tokens[i])
                if "mbert" in vocab_path.lower():
                    token_str = vocab_words[token_id]
                elif "biobert" in vocab_path.lower():
                    token_str = vocab_words[token_id]
                elif "bertbase" in vocab_path.lower():
                    token_str = vocab_words[token_id]
                elif "kobert" in vocab_path.lower():
                    token_str = vocab_words.i_to_w[token_id]
                else:
                    token_str = vocab_words.i_to_w[token_id]

                if token_str!="[PAD]":
                    tokens_q.append(token_str)

            value = line.split("\t")[3]
            value_tokens = value.split(",")
            tokens_v = []        
            for i in range(len(value_tokens)):
                token_id = int(value_tokens[i])
                if "mbert" in vocab_path.lower():
                    token_str = vocab_words[token_id]
                elif "biobert" in vocab_path.lower():
                    token_str = vocab_words[token_id]
                elif "bertbase" in vocab_path.lower():
                    token_str = vocab_words[token_id]
                elif "kobert" in vocab_path.lower():
                    token_str = vocab_words.i_to_w[token_id]
                else:
                    token_str = vocab_words.i_to_w[token_id]

                if token_str!="[PAD]":
                    tokens_v.append(token_str)


            probs.append(prob)
            labels.append(label)
            
            outtext.append(str(fileinfo[p])+"\t"+str(line.split("\t")[0])+"\t"+str(line.split("\t")[1]) +"\t"
                           + " ".join(tokens_q) +"\t"+ " ".join(tokens_v) )

    else:
        for p in range(len(lines)):
            line = lines[p].strip("\n")
            prob = float(line.split("\t")[0])
            label = int(line.split("\t")[1])
            probs.append(prob)
            labels.append(label)
        
    ######## Evaluation 1. Evaluation viewed as a problem of selecting one of four documents ########
    evals = []
    
    for p in range(0, len(probs), 4):
        probs_ = torch.tensor(probs)
        labels_ = torch.tensor(labels)

        pred_idx = torch.argmax(probs_).detach().item()
        label_idx = torch.argmax(labels_).detach().item()
        if collect_tokens==True:
            for o in range(0, len(outtext)):
                if o==pred_idx:
                    outtext[o] = "SELECT\t"+outtext[o]
                else:
                    outtext[o] = "\t"+outtext[o]

        is_true_highest1 = 0
        if pred_idx==label_idx:
            is_true_highest1 = 1
    #     print("is_true: ", is_true)
        evals.append(is_true_highest1)
    ########################################################################
    
    return "\n".join(outtext), evals
    


In [None]:
def change_char(tag):
    result = []
    for t in range(len(tag)):
        if tag[t]==1:
            result.append(["B-entail"])
        elif tag[t]==0:
            result.append(["B-notent"])
            
    return result


In [None]:
# Evaluation 1. Evaluation viewed as a problem of selecting one of four documents
for p in range(len(preds_paths)):
    exact_match = []

    path = preds_paths[p]+"/*.txt"
    files = glob.glob(path)
    files.sort()

    vocab_path = vocab_paths[p]

    # 예측 결과를 내뱉는 곳
    outpath = "./data/scores/test_acc"
    print("outpath: ", outpath)
    if not os.path.exists(outpath):
        os.makedirs(outpath)
    output_eval_preds = os.path.join(outpath, str(preds_paths[p].split("/")[-2]))
    print("output_eval_preds: ", output_eval_preds)
    

    evaluations = []
    y_true = []
    y_pred = []

    for f in range(len(files)):
        if f%100==0:
            print(str(f)+"/"+str(len(files)))

        filename = files[f].split("/")[-1]
        outtext, evals = prediction(files[f], vocab_path, collect_tokens=False) 
        
        if len(outtext)>0:
            file = open(output_eval_preds+"/"+filename, "w")
            file.write(outtext)
            file.close()
    
        evaluations = evaluations + evals
    
    accuracy = sum(evaluations)/len(evaluations)
    output_eval_file = os.path.join(outpath, str(preds_paths[p].split("/")[-2])+"_eval1.txt")
    with open(output_eval_file, "w") as writer:
        writer.write("accuracy: "+str(accuracy))

    print("Done")


## merge eval

In [None]:
import glob

filepaths = glob.glob("./data/scores/test_acc/*_eval1.txt")
filepaths.sort()

names = []
scores_doc_acc = []
for p in range(len(filepaths)):
    file = open(filepaths[p], "r")
    lines = file.readlines()
    file.close()

    filename = ".".join(filepaths[p].split("/")[-1].split(".")[:-1])
    #print("filename: ", filename)
    names.append(filename)

    for l in range(len(lines)):
        line = lines[l].strip("\n")
        line = line.replace("    ", "\t")
        line = line.strip()
#         print("line:", line)
        line = "".join(line.split(":")[-1].strip())
        scores_doc_acc.append(line)

#     break
print(len(names))
print(len(scores_doc_acc))

outtext = ["name\tscores_doc_acc"]
for n in range(len(names)):
    print(names[n]+"\t"+scores_doc_acc[n])
    outtext.append(names[n]+"\t"+scores_doc_acc[n])

file = open("./data/scores/eval1.txt", "w")
file.write("\n".join(outtext))
file.close()
    