# 04-05. Make samples for knowledge inference task


# 04. Make candidate entities & document links

- get top10 entities within a department
- Each entity has one file to link documents which contain the entity.
- example of output
- filename: C0020538,Hypertensive disease,[dsyn],3588_valid.txt
- contents:
<pre>
Patient_fakeid.txt/docid    sentenceid/begin_of_entity/end_of_entity/semantic type/entity name
00000120_data.txt/00000000	00000004/00000002/00000005/[dsyn]/Hypertensive disease
00000330_data.txt/00000004	00000048/00000002/00000005/[dsyn]/Hypertensive disease
00000330_data.txt/00000005	00000061/00000002/00000005/[dsyn]/Hypertensive disease
00000330_data.txt/00000009	00000150/00000000/00000003/[dsyn]/Hypertensive disease
...
</pre>


## a. Get Dictionaries for entity locations, corpus locations

#### Entity_loc_map
- Entity -map- file name/document number/sentence number/begin/end
    - key: cui_id
    - value: [File name/document number/sentence number/begin/end/semantics/prefered_term]

#### Corpus_map
- File name + document number -map- document content
    - key: File name + document number
    - value: [
line (sentence number, document number, original sentence number, section, content)]
    - original sentence number: while labeling metamap, some sentences were split into sub sentences

In [None]:
import os
import glob
import time

# 00001 -> 1
def recover_to_int(strnumber):
    for z in range(len(strnumber)):
        if strnumber[z]!=0:
            return int(strnumber[z:])
    return 0

# entitie map
def make_ent_map(targetdir, filename, ent_map):
    
    ent_mode = False
    file = open(targetdir, "r")
    lines = file.readlines()
    file.close()
    
    for l in range(len(lines)):
        line = lines[l].strip()
#         print("line: ", line)
        
        if "#### entities" in line:
            ent_mode=True
        
        if len(line.split("\t"))<3:
            continue
        
        if ent_mode==True:
            if line.lower()=="none":
                break
                
            key = line.split("\t")[0]
            cui = line.split("\t")[1]
            sem = line.split("\t")[2]
            term = line.split("\t")[3]
            
            doc_id = key.split("/")[0]
            sent_id = key.split("/")[1]
            begin = key.split("/")[2]
            end = key.split("/")[3]
            
            if cui in ent_map:
                valtmp = ent_map[cui]
                valtmp.append(filename+"/"+doc_id+"/"+sent_id+"/"+\
                                            begin+"/"+end+"/"+sem+"/"+term)
                ent_map[cui] = valtmp
            
            else:
                ent_map[cui] = [filename+"/"+doc_id+"/"+sent_id+"/"+\
                                            begin+"/"+end+"/"+sem+"/"+term]
    
    return ent_map
    


def make_corpus_map(targetdir, filename, corpus_map):
    
    ent_mode = False
    file = open(targetdir, "r")
    lines = file.readlines()
    file.close()
    
    for l in range(len(lines)):
        line = lines[l].strip()
        #print("line: ", line)
        
        if "#### entities" in line:
            break
        
        doc_id = line.split("\t")[0]
        sent_id = line.split("\t")[1]

        dockey = str(filename)+"/"+str(doc_id)
        if dockey in corpus_map:
            corpustmp = corpus_map[dockey]
            corpustmp.append(line)
            corpus_map[dockey] = corpustmp            
        else:
            corpus_map[dockey] = [line]

    return corpus_map


def get_map_data(department="감염내과", mode="train"):
    start_time = time.time()
    ent_map = {}
    corpus_map = {}
    
    cui_to_term = {}
    cui_to_sem = {}
    ent_count = {}
    
    files_input = glob.glob("./data/03_entities_task7_fixed/"+str(mode)+"/"+str(department)+"/*_data.txt")
    files_input.sort()
    
    for f in range(0, len(files_input)):
        filename = files_input[f].split("/")[-1]
        ent_map = make_ent_map(files_input[f], filename, ent_map)
        corpus_map = make_corpus_map(files_input[f], filename, corpus_map)
    
    return ent_map, corpus_map

##  b. Get top-10 entities within each department

In [None]:
def get_target_entities(top=10, department="감염내과", mode="train"):
    read_path = "./data/ent_dist/"+str(mode)+"_"+str(department)+"task7_dist.txt"
#     print("read_path: ", read_path)
    file = open(read_path, "r")
    lines = file.readlines()
    file.close()
    
    target_entities = []
    for l in range(1, len(lines)):
        if (l)==top+1:
            break
        line = lines[l].strip()
        target_entities.append(line)
    
    return target_entities

## c. Create samples

### Determination of candidate documents for target entity
- target_loc_map = {}
- Iteration of (target entities)
    - In entity_loc_map, obtain multiple values “file name/document number/sentence number/begin/end” according to target entity as key
    - Since we decided to [MASK] only the entity that appears first in the document, we went through the following process.
    - Sort by “file name/document number/sentence number/begin/end”
    - Obtain only one smallest “sentence number/begin/end” for each “file name/document number”
    - (target_loc_map["file name/document number"] = smallest "sentence number/begin/end")

### Obtain candidate documents for target entity
- Iteration of (target_loc_map)
    - By using 'Corpus_map', obtain value documents (input key: “file name/sentence number”)
    - Corpus_map(["File name/sentence number"]) == Document content
    - Replace the relevant entity in the document content with the [MASK] token
    - The document goes through the conversion process below.
        - Before conversion: (sentence number, document number, original sentence number, section, content)
        - After conversion: Document consisting of "section (enter) content"


In [None]:
def target_ent_candidate_docs(target_ent, ent_map):
    cui = target_ent.split("\t")[0]
    term = target_ent.split("\t")[1]
    sem = target_ent.split("\t")[2]
    count = target_ent.split("\t")[3]
    
    # Obtain only the first entity per document
    firstparts = {}    
    if cui not in ent_map:
        return firstparts
    
    entinfo = ent_map[cui]
    
    # Since we decided to [MASK] only the entity that appears first in the document, 
    # we went through the following process:
    # Sort by “file name/document number/sentence number/begin/end”
    entinfo.sort()
    
    for e in range(len(entinfo)):
        line = entinfo[e].strip()
        
        filename = line.split("/")[0]
        docid = line.split("/")[1]
        sentid = line.split("/")[2]
        begin = line.split("/")[3]
        end = line.split("/")[4]
        sem = line.split("/")[5]
        term = line.split("/")[6]
        
        document_id = str(filename)+"/"+str(docid)
        
        if document_id not in firstparts:
            firstparts[document_id]=sentid+"/"+begin+"/"+end+"/"+sem+"/"+term
     
    return firstparts

def write_target_ents(target_ents, ent_map, mode="train", department="감염내과"):
    for t in range(len(target_ents)):
        candidates          = target_ent_candidate_docs(target_ents[t], ent_map)
        outdir = "./data/04_ents_location/"+str(mode)+"/"+str(department)
        if not os.path.exists(outdir):
            os.makedirs(outdir)
            
        outtext = []
        for key in candidates:
            outtext.append(key+"\t"+candidates[key])

        outfilename = target_ents[t].replace("\t", ",")
        
        print(outdir+"/"+str(outfilename)+".txt")
        
        file = open(outdir+"/"+str(outfilename)+".txt", "w")
        file.write("\n".join(outtext))
        file.close()

print("Done")

## d. length filter

In [None]:
from tokenization import BertTokenizer
import tokenization as tokenization

def orgarnize_document(target_doc):
    date = ""
    doctype = ""
    section2content = {}
    for t in range(len(target_doc)):
        if t==0:
            date = target_doc[t].split("\t")[4]
            continue
            
        section = target_doc[t].split("\t")[3]
        content = target_doc[t].split("\t")[4]
        
        # DOCTYPE [ 감염내과 _ 외래경과 ] SECTION [ Subjective [ Symptoms ] <- 소견 ]
        doctype = section.split("] SECTION [")[0]
        doctype = doctype.replace("DOCTYPE [", "").strip()
        
        section = section.split("] SECTION [")[1].strip()
        section = section[:-1].strip()
        
        if section in section2content:
            tmpcontent = section2content[section]
            tmpcontent.append(content)
            section2content[section] = tmpcontent
        else:
            section2content[section] = [content]
    
    # 문서 재조립
    or_document = []
    or_document.append(date)
    or_document.append(doctype)
    
    for section in section2content:
        or_document.append(section)
        contents = section2content[section]
        or_document = or_document + contents
    
    return or_document


# Ensure samples that are within the maximum input length of the BERT
def check_token_length(documents):
    max_len = 512 - 2 # [CLS], [SEP]
    vocab_paths = [
        "../otherberts/bertbase_cased/vocab.txt",
        "../otherberts/mbert_cased/vocab.txt"
    ]
    do_lower = [False, False]
    max_lengths = [512, 512]
    
    assert len(vocab_paths)==len(do_lower)
    assert len(vocab_paths)==len(max_lengths)
    
    for v in range(len(vocab_paths)):
        tokenizer = BertTokenizer(vocab_file=vocab_paths[v], do_lower_case=do_lower[v], max_len=max_lengths[v])
        vocab_words = list(tokenizer.vocab.keys())
        
        new_doc = []
        for d in range(len(documents)):
            content = documents[d]
            if do_lower[v]==True:
                content = content.lower()
            content = tokenizer.tokenize(content)
            new_doc = new_doc + content

        if len(new_doc)>max_len:
            return True
        
    return False


# Ensure samples that are within the maximum input length of the BERT
def corpus_length_filter(corpus_map, department, mode):
    path = "./data/04_ents_location/"+str(mode)+"/"+str(department)+"/*.txt"
    entfiles = glob.glob(path)
    entfiles.sort()
    #print("entfiles: ", entfiles)
    
    for e in range(len(entfiles)):
        ent = entfiles[e].split("/")[-1] 
        ent = ent.replace(".txt", "") # label

        file = open(entfiles[e], "r")
        lines = file.readlines()
        file.close()
        
        # candidates_locs -> valid_candidate
        valid_candidate = []
        for l in range(len(lines)):
            line = lines[l].strip()
            filename_docidx = line.split("\t")[0]
            filename = filename_docidx.split("/")[0]
            docidx = recover_to_int(filename_docidx.split("/")[1])
            key = str(filename)+"/"+str(docidx)
            
            target_doc = corpus_map[key]
            # (before)
            # sectionname sentence1
            # sectionname sentence2
            # sectionname sentence3 ...
            # ->
            # (after)
            # sectionname sentence1 \n sentence2 \n sentence3 ...            
            target_doc = orgarnize_document(target_doc)
            
            # check length of docs
            legnthover = check_token_length(target_doc)
            if legnthover==True:
                continue
            else:
                valid_candidate.append(line)
                
        outdir = "/".join(entfiles[e].split("/")[:-1])
        ent = entfiles[e].split("/")[-1]
        ent = ent.replace(".txt", "")
        
        print(outdir+"/"+str(ent)+"_valid.txt")
        file = open(outdir+"/"+str(ent)+"_valid.txt", "w")
        file.write("\n".join(valid_candidate))
        file.close()

# Pipeline
- Function to implement a~d
- a. Get Dictionaries for entity locations, corpus locations
- b. Get top-10 entities within each department
- c. Create samples
- d. length filter

In [None]:
# 위 과정을 한번에 돌리는 함수
def run_bundle_get_entity_locations(department="감염내과", mode="train"):
    # a. Get Dictionaries for entity locations, corpus locations
    ent_map, corpus_map = get_map_data(department=department, mode=mode)    
    
    # b. Get top-10 entities within each department
    target_ents         = get_target_entities(top=10, department=department, mode="train")
    
    # c. Create samples
    write_target_ents(target_ents, ent_map, mode=mode, department=department)    
    
    # d. length filter
    corpus_length_filter(corpus_map, department, mode)

# Run pipelines
- example of output
- filename: C0020538,Hypertensive disease,[dsyn],3588_valid.txt
- contents:
<pre>
Patient_fakeid.txt/docid    sentenceid/begin_of_entity/end_of_entity/semantic type/entity name
00000120_data.txt/00000000	00000004/00000002/00000005/[dsyn]/Hypertensive disease
00000330_data.txt/00000004	00000048/00000002/00000005/[dsyn]/Hypertensive disease
00000330_data.txt/00000005	00000061/00000002/00000005/[dsyn]/Hypertensive disease
00000330_data.txt/00000009	00000150/00000000/00000003/[dsyn]/Hypertensive disease
...
</pre>

In [None]:
mode="train"
run_bundle_get_entity_locations(department="감염내과", mode=mode)
run_bundle_get_entity_locations(department="내분비대사내과", mode=mode)
run_bundle_get_entity_locations(department="류마티스내과", mode=mode)
run_bundle_get_entity_locations(department="소화기내과", mode=mode)
run_bundle_get_entity_locations(department="순환기내과", mode=mode)
run_bundle_get_entity_locations(department="신장내과", mode=mode)
run_bundle_get_entity_locations(department="알레르기내과", mode=mode)
run_bundle_get_entity_locations(department="호흡기내과", mode=mode)

mode="test"
run_bundle_get_entity_locations(department="감염내과", mode=mode)
run_bundle_get_entity_locations(department="내분비대사내과", mode=mode)
run_bundle_get_entity_locations(department="류마티스내과", mode=mode)
run_bundle_get_entity_locations(department="소화기내과", mode=mode)
run_bundle_get_entity_locations(department="순환기내과", mode=mode)
run_bundle_get_entity_locations(department="신장내과", mode=mode)
run_bundle_get_entity_locations(department="알레르기내과", mode=mode)
run_bundle_get_entity_locations(department="호흡기내과", mode=mode)

print("Complete")

## Entity types used in top-10 of departments

In [None]:
import glob
def get_labels():
    target_entities_all = []
    
    path = "./data/04_ents_location/train/*"
    departments = glob.glob(path)
    
    for d in range(len(departments)):
        entfiles = glob.glob(departments[d]+"/*_valid.txt")
        
        for e in range(len(entfiles)):
            ent = entfiles[e].split("/")[-1]
            ent = ent.split(",")[:-1] # C0020538,Hypertensive disease,[dsyn],2066_valid -> [C0020538,Hypertensive disease,[dsyn]]
            ent = ",".join(ent)
            if ent not in target_entities_all:
                target_entities_all.append(ent)
    
    return target_entities_all

labels = get_labels()
print("len(labels): ", len(labels))
# print(labels)


file = open("./data/labels.txt", "w")
file.write("\n".join(labels))
file.close()

<hr></hr>

# 05 sampling

- Get the links of documents to be used in actual task
- example of output
- filename: C0020538,Hypertensive disease,[dsyn],3588_rand30.txt
- contents:
<pre>
Patient_fakeid.txt/docid    sentenceid/begin_of_entity/end_of_entity    CUI/entity name/semantic type    department
00003108_data.txt/00000001	00000035/00000002/00000014	C0020538/Hypertensive disease/[dsyn]	신장내과
00003820_data.txt/00000000	00000004/00000003/00000006	C0020538/Hypertensive disease/[dsyn]	신장내과
00012070_data.txt/00000002	00000040/00000001/00000004	C0020538/Hypertensive disease/[dsyn]	신장내과
00025519_data.txt/00000004	00000115/00000000/00000003	C0020538/Hypertensive disease/[dsyn]	신장내과
...
</pre>



# Random sample
- Select k random documents mapped to target entity

In [None]:
import random

def random_sample(department="감염내과", mode = "train"):
    if mode=="train":
        max_samples = 30
    elif mode=="test":
        max_samples = 10
        
    
    path = "./data/04_ents_location/"+str(mode)+"/"+str(department)+"/*_valid.txt"
    entfiles = glob.glob(path)

    for e in range(len(entfiles)):
        ent = entfiles[e].split("/")[-1]
        ent = ent.replace("_valid.txt", "") # label
        ent_info = "/".join(ent.split(",")[:-1])
        print("ent_info: ", ent_info)
        
        file = open(entfiles[e], "r")
        lines = file.readlines()
        file.close()
        
        candidates_locs = []
        for l in range(len(lines)):
            line = lines[l].strip()
            candidates_locs.append(line)

        random.shuffle(candidates_locs)

        candidates_locs = candidates_locs[:max_samples]
        candidates_locs.sort()

        
        for c in range(len(candidates_locs)):    
            sample_info = candidates_locs[c].split("\t")[1]
            sample_info = "/".join(sample_info.split("/")[:3])
            
            candidates_locs[c] = candidates_locs[c].split("\t")[0]+"\t"+sample_info+"\t"+ent_info+"\t"+str(department)
        
        
        outdir = "./data/04_ents_location/"+str(mode)+"/"+str(department)

        file = open(outdir+"/"+str(ent)+"_rand"+str(max_samples)+".txt", "w")
        file.write("\n".join(candidates_locs))
        file.close()
        
        

mode = "train"
random_sample(department="감염내과", mode = mode)
random_sample(department="내분비대사내과", mode = mode)
random_sample(department="류마티스내과", mode = mode)
random_sample(department="소화기내과", mode = mode)
random_sample(department="순환기내과", mode = mode)
random_sample(department="신장내과", mode = mode)
random_sample(department="알레르기내과", mode = mode)
random_sample(department="호흡기내과", mode = mode)

mode = "test"
random_sample(department="감염내과", mode = mode)
random_sample(department="내분비대사내과", mode = mode)
random_sample(department="류마티스내과", mode = mode)
random_sample(department="소화기내과", mode = mode)
random_sample(department="순환기내과", mode = mode)
random_sample(department="신장내과", mode = mode)
random_sample(department="알레르기내과", mode = mode)
random_sample(department="호흡기내과", mode = mode)

# 05 get text
- Obtain the original text and create a sample

In [None]:
import glob
from datamanager_task7 import get_map_data, recover_to_int, orgarnize_document

import glob
import os
import random

# Receive a document and map it to line number and content
def mask_target_entity(target_doc, target_sentidx, begin, end):
    masked_sample = []
    for t in range(len(target_doc)):
        docidx   = recover_to_int(target_doc[t].split("\t")[0])
        sentidx  = recover_to_int(target_doc[t].split("\t")[1])
        sentidx2 = recover_to_int(target_doc[t].split("\t")[2])
        section  = target_doc[t].split("\t")[3]
        content  = target_doc[t].split("\t")[4]
        
        # mask
        if target_sentidx==sentidx:
            content  = content[:begin] + "[MASK]" + content[end:]
            
        masked_sample.append("\t".join(target_doc[t].split("\t")[0:4]) +"\t"+ content)
        
    return masked_sample

# Obtain the original text and create a sample
def get_text(corpus_map, department, mode):
    if mode=="train":
        max_samples=30
    elif mode=="test":
        max_samples=10
    
    path = "./data/04_ents_location/"+str(mode)+"/"+str(department)+"/*_rand"+str(max_samples)+".txt"
    entfiles = glob.glob(path)
    
    sampled = []
    for e in range(len(entfiles)):
        ent = entfiles[e].split("/")[-1]
        ent = ent.replace("_rand30.txt", "") # label
    
        file = open(entfiles[e], "r")
        lines = file.readlines()
        file.close()
        
        # line: Location information and entity information of the document mapped to the entity
        for l in range(len(lines)):
            ######################################
            line     = lines[l].strip()
            filename_docidx = line.split("\t")[0]
            filename = filename_docidx.split("/")[0]
            docidx   = recover_to_int(filename_docidx.split("/")[1])
            key      = str(filename)+"/"+str(docidx)
            
            # Obtain the target document
            target_doc = corpus_map[key]
            ######################################
            

            ######################################
            # location information
            locinfo = line.split("\t")[1]
            sentidx = recover_to_int(locinfo.split("/")[0])
            begin   = recover_to_int(locinfo.split("/")[1])
            end     = recover_to_int(locinfo.split("/")[2])
            
            # Replace entities in the document with [MASK] tokens
            target_doc = mask_target_entity(target_doc, sentidx, begin, end)
            
            
            arranged_doc = orgarnize_document(target_doc)
            ######################################

            
            ######################################
            # entity info
            entinfo = line.split("\t")[2]
            cuiid   = entinfo.split("/")[0]
            term    = entinfo.split("/")[1]
            sem     = entinfo.split("/")[2]
            label = str(cuiid)+","+str(term)+","+str(sem) # C0020538,Hypertensive disease,[dsyn]

            sampled.append(line+"\t"+" ".join(arranged_doc))
            ######################################
        
    outdir = "./data/05_samples/"+str(mode)
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    file = open(outdir+"/"+str(department)+".txt", "w")
    file.write("\n".join(sampled))
    file.close()
    

# run
def get_text_run(department, mode):
    print(department, mode)
    ent_map, corpus_map = get_map_data(department=department, mode=mode)
    get_text(corpus_map, department=department, mode=mode)

    
mode = "train"
get_text_run(department="감염내과", mode=mode)
get_text_run(department="내분비대사내과", mode=mode)
get_text_run(department="류마티스내과", mode=mode)
get_text_run(department="소화기내과", mode=mode)
get_text_run(department="순환기내과", mode=mode)
get_text_run(department="신장내과", mode=mode)
get_text_run(department="알레르기내과", mode=mode)
get_text_run(department="호흡기내과", mode=mode)


mode = "test"
get_text_run(department="감염내과", mode=mode)
get_text_run(department="내분비대사내과", mode=mode)
get_text_run(department="류마티스내과", mode=mode)
get_text_run(department="소화기내과", mode=mode)
get_text_run(department="순환기내과", mode=mode)
get_text_run(department="신장내과", mode=mode)
get_text_run(department="알레르기내과", mode=mode)
get_text_run(department="호흡기내과", mode=mode)

print("Done")


In [None]:
# Read all samples, mix them, and print them
import random
def all_gather(mode, is_shuffle=True):
    path = "./data/05_samples/"+str(mode)+"/*.txt"
    entfiles = glob.glob(path)
    # print("entfiles: ", entfiles)
    
    sampled = []
    for e in range(len(entfiles)):
        ent = entfiles[e].split("/")[-1]
        ent = ent.replace("_rand30.txt", "") # label
    
        file = open(entfiles[e], "r")
        lines = file.readlines()
        file.close()
        sampled = sampled + [line.strip() for line in lines]
    
    if is_shuffle==True:
        random.shuffle(sampled)
    
    file = open("./data/05_samples/"+str(mode)+".txt", "w")
    file.write("\n".join(sampled))
    file.close()

all_gather(mode="train", is_shuffle=False)
all_gather(mode="test", is_shuffle=False)


print("Done")