# Entities distribution within Departments

- Count how many entities there are in each department.
- Output to ./data/ent_dist
- The created file is used to select the top-10 entities for each department in step04_05_entities_to_ptline.ipynb.
- In step04_05_entities_to_ptline.ipynb, train/test samples are created using the top-10 entities for each department (total max 80 entities)

In [None]:
import os
import glob
import time

def readentfile(targetdir):
    pos_cui={}
    pos_sem={}
    pos_term={}
    
    record_mode = False
    file = open(targetdir, "r")
    lines = file.readlines()
    for l in range(len(lines)):
        line = lines[l].strip()
        
        if "#### entities" in line:
            record_mode=True
        
        if len(line.split("\t"))<3:
            continue
        
        if record_mode==True:
            key = line.split("\t")[0]
            pos_cui[key]=line.split("\t")[1]
            pos_sem[key]=line.split("\t")[2]
            pos_term[key]=line.split("\t")[3]
    
    file.close()
    
    return pos_cui, pos_sem, pos_term
    
def count_entities(pos_cui, pos_sem, pos_term, ent_count, cui_to_term, cui_to_sem):    
    for key in pos_cui:
        cui_id = pos_cui[key]
        preferd_term = pos_term[key]
        semantic = pos_sem[key]
        
        if cui_id in ent_count:
            ent_count[cui_id] = ent_count[cui_id] + 1
        else:
            ent_count[cui_id] = 1
        
        
        cui_to_term[cui_id] = preferd_term
        cui_to_sem[cui_id] = semantic
    
    return ent_count, cui_to_term, cui_to_sem


def file_loop(datapath_input):
    cui_to_term = {}
    cui_to_sem = {}
    ent_count = {}
    
    if not os.path.exists("./data/ent_dist"):
        os.makedirs("./data/ent_dist")
            
    start_time = time.time()
    
    department = datapath_input.split("/")[-1]
    
    groups_inputs = glob.glob(datapath_input+"/*")
    groups_inputs.sort()

    for g in range(0, len(groups_inputs)):
        files_input = glob.glob(groups_inputs[g]+"/*_data.txt")
        files_input.sort()
        group = groups_inputs[g].split("/")[-1]

        for f in range(0, len(files_input)):
            filename = files_input[f].split("/")[-1]
            print(f, "/",len(files_input), "...", g,"/",len(groups_inputs))
            
            pos_cui, pos_sem, pos_term = readentfile(targetdir=files_input[f])
            ent_count, cui_to_term, cui_to_sem = count_entities(pos_cui, pos_sem, pos_term, 
                                                           ent_count, cui_to_term, cui_to_sem)
    
    ent_count = dict(sorted(ent_count.items(), key = lambda item: item[1], reverse = True))
    
    outtext = []
    for cui_id in ent_count:
        cui_element = str(cui_id)+"\t"+\
              str(cui_to_term[cui_id])+"\t"+\
              str(cui_to_sem[cui_id])+"\t"+str(ent_count[cui_id])
        print(cui_element)
        outtext.append(cui_element)
    
    end = time.time()
    print("run time: ", str(end-start_time))
    
    file = open("./data/ent_dist/"+str(department)+"entity_dist.txt", "w")
    file.write("\n".join(outtext)+"\n")
    file.close()

    

def file_loop_targeted(department="감염내과", mode="train"):
    start_time = time.time()
    
    cui_to_term = {}
    cui_to_sem = {}
    ent_count = {}
    
    if not os.path.exists("./data/ent_dist"):
        os.makedirs("./data/ent_dist")
            
    
    files_input = glob.glob("./data/02_entities_task7/"+str(mode)+"/"+str(department)+"/*_data.txt")
    files_input.sort()
    print("files_input: ", files_input)
    print("환자 수: ", len(files_input))
    
    for f in range(0, len(files_input)):
        filename = files_input[f].split("/")[-1]
        print(f, "/", len(files_input))

        pos_cui, pos_sem, pos_term = readentfile(targetdir=files_input[f])
        ent_count, cui_to_term, cui_to_sem = count_entities(pos_cui, pos_sem, pos_term, 
                                                       ent_count, cui_to_term, cui_to_sem)

    ent_count = dict(sorted(ent_count.items(), key = lambda item: item[1], reverse = True))
    

    # MYOTONICDYSTROPHY( C3250443 MYOTONIC DYSTROPHY 1) was Excluded 
    # because diabetic DM is too often judged as MYOTONIC DYSTROPHY
    outtext = []
    outtext.append("환자 수: "+str(len(files_input)))
    for cui_id in ent_count:
        if "MYOTONIC DYSTROPHY" in cui_to_term[cui_id]:
            continue
            
        cui_element = str(cui_id)+"\t"+\
              str(cui_to_term[cui_id])+"\t"+\
              str(cui_to_sem[cui_id])+"\t"+str(ent_count[cui_id])
        print(cui_element)
        outtext.append(cui_element)
    
    end = time.time()
    print("run time: ", str(end-start_time))
    
    file = open("./data/ent_dist/"+str(mode)+"_"+str(department)+"task7_dist.txt", "w")
    file.write("\n".join(outtext)+"\n")
    file.close()

    
##################################################
## Finetuning data
##################################################
mode="train"
file_loop_targeted(department="감염내과", mode=mode)
file_loop_targeted(department="내분비대사내과", mode=mode)
file_loop_targeted(department="류마티스내과", mode=mode)
file_loop_targeted(department="소화기내과", mode=mode)
file_loop_targeted(department="순환기내과", mode=mode)
file_loop_targeted(department="신장내과", mode=mode)
file_loop_targeted(department="알레르기내과", mode=mode)
file_loop_targeted(department="호흡기내과", mode=mode)

mode="test"
file_loop_targeted(department="감염내과", mode=mode)
file_loop_targeted(department="내분비대사내과", mode=mode)
file_loop_targeted(department="류마티스내과", mode=mode)
file_loop_targeted(department="소화기내과", mode=mode)
file_loop_targeted(department="순환기내과", mode=mode)
file_loop_targeted(department="신장내과", mode=mode)
file_loop_targeted(department="알레르기내과", mode=mode)
file_loop_targeted(department="호흡기내과", mode=mode)