# Labeling Entities using Metamap

## Note
- The codes extract disease names from documents using MetaMap
- We used the following python wrapper
    - https://github.com/AnthonyMRios/pymetamap
- Download the code and locate the pymetamap folder in task 7 folder
- Be sure to <b>run the MetaMap (MetaMap2020) server before running this code</b>
- Set your mmpath='metamap installation location'

## Install metmap on your system
- To run the metamap server, you should install java & Metamap thesaurus the first
- Before download Metamap thesaurus, signup & get your UMLS Metathesaurus License from National Library of Medicine.
    - https://uts.nlm.nih.gov/uts/signup-login
- To download Metamap thesaurus visit 
    - https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/run-locally/MainDownload.html
- How to install Metamap: 
    - https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/documentation/Installation.html
- After installing them, start metamap by using following command in Ubuntu prompt:
    
## Start metamap server
<pre>
cd /path/installed/mmap2020/public_mm
./bin/skrmedpostctl start
./bin/wsdserverctl start
#./bin/skrmedpostctl stop
#./bin/wsdserverctl stop
./bin/metamap
</pre>

In [None]:
from pymetamap import MetaMap

# mmpath = '/home/kkm/anaconda_home/mmap2020/public_mm/bin/metamap'
mmpath = '/path/installed/mmap2020/public_mm/bin/metamap'
mm = MetaMap.get_instance(mmpath)
semantic_filter = ['[dsyn]']

## Test your input works
- Get entity locations in character-level

In [None]:
sents = ['Heart Attack', 'John had a huge heart attack']
concepts,error = mm.extract_concepts(sents,[1,2])
for concept in concepts:
    print(concept)
    print(concept.pos_info)
    
    target_sent = int(concept.index)-1
    pos_s = int(concept.pos_info.split("/")[0])-1
    pos_e = pos_s + int(concept.pos_info.split("/")[1])
    
    # extraction result
    print(sents[target_sent])
    print(sents[target_sent][pos_s:pos_e])
    print()

### apply

In [None]:
import glob
import os
from task7reader import read_patient_documents
import time

# Function to get the position of entities
# If there are multiple startidx sentences split them
def call_mmp(sents, sents_ids_for_query, pos_score, pos_cui, pos_sem, pos_term, startidx, found_concepts):
    # found_concepts: return concepts. use original Metamap output format
    found_concepts = []
    
    # Temporarily replace the string causing the error with another string
    sents_for_search = []
    for s in range(len(sents)):
        # "/" -> ","
        tmp_sent = sents[s]
        tmp_sent = tmp_sent.replace("/", ",")
        tmp_sent = tmp_sent.replace("|", ",")
        tmp_sent = tmp_sent.replace("\\", ",")
        tmp_sent = tmp_sent.replace("≥", ",")
        tmp_sent = tmp_sent.replace("-", ",")
        tmp_sent = tmp_sent.replace(")", ",")
        tmp_sent = tmp_sent.replace("(", ",")

        sents_for_search.append(tmp_sent)
    
    
    # search
    # Don't use original 'sents', because of text matching errors
    # User 'sents_for_search'
    concepts, error = mm.extract_concepts(sents_for_search, sents_ids_for_query)
    
    ########################
    # Parse results (=cocepts) to get start, end location within each sentence
    ########################
    # Check the location of the acquired concept
    # Obtain the position with the highest score
    concept_count = 0
    for concept in concepts:
        concept_count = concept_count + 1
        # exclude conceptAA
        # conceptAA class does not have semtypes, so an error occurs.
        try:
            semantic = concept.semtypes
            
            found_concepts.append(str(concept))
            
        except:
            print("conceptAA has no semantic")
            continue
        
        # positions of concepts == poses
        # Covers the following cases
        # pos_info='[5/3],[12/3],[62/3]'
        # pos_info='122/2;[94/2],[107/2]'
        poses = concept.pos_info.replace("[", "")
        poses = poses.replace("]", "")
        poses = poses.replace(";", ",")
        poses = poses.split(",")

        for p in range(len(poses)):
            target_line_num = int(concept.index)
            
            # set the sentence number considering offset (startidx)
            # Reason for existing the offset is the a documents are split into 
            # small set of sentences to query the metamap.
            # therefore there are some gaps between 
            # and search results sentence number (==original sentence number == concept.index)
            # and sents_for_search sentence index (==target_line_num-startidx)
            target_line = sents_for_search[target_line_num-startidx]
            
            if concept.semtypes not in semantic_filter:
                continue
            
            # position of text
            pos_s = int(poses[p].split("/")[0])-1
            pos_e = pos_s + int(poses[p].split("/")[1])
            
            # bugfix
            if pos_s==-1:
                continue
            
            cui = str(concept.cui)
            sem = str(semantic)
            score = str(concept.score)
            preferred_name = str(concept.preferred_name)
            
            zeros_d = "".join(['0']*(8-len(str(target_line_num))))
            zeros_s = "".join(['0']*(8-len(str(pos_s))))
            zeros_e = "".join(['0']*(8-len(str(pos_e))))
            
            # key = str(target_line_num)+"/"+str(pos_s)+"/"+str(pos_e)
            key = zeros_d+str(target_line_num)+"/"+zeros_s+str(pos_s)+"/"+zeros_e+str(pos_e)
            
            if key not in pos_score:
                pos_score[key] = score
                pos_cui[key] = cui
                pos_sem[key] = sem
                pos_term[key] = preferred_name
            else:
                # get the pos with highest score
                if score>pos_score[key]:
                    pos_cui[key] = cui
                    pos_sem[key] = sem
                    pos_term[key] = preferred_name
    
    return pos_score, pos_cui, pos_sem, pos_term, found_concepts
    
    
# before querying the sentence to metamap, 
# A function which make map sentence ids to segment id, vice versa 
def make_sents_ids_for_query(segment_ids):
    sents_ids_for_query = [i for i in range(len(segment_ids))]
    
    # Creating a matrix to recover segid and sentid
    sent_ids = [i for i in range(len(segment_ids))]
    seg_to_sent = {}
    sent_to_seg = {}
    for i in range(len(segment_ids)):
        seg_to_sent[segment_ids[i]] = i
        sent_to_seg[i] = segment_ids[i]
        
    return seg_to_sent, sent_to_seg, sents_ids_for_query


# result text parser 1
# 00001 -> 1
def recover_to_int(strnumber):
    for z in range(len(strnumber)):
        if strnumber[z]!=0:
            return int(strnumber[z:])
    return 0

# result text parser 2
def sort_entities(pos_cui, pos_sem, pos_term, reverse=False):
    # 엔터티 정렬
    pos_cui = dict(sorted(pos_cui.items(), reverse=reverse))
    pos_sem = dict(sorted(pos_sem.items(), reverse=reverse))
    pos_term = dict(sorted(pos_term.items(), reverse=reverse))
    return pos_cui, pos_sem, pos_term

# result text parser 3
# Make sure there is no overlapping positions between entities
# Select the longest entity.
def make_entity_no_dup_area(pos_cui, pos_sem, pos_term):
    del_targets = []

    for key in pos_cui:
        linenum = recover_to_int(key.split("/")[0])
        start = recover_to_int(key.split("/")[1])
        end = recover_to_int(key.split("/")[2])
        
        len_ent = end-start+1
        
        for key_compare in pos_cui:
            if key==key_compare:
                continue
                
            compare_l = recover_to_int(key_compare.split("/")[0])
            compare_s = recover_to_int(key_compare.split("/")[1])
            compare_e = recover_to_int(key_compare.split("/")[2])

            len_ent2 = compare_e-compare_s+1
            
            # overlapped
            if compare_l==linenum:
                if (compare_s<= start and start<=compare_e) or (compare_s<= end and end<=compare_e) or \
                (start<= compare_s and compare_s<=end) or (start<= compare_e and compare_e<=end):
                    
                    if len_ent2<len_ent:
                        if key_compare not in del_targets:
                            del_targets.append(key_compare)
                    
                    elif len_ent2>=len_ent:
                        if key not in del_targets:
                            del_targets.append(key)
    
    
    # Delete unnecessary entities
    for d in range(len(del_targets)):
        targetkey = del_targets[d]
        del pos_cui[targetkey]
        del pos_sem[targetkey]
        del pos_term[targetkey]
    
    return pos_cui, pos_sem, pos_term


# send queries (documents) to MetaMap
def doc_iter(documents, output_directory, output_dir_entall):
    interval  = 1000
    pos_score = {}
    pos_cui   = {}
    pos_sem   = {}
    pos_term  = {}
    found_concepts = []

    # segment_ids: Index of the original sentence
    # sent_ids: Index of the sentence after segmentation
    # sentid_to_sec: Index of the original sentence-section
    # sentid_to_docid: Index of the original sentence - docid
    sents, segment_ids, sentid_to_sec, sentid_to_docid = organize_text(documents)
    
    # sent_id for search
    seg_to_sent, sent_to_seg, sents_ids_for_query = make_sents_ids_for_query(segment_ids)
    
    # mmp
    # There is a bug that prevents search when all documents of one patient are placed in one matrix and searched at once.
    # Divide queries into up to 100 queries.
    for i in range(0, len(sents), interval):
        sents_split = sents[i:i+interval]
        sents_ids_for_query_split = sents_ids_for_query[i:i+interval]
        pos_score, pos_cui, pos_sem, pos_term, found_concepts = call_mmp(sents_split, sents_ids_for_query_split, 
                                                         pos_score, pos_cui, pos_sem, pos_term, i, found_concepts)
                
    # sort entities
    pos_cui, pos_sem, pos_term = sort_entities(pos_cui, pos_sem, pos_term, reverse=False)
    
    # If the same semantics or terms appear in succession, they should be treated as one entity and bundled.
    pos_cui, pos_sem, pos_term = post_process_entities(sents, pos_cui, pos_sem, pos_term)
    
    # If sections overlap between entities, select the longest matching entity
    pos_cui, pos_sem, pos_term = make_entity_no_dup_area(pos_cui, pos_sem, pos_term)
    
    outtext = []
    
    for s in range(len(sents)):
        restored_sent_id = segment_ids[s]
        doc_id      = sentid_to_docid[restored_sent_id]
        sectionname = sentid_to_sec[restored_sent_id]
        
        line = str(s)+"\t"+str(doc_id)+"\t"+str(restored_sent_id)+"\t"+str(sectionname)+"\t"+str(sents[s])
        outtext.append(line)
    
    output_entities(output_directory, outtext, pos_cui, pos_sem, pos_term)
    
    
    file = open(output_dir_entall, "w")
    file.write("\n".join(found_concepts))
    file.close()
    

# Prepare the text in a format suitable for mmp search
def organize_text(documents):
    sent_count = 0
    
    sents = []
    sent_ids = []
    sentid_to_sec   = {}
    sentid_to_docid = {}
    date = ""
    sent_id = 0
    doc_id = 0
    for doc in documents:
        for sec in doc:
            if sec=="date":
                doc[sec] = [doc[sec]]
            
            for sent in doc[sec]:
                split_sents, segment_ids = make_short_sents(text=sent, seg_id=sent_id, linechar_limit=100)                
                sents = sents + split_sents
                sentid_to_sec[sent_id]   = sec
                sentid_to_docid[sent_id] = doc_id
                sent_ids = sent_ids + segment_ids
                sent_id = sent_id + 1
        
        doc_id = doc_id + 1
    
    return sents, sent_ids, sentid_to_sec, sentid_to_docid
    

# There is a bug in MetaMap that prevents analysis altogether when a single sentence exceeds about 200 words.
# The original text needs to be segmented into multiple sentences.
def make_short_sents(text, seg_id=0, linechar_limit=100):
    split_sents = []
    segment_ids = []
    tokens = text.split(" ")
    sent_tmp = []
    for t in range(len(tokens)):
        if (len(" ".join(sent_tmp))+1+len(tokens[t]))>linechar_limit:
            split_sents.append(" ".join(sent_tmp))
            sent_tmp = []
        sent_tmp.append(tokens[t])
        
    if len(sent_tmp)>0:
        split_sents.append(" ".join(sent_tmp))
        sent_tmp = []
    
    segment_ids = [seg_id]*len(split_sents)
    
    return split_sents, segment_ids



def output_entities(output_directory, outtext, pos_cui, pos_sem, pos_term):
    outtext.append("\n")
    outtext.append("#### entities (문장번호/begin/end) ####")
    if len(pos_cui)>0:
        for key in pos_cui:
            outtext.append(str(key)+"\t"+str(pos_cui[key])+ "\t"+str(pos_sem[key])+"\t"+str(pos_term[key]))
    else:
        outtext.append("None")
    
    file = open(output_directory, "w")
    file.write("\n".join(outtext)+"\n\n")
    file.close()
        


# If the same semantics or terms appear in succession, they should be treated as one entity.
# e.g)
# key:  0/4/5  /  C2603358  /  [clna]  /  R prime
# key:  0/10/15  /  C5203670  /  [dsyn]  /  COVID-19
# key:  0/17/19  /  C5203670  /  [dsyn]  /  COVID-19
import re
def post_process_entities(sents, pos_cui, pos_sem, pos_term):
    pos_cui_p = {}
    pos_sem_p = {}
    pos_term_p = {}
    
    last_linenum = ""
    last_sem = ""
    last_term = ""
    last_end = ""
    
    begin_cands = []
    end_cands = []
    
    count = 0
    for key in pos_cui:
        if count==0:
            last_linenum = recover_to_int(key.split("/")[0])
            begin_cands.append(recover_to_int(key.split("/")[1]))
            end_cands.append(recover_to_int(key.split("/")[2]))
            last_end = recover_to_int(key.split("/")[2])
            
            last_cui = pos_cui[key]
            last_sem = pos_sem[key]
            last_term = pos_term[key]
            count = count + 1
            continue
        
        begin_current = recover_to_int(key.split("/")[1])
        end_current   = recover_to_int(key.split("/")[2])
        
        # connect entities only when the length between entities is within 5
        ent_distance  = begin_current - last_end
        linenum       = recover_to_int(key.split("/")[0])        
        text = sents[linenum][begin_current:end_current]
                    
        if recover_to_int(key.split("/")[0])==last_linenum and last_cui==pos_cui[key] and last_sem==pos_sem[key] \
           and last_term==pos_term[key] and ent_distance<5:
            begin_cands.append(begin_current)
            end_cands.append(end_current)
            
        # When conditions change, previously collected entities are merged into one entity.
        else:
            new_start = min(begin_cands)
            new_end = max(end_cands)
            
            zeros_d = "".join(['0']*(8-len(str(last_linenum))))
            zeros_s = "".join(['0']*(8-len(str(new_start))))
            zeros_e = "".join(['0']*(8-len(str(new_end))))
            
            newkey = zeros_d+str(last_linenum)+"/"+zeros_s+str(new_start)+"/"+zeros_e+str(new_end)
            
            pos_cui_p[newkey] = last_cui
            pos_sem_p[newkey] = last_sem
            pos_term_p[newkey] = last_term
            
            # prepare for next
            begin_cands = []
            end_cands = []
            begin_cands.append(begin_current)
            end_cands.append(end_current)        
    
        last_linenum = recover_to_int(key.split("/")[0])
        last_end = recover_to_int(key.split("/")[2])
        
        last_cui = pos_cui[key]
        last_sem = pos_sem[key]
        last_term = pos_term[key]
        
        count = count + 1
    
    if len(begin_cands)>0:
        new_start = min(begin_cands)
        new_end = max(end_cands)
            
        zeros_d = "".join(['0']*(8-len(str(last_linenum))))
        zeros_s = "".join(['0']*(8-len(str(new_start))))
        zeros_e = "".join(['0']*(8-len(str(new_end))))
        newkey = zeros_d+str(last_linenum)+"/"+zeros_s+str(new_start)+"/"+zeros_e+str(new_end)
        
        pos_cui_p[newkey] = last_cui
        pos_sem_p[newkey] = last_sem
        pos_term_p[newkey] = last_term

    return pos_cui_p, pos_sem_p, pos_term_p


def file_loop(datapath_input):
    start_time = time.time()
    
    department = datapath_input.split("/")[-1]
    
    groups_inputs = glob.glob(datapath_input+"/*")
    groups_inputs.sort()

    for g in range(0, len(groups_inputs)):
        files_input = glob.glob(groups_inputs[g]+"/*.txt")
        files_input.sort()
        group = groups_inputs[g].split("/")[-1]

        for f in range(0, len(files_input)):
            filename = files_input[f].split("/")[-1]
            output_path = "./data/02_entities/"+str(department)+"/"+str(group)
            output_path_file = output_path+"/"+filename[:-len(".txt")]+"_data.txt"
            output_path_ents = output_path+"/"+filename[:-len(".txt")]+"_ents.txt"
            print(output_path, " / ", f, "/",len(files_input),"/",filename, "...", g,"/",len(groups_inputs))
            if not os.path.exists(output_path):
                os.makedirs(output_path)
            
            documents = read_patient_documents(files_input[f])
            doc_iter(documents = documents, 
                     output_directory = output_path_file, 
                     output_dir_entall = output_path_ents)
            
    end = time.time()
    print("run time: ", str(end-start_time))
    
    file = open("runtime.txt", "a")
    file.write(str(datapath_input)+"\t"+str(end-start_time)+"\n")
    file.close()
    
    
##################################################
## Finetuning data
##################################################
def file_loop_targeted(department, mode="train"):
    print("department: ", department, ", mode: ", str(mode))
    start_time = time.time()
    
    def get_heldout_paths(heldoutlist, department):
        file = open(heldoutlist, "r")
        lines = file.readlines()
        file.close()

        input_paths = []
        for l in range(len(lines)):
            pt_path = "./data/visits_2011to2020/"+str(department)+"/"+"/".join(lines[l].strip().split("/")[-2:])
            
            input_paths.append(pt_path)

        print("len(input_paths): ", len(input_paths))

        return input_paths

    
    if mode=="train":
        heldoutlist = "../preprocessing/01_data4finetune/pts_SNUH_visit_2011to2020_heldout_train.txt"
    elif mode=="test":
        heldoutlist = "../preprocessing/01_data4finetune/pts_SNUH_visit_2011to2020_heldout_test.txt"
    else:
        print("need to define mode")
        return
        
    datapath_input = get_heldout_paths(heldoutlist=heldoutlist, department=department)
    
    # run
    for f in range(0, len(datapath_input)):
        filename = datapath_input[f].split("/")[-1]
        output_path = "./data/02_entities_task7/"+str(mode)+"/"+str(department)
        output_path_file = output_path+"/"+filename[:-len(".txt")]+"_data.txt"
        output_path_ents = output_path+"/"+filename[:-len(".txt")]+"_ents.txt"
        if not os.path.exists(output_path):
            os.makedirs(output_path)
        
        if os.path.isfile(datapath_input[f]):
            """File exists"""
        else:
            """File doesnot exist"""
            continue
            
        print(output_path, " / ", f, "/",len(datapath_input),"/",filename)
        documents = read_patient_documents(datapath_input[f])
        doc_iter(documents = documents, 
                 output_directory = output_path_file, 
                 output_dir_entall = output_path_ents)
                
    end = time.time()
    print("run time: ", str(end-start_time))
    
    file = open("runtime.txt", "a")
    file.write(str(department)+"\t"+str(mode)+"\t"+str(end-start_time)+"\n")
    file.close()
    


# Run code
- Output
    - ./data/02_entities_task7/train/department/patient_fakeid.txt
    - ./data/02_entities_task7/test/department/patient_fakeid.txt    


- Departments:
    - 감염내과 (Infectious Diseases)
    - 내분비대사내과 (Endocrinology and Metabolism)
    - 류마티스내과 (Rheumatology)
    - 소화기내과 (Gastroenterology)
    - 순환기내과 (Cardiology)
    - 신장내과 (Nephrology)
    - 알레르기내과 (Allergy and Immunology)
    - 호흡기내과 (Pulmonology)

In [None]:
file_loop_targeted(department="감염내과", mode="train")
file_loop_targeted(department="내분비대사내과", mode="train")
file_loop_targeted(department="류마티스내과", mode="train")
file_loop_targeted(department="소화기내과", mode="train")
file_loop_targeted(department="순환기내과", mode="train")
file_loop_targeted(department="신장내과", mode="train")
file_loop_targeted(department="알레르기내과", mode="train")
file_loop_targeted(department="호흡기내과", mode="train")


file_loop_targeted(department="감염내과", mode="test")
file_loop_targeted(department="내분비대사내과", mode="test")
file_loop_targeted(department="류마티스내과", mode="test")
file_loop_targeted(department="소화기내과", mode="test")
file_loop_targeted(department="순환기내과", mode="test")
file_loop_targeted(department="신장내과", mode="test")
file_loop_targeted(department="알레르기내과", mode="test")
file_loop_targeted(department="호흡기내과", mode="test")
