# index the Sections

## 주의점
- 의료문서는 history, P/E & Lab, Assessment, Plan 섹션으로 구성되어 있다.
- Task 4 and 5 의 목적은 assessment 의 구간을 찾아내는 task 이다.
- Task 4 는 섹션의 순서를 섞고, task 5 에서는 섹션의 순서가 고정된다.
- 주의점: task 4 의 섹션의 순서를 섞는 내용은 이 코드에 반영되지 않았다. 우리가 병원정보시스템에서 획득한 데이터는 섹션의 순서가 이미 섞여있었기 때문이다. 만일 당신의 데이터가 섹션의 순서를 섞는 것을 원한다면 이 코드에 입력되는 데이터의 섹션을 미리 섞거나 아래 코드에서 섹션을 섞는 코드를 추가해야한다. 


## NOTE
- Clinical notes consist of history, P/E & Lab, Assessment, and Plan section.
- The purpose of Tasks 4 and 5 is to find the range of assessment section.
- Task 4 the order of sections was randomized, and in task 5, the order of sections was fixed.
- <b>Note: Shuffling the order of sections in task 4 is not reflected in our code. This is because the order of sections in the data we obtained from the hospital information system was already shuffled. If you want your data to shuffled the order of the sections, you must pre-shuffle the sections of the data entered in this code or add the code to shuffle the sections in the code below.</b>


## output
- In the code below, the order of sections is indexed as follows.
    - samplenumber \t startline1/endline2/Document1 species/section1 species \t startline2/endline2/Document2 species/section2 species ...

## heldouts

In [None]:
heldout_path = "../preprocessing/01_data4finetune/"

train_patients = [
    "pts_SNUH_visit_2011to2020_heldout_train.txt",
]

test_patients = [
    "pts_SNUH_visit_2011to2020_heldout_test.txt",
]


# heldouts
def get_heldouts(heldout_array_files):
    heldouts = []
    for h in range(len(heldout_array_files)):
        file = open(heldout_path +"/"+ heldout_array_files[h])
        lines = file.readlines()
        heldouts_cate = []
        for l in range(len(lines)):
            pts = lines[l].split("/")[-1]
            pts = pts.split(".")[0]
    #         print(str(pts))
            heldouts_cate.append(pts)
        heldouts.append(heldouts_cate)
    return heldouts

train_pts = get_heldouts(train_patients)
test_pts = get_heldouts(test_patients)

print("len(train_pts[0]): ", len(train_pts[0]))
print("len(test_pts[0]): ", len(test_pts[0]))

## section names

In [None]:
section_types_paths = ["./data/02_type_to_file_links/sections_visits_2011to2020_task4.txt"]

section_types = {}
for i in range(len(section_types_paths)):
    section_types_path = section_types_paths[i]
    file = open(section_types_path, "r")
    lines = file.readlines()
    for l in range(len(lines)):
        if lines[l]=="\n":
            continue
        doctype = lines[l].split("\t")[0].strip("\n")
        section = lines[l].split("\t")[1].strip("\n")    
        key = doctype+"\t"+section

        val = lines[l].split("\t")[2].strip("\n")
        
        if "assessment" in val.lower():
            section_types[key] = "assessment"
        else:
            section_types[key] = "else"
                
    file.close()
print(section_types)

## indexing

- output 
    - samplenumber \t startline1/endline2/Document1 species/section1 species \t startline2/endline2/Document2 species/section2 species ...

In [None]:
import os

def linenum_indexing(path, out_folder):
    file = open(path, "r")
    lines = file.readlines()
    
    filename = path.split("/")[-1]
    groupname = path.split("/")[-2]
    
    documentidx = []
    documentidx_line = []
    
    date_last = ""
    doctype_last = ""
    soap_last = ""
    doc_id_last = ""
    section_last = ""
    
    start = 0
    doc_id = 0
    for l in range(len(lines)):
        line = lines[l].strip("\n")
        if len(line.split("\t"))<4:
            continue
        date = line.split("\t")[0]
        doctype = line.split("\t")[1]
        section = line.split("\t")[2]
        content = line.split("\t")[3]
        
        key = doctype+"\t"+section
        
        # switched section
        if (section_last!=section or date_last!=date or doctype_last!=doctype) and l!=0:
            indx = str(start)+"/"+str(l-1)+"/[DOC]"+str(doctype_last)+"/[SEC]"+str(section_last)
            documentidx_line.append(indx)
            start = l
        
        # swithced document
        if (date_last!=date or doctype_last!=doctype) and l!=0:
            doc_id_last_txt = "".join(["0"]*(4-len(str(doc_id_last))))+str(doc_id_last)
            documentidx.append(str(doc_id_last_txt)+"\t"+"\t".join(documentidx_line))
            doc_id = doc_id + 1
            documentidx_line = []
            
            
        
        date_last = date
        doctype_last = doctype
        doc_id_last = doc_id
        section_last = section
        
    # rest of data
    indx = str(start)+"/"+str(l-1)+"/[DOC]"+str(doctype_last)+"/[SEC]"+str(section_last)
    documentidx_line.append(indx)
    
    doc_id_last_txt = "".join(["0"]*(4-len(str(doc_id_last))))+str(doc_id_last)
    documentidx.append(str(doc_id_last_txt)+"\t"+"\t".join(documentidx_line))
    
    file = open(out_folder+"/"+filename, "w")
    file.write("\n".join(documentidx))
    file.close()
    
    return documentidx

## Loop

In [None]:
import glob
import os

out_folder = "./data/03_soap_index/train"
if not os.path.exists(out_folder):
    os.makedirs(out_folder)

### index data for train set

In [None]:
docperfile = 50

categories = [
    "visits_2011to2020",
]

for c in range(len(categories)):
    category = categories[c]
    
    for t in range(len(train_pts[c])):
        target_patient = train_pts[c][t]
        print("target_patient: ", train_pts[c][t])
        
        zeropoint = 0
        for p in range(len(target_patient)):
            #print(target_patient[p])
            if target_patient[p]!="0":
                zeropoint = p
                break
        
        pt_id = int(target_patient[zeropoint:])
        #print("pt_id: ", pt_id)
        groupname = str(((pt_id//docperfile)+1)*docperfile)       
        
        #target_path = "./data/"+str(category)+"/"+str(groupname)+"/"+str(target_patient)+".txt"
        target_path = "../task2/data/"+str(category)+"/"+str(groupname)+"/"+str(target_patient)+".txt"
        
        print("target_path :", target_path)
        
        linenum_indexing(target_path, out_folder)
        

### index data for test set

In [None]:
import glob
import os

out_folder = "./data/03_soap_index/test"
if not os.path.exists(out_folder):
    os.makedirs(out_folder)
    
docperfile = 50

categories = [
    "visits_2011to2020",
]

for c in range(len(categories)):
    category = categories[c]
    
    for t in range(len(test_pts[c])):
        target_patient = test_pts[c][t]
        print("target_patient: ", test_pts[c][t])
        
        zeropoint = 0
        for p in range(len(target_patient)):
            #print(target_patient[p])
            if target_patient[p]!="0":
                zeropoint = p
                break
        
        pt_id = int(target_patient[zeropoint:])
        #print("pt_id: ", pt_id)
        groupname = str(((pt_id//docperfile)+1)*docperfile)       
        
        target_path = "../task2/data/"+str(category)+"/"+str(groupname)+"/"+str(target_patient)+".txt"
        print("target_path :", target_path)
        
        linenum_indexing(target_path, out_folder)
        