# Sampling

## sampling rules
- A sample contain a patient records, and is a concat of n documents within a patient records.
- Select 2 documents for each patient
- Select the AP of the document, look at the left and right directions of the Assessment section, and add the non-Assessment sections to compose the new document.
- If the length is too long, truncate.

In [None]:
import glob
import random

## section names

In [None]:
section_types_paths = ["./data/02_type_to_file_links/sections_visits_2011to2020_task4.txt"]

section_types = {}
for i in range(len(section_types_paths)):
    section_types_path = section_types_paths[i]
    file = open(section_types_path, "r")
    lines = file.readlines()
    for l in range(len(lines)):
        if lines[l]=="\n":
            continue
        doctype = lines[l].split("\t")[0].strip("\n")
        section = lines[l].split("\t")[1].strip("\n")    
        key = doctype+"\t"+section

        val = lines[l].split("\t")[2].strip("\n")
        
        # Definition of val
        # assessment sections are 1
        # For sections other than assessment indicate 0
        if "assessment" in val.lower():
            section_types[key] = "assessment"
        else:
            section_types[key] = "else"
                
    file.close()


In [None]:
def get_valid_documents(path, doc_samples, rng):
    file = open(path, "r")
    lines = file.readlines()
    file.close()
    
    # Each line is the location information of the original document and sentence.
    # Obtain only documents where both AP and SO exist
    valid_docs = []
    for l in range(len(lines)):
        line = lines[l].strip("\n")
        
        # AP, SO 를 섹션 이름을 사용하여 판정함
        data = line.split("\t")[1:]
        flag_target = False
        flag_else = False
        for d in range(len(data)):
            docinfo = data[d].split("/[DOC]")[1]
            doctype = docinfo.split("/[SEC]")[0]
            section = docinfo.split("/[SEC]")[1]
#             print("doctype: ", doctype)
#             print("section: ", section)
            key = doctype+"\t"+section
            if len(doctype)==0 and len(section)==0:
                continue
            #print("key: ", key)
            if section_types[key]=="assessment":
                flag_target=True
            elif section_types[key]=="else":
                flag_else = True
        
        # 두 개 모두 들어있는 문서들만 취급
        if flag_target==True and flag_else==True:
            valid_docs.append(line)
            
            
    #print("valid_docs: ", valid_docs)
    random.shuffle(valid_docs)
    #print("valid_docs: ", valid_docs)
    valid_docs = valid_docs[:doc_samples]
    
    valid_docs.sort()
    
    return valid_docs

def read_text(path):
    # read data
    filename = filepaths[i].split("/")[-1]
    filename_int = ""
    #print("filename: ", filename)
    for j in range(len(filename)):
        if filename[j]!=0:
            filename_int = int(filename[j:-len(".txt")])
            break
    
    groupname = str(((filename_int//docperfile)+1)*docperfile)
    
    target_path = "../task2/data/"+str(category)+"/"+str(groupname)+"/"+str(filename)
    
    file = open(target_path, "r")
    lines = file.readlines()
    file.close()
    
    # date, doctype, section, contents
    linee = [line.strip("\n").split("\t")[-1] for line in lines]
    
    return linee

# Obtain sentences at the indexed location from the document
def extract_sample(got_valid_docs, textlines):
    doc_id_lines = []
    sent_id_lines = []
    doctype_lines = []
    section_lines = []    
    labels = []
    extractedlines = []
    
    for v in range(len(got_valid_docs)):
        docinfo = got_valid_docs[v]
        soap_span = docinfo.split("\t")
        doc_id = soap_span[0]
        print("doc_id: ", doc_id)
        
        # Sentence number, start/end/label...
        for s in range(1, len(soap_span)):
            #print("soap_span[s]: ", soap_span[s])
            startline = int(soap_span[s].split("/")[0])
            endline   = int(soap_span[s].split("/")[1])+1
            
            docinfo = soap_span[s].split("/[DOC]")[1]
            doctype = docinfo.split("/[SEC]")[0]
            section = docinfo.split("/[SEC]")[1]
#             print("doctype: ", doctype)
#             print("section: ", section)
            key = doctype+"\t"+section
            if len(doctype)==0 and len(section)==0:
                continue
            
            soap = section_types[key]
            #print("soap: ", soap)
            
            if soap=="else":
                label = 0
            elif soap=="assessment":
                label = 1
             
            sent_ids_tmp = []
            for s in range(startline, endline):
                sent_ids_tmp.append(s)            
            
            targetlines = textlines[startline:endline]
            extractedlines = extractedlines + targetlines
            labels = labels + [str(label)]*len(targetlines)
            doc_id_lines = doc_id_lines + [str(doc_id)]*len(targetlines)
            sent_id_lines = sent_id_lines + sent_ids_tmp 
            doctype_lines = doctype_lines + [str(doctype)]*len(targetlines)
            section_lines = section_lines + [str(section)]*len(targetlines)
            
    
    assert len(extractedlines)==len(labels)
    
    return doc_id_lines, sent_id_lines, doctype_lines, section_lines, extractedlines, labels


def write_sample(doc_ids, sent_id_lines, doctype_lines, section_lines, 
                 extractedlines, labels, out_folder, filename):
    outtext = []
    for e in range(len(extractedlines)):
#         print(labels[e]+"\t"+extractedlines[e])
        outtext.append(str(doc_ids[e])+"\t"+
                       str(sent_id_lines[e])+"\t"+
                       str(doctype_lines[e])+"\t"+
                       str(section_lines[e])+"\t"+
                       str(labels[e])+"\t"+
                       str(extractedlines[e]))
    
    if len(outtext)>0:
        file = open(out_folder+"/"+filename, "a")
        file.write("\n".join(outtext)+"\n")
        file.close()

    

## trainset

In [None]:
import glob
import os

docperfile = 50
doc_samples = 20 # How many documents will be used per patient?
rng = random.random()
dup_factor = 1

category = "visits_2011to2020"

index_folder = "./data/03_soap_index/train"
filepaths = glob.glob(index_folder+"/*.txt")
filepaths.sort()
print("len(filepaths): ", len(filepaths))

out_folder = "./data/04_sampled/train"
if not os.path.exists(out_folder):
    os.makedirs(out_folder)

for d in range(dup_factor):
    for i in range(len(filepaths)):
        #print("filepaths[i]: ", filepaths[i])
        if i%100==0:
            print(str(i)+"/"+str(len(filepaths)))
        filename = filepaths[i].split("/")[-1]
        got_valid_docs = get_valid_documents(path=filepaths[i], doc_samples=doc_samples, rng=rng)
        textlines = read_text(path=filepaths[i])
        #print("textlines: ", textlines)
        doc_id_lines, sent_id_lines, doctype_lines, section_lines, extractedlines, labels = extract_sample(got_valid_docs=got_valid_docs, textlines=textlines)
        write_sample(doc_ids=doc_id_lines, sent_id_lines=sent_id_lines, 
                     doctype_lines=doctype_lines, 
                     section_lines=section_lines, 
                     extractedlines=extractedlines, 
                     labels=labels, out_folder=out_folder, 
                     filename=filename)
    
#     break

## test set

In [None]:
import glob
import os

docperfile = 50
doc_samples = 5
rng = random.random()
dup_factor = 1

category = "visits_2011to2020"

index_folder = "./data/03_soap_index/test"
filepaths = glob.glob(index_folder+"/*.txt")
filepaths.sort()
print("len(filepaths): ", len(filepaths))

out_folder = "./data/04_sampled/test"
if not os.path.exists(out_folder):
    os.makedirs(out_folder)

for d in range(dup_factor):
    for i in range(len(filepaths)):
        print("filepaths[i]: ", filepaths[i])
        if i%100==0:
            print(str(i)+"/"+str(len(filepaths)))
        
        filename = filepaths[i].split("/")[-1]
        got_valid_docs = get_valid_documents(path=filepaths[i], doc_samples=doc_samples, rng=rng)
        textlines = read_text(path=filepaths[i])
        doc_id_lines, sent_id_lines, doctype_lines, section_lines, extractedlines, labels = extract_sample(got_valid_docs=got_valid_docs, textlines=textlines)
        write_sample(doc_ids=doc_id_lines, sent_id_lines=sent_id_lines, 
                     doctype_lines=doctype_lines, 
                     section_lines=section_lines, 
                     extractedlines=extractedlines, 
                     labels=labels, out_folder=out_folder, 
                     filename=filename)
    
print("Complete")