# Make pair samples

- procedure
    - Data sampling
    - Select up to 10 random documents from one patient
    - At this time, extract the ap and so parts separately.

- output
    - The AP part is grouped from [START_AP] to [END_AP]
    - The so part consists of label + tab + sentence

In [None]:
# #1
# ap_doc_num: List of documents with only ap
# so_doc_num: List of documents where so exists
# none_ap_doc_num: list of documents with only so
# soap_doc_num: List of documents containing all soap
# docs_ap: AP part text of each document
# docs_so: so part text of each document
def get_soap(target_path):
    START_AP = "[START_AP]"
    END_AP = "[END_AP]"
    END_DOC = "[END_DOC]"

    docs_ap = []
    docs_so = []
    ap_doc_num = [] # AP doc num
    so_doc_num = [] # SO doc num
    
    mode = "AP"
    
    file = open(target_path, "r")
    lines = file.readlines()
    file.close()

    doc_id = 0
    document = []
    for l in range(len(lines)):
        line = lines[l].strip("\n")

        # start of AP
        if line==START_AP:
            document = []
            mode = "AP"

        # end of AP
        elif line==END_AP:
            if len(document)>0:
                ap_doc_num.append(doc_id)
            docs_ap.append(document)
            document = []
            mode = "SO"
        
        # end of DOC
        elif line==END_DOC:
            if len(document)>0:
                so_doc_num.append(doc_id)
            docs_so.append(document)
            document = []
            mode = "AP"
        
        # end of SO
        elif line=="" or l==len(lines)-1:
            document = []
            doc_id = doc_id + 1
            
        # collect a document
        else:
            if line!="":
                document.append(line)


    none_ap_doc_num = []
    for d in range(doc_id):
        if d not in ap_doc_num:
            none_ap_doc_num.append(d)
            
    return ap_doc_num, so_doc_num, none_ap_doc_num, docs_ap, docs_so


In [None]:
# #2
import random
def random_docs(limit, doc_nums):
    randomdocs = []

    # use whole contents
    if limit >= len(doc_nums):
        return doc_nums
    
    # select random n
    else:
        while(True):
            dc = random.randint(0, len(doc_nums)-1)
            if dc not in randomdocs:
                randomdocs.append(dc)
            if len(randomdocs)>=limit:
                break
                
    # random documents number
    target_docnums = []
    for r in range(len(randomdocs)):
        target_docnums.append(doc_nums[randomdocs[r]])
    target_docnums.sort()

    return target_docnums

# #3
# Select m random AP documents and n SO documents from one patient
# After that, separate the ap and so parts to obtain them
def random_m_ap_n_so(target_path, so_only_proportion=0.5, sample_num_docs=10):
    ap_doc_num, so_doc_num, none_ap_doc_num, docs_ap, docs_so = get_soap(target_path)
    # print("ap_doc_num: ", ap_doc_num)
    # print("so_doc_num: ", so_doc_num)
    # print("none_ap_doc_num: ", none_ap_doc_num)
    # print("docs_ap: ", docs_ap)
    # print("docs_so: ", docs_so)
    
    limit_ap = sample_num_docs*(1-so_only_proportion) # 5
    limit_non_ap = sample_num_docs - limit_ap
    
    if len(ap_doc_num)<limit_ap:
        limit_ap=len(ap_doc_num)
        limit_non_ap = sample_num_docs - limit_ap    
    
    # Document random sampling
    # regulation
    # 1.1. If there is no document with AP in the obtained documents for one patient, no sampling is performed.
    # If the number of documents for parts other than AP is 0, do not sample
    if len(ap_doc_num)==0 or len(so_doc_num)==0:
        return [], []
    
    #1.2. If there are enough AP documents, there seems to be no need to choose and use documents that only have so.
    if len(ap_doc_num) > sample_num_docs:
        target_docnums_ap = random_docs(sample_num_docs, ap_doc_num)
        target_docnums_non_ap = []
    
    # 1.3. 양쪽 다 원하는 개수의 문서를 가지고 있지 않은 경우에는 모든 문서를 가져와서 사용한다
    # 1.4. 양쪽 다 원는 개수의 문서를 가진 경우
    # 1.5. 한쪽이라도 문서 개수를 충족하지 못하면 전체 문서를 사용한다
    # (단, ap 가 0 인 경우는 위에서 샘플생성 안하기로 함)
    
    # 1.3. If neither patient has the desired number of documents, all documents are imported and used.
    # 1.4. If both patients have the same number of documents
    # 1.5. If even one side does not meet the number of documents, the entire document is used.
    # (However, if the number of AP is 0, the sample is not created)
    else:
        target_docnums_ap = random_docs(limit_ap, ap_doc_num)               # AP Documents
        target_docnums_non_ap = random_docs(limit_non_ap, none_ap_doc_num)  # None AP docs
        
#     print("target_docnums_ap: ", target_docnums_ap)
#     print("target_docnums_non_ap: ", target_docnums_non_ap)

    # 3. Sampling segments from obtained samples
    ap_index = target_docnums_ap
    so_index = target_docnums_ap+target_docnums_non_ap
    so_index.sort()

    ap_sentences = []
    so_sentences = []

    for a in range(len(ap_index)):
        target_idx = ap_index[a]
        ap_sentences = ap_sentences + docs_ap[target_idx]

    for a in range(len(so_index)):
        target_idx = so_index[a]
        so_sentences = so_sentences + docs_so[target_idx]

    return ap_sentences, so_sentences


# Sampling

- Select two different patients
- Sample data from 2 patients
- Sample example:
    - 1 AP section <> 1 SO parts (from Sample data from 2 patients)

In [None]:
import random

def shuffle_patients(targets, so_only_proportion=0.1, sample_num_docs=10):
    START_AP = "[START_AP]"
    END_AP = "[END_AP]"
    assert len(targets)>1
    
    doc_nums = []
    labels = []
    so_sentences_all = []
    
    # patient who has AP section
    for t in range(1, len(targets)):
        print("target_path1: ", targets[0])
        target_path = targets[0]
        ap_sentences_1, so_sentences_1 = random_m_ap_n_so(target_path, 
                                                          so_only_proportion=so_only_proportion, 
                                                          sample_num_docs=sample_num_docs)
        # print("ap_sentences_1: ", ap_sentences_1)
        # print("so_sentences_1: ", so_sentences_1)
        
        # If ap_sentences_1 is [], it means that there is no AP part, 
        # so the patient subject to this sampling is not sampled.
        if ap_sentences_1==[]:
            print("ap_sentences_1 len is 0")
            return None
        elif so_sentences_1==[]:
            print("so_sentences_1 len is 0")
            return None
        
        so_sentences_all.append(so_sentences_1)
        doc_nums.append(0)
    
    # patients who does not have AP section
    for t in range(1, len(targets)):
        print("target_path2: ", targets[t])
        target_path = targets[t]
        _, so_sentences_ = random_m_ap_n_so(target_path, so_only_proportion=so_only_proportion, 
                                           sample_num_docs=sample_num_docs)
        
        # The case that needs to regenerate the sample
        if len(so_sentences_)==0:
            return 2
        
        so_sentences_all.append(so_sentences_)
        doc_nums.append(1)
    
    
    # AP sections
    START_AP = "[START_AP]"
    END_AP = "[END_AP]"
    ap_text = "\n".join(ap_sentences_1)
    ap_text = START_AP +"\n"+ ap_text +"\n"+ END_AP
    
    print("len(so_sentences_all): ", len(so_sentences_all))
    print("doc_nums: ", doc_nums)
    
    # SO sections
    # The number of sentences retrieved from each patient should be a maximum of 20.
    sentnum_limit = 20
    for s in range(len(so_sentences_all)):
        if len(so_sentences_all[s])>sentnum_limit:
            deloop = len(so_sentences_all[s])-sentnum_limit
            for d in range(deloop-1, -1, -1):
                if random.random()>=0.5:
                    del so_sentences_all[s][-1] # Delete the last sentence
                else:
                    del so_sentences_all[s][0]  # Delete the first sentence
    
    # Determine the position of the sentence
    document_order = [i for i in range(len(so_sentences_all))]
    random.shuffle(document_order)
    
    
    print("document_order: ", document_order)
    
    
    target_label = 0
    if random.randint(0, 9)>=5:
        target_label=1
    
    print("target_label: ", target_label)
    
    so_sentence_cat = []
    for d in range(len(document_order)):
        idx = document_order[d]
        sents = so_sentences_all[idx]
        
        if target_label==doc_nums[idx]:
            so_sentence_cat.append("[CLS]"+"\t"+str(doc_nums[idx])) # labels
            for s in range(len(sents)):
                so_sentence_cat.append(str(doc_nums[idx])+"\t"+"".join(sents[s]))
            so_sentence_cat.append("[SEP]")
    
    
    outtext_so = []
    for s in range(len(so_sentence_cat)):
        outtext_so.append(str(so_sentence_cat[s]))
    
    outtext = ap_text + "\n" + "\n".join(outtext_so)
    
    return outtext
    

## heldouts

In [None]:
heldout_path = "../preprocessing/01_data4finetune/"

train_patients = [
    "pts_SNUH_visit_2011to2020_heldout_train.txt",
]

test_patients = [
    "pts_SNUH_visit_2011to2020_heldout_test.txt",
]


# heldouts
def get_heldouts(heldout_array_files):
    heldouts = []
    for h in range(len(heldout_array_files)):
        file = open(heldout_path +"/"+ heldout_array_files[h])
        lines = file.readlines()
        heldouts_cate = []
        for l in range(len(lines)):
            pts = lines[l].split("/")[-1]
            pts = pts.split(".")[0]
    #         print(str(pts))
            heldouts_cate.append(pts)
        heldouts.append(heldouts_cate)
    return heldouts

train_pts = get_heldouts(train_patients)
test_pts = get_heldouts(test_patients)

print("len(train_pts[0]): ", len(train_pts[0]))
print("len(test_pts[0]): ", len(test_pts[0]))

## Loop

In [None]:
import os
import glob

categories = [
        "visits_2011to2020",
    ]

In [None]:
def get_patients(category, pts):
    docperfile = 50

    # train
    train_patients = []
    
    for t in range(len(pts)):
        target_patient = pts[t]

        zeropoint = 0
        for p in range(len(target_patient)):
            #print(target_patient[p])
            if target_patient[p]!="0":
                zeropoint = p
                break

        pt_id = int(target_patient[zeropoint:])
        groupname = str(((pt_id//docperfile)+1)*docperfile)       

        target_path = "./data/03_docsplit/"+str(category)+"/"+str(groupname)+"/"+str(target_patient)+".txt"

        train_patients.append(target_path)
        
    return train_patients


In [None]:
output_directory = "./data/04_samples"
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

def make_samples(pts_nums, categories, mode, dup_factor=2):
    
    # 생성된 샘플의 개수
    num_samples = 0

    # 작성한 샘플의 개수
    write_count=0
    
    # 작성 정보
    writeinfo = ""
        
    # 루프
    for c in range(len(categories)):
        category = categories[c]
        # Obtain the path to the patient record file
        target_patients = get_patients(category, pts_nums[c])
        print("len(target_patients): ", len(target_patients))
        target_patients.sort()

        for _ in range(dup_factor):
            # 샘플링
            for t in range(len(target_patients)):

                # 2 random patient records
                pat1 = target_patients[t]
                while(True):
                    pat2 = random.randint(0, len(target_patients)-1)
                    pat2 = target_patients[pat2]
                    
                    if pat1!=pat2:
                        outtext = shuffle_patients(targets=[pat1, pat2], 
                                                   so_only_proportion=0.1, sample_num_docs=10)
                        
                        # Code number 2 means to restart this code
                        # This is the case where there is no label 1 in the obtained so sentence.
                        if outtext!=2:
                            break

                            

                if outtext!=None:
                    file = open(output_directory+"/"+str(mode)+".txt", "a")
                    if write_count==0:
                        file.write(outtext)
                    else:
                        file.write("\n\n"+outtext)
                    file.close()
                    write_count = write_count+1

                    
        num_samples = num_samples + write_count
        print("num_samples: ", num_samples)
        writeinfo = writeinfo + "\n"+str(mode)+" samples: " + str(num_samples) 

        file = open("./data/04_samples/info_"+str(mode)+".txt", "w")
        file.write(writeinfo)
        file.close()

make_samples(pts_nums=train_pts, categories=categories, mode="train", dup_factor=4)
make_samples(pts_nums=test_pts,  categories=categories, mode="test", dup_factor=4)