In [1]:
import pickle

with open('/data/pj20/exp_data/icd9cm_icd9proc/drugrec_dataset_umls.pkl', 'rb') as f:
    sample_dataset = pickle.load(f)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import csv

condition_mapping_file = "../../resources/ICD9CM.csv"
procedure_mapping_file = "../../resources/ICD9PROC.csv"
drug_file = "../../resources/ATC.csv"

condition_dict = {}
with open(condition_mapping_file, newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        condition_dict[row['code'].replace('.', '')] = row['name'].lower()

procedure_dict = {}
with open(procedure_mapping_file, newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        procedure_dict[row['code'].replace('.', '')] = row['name'].lower()

drug_dict = {}
with open(drug_file, newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        if row['level'] == '3.0':
            drug_dict[row['code'].replace('.', '')] = row['name'].lower()


In [3]:
def flatten(lst):
    result = []
    for item in lst:
        if isinstance(item, list):
            result.extend(flatten(item))
        else:
            result.append(item)
    return result

In [4]:
condition_dict_sample = {}
procedure_dict_sample = {}

for i in range(101):
    sample = sample_dataset[i]
    for condition in flatten(sample['conditions']):
        if condition not in condition_dict_sample:
            condition_dict_sample[condition] = condition_dict[condition]
    for procedure in flatten(sample['procedures']):
        if procedure not in procedure_dict_sample:
            try:
                procedure_dict_sample[procedure] = procedure_dict[procedure]
            except:
                procedure_dict_sample[procedure[:-1]] = procedure_dict[procedure[:-1]]

In [5]:
# import os

# for i in range(1, 101):
#     folder_name = f"../../graphs/patient_samples/{i}"
#     os.mkdir(folder_name)

In [6]:
sample_dataset[0]

{'visit_id': '184167',
 'patient_id': '10',
 'conditions': [['V3000', '7742', '76525', '76515', 'V290']],
 'procedures': [['9983', '9915', '966']],
 'drugs': ['J01C', 'J01G', 'V06D', 'B05X', 'B03A'],
 'drugs_all': [['J01C', 'J01G', 'V06D', 'B05X', 'B03A']],
 'drugs_ind': tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
         0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,


In [14]:
import json

with open('../../graphs/condition/ICD9CM_base_umls/ent2word.json', 'r') as f:
    umls_ent2word = json.load(f)   

In [17]:
len(umls_ent2word)

297927

In [21]:
from tqdm import tqdm

patient_sample_path = "../../graphs/patient_samples"

for j in tqdm(range(101)):
    sample = sample_dataset[i]
    patient_path = f"{patient_sample_path}/{j}"
    patient_desp = f"{patient_path}/description.txt"
    patient_gpt_graph = f"{patient_path}/gpt_graph.csv"
    patient_umls_graph = f"{patient_path}/umls_graph.csv"

    triple_set_gpt = set()
    triple_set_umls = set()

    conditions  = flatten(sample['conditions'])
    procedures = flatten(sample['procedures'])
    drugs = flatten(sample['drugs'])

    ### BEGIN Write description
    desp_condition = ""
    for i in range(len(conditions)):
        desp_condition += f"{i}: " + condition_dict_sample[conditions[i]] + ",\n"
    desp_condition = desp_condition[:-2]

    desp_procedure = ""
    for i in range(len(procedures)):
        try:
            desp_procedure += f"{i}: " + procedure_dict_sample[procedures[i]] + ",\n"
        except:
            desp_procedure += f"{i}: " + procedure_dict_sample[procedures[i][:-1]] + ",\n"
    desp_procedure = desp_procedure[:-2]

    desp_drug = ""
    for i in range(len(drugs)):
        desp_drug += f"{i}: " + drug_dict[drugs[i]] + ",\n"
    desp_drug = desp_drug[:-2]
    
    desp_all = f"Patient ID: {j}\nConditions:\n[\n{desp_condition}\n]\nProcedures:\n[\n{desp_procedure}\n]\nDrugs:\n[\n{desp_drug}\n]"
    with open(patient_desp, 'w') as f:
        f.write(desp_all)
    ### END Write description


    ### BEGIN Write graph
    for condition in conditions:
        cond_file_gpt = f'../../graphs/condition/ICD9CM_base_gpt/{condition}.txt'
        cond_file_umls = f'../../graphs/condition/ICD9CM_base_umls/{condition}.txt'
        with open(cond_file_gpt, 'r') as f:
            lines = f.readlines()
        for line in lines:
            items = line.split('\t')
            if len(items) == 3:
                h, r, t = items
                t = t[:-1]
                triple = (h, r, t)
                triple_set_gpt.add(triple)
        with open(cond_file_umls, 'r') as f:
            lines = f.readlines()
        for line in lines:
            items = line.split('\t')
            if len(items) == 3:
                h, r, t = items
                t = t[:-1]
                if r == "self":
                    continue
                triple = (umls_ent2word[h], r, umls_ent2word[t])
                triple_set_umls.add(triple)

    for procedure in procedures:
        proc_file_gpt = f'../../graphs/procedure/ICD9PROC_base_gpt/{procedure}.txt'
        proc_file_umls = f'../../graphs/condition/ICD9CM_base_umls/{procedure}.txt'
        try:
            with open(proc_file_gpt, 'r') as f:
                lines = f.readlines()
        except:
            proc_file_gpt = f'../../graphs/procedure/ICD9PROC_base_gpt/{procedure[:-1]}.txt'
            with open(proc_file_gpt, 'r') as f:
                lines = f.readlines()
            
        for line in lines:
            items = line.split('\t')
            if len(items) == 3:
                h, r, t = items
                t = t[:-1]
                triple = (h, r, t)
                triple_set_gpt.add(triple)
        with open(proc_file_umls, 'r') as f:
            lines = f.readlines()
        for line in lines:
            items = line.split('\t')
            if len(items) == 3:
                h, r, t = items
                t = t[:-1]
                if r == "self":
                    continue
                triple = (umls_ent2word[h], r, umls_ent2word[t])
                triple_set_umls.add(triple)
    
    triple_list_gpt = [*triple_set_gpt]
    triple_list_umls = [*triple_set_umls]

    with open(patient_gpt_graph, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["head", "relation", "tail"])
        for triple in triple_list_gpt:
            writer.writerow([triple[0], triple[1], triple[2]])
    
    with open(patient_umls_graph, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["head", "relation", "tail"])
        for triple in triple_list_umls:
            writer.writerow([triple[0], triple[1], triple[2]])

    ### END Write graph

100%|██████████| 101/101 [00:00<00:00, 139.40it/s]
