In [None]:
import rdflib
from rdflib.namespace import SKOS, RDF, RDFS, Namespace

# Load the Turtle file
graph = rdflib.Graph()
graph.parse("ICD9CM.ttl", format="turtle")

# Define the UMLS namespace
umls = Namespace("http://bioportal.bioontology.org/ontologies/umls/")

# Iterate through the graph to find ICD9 codes and UMLS CUIs
icd9_to_cui_map = {}
for subject in graph.subjects(RDF.type, rdflib.OWL.Class):
    icd9_code = graph.value(subject, SKOS.notation)
    umls_cui = graph.value(subject, umls.cui)

    if icd9_code and umls_cui:
        icd9_to_cui_map[str(icd9_code)] = str(umls_cui)



In [3]:
import json

# Print the mapping
with open("icd9_to_umls_cui.json", 'w') as f:
    json.dump(icd9_to_cui_map, f, indent=6)

In [24]:
# Load the Turtle file
graph = rdflib.Graph()
graph.parse("ATC.ttl", format="turtle")

# Define the UMLS namespace
umls = Namespace("http://bioportal.bioontology.org/ontologies/umls/")

# Iterate through the graph to find ATC codes and UMLS CUIs
atc_to_cui_map = {}
for subject in graph.subjects(RDF.type, rdflib.OWL.Class):
    atc_code = graph.value(subject, SKOS.notation)
    umls_cui = graph.value(subject, umls.cui)

    if atc_code and umls_cui:
        atc_to_cui_map[str(atc_code)] = str(umls_cui)

In [5]:
import json

# Print the mapping
with open("atc_to_umls_cui.json", 'w') as f:
    json.dump(atc_to_cui_map, f, indent=6)

In [12]:
# Load the Turtle file
graph = rdflib.Graph()
graph.parse("RXNORM.ttl", format="turtle")

# Define the UMLS namespace
umls = Namespace("http://bioportal.bioontology.org/ontologies/umls/")

# Iterate through the graph to find RxNorm codes and UMLS CUIs
rxnorm_to_cui_map = {}
for subject in graph.subjects(RDF.type, rdflib.OWL.Class):
    rxnorm_code = graph.value(subject, SKOS.notation)
    umls_cui = graph.value(subject, umls.cui)

    if rxnorm_code and umls_cui:
        rxnorm_to_cui_map[str(rxnorm_code)] = str(umls_cui)

In [13]:
import json

# Print the mapping
with open("rxnorm_to_umls_cui.json", 'w') as f:
    json.dump(rxnorm_to_cui_map, f, indent=6)

In [2]:
import pickle
import pandas as pd

with open('./umls/umls.graph', 'rb') as f:
    umls_g = pickle.load(f)


In [3]:
from torch_geometric.utils import to_networkx, from_networkx
G_tg = from_networkx(umls_g) 

In [60]:
len(umls_g.edges)

2341070

In [44]:
with open('./umls/umls.csv', 'r') as f:
    lines_1 = f.readlines()

# with open('./graph.txt', 'r') as f:
#     lines_2 = f.readlines()

In [45]:
triple_set = set()
tuple_set = set()
node_set = set()

for line in lines_1:
    items = line.split('\t')
    e1 = items[1]
    r = items[0]
    e2 = items[2]
    triple_set.add((e1, r, e2))
    tuple_set.add((e1, e2))
    node_set.add(e1)
    node_set.add(e2)
    

# for line in lines_2:
#     items = line.split('\t')
#     e1 = items[0]
#     r = items[1]
#     e2 = items[2][:-1]
#     if (e1, e2) not in tuple_set and (e2, e1) not in tuple_set:
#         tuple_set.add((e1, e2))
#         triple_set.add((e1, r, e2))

In [46]:
len(triple_set), len(node_set)

(1212586, 297927)

In [36]:
out_str = ""

for triple in triple_set:
    out_str += triple[0] + '\t' + triple[1] + '\t' + triple[2] + '\n'

with open('./umls_graph.txt', 'w') as f:
    f.write(out_str)

In [1]:
import numpy as np

ent_emb = np.load('./umls/ent_emb.npy')

In [4]:
len(ent_emb[0])

1024

In [4]:
import rdflib
from rdflib.namespace import SKOS, RDF, RDFS, Namespace
import csv
from collections import defaultdict

# Load the Turtle file
graph = rdflib.Graph()
graph.parse("ICD9CM.ttl", format="turtle")

# Define the UMLS namespace
umls = Namespace("http://bioportal.bioontology.org/ontologies/umls/")

# Iterate through the graph to find ICD9 codes and UMLS CUIs
icd9_to_cui_map = defaultdict(list)
for subject in graph.subjects(RDF.type, rdflib.OWL.Class):
    icd9_code = graph.value(subject, SKOS.notation)
    umls_cui = graph.value(subject, umls.cui)

    if icd9_code and umls_cui:
        icd9_to_cui_map['ICD9CM'].append(str(icd9_code))
        icd9_to_cui_map['UMLS'].append(str(umls_cui))

with open('ICD9CM_to_UMLS.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(icd9_to_cui_map.keys())
    writer.writerows(zip(*icd9_to_cui_map.values()))


In [7]:
# Load the Turtle file
graph = rdflib.Graph()
graph.parse("RXNORM.ttl", format="turtle")

# Define the UMLS namespace
umls = Namespace("http://bioportal.bioontology.org/ontologies/umls/")

# Iterate through the graph to find RxNorm codes and UMLS CUIs
rxnorm_to_cui_map = defaultdict(list)
for subject in graph.subjects(RDF.type, rdflib.OWL.Class):
    rxnorm_code = graph.value(subject, SKOS.notation)
    umls_cui = graph.value(subject, umls.cui)

    if rxnorm_code and umls_cui:
        rxnorm_to_cui_map['RxNorm'].append(str(rxnorm_code))
        rxnorm_to_cui_map['UMLS'].append(str(umls_cui))

with open('RxNorm_to_UMLS.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(rxnorm_to_cui_map.keys())
    writer.writerows(zip(*rxnorm_to_cui_map.values()))

In [8]:
# Load the Turtle file
graph = rdflib.Graph()
graph.parse("ATC.ttl", format="turtle")

# Define the UMLS namespace
umls = Namespace("http://bioportal.bioontology.org/ontologies/umls/")

# Iterate through the graph to find ATC codes and UMLS CUIs
atc_to_cui_map = defaultdict(list)
for subject in graph.subjects(RDF.type, rdflib.OWL.Class):
    atc_code = graph.value(subject, SKOS.notation)
    umls_cui = graph.value(subject, umls.cui)

    if atc_code and umls_cui:
        atc_to_cui_map['ATC'].append(str(atc_code))
        atc_to_cui_map['UMLS'].append(str(umls_cui))

with open('ATC_to_UMLS.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(atc_to_cui_map.keys())
    writer.writerows(zip(*atc_to_cui_map.values()))

In [12]:
with open('./RxNorm_to_ATC.csv', 'r') as f:
    lines = f.readlines()
    lines = lines[1:]

rxnorm_to_atc_map = {}
for line in lines:
    rxnorm, atc = line.split(',')
    atc = atc[:-1]
    rxnorm_to_atc_map[rxnorm] = atc

In [None]:
from pyhealth.medcode import CrossMap
from collections import Counter
# Load the Turtle file
graph = rdflib.Graph()
graph.parse("RXNORM.ttl", format="turtle")

# Define the UMLS namespace
umls = Namespace("http://bioportal.bioontology.org/ontologies/umls/")

# Iterate through the graph to find RxNorm codes and UMLS CUIs
rxnorm_to_cui_map = defaultdict(list)
mapping = CrossMap("RxNorm", "ATC")
for subject in graph.subjects(RDF.type, rdflib.OWL.Class):
    rxnorm_code = graph.value(subject, SKOS.notation)
    umls_cui = graph.value(subject, umls.cui)

    if rxnorm_code and umls_cui:
        rxnorm_to_cui_map['RxNorm'].append(str(rxnorm_code))
        rxnorm_to_cui_map['UMLS'].append(str(umls_cui))

with open('RxNorm_to_ATCUMLS.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(rxnorm_to_cui_map.keys())
    writer.writerows(zip(*rxnorm_to_cui_map.values()))

In [36]:
from tqdm import tqdm

rxnorm_to_atc3_to_cui_map = defaultdict(list)

mapping = CrossMap("RxNorm", "ATC")
for rxnorm_code in tqdm(rxnorm_to_cui_map['RxNorm']):
    try:
        atc3_code = Counter(mapping.map(str(rxnorm_code), target_kwargs={"level": 3})).most_common(1)[0][0]
    except:
        continue
    umls_cui = atc_to_cui_map[atc3_code]
    rxnorm_to_atc3_to_cui_map['RxNorm'].append(rxnorm_code)
    rxnorm_to_atc3_to_cui_map['UMLS'].append(umls_cui)


with open('RxNorm_to_ATC3_UMLS.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(rxnorm_to_atc3_to_cui_map.keys())
    writer.writerows(zip(*rxnorm_to_atc3_to_cui_map.values()))


100%|██████████| 104819/104819 [00:00<00:00, 261568.96it/s]


In [44]:
with open("./umls_graph.txt", 'r') as f:
    lines = f.readlines()

triple_set = set()
for line in lines:
    h, r, t = line.split('\t')
    t = t[:-1]
    triple_set.add((h,r,t))

In [46]:
from tqdm import tqdm

entity_set = set()

for idx in tqdm(range(len(icd9_to_cui_map['ICD9CM']))):
    icd9_code_name = icd9_to_cui_map['ICD9CM'][idx].replace('.', '')
    out_file = f'../graphs/condition/ICD9CM_large/{icd9_code_name}.txt'
    out_str = ""
    umls_cui = icd9_to_cui_map['UMLS'][idx]
    for triple in triple_set:
        if umls_cui in triple:
            out_str += triple[0] + '\t' + triple[1] + '\t' + triple[2] + '\n'
    if out_str != "":
        with open(out_file, 'w') as f:
            f.write(out_str)

    

100%|██████████| 22406/22406 [3:37:08<00:00,  1.72it/s]  


In [53]:
import json

node2idx = {}
edge2idx = {}

node_idx = 0
edge_idx = 0
for idx in tqdm(range(len(icd9_to_cui_map['ICD9CM']))):
    icd9_code_name = icd9_to_cui_map['ICD9CM'][idx].replace('.', '')
    feat_file = f'../graphs/condition/ICD9CM_large/{icd9_code_name}.txt'
    try:
        with open(feat_file, 'r') as f:
            lines = f.readlines()
    except:
        continue

    for line in lines:
        h, r, t = line[:-1].split('\t')
        if h not in node2idx.keys():
            node2idx[h] = node_idx
            node_idx += 1
        if t not in node2idx.keys():
            node2idx[t] = node_idx
            node_idx += 1
        if r not in edge2idx.keys():
            edge2idx[r] = edge_idx
            edge_idx += 1


out_file = f'../graphs/condition/ICD9CM_large/ent2id.json'
with open(out_file, 'w') as f:
    json.dump(node2idx, f, indent=6)

out_file = f'../graphs/condition/ICD9CM_large/rel2id.json'
with open(out_file, 'w') as f:
    json.dump(edge2idx, f, indent=6)


100%|██████████| 22406/22406 [00:00<00:00, 37044.05it/s]


In [1]:
import numpy as np

with open('./umls/umls.csv', 'r') as f:
    lines_1 = f.readlines()

triple_set = set()
tuple_set = set()

for line in lines_1:
    items = line.split('\t')
    e1 = items[1]
    r = items[0]
    e2 = items[2]
    triple_set.add((e1, r, e2))
    tuple_set.add((e1, e2))

# ent_emb = np.load('./umls/ent_emb.npy')

In [5]:
from tqdm import tqdm

entity_set = set()

for idx in tqdm(range(len(icd9_to_cui_map['ICD9CM']))):
    icd9_code_name = icd9_to_cui_map['ICD9CM'][idx].replace('.', '')
    out_file = f'../graphs/condition/ICD9CM_base/{icd9_code_name}.txt'
    out_str = ""
    umls_cui = icd9_to_cui_map['UMLS'][idx]
    for triple in triple_set:
        if umls_cui in triple:
            out_str += triple[0] + '\t' + triple[1] + '\t' + triple[2] + '\n'
    out_str += umls_cui + '\t' + 'self' + '\t' + umls_cui + '\n'
    with open(out_file, 'w') as f:
        f.write(out_str)

    

100%|██████████| 22406/22406 [1:59:43<00:00,  3.12it/s]  


In [8]:
import json

with open('./umls/concepts.txt', 'r') as f:
    lines = f.readlines()

ent2id = {}
id_ = 0
for line in lines:
    ent2id[line[:-1]] = id_
    id_ += 1

with open('./umls/relations.txt', 'r') as f:
    lines = f.readlines()

rel2id = {}
id_ = 0
for line in lines:
    rel2id[line[:-1]] = id_
    id_ += 1


out_file = f'../graphs/condition/ICD9CM_base/ent2id.json'
with open(out_file, 'w') as f:
    json.dump(ent2id, f, indent=6)

out_file = f'../graphs/condition/ICD9CM_base/rel2id.json'
with open(out_file, 'w') as f:
    json.dump(rel2id, f, indent=6)

In [15]:
ent2word = {}

with open('./umls/concept_names.txt', 'r') as f:
    lines = f.readlines()

for line in lines:
    ent, word = line[:-1].split('\t')
    ent2word[ent] = word

with open('../graphs/condition/ICD9CM_base/ent2word.json', 'w') as f:
    json.dump(ent2word, f, indent=6)

In [4]:
import json
import numpy as np

with open('../graphs/condition/ICD9CM_base/ent2id.json', 'r') as f:
    ent2id = json.load(f)

with open('../graphs/condition/ICD9CM_base/rel2id.json', 'r') as f:
    rel2id = json.load(f)

ent_emb = np.load('../KG_mapping/umls/ent_emb.npy')

with open('../KG_mapping/ICD9CM_to_UMLS.csv', 'r') as f:
    lines = f.readlines()
    lines = lines[1:]

with open('../KG_mapping/umls/umls.csv', 'r') as f:
    lines_1 = f.readlines()

triple_set = set()
tuple_set = set()

for line in lines_1:
    items = line.split('\t')
    e1 = items[1]
    r = items[0]
    e2 = items[2]
    triple_set.add((e1, r, e2))

icd9_to_umls = {}
for line in lines:
    icd9cm, umls = line.split(',')
    umls = umls[:-1]
    icd9_to_umls[icd9cm.replace('.', '')] = umls

In [4]:
import glob
import os
from tqdm import tqdm

triple_files = glob.glob('../graphs/condition/ICD9CM_base/*.txt')
node_set_all = set()
edge_set_all = set()
for triple_file in tqdm(triple_files):
    # file_name = '../../../../data/pj20/graphs/condition/ICD9CM_base_ext/' + triple_file.split('/')[-1]
    # if os.path.exists(file_name) == False:
        # triple_str = ""
    node_set = set()
    with open(triple_file, 'r') as f:
        lines = f.readlines()
    for line in lines:
        h, r, t = line[:-1].split('\t')
        # node_set.add(h)
        # node_set.add(t)
        node_set_all.add(h)
        edge_set_all.add(r)
        node_set_all.add(t)
        
        # for node in node_set:
        #     for triple in triple_set:
        #         if node in triple:
        #             h, r, t = triple
        #             triple_str += h + '\t' + r + '\t' + t + '\n'
        #             node_set_all.add(h)
        #             node_set_all.add(t)
        #             edge_set_all.add(r)
        
        # with open(file_name, 'w') as f:
        #     f.write(triple_str)

100%|██████████| 19340/19340 [00:00<00:00, 36287.55it/s]


In [5]:
len(node_set_all)

73629

In [7]:
ent_emb_new = []
ent2id_new = {}
rel2id_new = {}

idx = 0
for node in node_set_all:
    try:
        ent_emb_new.append(ent_emb[ent2id[node]])
        ent2id_new[node] = idx
        idx += 1
    except:
        continue

ent_emb_new = np.array(ent_emb_new)

idx = 0
for edge in edge_set_all:
    rel2id_new[edge] = idx
    idx += 1


with open('../graphs/condition/ICD9CM_base/ent2id_new.json', 'w') as f:
    json.dump(ent2id_new, f, indent=6)

with open('../graphs/condition/ICD9CM_base/rel2id_new.json', 'w') as f:
    json.dump(rel2id_new, f, indent=6)

np.save(arr=ent_emb_new, file='../graphs/condition/ICD9CM_base/ent_emb_new.npy')

In [12]:
id2ent_new = {value: key for key, value in ent2id_new.items()}

In [8]:
len(ent2id_new), len(ent_emb_new), len(rel2id_new)

(62798, 62798, 65)

In [14]:
with open('../graphs/condition/ICD9CM_base/id2ent_new.json', 'w') as f:
    json.dump(id2ent_new, f, indent=6)

In [None]:
# import glob
# import os
# from tqdm import tqdm
# from multiprocessing import Pool, cpu_count


# def process_triple_file(triple_file):
#     file_name = '../../../../data/pj20/graphs/condition/ICD9CM_base_ext/' + os.path.basename(triple_file)
#     if not os.path.exists(file_name):
#         node_set = {h for h, r, t in triple_set} | {t for h, r, t in triple_set}
#         triple_str = '\n'.join([f'{h}\t{r}\t{t}' for h, r, t in triple_set if h in node_set and t in node_set])
#         with open(file_name, 'w') as f:
#             f.write(triple_str)
#         return (node_set, {r for h, r, t in triple_set if h in node_set and t in node_set})
#     else:
#         return (set(), set())

# triple_files = glob.glob('../../../../data/pj20/graphs/condition/ICD9CM_base/*.txt')
# with Pool(cpu_count() - 1) as pool:
#     results = list(tqdm(pool.imap(process_triple_file, triple_files), total=len(triple_files)))

# node_set_all = set().union(*[r[0] for r in results])
# edge_set_all = set().union(*[r[1] for r in results])

# print(f'Number of unique nodes: {len(node_set_all)}')
# print(f'Number of unique edges: {len(edge_set_all)}')


In [10]:
import json
from collections import defaultdict

with open('../graphs/condition/ICD9CM_base/id2ent_new.json', 'r') as f:
    id2ent_new = json.load(f)

triple_files = glob.glob('../graphs/condition/ICD9CM_base/*.txt')
store_dir = "/data/pj20/graphs/umls_icd9_2hop/"

for triple_file in tqdm(triple_files):
    triple_set_ = set()
    node_triple_dict = defaultdict(list)
    out_file = store_dir + triple_file.replace('../graphs/condition/ICD9CM_base/', '')
    out_str = ""

    with open(triple_file, 'r') as f:
        lines = f.readlines()
    for line in lines:
        h, r, t = line[:-1].split('\t')
        triple = (h, r, t)
        triple_set_.add(triple)
        node_triple_dict[h].append(triple)
        node_triple_dict[t].append(triple)
    
    for key in node_triple_dict.keys():
        i = 0

        ## limit the extended 2-hop triples to 5
        while len(node_triple_dict[key]) <= 5 and i <= len(triple_set):
            for triple in triple_set:
                if (key in triple) and (triple not in node_triple_dict[key]):
                    node_triple_dict[key].append(triple)
                i += 1

    for key, triple_list in node_triple_dict.items():
        for triple in triple_list:
            h, r, t = triple
            out_str += h + '\t' + r + '\t' + t + '\n'
        
    
    with open(out_file, 'w') as f:
        f.write(out_str)


        


  0%|          | 15/19340 [00:28<10:22:12,  1.93s/it]


KeyboardInterrupt: 

In [8]:
import glob
from tqdm import tqdm

triple_files = glob.glob('../graphs/condition/ICD9CM_base/*.txt')
for triple_file in tqdm(triple_files):
    print(triple_file.replace('../graphs/condition/ICD9CM_base/', ''))
    break

  0%|          | 0/19340 [00:00<?, ?it/s]

36004.txt



