In [1]:
from rdflib import Graph, Namespace, Literal
from rdflib.plugins.sparql import prepareQuery
from rdflib.namespace import SKOS
import glob
import gzip
from tqdm import tqdm

dir_ = '/data/pj20/compound/general/'

In [2]:
g = Graph()
g.parse(dir_ + 'pc_disease.ttl', format='ttl')

corpus = []
for s, p, o in g:
    if p == SKOS.prefLabel or p == SKOS.altLabel:
        label = str(o)
        dzid = str(s).split(':')[-1].replace('//rdf.ncbi.nlm.nih.gov/pubchem/disease/', '')
        corpus.append([label, dzid])


In [3]:
corpus

[['Cyclic Leucopenia', 'DZID965'],
 ['Deficiency Syndrome, Immunologic', 'DZID8153'],
 ['Tinnitus, Spontaneous Oto-Acoustic Emission', 'DZID9437'],
 ['shubo-kyofu', 'DZID64'],
 ['White forelock and leukoderma with neurological impairment', 'DZID1577'],
 ['X-linked VACTERL-H syndrome', 'DZID4298'],
 ['Brief, Resolved, Unexplained Event', 'DZID11236'],
 ['Rubber Allergy', 'DZID10549'],
 ['Gait, Stumbling', 'DZID10505'],
 ['Natural Killer Cell Deficiency, Familial Isolated', 'DZID5438'],
 ['Cleft Lip with or without Cleft Palate, Nonsyndromic, 8', 'DZID4493'],
 ['Monckeberg Medial Calcific Sclerosis', 'DZID10901'],
 ['Neurologic Symptoms', 'DZID8652'],
 ['Epidermolysis Bullosa, Junctional', 'DZID9826'],
 ['Nevi, Melanocytic', 'DZID8664'],
 ['Nphs3', 'DZID567'],
 ['Duplication 8p', 'DZID2482'],
 ['Deafness, Autosomal Recessive 10', 'DZID4688'],
 ['Syndactyly, type v', 'DZID2596'],
 ['Gastrointestinal Stromal Sarcoma', 'DZID10833'],
 ['WT3', 'DZID5118'],
 ['High red cell phosphatidylcholine

In [7]:
from rank_bm25 import BM25Okapi
import nltk

def preprocess_text(text):
    # Tokenize the text
    tokens = nltk.word_tokenize(text.lower())

    # Remove stopwords
    stopwords = set(nltk.corpus.stopwords.words('english'))
    tokens = [token for token in tokens if not token in stopwords]

    # Join the tokens back into a string
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

In [8]:
bm25 = BM25Okapi([doc[0].split() for doc in corpus])

In [19]:
import json

with open('./primekg_id_mapping.json', 'r') as f:
    id2name_disease = json.load(f)['id2name_disease']

In [20]:
id2name_disease

{'13924_12592_14672_13460_12591_12536_30861_8146_8148_32846_13459_44329_14544_9805_49223_9804_14086_8147_13515_14029_12581_19019': 'osteogenesis imperfecta',
 '11160_13119_13978_12060_12327_12670_13210_11067_12903_12293_12376_12375_11767_10965_12460_10967_11602_12002_11762_13386_14363_10933_12452_13365_13250_13826_12445_12326_11360_11392_13985_14739_11351_13489_12421_9076_13738_11279_14675_11286_13249_12485_10986_12420_14428_12170_12091_12442_11364_13984_12418_14237_13010_12355_912_14469_12273_13269_12602_11774_10807_12977_12003_12370_11192_10987_11991_12333_10860_13929_13471_11912_13537_13963_11799_13215_11553_14182_19588_14849': 'autosomal recessive nonsyndromic deafness',
 '8099_12497_12498': 'congenital stationary night blindness autosomal dominant',
 '14854_14293_14470_12380_11832_14603_14853_11761_11032_14594_12975_10973_12090_14740_12902_10915_11058_14283_11519_12083_7424_11673_11389_13632_11103_11226_11102_12974_12086_11159_11074_11031_10963_13823_11660_11893_13305_11708_11994_

In [21]:
from tqdm import tqdm

primekg_disease_id_to_dzid = {}

for key in tqdm(id2name_disease.keys()):
    query = id2name_disease[key]
    tokenized_query = query.split()
    doc_scores = bm25.get_scores(tokenized_query)
    max_score = max(doc_scores)

    # Get the indices of the documents with the highest score
    max_score_indices = [i for i, score in enumerate(doc_scores) if score == max_score]

    # Get the DZIDs of the matching documents
    dzid = [corpus[i][1] for i in max_score_indices][0]

    primekg_disease_id_to_dzid[key] = dzid


100%|██████████| 17080/17080 [05:03<00:00, 56.33it/s]


In [23]:
with open('./primekg_disease_id_to_dzid.json', 'w') as f:
    json.dump(primekg_disease_id_to_dzid, f, indent=6)