In [1]:
import pandas as pd
from collections import defaultdict
from wikidataintegrator.wdi_core import WDItemEngine
from wikidataintegrator.wdi_config import config

config['BACKOFF_MAX_TRIES'] = 1
eq = WDItemEngine.execute_sparql_query

In [2]:
ps = {'i': 'wdt:P31', 
      's': 'wdt:P279*', 
      'i_or_s': 'wdt:P31|wdt:P279*', 
      'i_path_s': 'wdt:P31/wdt:P279*', 
      's_path_i': 'wdt:P279*/wdt:P31', 
      's1_path_i': 'wdt:P279?/wdt:P31'}

query_text = """
SELECT DISTINCT ?item ?itemLabel
WHERE {{
    ?item {p} wd:{q}
    SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }}
}}
"""

In [3]:
results = list()

qid = 'Q12136'
qlabel = 'disease'

for abbv, p in ps.items():
    r = eq(query_text.format(p=p, q=qid), as_dataframe=True)
    r['item'] = r['item'].apply(lambda s: s.split('/')[-1])
    r['p'] = abbv
    r['query_p'] = p
    results.append(r)

In [4]:
res = pd.concat(results, ignore_index=True, sort=False)

In [5]:
res.sample(10)

Unnamed: 0,item,itemLabel,p,query_p
53940,Q55786185,hypogonadotropic hypogonadism-retinitis pigmen...,i_or_s,wdt:P31|wdt:P279*
124743,Q60609927,Q60609927,s1_path_i,wdt:P279?/wdt:P31
26913,Q55781948,encephalopathy due to hydroxykynureninuria,s,wdt:P279*
21245,Q734123,Asomatognosia,s,wdt:P279*
19207,Q6583504,Ribose-5-phosphate isomerase deficiency,s,wdt:P279*
112162,Q55781996,"internal carotid arteries, hypoplasia of",s1_path_i,wdt:P279?/wdt:P31
105087,Q2866610,hyposalivation,s_path_i,wdt:P279*/wdt:P31
51653,Q55782187,"pallidal degeneration, progressive, with retin...",i_or_s,wdt:P31|wdt:P279*
108516,Q21110052,temporary loss of hair,s1_path_i,wdt:P279?/wdt:P31
75643,Q18558262,lymph node disease,i_path_s,wdt:P31/wdt:P279*


In [6]:
res['query_p'].value_counts()

wdt:P31|wdt:P279*    25699
wdt:P279*/wdt:P31    25426
wdt:P279*            24789
wdt:P279?/wdt:P31    20952
wdt:P31/wdt:P279*    17131
wdt:P31              12983
Name: query_p, dtype: int64

In [8]:
instance = res.query('p == "i"')['item'].unique()
subclass = res.query('p == "s"')['item'].unique()
ips = res.query('p == "i_path_s"')['item'].unique()
spi = res.query('p == "s_path_i"')['item'].unique()
s1pi = res.query('p == "s1_path_i"')['item'].unique()


In [9]:
qr = res.query('p == "s" and item not in @instance')

print("Subclass and not Instance: {:,}".format(len(qr)))
qr.sample(10)

Subclass and not Instance: 12,716


Unnamed: 0,item,itemLabel,p,query_p
17800,Q18975593,respirovirus infectious disease,s,wdt:P279*
19640,Q55784162,cutis laxa-marfanoid syndrome,s,wdt:P279*
35061,Q28194839,Occipital epilepsy,s,wdt:P279*
30475,Q56014097,"pyruvate carboxylase deficiency, infantile form",s,wdt:P279*
20289,Q7168925,Periungual wart,s,wdt:P279*
19677,Q55787242,carbon monoxide-induced parkinsonism,s,wdt:P279*
14326,Q55780436,colonic varices without portal hypertension,s,wdt:P279*
23547,Q20730592,coffee rust,s,wdt:P279*
24392,Q2097389,Pityriasis,s,wdt:P279*
19509,Q1512812,Rotor syndrome,s,wdt:P279*


In [10]:
qr = res.query('p == "i" and item not in @subclass')

print("Instance and not Sublcass: {:,}".format(len(qr)))
qr.sample(10)

Instance and not Sublcass: 910


Unnamed: 0,item,itemLabel,p,query_p
10660,Q40739448,CDKL5 Disorder,i,wdt:P31
931,Q740731,Jarisch-Herxheimer reaction,i,wdt:P31
11507,Q55762556,"epilepsy, nocturnal frontal lobe",i,wdt:P31
10653,Q42404343,transtentorial herniation,i,wdt:P31
11704,Q55783367,"microcephaly 2, primary, autosomal recessive, ...",i,wdt:P31
12418,Q56070785,open fracture of an unspecified trochanteric s...,i,wdt:P31
12265,Q55785140,"aortic aneurysm, familial thoracic 10",i,wdt:P31
12954,Q69637370,Cowden syndrome 4,i,wdt:P31
12323,Q56002844,"Meckel syndrome, type 10",i,wdt:P31
3519,Q4875174,Bazex–Dupré–Christol syndrome,i,wdt:P31


In [11]:
qr = res.query('p == "i" and item not in @ips')

print("Instance and not P31/P279*: {:,}".format(len(qr)))

Instance and not P31/P279*: 0


In [12]:
qr = res.query('p == "i" and item not in @spi')

print("Instance and not P279*/P31: {:,}".format(len(qr)))

Instance and not P279*/P31: 0


In [13]:
qr = res.query('p == "i_path_s" and item not in @subclass')

print("P31/P279* and not Subclass: {:,}".format(len(qr)))
qr.sample(10)

P31/P279* and not Subclass: 2,550


Unnamed: 0,item,itemLabel,p,query_p
68784,Q4982380,Buccal bifurcation cyst,i_path_s,wdt:P31/wdt:P279*
79149,Q56013638,polycystic liver disease 2; PCLD2,i_path_s,wdt:P31/wdt:P279*
80345,Q3023833,Paederus dermatitis,i_path_s,wdt:P31/wdt:P279*
76096,Q53716241,mouse neuroblastoma,i_path_s,wdt:P31/wdt:P279*
69625,Q55786584,partial deletion of chromosome 19,i_path_s,wdt:P31/wdt:P279*
66133,Q1916694,medullary cystic kidney disease,i_path_s,wdt:P31/wdt:P279*
70633,Q55787679,mosaic genome-wide paternal uniparental disomy,i_path_s,wdt:P31/wdt:P279*
71497,Q55790014,"chromosome 22, monosome mosaic",i_path_s,wdt:P31/wdt:P279*
79919,Q1087749,Chromosome instability syndrome,i_path_s,wdt:P31/wdt:P279*
77969,Q53844335,midgut carcinoid tumor,i_path_s,wdt:P31/wdt:P279*


In [14]:
qr = res.query('p == "s_path_i" and item not in @subclass')

print("P279*/P31 and not Subclass:  {:,}".format(len(qr)))
qr.sample(10)

P279*/P31 and not Subclass:  1,166


Unnamed: 0,item,itemLabel,p,query_p
83066,Q2060353,Bulbar palsy,s_path_i,wdt:P279*/wdt:P31
105246,Q7535419,skin pop scar,s_path_i,wdt:P279*/wdt:P31
92718,Q55785029,"intellectual disability, autosomal recessive 52",s_path_i,wdt:P279*/wdt:P31
92112,Q55780299,pyruvate kinase hyperactivity,s_path_i,wdt:P279*/wdt:P31
93782,Q56027188,hood symptom,s_path_i,wdt:P279*/wdt:P31
83346,Q3150707,osteoarticular infection,s_path_i,wdt:P279*/wdt:P31
80759,Q33121,serum sickness,s_path_i,wdt:P279*/wdt:P31
105682,Q6556287,Lipedematous alopecia,s_path_i,wdt:P279*/wdt:P31
93069,Q56014255,Bosley-Salih-Alorainy syndrome,s_path_i,wdt:P279*/wdt:P31
82439,Q1363741,erythroderma,s_path_i,wdt:P279*/wdt:P31


In [15]:
qr = res.query('p == "i_path_s" and item not in @instance')

print("P31/P279* and not Instance: {:,}".format(len(qr)))
qr.sample(10)

P31/P279* and not Instance: 4,148


Unnamed: 0,item,itemLabel,p,query_p
70389,Q1383693,exercise-induced collapse,i_path_s,wdt:P31/wdt:P279*
69216,Q55785608,FASTKD2-related infantile mitochondrial enceph...,i_path_s,wdt:P31/wdt:P279*
70667,Q55787714,trigonocephaly-broad thumbs syndrome,i_path_s,wdt:P31/wdt:P279*
70079,Q12042146,avian ochratoxicosis,i_path_s,wdt:P31/wdt:P279*
69623,Q55786567,Okihiro syndrome due to a point mutation,i_path_s,wdt:P31/wdt:P279*
80316,Q2257746,head for heights,i_path_s,wdt:P31/wdt:P279*
70360,Q1347819,Pituitary pars intermedia dysfunction,i_path_s,wdt:P31/wdt:P279*
68093,Q55782429,X-linked myotubular myopathy-abnormal genitali...,i_path_s,wdt:P31/wdt:P279*
67843,Q55781775,craniofacial dyssynostosis,i_path_s,wdt:P31/wdt:P279*
80506,Q1034711,Heberden's node,i_path_s,wdt:P31/wdt:P279*


In [16]:
qr = res.query('p == "i_path_s" and item not in @instance and item not in @subclass')

print("P31/P279* and not Instance or Sublcass: {:,}".format(len(qr)))
qr.sample(10)

P31/P279* and not Instance or Sublcass: 1,640


Unnamed: 0,item,itemLabel,p,query_p
71520,Q55789792,acrofacial dysostosis preis type,i_path_s,wdt:P31/wdt:P279*
70704,Q4951681,Boxer cardiomyopathy,i_path_s,wdt:P31/wdt:P279*
71360,Q55788996,paternal uniparental disomy of chromosome 6,i_path_s,wdt:P31/wdt:P279*
80343,Q2260565,broiler ascites syndrome,i_path_s,wdt:P31/wdt:P279*
71239,Q749342,noble rot,i_path_s,wdt:P31/wdt:P279*
69805,Q4747466,amoebic gill disease,i_path_s,wdt:P31/wdt:P279*
79954,Q51001240,White rust of chrysanthemum,i_path_s,wdt:P31/wdt:P279*
75819,Q1525945,Giacomo and Giovanni Battista Tocci,i_path_s,wdt:P31/wdt:P279*
80331,Q16708635,Q16708635,i_path_s,wdt:P31/wdt:P279*
79915,Q2364376,Bletting,i_path_s,wdt:P31/wdt:P279*


In [17]:
qr = res.query('p == "s_path_i" and item not in @instance')

print("P279*/P31 and not Instance: {:,}".format(len(qr)))
qr.sample(10)

P279*/P31 and not Instance: 12,443


Unnamed: 0,item,itemLabel,p,query_p
96888,Q55785708,non-rhizomelic chondrodysplasia punctata,s_path_i,wdt:P279*/wdt:P31
99119,Q55788077,partial corpus callosum agenesis-cerebellar ve...,s_path_i,wdt:P279*/wdt:P31
97296,Q3144528,heboidophrenia,s_path_i,wdt:P279*/wdt:P31
104418,Q55789752,eyelid seborrheic keratosis,s_path_i,wdt:P279*/wdt:P31
99088,Q55781702,"camptodactyly syndrome, Guadalajara type 1",s_path_i,wdt:P279*/wdt:P31
94764,Q55781344,"Parotidomegaly, hereditary bilateral",s_path_i,wdt:P279*/wdt:P31
97275,Q55785456,cerebrofacial arteriovenous metameric syndrome...,s_path_i,wdt:P279*/wdt:P31
104639,Q29589522,salmon-colored rash,s_path_i,wdt:P279*/wdt:P31
104020,Q30150924,Sensorimotor neuropathy,s_path_i,wdt:P279*/wdt:P31
95979,Q55782556,hemophilia A with vascular abnormality,s_path_i,wdt:P279*/wdt:P31


In [18]:
qr = res.query('p == "s_path_i" and item not in @subclass')

print("P279*/P31 and not Subclass: {:,}".format(len(qr)))
qr.sample(10)

P279*/P31 and not Subclass: 1,166


Unnamed: 0,item,itemLabel,p,query_p
105559,Q25464242,cryptogenic drop attacks,s_path_i,wdt:P279*/wdt:P31
82171,Q991897,sacbrood,s_path_i,wdt:P279*/wdt:P31
93510,Q67247674,Q67247674,s_path_i,wdt:P279*/wdt:P31
92517,Q55784385,C3 glomerulonephritis,s_path_i,wdt:P279*/wdt:P31
92210,Q55782505,Diamond-Blackfan anemia 14 with mandibulofacia...,s_path_i,wdt:P279*/wdt:P31
91723,Q53996650,carbonic anhydrase I deficiency,s_path_i,wdt:P279*/wdt:P31
93070,Q56014325,hypotrichosis simplex of the scalp,s_path_i,wdt:P279*/wdt:P31
92637,Q55784924,cone-rod dystrophy 21,s_path_i,wdt:P279*/wdt:P31
83191,Q2853082,anthropozoonosis,s_path_i,wdt:P279*/wdt:P31
84471,Q5396513,Erythrotelangiectatic rosacea,s_path_i,wdt:P279*/wdt:P31


In [19]:
qr = res.query('p == "s_path_i" and item not in @subclass and item not in @instance')

print("P279*/P31 and not Subclass or Instahce: {:,}".format(len(qr)))
qr.sample(10)

P279*/P31 and not Subclass or Instahce: 256


Unnamed: 0,item,itemLabel,p,query_p
98718,Q55785478,ring chromosome 19,s_path_i,wdt:P279*/wdt:P31
105662,Q7049209,Noncicatricial alopecia,s_path_i,wdt:P279*/wdt:P31
105902,Q4331898,millipede burn,s_path_i,wdt:P279*/wdt:P31
93851,Q193840,asphyxia,s_path_i,wdt:P279*/wdt:P31
98775,Q55790289,giant mammary hamartoma,s_path_i,wdt:P279*/wdt:P31
103138,Q945225,ventricular hypertrophy,s_path_i,wdt:P279*/wdt:P31
105886,Q55784012,ectodermal dysplasia-cutaneous syndactyly synd...,s_path_i,wdt:P279*/wdt:P31
94213,Q48207191,Urogenital fistula,s_path_i,wdt:P279*/wdt:P31
98724,Q55785477,ring chromosome 17,s_path_i,wdt:P279*/wdt:P31
105275,Q7240506,Premorbidity,s_path_i,wdt:P279*/wdt:P31


In [20]:
qr = res.query('p == "s_path_i" and item not in @ips')

print("P279*/P31 and not P31/P279*: {:,}".format(len(qr)))
qr.sample(10)

P279*/P31 and not P31/P279*: 9,955


Unnamed: 0,item,itemLabel,p,query_p
101490,Q5280344,Direct inguinal hernia,s_path_i,wdt:P279*/wdt:P31
96553,Q55999634,preaxial Hallucal polydactyly,s_path_i,wdt:P279*/wdt:P31
102804,Q25091271,Birth injury,s_path_i,wdt:P279*/wdt:P31
99967,Q66124188,spinocerebellar degenerations,s_path_i,wdt:P279*/wdt:P31
101193,Q55790454,genetic motor neuron disease,s_path_i,wdt:P279*/wdt:P31
104462,Q18975385,Human T-lymphotropic virus 1 infectious disease,s_path_i,wdt:P279*/wdt:P31
98964,Q6692784,Low-set ears,s_path_i,wdt:P279*/wdt:P31
102863,Q18966637,renal syphilis,s_path_i,wdt:P279*/wdt:P31
104870,Q55780397,neurofibromatosis type 6,s_path_i,wdt:P279*/wdt:P31
101577,Q3958669,sphérophakia,s_path_i,wdt:P279*/wdt:P31


In [21]:
qr = res.query('p == "i_path_s" and item not in @spi')

print("P31/P279* and not P279*/P31: {:,}".format(len(qr)))
qr.sample(10)

P31/P279* and not P279*/P31: 1,660


Unnamed: 0,item,itemLabel,p,query_p
68092,Q55782398,"whistling face syndrome, recessive form",i_path_s,wdt:P31/wdt:P279*
69260,Q55785913,congenital aortopulmonary window,i_path_s,wdt:P31/wdt:P279*
64971,Q366886,psychopathy,i_path_s,wdt:P31/wdt:P279*
76295,Q53844702,rat endometrial stromal sarcoma,i_path_s,wdt:P31/wdt:P279*
79956,Q1679678,hydrops fetalis,i_path_s,wdt:P31/wdt:P279*
69581,Q55786614,partial deletion of the long arm of chromosome 14,i_path_s,wdt:P31/wdt:P279*
70049,Q12049343,Q12049343,i_path_s,wdt:P31/wdt:P279*
75812,Q2290285,Sisi syndrome,i_path_s,wdt:P31/wdt:P279*
76301,Q53869257,mouse gastric carcinoma,i_path_s,wdt:P31/wdt:P279*
76292,Q53843927,mouse skin hemangioma,i_path_s,wdt:P31/wdt:P279*


In [22]:
qr = res.query('p == "i_path_s" and item not in @s1pi')

print("P31/P279* and not P279*/P31: {:,}".format(len(qr)))
qr.sample(10)

P31/P279* and not P279*/P31: 3,034


Unnamed: 0,item,itemLabel,p,query_p
79700,Q1797858,LAMB Syndrome,i_path_s,wdt:P31/wdt:P279*
80594,Q54911773,P388 leukemia,i_path_s,wdt:P31/wdt:P279*
70396,Q53678342,budgerigar fibrosarcoma,i_path_s,wdt:P31/wdt:P279*
67687,Q55780856,cavernous hemangiomas of face-supraumbilical m...,i_path_s,wdt:P31/wdt:P279*
69552,Q55786454,mitochondrial oxidative phosphorylation disord...,i_path_s,wdt:P31/wdt:P279*
69487,Q55786336,Melhem-Fahl syndrome,i_path_s,wdt:P31/wdt:P279*
80596,Q41792180,Swedish pregnancy category 3,i_path_s,wdt:P31/wdt:P279*
67277,Q51728071,Edinburgh malformation syndrome,i_path_s,wdt:P31/wdt:P279*
69146,Q1024563,CVBD,i_path_s,wdt:P31/wdt:P279*
73766,Q55789178,malignant tumor of palpebral epidermis,i_path_s,wdt:P31/wdt:P279*


In [23]:
qr = res.query('p == "s1_path_i" and item not in @ips')

print("P279?/P31 and not P31/P279*: {:,}".format(len(qr)))
qr.sample(10)

P279?/P31 and not P31/P279*: 6,855


Unnamed: 0,item,itemLabel,p,query_p
126972,Q55788365,autosomal recessive limb-girdle muscular dystr...,s1_path_i,wdt:P279?/wdt:P31
122862,Q55785261,rare metabolic liver disease,s1_path_i,wdt:P279?/wdt:P31
112520,Q5260908,Depression in childhood and adolescence,s1_path_i,wdt:P279?/wdt:P31
109672,Q48996667,Premature greying of hair,s1_path_i,wdt:P279?/wdt:P31
107292,Q5684276,bronchial adenoma,s1_path_i,wdt:P279?/wdt:P31
113282,Q1413991,reperfusion injury,s1_path_i,wdt:P279?/wdt:P31
111099,Q18967487,transsexuality with heterosexual history,s1_path_i,wdt:P279?/wdt:P31
125228,Q55789948,CDG syndrome type 4,s1_path_i,wdt:P279?/wdt:P31
118183,Q55785220,neurogenic thoracic outlet syndrome,s1_path_i,wdt:P279?/wdt:P31
125345,Q55788544,inborn disorder of fatty acid oxidation and ke...,s1_path_i,wdt:P279?/wdt:P31


In [24]:
qr = res.query('p == "s_path_i" and item not in @s1pi')

print("P279*/P31 and not P279?/P31: {:,}".format(len(qr)))
qr.sample(10)

P279*/P31 and not P279?/P31: 4,474


Unnamed: 0,item,itemLabel,p,query_p
104209,Q56014291,florid cemento-osseous dysplasia,s_path_i,wdt:P279*/wdt:P31
96384,Q55786076,rare lymphatic system malformation,s_path_i,wdt:P279*/wdt:P31
103210,Q2086658,Ghon's complex,s_path_i,wdt:P279*/wdt:P31
104213,Q5196700,Cutaneous lupus mucinosis,s_path_i,wdt:P279*/wdt:P31
103599,Q2276279,Hypermenorrhoea,s_path_i,wdt:P279*/wdt:P31
105201,Q4162067,diseases caused by Diptera,s_path_i,wdt:P279*/wdt:P31
105543,Q5597010,grapefruit drug interactions,s_path_i,wdt:P279*/wdt:P31
96573,Q55785396,cataract-intellectual disability-anal atresia-...,s_path_i,wdt:P279*/wdt:P31
103548,Q55789641,benign neoplasm of pleura,s_path_i,wdt:P279*/wdt:P31
102753,Q16526734,pulmonary agenesis,s_path_i,wdt:P279*/wdt:P31


In [26]:
res.query('p == "s_path_i" or p == "i_path_s"')['item'].nunique()

27086

In [27]:
res.to_csv('disease_path_results.csv', index=False)