# Angelman Syndrome TD-IDF calculation

The code below goes through the needed calculcation of a TF-IDF score using just one disease, Angelman syndrome. It outlines the process of getting the phenotypes of Angelman syndrome and their respective weights. These are then used to calculate a tf, idf, and tf-idf score for each phenotype. 

The latter half of the code goes through the same process with a phenotype of angelman syndrome (cerebral cortex atrophy). 

In [2]:
import pandas as pd
from py2neo import Graph
from py2neo import Node

In [5]:
from py2neo import Database
db = Database("bolt://disease.ncats.io:80")
graph = db.default_graph

In [6]:
from matplotlib import pyplot as plt

In [40]:
import numpy as np

In [7]:
weights = pd.read_csv("page_rank_weights.csv")

### Angelman Syndrome Phenotype Table 

In [9]:
query = """match (n:S_GARD)--(a:DATA) with n, a 
            match p =(n)-[:R_rel{name: 'has_phenotype'}]-(h:S_HP)--(d:DATA) 
            where a.id = 5810 return ID(h), h.N_Name, h.I_CODE, d.id"""
d = graph.run(query).data()

In [10]:
angelman_df = pd.DataFrame.from_dict(d)
angelman_df.head(10)

Unnamed: 0,ID(h),h.N_Name,h.I_CODE,d.id
0,119957,"[HYPERTROPHY OF THE TONGUE, ABNORMALLY LARGE T...","[HP:0000203, UMLS:C0024421, SNOMEDCT_US:252730...",HP:0000158
1,116766,[SLEEP-WAKE CYCLE DISTURBANCE],"[UMLS:C1833362, HP:0006979]",HP:0006979
2,66781,"[LOW OR WEAK MUSCLE TONE, MUSCLE HYPOTONIA, MU...","[HP:0001318, SNOMEDCT_US:398152000, UMLS:C0026...",HP:0001252
3,122798,"[OBESITY, HAVING TOO MUCH BODY FAT]","[SNOMEDCT_US:414916001, SNOMEDCT_US:414915002,...",HP:0001513
4,105171,"[PAROXYSMAL LAUGHTER, PAROXYSMAL BURSTS OF LAU...","[UMLS:C1839749, HP:0000749]",HP:0000749
5,68784,"[ABNORMAL CURVING OF THE SPINE, SCOLIOSIS]","[HP:0003317, HP:0003415, HP:0003303, HP:000277...",HP:0002650
6,81475,"[OUTWARD FACING EYE BALL, EXOTROPIA]","[HP:0008033, UMLS:C0015310, MESH:D005099, SNOM...",HP:0000577
7,56758,"[ELECTROENCEPHALOGRAM ABNORMAL, ABNORMAL EEG, ...","[HP:0002429, HP:0001346, HP:0006841, UMLS:C015...",HP:0002353
8,120003,"[LARGE MOUTH, MACROSTOMIA, BROAD MOUTH, WIDE M...","[HP:0002052, HP:0000181, MESH:D008265, SNOMEDC...",HP:0000154
9,110297,"[LIMB TREMOR, INVOLUNTARY SHAKING OF LIMB, TRE...","[MESH:D014202, UMLS:C0235081, HP:0200085]",HP:0200085


In [11]:
angelman_weights = pd.merge(angelman_df, weights, how='inner', left_on ="ID(h)", right_on = "Node_ID")

### Angelman Syndrome Phenotype Weights

In [12]:
angelman_weights.head(10)

Unnamed: 0,ID(h),h.N_Name,h.I_CODE,d.id,Node_ID,Weight
0,119957,"[HYPERTROPHY OF THE TONGUE, ABNORMALLY LARGE T...","[HP:0000203, UMLS:C0024421, SNOMEDCT_US:252730...",HP:0000158,119957,0.000101
1,116766,[SLEEP-WAKE CYCLE DISTURBANCE],"[UMLS:C1833362, HP:0006979]",HP:0006979,116766,3.6e-05
2,66781,"[LOW OR WEAK MUSCLE TONE, MUSCLE HYPOTONIA, MU...","[HP:0001318, SNOMEDCT_US:398152000, UMLS:C0026...",HP:0001252,66781,0.000595
3,122798,"[OBESITY, HAVING TOO MUCH BODY FAT]","[SNOMEDCT_US:414916001, SNOMEDCT_US:414915002,...",HP:0001513,122798,0.000148
4,105171,"[PAROXYSMAL LAUGHTER, PAROXYSMAL BURSTS OF LAU...","[UMLS:C1839749, HP:0000749]",HP:0000749,105171,1.7e-05
5,68784,"[ABNORMAL CURVING OF THE SPINE, SCOLIOSIS]","[HP:0003317, HP:0003415, HP:0003303, HP:000277...",HP:0002650,68784,0.000634
6,81475,"[OUTWARD FACING EYE BALL, EXOTROPIA]","[HP:0008033, UMLS:C0015310, MESH:D005099, SNOM...",HP:0000577,81475,0.000108
7,56758,"[ELECTROENCEPHALOGRAM ABNORMAL, ABNORMAL EEG, ...","[HP:0002429, HP:0001346, HP:0006841, UMLS:C015...",HP:0002353,56758,0.000208
8,120003,"[LARGE MOUTH, MACROSTOMIA, BROAD MOUTH, WIDE M...","[HP:0002052, HP:0000181, MESH:D008265, SNOMEDC...",HP:0000154,120003,0.000113
9,110297,"[LIMB TREMOR, INVOLUNTARY SHAKING OF LIMB, TRE...","[MESH:D014202, UMLS:C0235081, HP:0200085]",HP:0200085,110297,2e-05


In [13]:
angelman_weights.shape

(42, 6)

In [14]:
angelman_weights = angelman_weights.drop(columns=["Node_ID"])

In [15]:
angelman_weights.sort_values(by="Weight", ascending=False)

Unnamed: 0,ID(h),h.N_Name,h.I_CODE,d.id,Weight
33,89629,"[AUTOSOMAL DOMINANT, AUTOSOMAL DOMINANT TYPE, ...","[HP:0001463, HP:0001447, HP:0001456, HP:000145...",HP:0000006,0.002004
37,66797,"[SEIZURE, SEIZURES, EPILEPSY]","[HP:0002466, HP:0001303, HP:0002479, HP:000127...",HP:0001250,0.001065
11,66485,"[RETARDED DEVELOPMENT, LACK OF PSYCHOMOTOR DEV...","[HP:0007228, HP:0007174, HP:0001255, HP:000710...",HP:0001263,0.000848
5,68784,"[ABNORMAL CURVING OF THE SPINE, SCOLIOSIS]","[HP:0003317, HP:0003415, HP:0003303, HP:000277...",HP:0002650,0.000634
29,73241,"[INVOLUNTARY, RAPID, RHYTHMIC EYE MOVEMENTS, N...","[UMLS:C0028738, MESH:D009759, SNOMEDCT_US:5630...",HP:0000639,0.000611
2,66781,"[LOW OR WEAK MUSCLE TONE, MUSCLE HYPOTONIA, MU...","[HP:0001318, SNOMEDCT_US:398152000, UMLS:C0026...",HP:0001252,0.000595
34,65644,"[HYPOTONIA, GENERALIZED, GENERALIZED MUSCULAR ...","[UMLS:C1858120, HP:0001290]",HP:0001290,0.000583
16,50210,"[SQUINT, SQUINT EYES, CROSS-EYED, STRABISMUS]","[HP:0000487, UMLS:C0038379, SNOMEDCT_US:220660...",HP:0000486,0.000548
41,99759,"[INCREASED DEEP TENDON REFLEXES, INCREASED REF...","[HP:0001282, HP:0007318, HP:0006820, HP:000718...",HP:0001347,0.000331
32,66247,"[MOTOR RETARDATION, MOTOR DEVELOPMENTAL DELAY,...","[HP:0002130, HP:0007219, HP:0001307, HP:000678...",HP:0001270,0.000285


In [25]:
query = """match (h:S_HP)--(:DATA) with h 
        match p =(h)-[:R_rel{name: 'has_phenotype'}]-(n:S_GARD)--(d:DATA) 
        where ID(h) = 117196 return ID(n), n.N_Name, n.I_CODE"""
d = graph.run(query).data()

In [26]:
cortex_atrophy_df = pd.DataFrame.from_dict(d)
cortex_atrophy_df.head()

Unnamed: 0,ID(n),n.N_Name,n.I_CODE
0,12320,[MALIGNANT MIGRATING PARTIAL SEIZURES OF INFAN...,"[GARD:0012919, OMIM:614959, ORPHA:293181, ORPH..."
1,11740,"[RFT1-CDG (CDG-IN), CDG SYNDROME TYPE IN, CONG...","[GARD:0012394, OMIM:612015, ORPHA:244310, ORPH..."
2,12577,"[INFANTILE CEREBELLAR RETINAL DEGENERATION, IC...","[GARD:0013264, OMIM:614559, ORPHA:313850, ORPH..."
3,2194,"[ENCEPHALOCRANIOCUTANEOUS LIPOMATOSIS, ECCL, F...","[GARD:0002108, UMLS:C0406612, ORPHA:2396, ORPH..."
4,11738,"[DOLK-CDG (CDG-IM), CONGENITAL DISORDER OF GLY...","[GARD:0012393, OMIM:610768, ORPHA:91131, ORPHA..."


In [27]:
cortex_atrophy_weights = pd.merge(cortex_atrophy_df, weights, how='inner', left_on ="ID(n)", right_on = "Node_ID")
cortex_atrophy_weights.head()

Unnamed: 0,ID(n),n.N_Name,n.I_CODE,Node_ID,Weight
0,12320,[MALIGNANT MIGRATING PARTIAL SEIZURES OF INFAN...,"[GARD:0012919, OMIM:614959, ORPHA:293181, ORPH...",12320,3.4e-05
1,11740,"[RFT1-CDG (CDG-IN), CDG SYNDROME TYPE IN, CONG...","[GARD:0012394, OMIM:612015, ORPHA:244310, ORPH...",11740,8.4e-05
2,12577,"[INFANTILE CEREBELLAR RETINAL DEGENERATION, IC...","[GARD:0013264, OMIM:614559, ORPHA:313850, ORPH...",12577,4.2e-05
3,2194,"[ENCEPHALOCRANIOCUTANEOUS LIPOMATOSIS, ECCL, F...","[GARD:0002108, UMLS:C0406612, ORPHA:2396, ORPH...",2194,0.000106
4,11738,"[DOLK-CDG (CDG-IM), CONGENITAL DISORDER OF GLY...","[GARD:0012393, OMIM:610768, ORPHA:91131, ORPHA...",11738,5.5e-05


### Cerebral Cortex Atrophy Disease Weights

In [31]:
cortex_atrophy_weights = cortex_atrophy_weights.drop(columns=["Node_ID"])
cortex_atrophy_weights.sort_values(by="Weight", ascending=False)

Unnamed: 0,ID(n),n.N_Name,n.I_CODE,Weight
20,7305,"[WILLIAMS SYNDROME, WILLIAMS-BEUREN SYNDROME, ...","[GARD:0007891, OMIM:194050, ORPHA:904, ORPHANE...",0.000452
77,9633,"[CORNELIA DE LANGE SYNDROME, BRACHMANN DE LANG...","[GARD:0010109, OMIM:122470, ORPHA:199, ORPHANE...",0.000267
9,6541,[MITOCHONDRIAL ENCEPHALOMYOPATHY LACTIC ACIDOS...,"[GARD:0007009, OMIM:540000, ORPHA:550, ORPHANE...",0.000250
18,8094,"[CARDIOFACIOCUTANEOUS SYNDROME, CFC SYNDROME, ...","[GARD:0009146, OMIM:115150, ORPHA:1340, ORPHAN...",0.000231
5,7610,"[PETERS PLUS SYNDROME, PETERS ANOMALY WITH SHO...","[GARD:0008422, OMIM:261540, ORPHA:709, ORPHANE...",0.000226
...,...,...,...,...
44,12782,"[EARLY INFANTILE EPILEPTIC ENCEPHALOPATHY-64, ...","[GARD:0013681, OMIM:618004]",0.000039
74,3541,"[MILLER-DIEKER SYNDROME, MILLER-DIEKER LISSENC...","[GARD:0003669, OMIM:247200, ORPHA:531, ORPHANE...",0.000039
0,12320,[MALIGNANT MIGRATING PARTIAL SEIZURES OF INFAN...,"[GARD:0012919, OMIM:614959, ORPHA:293181, ORPH...",0.000034
60,12765,[CHILDHOOD-ONSET NEURODEGENERATION WITH BRAIN ...,"[GARD:0013658, OMIM:617672, ORPHA:500180, ORPH...",0.000031


## TF-IDF for disease and phenotype weights

### Tf(d,f) for phenotypes in angelman syndrome

In [21]:
#get the weight value for angelman syndrome
select_disease = weights.loc[weights['Node_ID'] == 5403]
print (select_disease)
angelman_val = weights.iloc[7574]["Weight"]
print(angelman_val)

      Node_ID    Weight
7574     5403  0.000083
8.290000000000001e-05


In [23]:
#add a column to the angelman table, representing the tf scores. 
#tf(d,f) is the product of the phenotype weight and the angelman weight
angelman_weights["tf(d,f)"] = angelman_weights["Weight"]*angelman_val
angelman_weights.head(10)

Unnamed: 0,ID(h),h.N_Name,h.I_CODE,d.id,Weight,"tf(d,f)"
0,119957,"[HYPERTROPHY OF THE TONGUE, ABNORMALLY LARGE T...","[HP:0000203, UMLS:C0024421, SNOMEDCT_US:252730...",HP:0000158,0.000101,8.352092e-09
1,116766,[SLEEP-WAKE CYCLE DISTURBANCE],"[UMLS:C1833362, HP:0006979]",HP:0006979,3.6e-05,2.9844e-09
2,66781,"[LOW OR WEAK MUSCLE TONE, MUSCLE HYPOTONIA, MU...","[HP:0001318, SNOMEDCT_US:398152000, UMLS:C0026...",HP:0001252,0.000595,4.930527e-08
3,122798,"[OBESITY, HAVING TOO MUCH BODY FAT]","[SNOMEDCT_US:414916001, SNOMEDCT_US:414915002,...",HP:0001513,0.000148,1.23007e-08
4,105171,"[PAROXYSMAL LAUGHTER, PAROXYSMAL BURSTS OF LAU...","[UMLS:C1839749, HP:0000749]",HP:0000749,1.7e-05,1.39272e-09
5,68784,"[ABNORMAL CURVING OF THE SPINE, SCOLIOSIS]","[HP:0003317, HP:0003415, HP:0003303, HP:000277...",HP:0002650,0.000634,5.253e-08
6,81475,"[OUTWARD FACING EYE BALL, EXOTROPIA]","[HP:0008033, UMLS:C0015310, MESH:D005099, SNOM...",HP:0000577,0.000108,8.98379e-09
7,56758,"[ELECTROENCEPHALOGRAM ABNORMAL, ABNORMAL EEG, ...","[HP:0002429, HP:0001346, HP:0006841, UMLS:C015...",HP:0002353,0.000208,1.724511e-08
8,120003,"[LARGE MOUTH, MACROSTOMIA, BROAD MOUTH, WIDE M...","[HP:0002052, HP:0000181, MESH:D008265, SNOMEDC...",HP:0000154,0.000113,9.367368e-09
9,110297,"[LIMB TREMOR, INVOLUNTARY SHAKING OF LIMB, TRE...","[MESH:D014202, UMLS:C0235081, HP:0200085]",HP:0200085,2e-05,1.63313e-09


### tf-idf for phenotypes in angelman syndrome

In [26]:
#get the idf scores
#idf is the log of the ratio of the total number of diseases over the number of diseases with the specific phenotype
#get the shapes of the disease and phenotype numbers
gard_diseases = pd.read_csv("~/Desktop/neo4j-disease/GARD_Phenotype_Distribution.csv")
phenotypes = pd.read_csv("~/Desktop/neo4j-disease/Phenotype_GARD_Distribution.csv")

In [35]:
total_d = gard_diseases.shape[0]

In [34]:
#test getting disease number for phenotypes
select_phenotype = phenotypes.loc[phenotypes['d.id'] == "HP:0000749"]
select_phenotype
#4 diseases associated with this phenotype

Unnamed: 0.1,Unnamed: 0,d.id,n.N_Name,count(s)
525,525,HP:0000749,"['PAROXYSMAL LAUGHTER', 'PAROXYSMAL BURSTS OF ...",4


In [36]:
#add column to angelman for associated diseases
angelman_weights = pd.merge(angelman_weights, phenotypes, how='inner', left_on ="d.id", right_on = "d.id")
angelman_weights.head()

Unnamed: 0.1,ID(h),h.N_Name,h.I_CODE,d.id,Weight,"tf(d,f)",Unnamed: 0,n.N_Name,count(s)
0,119957,"[HYPERTROPHY OF THE TONGUE, ABNORMALLY LARGE T...","[HP:0000203, UMLS:C0024421, SNOMEDCT_US:252730...",HP:0000158,0.000101,8.352092e-09,520,"['HYPERTROPHY OF THE TONGUE', 'ABNORMALLY LARG...",39
1,116766,[SLEEP-WAKE CYCLE DISTURBANCE],"[UMLS:C1833362, HP:0006979]",HP:0006979,3.6e-05,2.9844e-09,4343,['SLEEP-WAKE CYCLE DISTURBANCE'],4
2,66781,"[LOW OR WEAK MUSCLE TONE, MUSCLE HYPOTONIA, MU...","[HP:0001318, SNOMEDCT_US:398152000, UMLS:C0026...",HP:0001252,0.000595,4.930527e-08,64,"['LOW OR WEAK MUSCLE TONE', 'MUSCLE HYPOTONIA'...",300
3,122798,"[OBESITY, HAVING TOO MUCH BODY FAT]","[SNOMEDCT_US:414916001, SNOMEDCT_US:414915002,...",HP:0001513,0.000148,1.23007e-08,548,"['OBESITY', 'HAVING TOO MUCH BODY FAT']",63
4,105171,"[PAROXYSMAL LAUGHTER, PAROXYSMAL BURSTS OF LAU...","[UMLS:C1839749, HP:0000749]",HP:0000749,1.7e-05,1.39272e-09,525,"['PAROXYSMAL LAUGHTER', 'PAROXYSMAL BURSTS OF ...",4


In [38]:
angelman_weights = angelman_weights.drop(columns=["Unnamed: 0", "n.N_Name"])
angelman_weights.head()

Unnamed: 0,ID(h),h.N_Name,h.I_CODE,d.id,Weight,"tf(d,f)",count(s)
0,119957,"[HYPERTROPHY OF THE TONGUE, ABNORMALLY LARGE T...","[HP:0000203, UMLS:C0024421, SNOMEDCT_US:252730...",HP:0000158,0.000101,8.352092e-09,39
1,116766,[SLEEP-WAKE CYCLE DISTURBANCE],"[UMLS:C1833362, HP:0006979]",HP:0006979,3.6e-05,2.9844e-09,4
2,66781,"[LOW OR WEAK MUSCLE TONE, MUSCLE HYPOTONIA, MU...","[HP:0001318, SNOMEDCT_US:398152000, UMLS:C0026...",HP:0001252,0.000595,4.930527e-08,300
3,122798,"[OBESITY, HAVING TOO MUCH BODY FAT]","[SNOMEDCT_US:414916001, SNOMEDCT_US:414915002,...",HP:0001513,0.000148,1.23007e-08,63
4,105171,"[PAROXYSMAL LAUGHTER, PAROXYSMAL BURSTS OF LAU...","[UMLS:C1839749, HP:0000749]",HP:0000749,1.7e-05,1.39272e-09,4


In [45]:
#add the idf(d,f) column, this is the natural log of the 
angelman_weights["idf(d,f)"] = np.log10(total_d / angelman_weights["count(s)"])
angelman_weights.head()

Unnamed: 0,ID(h),h.N_Name,h.I_CODE,d.id,Weight,"tf(d,f)",count(s),"idf(d,f)"
0,119957,"[HYPERTROPHY OF THE TONGUE, ABNORMALLY LARGE T...","[HP:0000203, UMLS:C0024421, SNOMEDCT_US:252730...",HP:0000158,0.000101,8.352092e-09,39,1.761889
1,116766,[SLEEP-WAKE CYCLE DISTURBANCE],"[UMLS:C1833362, HP:0006979]",HP:0006979,3.6e-05,2.9844e-09,4,2.750894
2,66781,"[LOW OR WEAK MUSCLE TONE, MUSCLE HYPOTONIA, MU...","[HP:0001318, SNOMEDCT_US:398152000, UMLS:C0026...",HP:0001252,0.000595,4.930527e-08,300,0.875833
3,122798,"[OBESITY, HAVING TOO MUCH BODY FAT]","[SNOMEDCT_US:414916001, SNOMEDCT_US:414915002,...",HP:0001513,0.000148,1.23007e-08,63,1.553613
4,105171,"[PAROXYSMAL LAUGHTER, PAROXYSMAL BURSTS OF LAU...","[UMLS:C1839749, HP:0000749]",HP:0000749,1.7e-05,1.39272e-09,4,2.750894


In [47]:
#get the tf-idf column
angelman_weights["tf-idf"] = angelman_weights["tf(d,f)"]*angelman_weights["idf(d,f)"]
angelman_weights.head(10)

Unnamed: 0,ID(h),h.N_Name,h.I_CODE,d.id,Weight,"tf(d,f)",count(s),"idf(d,f)",tf-idf
0,119957,"[HYPERTROPHY OF THE TONGUE, ABNORMALLY LARGE T...","[HP:0000203, UMLS:C0024421, SNOMEDCT_US:252730...",HP:0000158,0.000101,8.352092e-09,39,1.761889,1.471546e-08
1,116766,[SLEEP-WAKE CYCLE DISTURBANCE],"[UMLS:C1833362, HP:0006979]",HP:0006979,3.6e-05,2.9844e-09,4,2.750894,8.209768e-09
2,66781,"[LOW OR WEAK MUSCLE TONE, MUSCLE HYPOTONIA, MU...","[HP:0001318, SNOMEDCT_US:398152000, UMLS:C0026...",HP:0001252,0.000595,4.930527e-08,300,0.875833,4.318317e-08
3,122798,"[OBESITY, HAVING TOO MUCH BODY FAT]","[SNOMEDCT_US:414916001, SNOMEDCT_US:414915002,...",HP:0001513,0.000148,1.23007e-08,63,1.553613,1.911053e-08
4,105171,"[PAROXYSMAL LAUGHTER, PAROXYSMAL BURSTS OF LAU...","[UMLS:C1839749, HP:0000749]",HP:0000749,1.7e-05,1.39272e-09,4,2.750894,3.831225e-09
5,68784,"[ABNORMAL CURVING OF THE SPINE, SCOLIOSIS]","[HP:0003317, HP:0003415, HP:0003303, HP:000277...",HP:0002650,0.000634,5.253e-08,337,0.825324,4.335427e-08
6,81475,"[OUTWARD FACING EYE BALL, EXOTROPIA]","[HP:0008033, UMLS:C0015310, MESH:D005099, SNOM...",HP:0000577,0.000108,8.98379e-09,20,2.051924,1.843405e-08
7,56758,"[ELECTROENCEPHALOGRAM ABNORMAL, ABNORMAL EEG, ...","[HP:0002429, HP:0001346, HP:0006841, UMLS:C015...",HP:0002353,0.000208,1.724511e-08,71,1.501696,2.58969e-08
8,120003,"[LARGE MOUTH, MACROSTOMIA, BROAD MOUTH, WIDE M...","[HP:0002052, HP:0000181, MESH:D008265, SNOMEDC...",HP:0000154,0.000113,9.367368e-09,60,1.574803,1.475176e-08
9,110297,"[LIMB TREMOR, INVOLUNTARY SHAKING OF LIMB, TRE...","[MESH:D014202, UMLS:C0235081, HP:0200085]",HP:0200085,2e-05,1.63313e-09,2,3.051924,4.984189e-09


In [49]:
np.sum(angelman_weights["tf-idf"])

8.962981015488205e-07