In [3]:
import pandas as pd
from py2neo import Graph
from py2neo import Node

In [2]:
from matplotlib import pyplot as plt
import numpy as np

In [42]:
from py2neo import Database
db = Database("bolt://disease.ncats.io:80")
graph = db.default_graph

In [11]:
weights = pd.read_csv("page_rank_weights.csv")

In [6]:
query = """match (n:S_GARD)--(d:DATA)  with n, d 
            match p =(n)-[:R_rel{name:"has_phenotype"}]
            -(s:S_HP)--(:DATA) return ID(n), n.N_Name, n.I_CODE, count(s)"""
d = graph.run(query).data()

In [7]:
diseases = pd.DataFrame.from_dict(d)
diseases.head()

Unnamed: 0,ID(n),count(s),n.I_CODE,n.N_Name
0,1,16,"[GARD:0000001, OMIM:603358, ORPHA:53693, ORPHA...","[GRACILE SYNDROME, FLNMS, FINNISH LACTIC ACIDO..."
1,3,49,"[GARD:0000003, OMIM:200110, ORPHA:920, ORPHANE...","[ABLEPHARON MACROSTOMIA SYNDROME, AMS, CONGENI..."
2,7,14,"[GARD:0000005, OMIM:200100, ORPHA:14, ORPHANET...","[ABETALIPOPROTEINEMIA, BASSEN KORNZWEIG SYNDRO..."
3,11,29,"[GARD:0000007, OMIM:102370, ORPHA:969, ORPHANE...","[ACROMICRIC DYSPLASIA, ACROMICRIC SKELETAL DYS..."
4,15,9,"[GARD:0000011, OMIM:104290]","[ALTERNATING HEMIPLEGIA OF CHILDHOOD, ALTERNAT..."


In [12]:
diseases = pd.merge(diseases, weights, how='inner', left_on ="ID(n)", right_on = "Node_ID")

In [14]:
diseases = diseases.drop(columns=["Node_ID"], )

In [15]:
diseases.head()

Unnamed: 0,ID(n),count(s),n.I_CODE,n.N_Name,Weight
0,1,16,"[GARD:0000001, OMIM:603358, ORPHA:53693, ORPHA...","[GRACILE SYNDROME, FLNMS, FINNISH LACTIC ACIDO...",4.6e-05
1,3,49,"[GARD:0000003, OMIM:200110, ORPHA:920, ORPHANE...","[ABLEPHARON MACROSTOMIA SYNDROME, AMS, CONGENI...",0.000101
2,7,14,"[GARD:0000005, OMIM:200100, ORPHA:14, ORPHANET...","[ABETALIPOPROTEINEMIA, BASSEN KORNZWEIG SYNDRO...",3.7e-05
3,11,29,"[GARD:0000007, OMIM:102370, ORPHA:969, ORPHANE...","[ACROMICRIC DYSPLASIA, ACROMICRIC SKELETAL DYS...",6e-05
4,15,9,"[GARD:0000011, OMIM:104290]","[ALTERNATING HEMIPLEGIA OF CHILDHOOD, ALTERNAT...",2.3e-05


In [22]:
query = """match (n:S_HP)--(d:DATA) with n, d 
        match p =(n)-[:R_rel{name:"has_phenotype"}]-(s:S_GARD) 
        return ID(n), n.N_Name, d.id, n.I_CODE, count(s)"""
d = graph.run(query).data()
phenotypes = pd.DataFrame.from_dict(d)

In [23]:
phenotypes = pd.merge(phenotypes, weights, how='inner', left_on ="ID(n)", right_on = "Node_ID")
phenotypes = phenotypes.drop(columns=["Node_ID"] )

In [24]:
phenotypes.head()

Unnamed: 0,ID(n),count(s),d.id,n.I_CODE,n.N_Name,Weight
0,82727,5,HP:0001994,"[MESH:D005198, SNOMEDCT_US:236468006, UMLS:C03...","[RENAL TUBULAR FANCONI SYNDROME, 'DE TONI-FANC...",2.2e-05
1,98209,48,HP:0001394,"[UMLS:C0023890, MESH:D008103, SNOMEDCT_US:1994...","[HEPATIC CIRRHOSIS, SCAR TISSUE REPLACES HEALT...",0.000131
2,59646,207,HP:0000365,"[HP:0008563, HP:0001754, HP:0001728, HP:000040...","[HEARING DEFECT, CONGENITAL DEAFNESS, HEARING ...",0.00047
3,75348,43,HP:0003128,"[HP:0005960, HP:0003255, UMLS:C0347959, SNOMED...","[LACTICACIDOSIS, LACTICACIDEMIA, HYPERLACTICAC...",0.000137
4,78847,2,HP:0012465,"[UMLS:C4022891, HP:0012465]","[INCREASED IRON CONCENTRATION IN LIVER, INCREA...",2e-05


In [25]:
total_d = diseases.shape[0]
total_p = phenotypes.shape[0]

In [34]:
#getting the sum tf-idf scores for each disease
tf_idf_sum = []
for index, row in diseases.iterrows():
    
    #generate the phenotype dataframe for each disease with weights and counts
    query = """match (n:S_GARD)--(a:DATA) with n, a 
            match p =(n)-[:R_rel{name: 'has_phenotype'}]-(h:S_HP)--(d:DATA) 
            where ID(n) = {i} return ID(h)"""
    d_weight = row["Weight"]
    d = graph.run(query, i= row["ID(n)"]).data()
    df = pd.DataFrame.from_dict(d)
    df = pd.merge(df, phenotypes, how='inner', left_on ="ID(h)", right_on = "ID(n)")
    df = df.drop(columns=["ID(h)","d.id", "n.I_CODE", "n.N_Name"])
    
    #add column for each phenotype: tf, idf, tf-idf
    df["tf(d,f)"] = df["Weight"]*d_weight
    df["idf(d,f)"] = np.log10(total_d / df["count(s)"])
    df["tf-idf"] = df["tf(d,f)"]*df["idf(d,f)"]
    
    #get the sum of each tf-idf column and add to list
    tf_idf_sum.append(np.sum(df["tf-idf"]))
    if index%100 == 0:
        print("Phenotype df set complete!")

Phenotype df set complete!
Phenotype df set complete!
Phenotype df set complete!
Phenotype df set complete!
Phenotype df set complete!
Phenotype df set complete!
Phenotype df set complete!
Phenotype df set complete!
Phenotype df set complete!
Phenotype df set complete!
Phenotype df set complete!
Phenotype df set complete!
Phenotype df set complete!
Phenotype df set complete!
Phenotype df set complete!
Phenotype df set complete!
Phenotype df set complete!
Phenotype df set complete!
Phenotype df set complete!
Phenotype df set complete!
Phenotype df set complete!
Phenotype df set complete!
Phenotype df set complete!


In [36]:
diseases["tf_idf_sum"] = tf_idf_sum
diseases.head(10)

Unnamed: 0,ID(n),count(s),n.I_CODE,n.N_Name,Weight,tf_idf_sum
0,1,16,"[GARD:0000001, OMIM:603358, ORPHA:53693, ORPHA...","[GRACILE SYNDROME, FLNMS, FINNISH LACTIC ACIDO...",4.6e-05,1.382186e-07
1,3,49,"[GARD:0000003, OMIM:200110, ORPHA:920, ORPHANE...","[ABLEPHARON MACROSTOMIA SYNDROME, AMS, CONGENI...",0.000101,1.140221e-06
2,7,14,"[GARD:0000005, OMIM:200100, ORPHA:14, ORPHANET...","[ABETALIPOPROTEINEMIA, BASSEN KORNZWEIG SYNDRO...",3.7e-05,1.303226e-07
3,11,29,"[GARD:0000007, OMIM:102370, ORPHA:969, ORPHANE...","[ACROMICRIC DYSPLASIA, ACROMICRIC SKELETAL DYS...",6e-05,3.964164e-07
4,15,9,"[GARD:0000011, OMIM:104290]","[ALTERNATING HEMIPLEGIA OF CHILDHOOD, ALTERNAT...",2.3e-05,7.009057e-08
5,19,19,"[GARD:0000013, OMIM:206700, ORPHA:1065, ORPHAN...","[GILLESPIE SYNDROME, ANIRIDIA, CEREBELLAR ATAX...",4.1e-05,2.593402e-07
6,25,23,"[GARD:0000019, OMIM:274270, ORPHA:1675, ORPHAN...","[DIHYDROPYRIMIDINE DEHYDROGENASE DEFICIENCY, D...",4.6e-05,3.680357e-07
7,29,12,"[GARD:0000022, OMIM:262000, ORPHA:123, ORPHANE...","[BJORNSTAD SYNDROME, BJS, PILI TORTI AND NERVE...",3e-05,9.747654e-08
8,31,21,"[GARD:0000023, OMIM:110100, ORPHA:126, ORPHANE...",[BLEPHAROPHIMOSIS-EPICANTHUS INVERSUS-PTOSIS S...,4.6e-05,2.400077e-07
9,37,24,"[GARD:0000028, OMIM:302380, ORPHA:1388, ORPHAN...","[CATEL MANZKE SYNDROME, HYPERPHALANGY-CLINODAC...",4.8e-05,3.291469e-07


In [40]:
diseases.to_csv("GARD_TF_IDF.csv")

In [62]:
#getting the sum tf-idf scores for each phenotype
tf_idf_sum2 = []
for index, row in phenotypes.iterrows():
    
    #generate the disease dataframe for each phenotype with weights and counts
    query = """match (n:S_HP)--(a:DATA) with n, a
            match p =(n)-[:R_rel{name: 'has_phenotype'}]-(h:S_GARD)--(d:DATA) 
            where ID(n) = {i} return ID(h)"""
    p_weight = row["Weight"]
    d = graph.run(query, i= row["ID(n)"]).data()
    df = pd.DataFrame.from_dict(d)
    df = pd.merge(df, diseases, how='inner', left_on ="ID(h)", right_on = "ID(n)")
    df = df.drop(columns=["ID(h)", "n.I_CODE", "n.N_Name"])
    
    #add column for each diseaase: tf, idf, tf-idf
    df["tf(d,f)"] = df["Weight"]*p_weight
    df["idf(d,f)"] = np.log10(total_p / df["count(s)"])
    df["tf-idf"] = df["tf(d,f)"]*df["idf(d,f)"]
    
    #get the sum of each tf-idf column and add to list
    tf_idf_sum2.append(np.sum(df["tf-idf"]))
    if index%200 == 0:
        print("Disease df set complete!")

Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!


In [65]:
phenotypes["tf_idf_sum"] = tf_idf_sum2
phenotypes.head(10)

Unnamed: 0,ID(n),count(s),d.id,n.I_CODE,n.N_Name,Weight,tf_idf_sum
0,82727,5,HP:0001994,"[MESH:D005198, SNOMEDCT_US:236468006, UMLS:C03...","[RENAL TUBULAR FANCONI SYNDROME, 'DE TONI-FANC...",2.2e-05,2.729152e-08
1,98209,48,HP:0001394,"[UMLS:C0023890, MESH:D008103, SNOMEDCT_US:1994...","[HEPATIC CIRRHOSIS, SCAR TISSUE REPLACES HEALT...",0.000131,1.109996e-06
2,59646,207,HP:0000365,"[HP:0008563, HP:0001754, HP:0001728, HP:000040...","[HEARING DEFECT, CONGENITAL DEAFNESS, HEARING ...",0.00047,1.979341e-05
3,75348,43,HP:0003128,"[HP:0005960, HP:0003255, UMLS:C0347959, SNOMED...","[LACTICACIDOSIS, LACTICACIDEMIA, HYPERLACTICAC...",0.000137,9.326714e-07
4,78847,2,HP:0012465,"[UMLS:C4022891, HP:0012465]","[INCREASED IRON CONCENTRATION IN LIVER, INCREA...",2e-05,5.927394e-09
5,98159,45,HP:0001397,"[HP:0002252, HP:0200121, MESH:D005234, SNOMEDC...","[LIVER STEATOSIS, FATTY LIVER, FATTY INFILTRAT...",0.000126,1.005284e-06
6,125687,10,HP:0100613,"[UMLS:C4022012, HP:0100613]",[DEATH IN EARLY ADULTHOOD],2.9e-05,9.115299e-08
7,78855,1,HP:0012464,"[UMLS:C0919785, HP:0012464]",DECREASED TRANSFERRIN SATURATION,1.6e-05,1.972279e-09
8,111594,3,HP:0004925,"[HP:0004899, UMLS:C1839437, HP:0004925]",CHRONIC LACTIC ACIDOSIS,2.4e-05,1.036932e-08
9,104771,5,HP:0003452,"[UMLS:C0151900, SNOMEDCT_US:165624002, HP:0003...",INCREASED SERUM IRON,2.5e-05,1.753008e-08


In [66]:
phenotypes.to_csv("HP_TF_IDF.csv")