In [1]:
#get phenotype list input
#get the list of diseases associated with those phenotypes
#for each disease:
    #for each phenotype get the tf-idf score
        #get the tf: product of disease and phenotype weight
        #get idf: log of the total diseases over the associated diseases
    #then add up all of those scores
    #output the ratio (sum of listed phenotype scores over the total)
#do this each- get ranking of diseases

In [2]:
import pandas as pd
from py2neo import Graph
from py2neo import Node
from matplotlib import pyplot as plt
import numpy as np

In [33]:
from py2neo import Database
db = Database("bolt://disease.ncats.io:80")
graph = db.default_graph

In [4]:
#get the total tf-idf lists
diseases = pd.read_csv("GARD_TF_IDF.csv")
phenotypes = pd.read_csv("HP_TF_IDF.csv")

In [35]:
total_d = diseases.shape[0]
total_p = diseases.shape[0]

In [6]:
diseases.head()

Unnamed: 0.1,Unnamed: 0,ID(n),count(s),n.I_CODE,n.N_Name,Weight,tf_idf_sum
0,0,1,16,"['GARD:0000001', 'OMIM:603358', 'ORPHA:53693',...","['GRACILE SYNDROME', 'FLNMS', 'FINNISH LACTIC ...",4.6e-05,1.382186e-07
1,1,3,49,"['GARD:0000003', 'OMIM:200110', 'ORPHA:920', '...","['ABLEPHARON MACROSTOMIA SYNDROME', 'AMS', 'CO...",0.000101,1.140221e-06
2,2,7,14,"['GARD:0000005', 'OMIM:200100', 'ORPHA:14', 'O...","['ABETALIPOPROTEINEMIA', 'BASSEN KORNZWEIG SYN...",3.7e-05,1.303226e-07
3,3,11,29,"['GARD:0000007', 'OMIM:102370', 'ORPHA:969', '...","['ACROMICRIC DYSPLASIA', 'ACROMICRIC SKELETAL ...",6e-05,3.964164e-07
4,4,15,9,"['GARD:0000011', 'OMIM:104290']","['ALTERNATING HEMIPLEGIA OF CHILDHOOD', 'ALTER...",2.3e-05,7.009057e-08


In [7]:
phenotypes.head()

Unnamed: 0.1,Unnamed: 0,ID(n),count(s),d.id,n.I_CODE,n.N_Name,Weight,tf_idf_sum
0,0,82727,5,HP:0001994,"['MESH:D005198', 'SNOMEDCT_US:236468006', 'UML...","['RENAL TUBULAR FANCONI SYNDROME', ""'DE TONI-F...",2.2e-05,2.729152e-08
1,1,98209,48,HP:0001394,"['UMLS:C0023890', 'MESH:D008103', 'SNOMEDCT_US...","['HEPATIC CIRRHOSIS', 'SCAR TISSUE REPLACES HE...",0.000131,1.109996e-06
2,2,59646,207,HP:0000365,"['HP:0008563', 'HP:0001754', 'HP:0001728', 'HP...","['HEARING DEFECT', 'CONGENITAL DEAFNESS', 'HEA...",0.00047,1.979341e-05
3,3,75348,43,HP:0003128,"['HP:0005960', 'HP:0003255', 'UMLS:C0347959', ...","['LACTICACIDOSIS', 'LACTICACIDEMIA', 'HYPERLAC...",0.000137,9.326714e-07
4,4,78847,2,HP:0012465,"['UMLS:C4022891', 'HP:0012465']","['INCREASED IRON CONCENTRATION IN LIVER', 'INC...",2e-05,5.927394e-09


In [45]:
#create a function to get the input
def get_disease_ranks(phen_list):
    #phen_list must be a list
    #for now we'll treat these as HP ids
    
    d_sets = []
    for phen in phen_list:
        q = """match (n:S_HP)--(d:DATA) where d.id = {HP_ID} with n match p =(n)-
                    [:R_rel{name: 'has_phenotype'}]-(g:S_GARD) return ID(g);"""
        d = set(graph.run(q, HP_ID = phen).to_series())
        d_sets.append(d)
    
    ids = set.intersection(*d_sets)
    d_df = diseases.loc[diseases['ID(n)'].isin(ids)].copy()
    ratios = []
    for d in ids:
        d_weight = diseases.loc[diseases['ID(n)'] == d, 'Weight'].iloc[0]
        phen_sum = 0
        for phen in phen_list:
            #get the tf-idf
            p_weight = phenotypes.loc[phenotypes['d.id'] == phen, 'Weight'].iloc[0]
            tf = d_weight*p_weight
            associated = phenotypes.loc[phenotypes['d.id'] == phen, 'count(s)'].iloc[0]
            idf = np.log10(total_d/associated)
            tf_idf = tf*idf
            phen_sum += tf_idf
        ratios.append(phen_sum/ diseases.loc[diseases['ID(n)'] == d, 'tf_idf_sum'].iloc[0])
    d_df["rankings"] = ratios
    return d_df

In [44]:
#function that just gives all disease rankings
#does not quite work- it gives a tf-idf score even if the phenotype does not belong to the disease
def get_disease_ranks2(phen_list):
    d_df = diseases.copy()[994:995]
    ratios = []
    for index, row in diseases[994:995].iterrows():
        d_weight = row["Weight"] 
        print(d_weight)
        phen_sum = 0
        for phen in phen_list:
            #get the tf-idf
            p_weight = phenotypes.loc[phenotypes['d.id'] == phen, 'Weight'].iloc[0]
            print(p_weight)
            tf = d_weight*p_weight
            print(tf)
            associated = phenotypes.loc[phenotypes['d.id'] == phen, 'count(s)'].iloc[0]
            print(associated)
            idf = np.log10(total_d/associated)
            print(idf)
            tf_idf = tf*idf
            print(tf_idf)
            phen_sum += tf_idf
            print(phen_sum)
            print("\n")
        ratios.append(phen_sum/ row["tf_idf_sum"])
    d_df["rankings"] = ratios
    return d_df

## Website considerations
- need to run a query to get the overlapping diseases for the phenotype entry
- connect the neo4j database to web app
- connect python to website

In [46]:
get_disease_ranks(phenotypes["d.id"][:5])

Unnamed: 0.1,Unnamed: 0,ID(n),count(s),n.I_CODE,n.N_Name,Weight,tf_idf_sum,rankings
0,0,1,16,"['GARD:0000001', 'OMIM:603358', 'ORPHA:53693',...","['GRACILE SYNDROME', 'FLNMS', 'FINNISH LACTIC ...",4.6e-05,1.382186e-07,0.356445


In [50]:
get_disease_ranks(phenotypes["d.id"][:2]).sort_values(by=["rankings"], ascending = False)

Unnamed: 0.1,Unnamed: 0,ID(n),count(s),n.I_CODE,n.N_Name,Weight,tf_idf_sum,rankings
0,0,1,16,"['GARD:0000001', 'OMIM:603358', 'ORPHA:53693',...","['GRACILE SYNDROME', 'FLNMS', 'FINNISH LACTIC ...",4.6e-05,1.382186e-07,0.093517
403,403,2714,31,"['GARD:0002658', 'OMIM:276700', 'ORPHA:882', '...","['TYROSINEMIA TYPE 1', 'TYROSINEMIA TYPE I', '...",8e-05,4.768549e-07,0.046735
