# TF-IDF model with disease page rank values

The code below uses previusly generated page rank scores for a disease-phenotype network to calculate new values based on a TF-IDF model. Instead of using frequency, the page rank scores are instead utilized to both consider the importance of highly connected phenotypes plus highly specific phenotypes. The tf-idf sum for a single disease is the total of tf-idf scores for it's corresponding phenotypes, and vice versa for a single phenotype. 

In [1]:
import pandas as pd
from py2neo import Graph
from py2neo import Node

In [2]:
from matplotlib import pyplot as plt
import numpy as np

In [42]:
from py2neo import Database
db = Database("bolt://disease.ncats.io:80")
graph = db.default_graph

In [11]:
#creating a new dataframe that merges the page_rank values with disease information
weights = pd.read_csv("page_rank_weights.csv")

In [6]:
query = """match (n:S_GARD)--(d:DATA)  with n, d 
            match p =(n)-[:R_rel{name:"has_phenotype"}]
            -(s:S_HP)--(:DATA) return ID(n), n.N_Name, n.I_CODE, count(s)"""
d = graph.run(query).data()

In [7]:
diseases = pd.DataFrame.from_dict(d)
diseases.head()

Unnamed: 0,ID(n),count(s),n.I_CODE,n.N_Name
0,1,16,"[GARD:0000001, OMIM:603358, ORPHA:53693, ORPHA...","[GRACILE SYNDROME, FLNMS, FINNISH LACTIC ACIDO..."
1,3,49,"[GARD:0000003, OMIM:200110, ORPHA:920, ORPHANE...","[ABLEPHARON MACROSTOMIA SYNDROME, AMS, CONGENI..."
2,7,14,"[GARD:0000005, OMIM:200100, ORPHA:14, ORPHANET...","[ABETALIPOPROTEINEMIA, BASSEN KORNZWEIG SYNDRO..."
3,11,29,"[GARD:0000007, OMIM:102370, ORPHA:969, ORPHANE...","[ACROMICRIC DYSPLASIA, ACROMICRIC SKELETAL DYS..."
4,15,9,"[GARD:0000011, OMIM:104290]","[ALTERNATING HEMIPLEGIA OF CHILDHOOD, ALTERNAT..."


In [12]:
diseases = pd.merge(diseases, weights, how='inner', left_on ="ID(n)", right_on = "Node_ID")

In [14]:
diseases = diseases.drop(columns=["Node_ID"], )

In [15]:
#diseases dataframe with weights
diseases.head()

Unnamed: 0,ID(n),count(s),n.I_CODE,n.N_Name,Weight
0,1,16,"[GARD:0000001, OMIM:603358, ORPHA:53693, ORPHA...","[GRACILE SYNDROME, FLNMS, FINNISH LACTIC ACIDO...",4.6e-05
1,3,49,"[GARD:0000003, OMIM:200110, ORPHA:920, ORPHANE...","[ABLEPHARON MACROSTOMIA SYNDROME, AMS, CONGENI...",0.000101
2,7,14,"[GARD:0000005, OMIM:200100, ORPHA:14, ORPHANET...","[ABETALIPOPROTEINEMIA, BASSEN KORNZWEIG SYNDRO...",3.7e-05
3,11,29,"[GARD:0000007, OMIM:102370, ORPHA:969, ORPHANE...","[ACROMICRIC DYSPLASIA, ACROMICRIC SKELETAL DYS...",6e-05
4,15,9,"[GARD:0000011, OMIM:104290]","[ALTERNATING HEMIPLEGIA OF CHILDHOOD, ALTERNAT...",2.3e-05


In [22]:
#repeating with phenotype dataframe
query = """match (n:S_HP)--(d:DATA) with n, d 
        match p =(n)-[:R_rel{name:"has_phenotype"}]-(s:S_GARD) 
        return ID(n), n.N_Name, d.id, n.I_CODE, count(s)"""
d = graph.run(query).data()
phenotypes = pd.DataFrame.from_dict(d)

In [23]:
phenotypes = pd.merge(phenotypes, weights, how='inner', left_on ="ID(n)", right_on = "Node_ID")
phenotypes = phenotypes.drop(columns=["Node_ID"] )

In [24]:
#phenotype dataframe with weights
phenotypes.head()

Unnamed: 0,ID(n),count(s),d.id,n.I_CODE,n.N_Name,Weight
0,82727,5,HP:0001994,"[MESH:D005198, SNOMEDCT_US:236468006, UMLS:C03...","[RENAL TUBULAR FANCONI SYNDROME, 'DE TONI-FANC...",2.2e-05
1,98209,48,HP:0001394,"[UMLS:C0023890, MESH:D008103, SNOMEDCT_US:1994...","[HEPATIC CIRRHOSIS, SCAR TISSUE REPLACES HEALT...",0.000131
2,59646,207,HP:0000365,"[HP:0008563, HP:0001754, HP:0001728, HP:000040...","[HEARING DEFECT, CONGENITAL DEAFNESS, HEARING ...",0.00047
3,75348,43,HP:0003128,"[HP:0005960, HP:0003255, UMLS:C0347959, SNOMED...","[LACTICACIDOSIS, LACTICACIDEMIA, HYPERLACTICAC...",0.000137
4,78847,2,HP:0012465,"[UMLS:C4022891, HP:0012465]","[INCREASED IRON CONCENTRATION IN LIVER, INCREA...",2e-05


In [25]:
total_d = diseases.shape[0]
total_p = phenotypes.shape[0]

In [34]:
#getting the sum tf-idf scores for each disease
tf_idf_sum = []
for index, row in diseases.iterrows():
    
    #generate the phenotype dataframe for each disease with weights and counts
    query = """match (n:S_GARD)--(a:DATA) with n, a 
            match p =(n)-[:R_rel{name: 'has_phenotype'}]-(h:S_HP)--(d:DATA) 
            where ID(n) = {i} return ID(h)"""
    d_weight = row["Weight"]
    d = graph.run(query, i= row["ID(n)"]).data()
    df = pd.DataFrame.from_dict(d)
    df = pd.merge(df, phenotypes, how='inner', left_on ="ID(h)", right_on = "ID(n)")
    df = df.drop(columns=["ID(h)","d.id", "n.I_CODE", "n.N_Name"])
    
    #add column for each phenotype: tf, idf, tf-idf
    df["tf(d,f)"] = df["Weight"]*d_weight
    df["idf(d,f)"] = np.log10(total_d / df["count(s)"])
    df["tf-idf"] = df["tf(d,f)"]*df["idf(d,f)"]
    
    #get the sum of each tf-idf column and add to list
    tf_idf_sum.append(np.sum(df["tf-idf"]))
    if index%100 == 0:
        print("Phenotype df set complete!")

Phenotype df set complete!
Phenotype df set complete!
Phenotype df set complete!
Phenotype df set complete!
Phenotype df set complete!
Phenotype df set complete!
Phenotype df set complete!
Phenotype df set complete!
Phenotype df set complete!
Phenotype df set complete!
Phenotype df set complete!
Phenotype df set complete!
Phenotype df set complete!
Phenotype df set complete!
Phenotype df set complete!
Phenotype df set complete!
Phenotype df set complete!
Phenotype df set complete!
Phenotype df set complete!
Phenotype df set complete!
Phenotype df set complete!
Phenotype df set complete!
Phenotype df set complete!


In [36]:
#final disease df
diseases["tf_idf_sum"] = tf_idf_sum
diseases.head(10)

Unnamed: 0,ID(n),count(s),n.I_CODE,n.N_Name,Weight,tf_idf_sum
0,1,16,"[GARD:0000001, OMIM:603358, ORPHA:53693, ORPHA...","[GRACILE SYNDROME, FLNMS, FINNISH LACTIC ACIDO...",4.6e-05,1.382186e-07
1,3,49,"[GARD:0000003, OMIM:200110, ORPHA:920, ORPHANE...","[ABLEPHARON MACROSTOMIA SYNDROME, AMS, CONGENI...",0.000101,1.140221e-06
2,7,14,"[GARD:0000005, OMIM:200100, ORPHA:14, ORPHANET...","[ABETALIPOPROTEINEMIA, BASSEN KORNZWEIG SYNDRO...",3.7e-05,1.303226e-07
3,11,29,"[GARD:0000007, OMIM:102370, ORPHA:969, ORPHANE...","[ACROMICRIC DYSPLASIA, ACROMICRIC SKELETAL DYS...",6e-05,3.964164e-07
4,15,9,"[GARD:0000011, OMIM:104290]","[ALTERNATING HEMIPLEGIA OF CHILDHOOD, ALTERNAT...",2.3e-05,7.009057e-08
5,19,19,"[GARD:0000013, OMIM:206700, ORPHA:1065, ORPHAN...","[GILLESPIE SYNDROME, ANIRIDIA, CEREBELLAR ATAX...",4.1e-05,2.593402e-07
6,25,23,"[GARD:0000019, OMIM:274270, ORPHA:1675, ORPHAN...","[DIHYDROPYRIMIDINE DEHYDROGENASE DEFICIENCY, D...",4.6e-05,3.680357e-07
7,29,12,"[GARD:0000022, OMIM:262000, ORPHA:123, ORPHANE...","[BJORNSTAD SYNDROME, BJS, PILI TORTI AND NERVE...",3e-05,9.747654e-08
8,31,21,"[GARD:0000023, OMIM:110100, ORPHA:126, ORPHANE...",[BLEPHAROPHIMOSIS-EPICANTHUS INVERSUS-PTOSIS S...,4.6e-05,2.400077e-07
9,37,24,"[GARD:0000028, OMIM:302380, ORPHA:1388, ORPHAN...","[CATEL MANZKE SYNDROME, HYPERPHALANGY-CLINODAC...",4.8e-05,3.291469e-07


In [40]:
diseases.to_csv("GARD_TF_IDF.csv")

In [62]:
#getting the sum tf-idf scores for each phenotype
tf_idf_sum2 = []
for index, row in phenotypes.iterrows():
    
    #generate the disease dataframe for each phenotype with weights and counts
    query = """match (n:S_HP)--(a:DATA) with n, a
            match p =(n)-[:R_rel{name: 'has_phenotype'}]-(h:S_GARD)--(d:DATA) 
            where ID(n) = {i} return ID(h)"""
    p_weight = row["Weight"]
    d = graph.run(query, i= row["ID(n)"]).data()
    df = pd.DataFrame.from_dict(d)
    df = pd.merge(df, diseases, how='inner', left_on ="ID(h)", right_on = "ID(n)")
    df = df.drop(columns=["ID(h)", "n.I_CODE", "n.N_Name"])
    
    #add column for each diseaase: tf, idf, tf-idf
    df["tf(d,f)"] = df["Weight"]*p_weight
    df["idf(d,f)"] = np.log10(total_p / df["count(s)"])
    df["tf-idf"] = df["tf(d,f)"]*df["idf(d,f)"]
    
    #get the sum of each tf-idf column and add to list
    tf_idf_sum2.append(np.sum(df["tf-idf"]))
    if index%200 == 0:
        print("Disease df set complete!")

Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!
Disease df set complete!


In [65]:
phenotypes["tf_idf_sum"] = tf_idf_sum2
phenotypes.head(10)

Unnamed: 0,ID(n),count(s),d.id,n.I_CODE,n.N_Name,Weight,tf_idf_sum
0,82727,5,HP:0001994,"[MESH:D005198, SNOMEDCT_US:236468006, UMLS:C03...","[RENAL TUBULAR FANCONI SYNDROME, 'DE TONI-FANC...",2.2e-05,2.729152e-08
1,98209,48,HP:0001394,"[UMLS:C0023890, MESH:D008103, SNOMEDCT_US:1994...","[HEPATIC CIRRHOSIS, SCAR TISSUE REPLACES HEALT...",0.000131,1.109996e-06
2,59646,207,HP:0000365,"[HP:0008563, HP:0001754, HP:0001728, HP:000040...","[HEARING DEFECT, CONGENITAL DEAFNESS, HEARING ...",0.00047,1.979341e-05
3,75348,43,HP:0003128,"[HP:0005960, HP:0003255, UMLS:C0347959, SNOMED...","[LACTICACIDOSIS, LACTICACIDEMIA, HYPERLACTICAC...",0.000137,9.326714e-07
4,78847,2,HP:0012465,"[UMLS:C4022891, HP:0012465]","[INCREASED IRON CONCENTRATION IN LIVER, INCREA...",2e-05,5.927394e-09
5,98159,45,HP:0001397,"[HP:0002252, HP:0200121, MESH:D005234, SNOMEDC...","[LIVER STEATOSIS, FATTY LIVER, FATTY INFILTRAT...",0.000126,1.005284e-06
6,125687,10,HP:0100613,"[UMLS:C4022012, HP:0100613]",[DEATH IN EARLY ADULTHOOD],2.9e-05,9.115299e-08
7,78855,1,HP:0012464,"[UMLS:C0919785, HP:0012464]",DECREASED TRANSFERRIN SATURATION,1.6e-05,1.972279e-09
8,111594,3,HP:0004925,"[HP:0004899, UMLS:C1839437, HP:0004925]",CHRONIC LACTIC ACIDOSIS,2.4e-05,1.036932e-08
9,104771,5,HP:0003452,"[UMLS:C0151900, SNOMEDCT_US:165624002, HP:0003...",INCREASED SERUM IRON,2.5e-05,1.753008e-08


In [66]:
phenotypes.to_csv("HP_TF_IDF.csv")

In [3]:
diseases = pd.read_csv("GARD_TF_IDF.csv")

In [11]:
diseases.sort_values(by=['Weight'], ascending=False)[:10]

Unnamed: 0.1,Unnamed: 0,ID(n),count(s),n.I_CODE,n.N_Name,Weight,tf_idf_sum
1214,1214,7305,218,"['GARD:0007891', 'OMIM:194050', 'ORPHA:904', '...","['WILLIAMS SYNDROME', 'WILLIAMS-BEUREN SYNDROM...",0.000452,1.8e-05
493,493,3248,141,"['GARD:0003295', 'OMIM:309000', 'ORPHA:534', '...","['LOWE OCULOCEREBRORENAL SYNDROME', 'OCRL', 'O...",0.000305,9e-06
859,859,5266,154,"['GARD:0005683', 'OMIM:270400', 'ORPHA:818', '...","['SMITH-LEMLI-OPITZ SYNDROME', 'SMITH LEMLI OP...",0.000301,1e-05
1130,1130,6786,121,"['GARD:0007305', 'OMIM:146510', 'ORPHA:672', '...","['PALLISTER-HALL SYNDROME', 'PHS', 'HYPOTHALAM...",0.000276,6e-06
1155,1155,6952,115,"['GARD:0007475', 'OMIM:176920', 'ORPHA:744', '...","['PROTEUS SYNDROME', 'HEMIHYPERTROPHY AND MACR...",0.00027,6e-06
984,984,6010,126,"['GARD:0006457', 'OMIM:305600', 'ORPHA:2092', ...","['FOCAL DERMAL HYPOPLASIA', 'DHOF', 'FODH', 'F...",0.000267,7e-06
1641,1641,9633,140,"['GARD:0010109', 'OMIM:122470', 'ORPHA:199', '...","['CORNELIA DE LANGE SYNDROME', 'BRACHMANN DE L...",0.000267,8e-06
10,10,39,126,"['GARD:0000029', 'OMIM:214800', 'ORPHA:138', '...","['CHARGE SYNDROME', 'COLOBOMA, HEART ANOMALY, ...",0.000261,7e-06
1167,1167,7029,122,"['GARD:0007593', 'OMIM:180849']","['RUBINSTEIN-TAYBI SYNDROME', 'RUBINSTEIN SYND...",0.000256,7e-06
977,977,5978,114,"['GARD:0006425', 'OMIM:227650', 'ORPHA:84', 'O...","['FANCONI ANEMIA', 'FANCONI PANCYTOPENIA', ""FA...",0.000254,7e-06


In [7]:
diseases.sort_values(by=['tf_idf_sum'], ascending=False)[:10]

Unnamed: 0.1,Unnamed: 0,ID(n),count(s),n.I_CODE,n.N_Name,Weight,tf_idf_sum
1214,1214,7305,218,"['GARD:0007891', 'OMIM:194050', 'ORPHA:904', '...","['WILLIAMS SYNDROME', 'WILLIAMS-BEUREN SYNDROM...",0.000452,1.8e-05
859,859,5266,154,"['GARD:0005683', 'OMIM:270400', 'ORPHA:818', '...","['SMITH-LEMLI-OPITZ SYNDROME', 'SMITH LEMLI OP...",0.000301,1e-05
493,493,3248,141,"['GARD:0003295', 'OMIM:309000', 'ORPHA:534', '...","['LOWE OCULOCEREBRORENAL SYNDROME', 'OCRL', 'O...",0.000305,9e-06
1641,1641,9633,140,"['GARD:0010109', 'OMIM:122470', 'ORPHA:199', '...","['CORNELIA DE LANGE SYNDROME', 'BRACHMANN DE L...",0.000267,8e-06
10,10,39,126,"['GARD:0000029', 'OMIM:214800', 'ORPHA:138', '...","['CHARGE SYNDROME', 'COLOBOMA, HEART ANOMALY, ...",0.000261,7e-06
1167,1167,7029,122,"['GARD:0007593', 'OMIM:180849']","['RUBINSTEIN-TAYBI SYNDROME', 'RUBINSTEIN SYND...",0.000256,7e-06
977,977,5978,114,"['GARD:0006425', 'OMIM:227650', 'ORPHA:84', 'O...","['FANCONI ANEMIA', 'FANCONI PANCYTOPENIA', ""FA...",0.000254,7e-06
984,984,6010,126,"['GARD:0006457', 'OMIM:305600', 'ORPHA:2092', ...","['FOCAL DERMAL HYPOPLASIA', 'DHOF', 'FODH', 'F...",0.000267,7e-06
1350,1350,8094,119,"['GARD:0009146', 'OMIM:115150', 'ORPHA:1340', ...","['CARDIOFACIOCUTANEOUS SYNDROME', 'CFC SYNDROM...",0.000231,7e-06
1710,1710,9927,119,"['GARD:0010299', 'OMIM:611867', 'ORPHA:567', '...","['22Q11.2 DELETION SYNDROME', 'CHROMOSOME 22Q1...",0.000245,6e-06


In [6]:
diseases.sort_values(by=['tf_idf_sum'])[:10]

Unnamed: 0.1,Unnamed: 0,ID(n),count(s),n.I_CODE,n.N_Name,Weight,tf_idf_sum
1406,1406,8350,1,"['GARD:0009303', 'OMIM:605462', 'UMLS:C1854245']","['BASAL CELL CARCINOMA, MULTIPLE', 'MULTIPLE B...",8e-06,8.647112e-10
1004,1004,6118,1,"['GARD:0006554', 'OMIM:139393', 'ORPHA:2103', ...","['GUILLAIN-BARRE SYNDROME', 'ACUTE AUTOIMMUNE ...",1e-05,9.271684e-10
267,267,1650,1,"['GARD:0001481', 'OMIM:142340', 'ORPHA:2140', ...","['CONGENITAL DIAPHRAGMATIC HERNIA', 'CDH', 'CO...",7e-06,1.250752e-09
994,994,6047,1,"['GARD:0006485', 'OMIM:137245']","['GASTRIC LYMPHOMA', 'FAMILIAL PRIMARY GASTRIC...",1.7e-05,1.364585e-09
894,894,5431,2,"['GARD:0005836', 'OMIM:609135', 'ORPHA:88', 'O...","['APLASTIC ANEMIA', 'ANEMIA APLASTIC', 'APLAST...",1e-05,1.619751e-09
435,435,2918,1,"['GARD:0002929', 'ORPHA:440', 'ORPHANET:440']",HYPOSPADIAS FAMILIAL,7e-06,2.033226e-09
930,930,5689,1,"['GARD:0006102', 'OMIM:139393', 'ORPHA:2932', ...",['CHRONIC INFLAMMATORY DEMYELINATING POLYNEURO...,2.3e-05,2.068299e-09
680,680,4178,1,"['GARD:0004484', 'ORPHA:244', 'ORPHANET:244']","['PRIMARY CILIARY DYSKINESIA', 'CILIARY DYSKIN...",7e-06,2.442349e-09
1355,1355,8112,2,"['GARD:0009159', 'OMIM:300018']","['DOSAGE-SENSITIVE SEX REVERSAL', 'DSS']",1e-05,2.546163e-09
1185,1185,7145,2,"['GARD:0007721', 'OMIM:300813', 'ORPHA:3273', ...","['SYNOVIAL SARCOMA', 'SYNOVIAL CELL SARCOMA', ...",1.4e-05,3.80542e-09


In [8]:
phenotypes = pd.read_csv("HP_TF_IDF.csv")

In [12]:
#final phenotype df
phenotypes.sort_values(by=['Weight'], ascending = False)[:10]

Unnamed: 0.1,Unnamed: 0,ID(n),count(s),d.id,n.I_CODE,n.N_Name,Weight,tf_idf_sum
18,18,89585,1048,HP:0000007,"['HP:0001416', 'HP:0001526', 'SNOMEDCT_US:2582...","['AUTOSOMAL RECESSIVE', 'AUTOSOMAL RECESSIVE F...",0.002253,0.000341
17,17,89629,844,HP:0000006,"['HP:0001463', 'HP:0001447', 'HP:0001456', 'HP...","['AUTOSOMAL DOMINANT', 'AUTOSOMAL DOMINANT TYP...",0.002004,0.000231
127,127,66797,531,HP:0001250,"['HP:0002466', 'HP:0001303', 'HP:0002479', 'HP...","['SEIZURE', 'SEIZURES', 'EPILEPSY']",0.001065,0.000102
103,103,67010,531,HP:0001249,"['HP:0002316', 'HP:0002402', 'HP:0007154', 'HP...","['INTELLECTUAL DISABILITY', 'NONPROGRESSIVE IN...",0.001026,0.000102
53,53,66485,452,HP:0001263,"['HP:0007228', 'HP:0007174', 'HP:0001255', 'HP...","['RETARDED DEVELOPMENT', 'LACK OF PSYCHOMOTOR ...",0.000848,7.1e-05
176,176,55735,411,HP:0004322,"['HP:0003501', 'HP:0001509', 'HP:0003518', 'HP...","['SHORT STATURE', 'DECREASED BODY HEIGHT', 'HE...",0.000798,6.7e-05
173,173,68784,337,HP:0002650,"['HP:0003317', 'HP:0003415', 'HP:0003303', 'HP...","['ABNORMAL CURVING OF THE SPINE', 'SCOLIOSIS']",0.000634,4.5e-05
106,106,73241,291,HP:0000639,"['UMLS:C0028738', 'MESH:D009759', 'SNOMEDCT_US...","['INVOLUNTARY, RAPID, RHYTHMIC EYE MOVEMENTS',...",0.000611,3.2e-05
64,64,66781,300,HP:0001252,"['HP:0001318', 'SNOMEDCT_US:398152000', 'UMLS:...","['LOW OR WEAK MUSCLE TONE', 'MUSCLE HYPOTONIA'...",0.000595,3.5e-05
134,134,65644,312,HP:0001290,"['UMLS:C1858120', 'HP:0001290']","['HYPOTONIA, GENERALIZED', 'GENERALIZED MUSCUL...",0.000583,3.4e-05


In [9]:
phenotypes.sort_values(by=['tf_idf_sum'], ascending = False)[:10]

Unnamed: 0.1,Unnamed: 0,ID(n),count(s),d.id,n.I_CODE,n.N_Name,Weight,tf_idf_sum
18,18,89585,1048,HP:0000007,"['HP:0001416', 'HP:0001526', 'SNOMEDCT_US:2582...","['AUTOSOMAL RECESSIVE', 'AUTOSOMAL RECESSIVE F...",0.002253,0.000341
17,17,89629,844,HP:0000006,"['HP:0001463', 'HP:0001447', 'HP:0001456', 'HP...","['AUTOSOMAL DOMINANT', 'AUTOSOMAL DOMINANT TYP...",0.002004,0.000231
103,103,67010,531,HP:0001249,"['HP:0002316', 'HP:0002402', 'HP:0007154', 'HP...","['INTELLECTUAL DISABILITY', 'NONPROGRESSIVE IN...",0.001026,0.000102
127,127,66797,531,HP:0001250,"['HP:0002466', 'HP:0001303', 'HP:0002479', 'HP...","['SEIZURE', 'SEIZURES', 'EPILEPSY']",0.001065,0.000102
53,53,66485,452,HP:0001263,"['HP:0007228', 'HP:0007174', 'HP:0001255', 'HP...","['RETARDED DEVELOPMENT', 'LACK OF PSYCHOMOTOR ...",0.000848,7.1e-05
176,176,55735,411,HP:0004322,"['HP:0003501', 'HP:0001509', 'HP:0003518', 'HP...","['SHORT STATURE', 'DECREASED BODY HEIGHT', 'HE...",0.000798,6.7e-05
173,173,68784,337,HP:0002650,"['HP:0003317', 'HP:0003415', 'HP:0003303', 'HP...","['ABNORMAL CURVING OF THE SPINE', 'SCOLIOSIS']",0.000634,4.5e-05
64,64,66781,300,HP:0001252,"['HP:0001318', 'SNOMEDCT_US:398152000', 'UMLS:...","['LOW OR WEAK MUSCLE TONE', 'MUSCLE HYPOTONIA'...",0.000595,3.5e-05
134,134,65644,312,HP:0001290,"['UMLS:C1858120', 'HP:0001290']","['HYPOTONIA, GENERALIZED', 'GENERALIZED MUSCUL...",0.000583,3.4e-05
106,106,73241,291,HP:0000639,"['UMLS:C0028738', 'MESH:D009759', 'SNOMEDCT_US...","['INVOLUNTARY, RAPID, RHYTHMIC EYE MOVEMENTS',...",0.000611,3.2e-05


In [10]:
phenotypes.sort_values(by=['tf_idf_sum'])[:10]

Unnamed: 0.1,Unnamed: 0,ID(n),count(s),d.id,n.I_CODE,n.N_Name,Weight,tf_idf_sum
2331,2331,77273,1,HP:0001118,"['UMLS:C0302254', 'SNOMEDCT_US:399336001', 'HP...",JUVENILE CATARACT,1.2e-05,5.061877e-10
5301,5301,102953,1,HP:0004854,"['UMLS:C1839167', 'HP:0004854']",INTERMITTENT THROMBOCYTOPENIA,1.3e-05,5.521535e-10
1852,1852,58776,1,HP:0006311,"['UMLS:C4025065', 'HP:0006311']","['DECREASED WIDTH OF ALL TEETH', 'DECREASED TO...",1.2e-05,5.777107e-10
3790,3790,66181,1,HP:0100959,"['UMLS:C4020919', 'HP:0100959']","['DENSE METAPHYSEAL LINES', 'TRANSVERSE METAPH...",1.3e-05,6.15718e-10
4216,4216,111104,1,HP:0007404,"['UMLS:C1833030', 'MESH:C563422', 'HP:0007404']",NONEPIDERMOLYTIC PALMOPLANTAR KERATODERMA,1.2e-05,6.297146e-10
4553,4553,116503,1,HP:0008005,"['UMLS:C1611195', 'HP:0008005']",CONGENITAL CORNEAL DYSTROPHY,1.4e-05,6.308429e-10
3998,3998,98087,1,HP:0006779,"['SNOMEDCT_US:404053004', 'SNOMEDCT_US:6344900...",ALVEOLAR RHABDOMYOSARCOMA,1.6e-05,6.470843e-10
5651,5651,50367,1,HP:0007690,"['UMLS:C0271285', 'SNOMEDCT_US:32935005', 'HP:...",MAP-DOT-FINGERPRINT CORNEAL DYSTROPHY,1.3e-05,6.509786e-10
6178,6178,51963,1,http://purl.obolibrary.org/obo/HP_0025355,HP:0025355,RETINAL ARTERIAL MACROANEURYSMS,1.2e-05,6.702058e-10
3001,3001,116573,1,HP:0008007,"['UMLS:C1533041', 'SNOMEDCT_US:415176004', 'HP...",PRIMARY CONGENITAL GLAUCOMA,1.8e-05,6.831869e-10
