In [37]:
import pickle
import pandas as pd
from neo4j import GraphDatabase, basic_auth
from dotenv import load_dotenv, find_dotenv
import os


In [38]:
load_dotenv(os.path.join(os.path.expanduser('~'), '.neo4j_config.env'))

SPOKE_USER = os.environ.get('SPOKE_USER')
SPOKE_PASSWORD = os.environ.get('SPOKE_PSW')
URI = os.environ.get('SPOKE_URI')

auth = basic_auth(SPOKE_USER, SPOKE_PASSWORD)
sdb = GraphDatabase.driver(URI, auth=auth)


In [39]:
with open('../data/disease_with_relation_to_genes.pickle', 'rb') as f:
    disease_names = pickle.load(f)

In [40]:
%%time

query = f'''
    MATCH(d:Disease)
    WHERE d.name IN {disease_names}
    RETURN d.identifier AS d_id, d.name AS d_name
'''
with sdb.session() as session:
    with session.begin_transaction() as tx:
        result = tx.run(query)
        out_list = []
        for row in result:
            out_list.append((row['d_id'], row['d_name']))
            

CPU times: user 176 ms, sys: 23.3 ms, total: 199 ms
Wall time: 607 ms


In [41]:

disease_df = pd.DataFrame(out_list, columns=['disease_id', 'disease_name'])
disease_name_df = pd.DataFrame(disease_names,columns=['disease_name'])

disease_df = pd.merge(disease_name_df, disease_df, on='disease_name', how='left').drop_duplicates()

disease_df.to_csv('../data/disease_name_with_id.csv', index=False)


In [42]:
disease_df

Unnamed: 0,disease_name,disease_id
0,serum amyloid A amyloidosis,DOID:0080936
1,erythroleukemia,DOID:0080916
2,spondylometaphyseal dysplasia Sedaghatian type,DOID:0112298
3,cerebrooculofacioskeletal syndrome 2,DOID:0080912
4,bilateral frontoparietal polymicrogyria,DOID:0080922
...,...,...
6305,graft-versus-host disease,DOID:0081267
6306,acute myeloid leukemia with maturation,DOID:0081087
6307,frontonasal dysplasia,DOID:0081044
6308,central diabetes insipidus,DOID:0081055
