# BioKnowledge reviewer implementation

The following code uses the bioknowledge reviewer framework to retrieve all necessary data to create the knowledge graphs

In [2]:
import transcriptomics, regulation, curation, monarch, graph, neo4jlib, hypothesis, summary, utils
import pandas as pd



## Subgraphs

### Curation

In [None]:
curation_edges = pd.read_csv("curation/data/HD/HD_curated_edges.csv")
curation_nodes = pd.read_csv("curation/data/HD/HD_curated_nodes.csv")

### Transcriptomics

In [None]:
%%time
# transcriptomics:
csv_path = './transcriptomics/GSE64810_mlhd_DESeq2_diffexp_DESeq2_outlier_trimmed_adjust.txt'
data = transcriptomics.read_data(csv_path, sep='\t')
clean_data, up, down = transcriptomics.clean_data(data)
# clean_data()" is running. Keeping only data with FC > 1.5 and FDR < 5%
data_edges = transcriptomics.prepare_data_edges(up, down) ###
rna_network = transcriptomics.prepare_rna_edges(data_edges)

# build network with graph schema
rna_edges = transcriptomics.build_edges(rna_network)
rna_nodes, node_dict = transcriptomics.build_nodes(rna_network)
rna_edges = transcriptomics.rework_edges(pd.DataFrame(rna_edges), node_dict)

### Regulation

In [None]:
%%time
# prepare msigdb data
gmt_path = './regulation/msigdb/data/c3.tft.v6.1.entrez.gmt'
regulation.prepare_msigdb_data(gmt_path)

# prepare individual networks
data = regulation.load_tf_gene_edges()
dicts = regulation.get_gene_id_normalization_dictionaries(data)
data_edges = regulation.prepare_data_edges(data, dicts)

# prepare regulation network
reg_network = regulation.prepare_regulation_edges(data_edges)

# build network with graph schema
reg_edges = regulation.build_edges(reg_network)
reg_nodes = regulation.build_nodes(reg_network)

### Monarch

In [3]:
%%time
# prepare data to graph schema
# seed nodes
seedList_a = [
    'MONDO:0004975', #AD
    'HGNC:620', #APP
    'HGNC:2095', #Clu
    'HGNC:6893', #MAPT
    'HGNC:933', #BACE1
    'HGNC:613' #APOE
]

# alz 'HGNC:9508' = #PSEN1, 'HGNC:9509' = #PSEN2

seedList = [ 
    'MONDO:0007739', # HD
    'HGNC:4851' # HTT
] 

# get first shell of neighbours
neighboursList = monarch.get_neighbours_list(seedList)
print(len(neighboursList))

# introduce animal model ortho-phenotypes for seed and 1st shell neighbors
## For seed nodes:
seed_orthophenoList = monarch.get_orthopheno_list(seedList)
print(len(seed_orthophenoList))
## For 1st shell nodes:
neighbours_orthophenoList = monarch.get_orthopheno_list(neighboursList)
print(len(neighbours_orthophenoList))

# network nodes: seed + 1shell + ortholog-phentoype
geneList = sum([seedList,
                neighboursList,
                seed_orthophenoList,
                neighbours_orthophenoList
               ], 
               [])
print('genelist: ',len(geneList))

# get Monarch network
monarch_network = monarch.extract_edges(geneList)
print('network: ',len(monarch_network))

# save edges
monarch.print_network(monarch_network, 'monarch_connections')

# build network with graph schema 
monarch_edges = monarch.build_edges(monarch_network)
monarch_nodes = monarch.build_nodes(monarch_network)


The function "get_neighbours_list()" is running. Its runtime may take some minutes. If you interrupt the process, you will lose all the nodes retrieved and you should start over the execution of this function.


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:05<00:00,  2.66s/it]



Finished get_neighbours_list().

419

The function "get_orthopheno_list()" is running. Its runtime may take some hours. If you interrupt the process, you will lose all the nodes retrieved and you should start over the execution of this function.


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:05<00:00,  2.87s/it]
0it [00:00, ?it/s]



Finished get_orthopheno_list().

0

The function "get_orthopheno_list()" is running. Its runtime may take some hours. If you interrupt the process, you will lose all the nodes retrieved and you should start over the execution of this function.


100%|████████████████████████████████████████████████████████████████████████████████| 419/419 [18:40<00:00,  2.68s/it]
0it [00:00, ?it/s]



Finished get_orthopheno_list().

0
genelist:  421

The function "extract_edges()" is running. Its runtime may take some hours. If you interrupt the process, you will lose all the edges retrieved and you should start over the execution of this function.


100%|████████████████████████████████████████████████████████████████████████████████| 421/421 [19:40<00:00,  2.80s/it]



Finished extract_edges(). To save the retrieved Monarch edges use the function "print_network()".

network:  10597

Saving Monarch edges at: 'C:\Users\mirei\Desktop\LEIDEN\th\kg\bioknowledge-reviewer-ADHD\bioknowledge_reviewer/monarch/monarch_connections_v2024-05-27.csv'...


The function "build_edges()" is running...
df (10597, 9)

* This is the size of the edges file data structure: (10597, 9)
* These are the edges attributes: Index(['subject_id', 'object_id', 'property_id', 'property_label',
       'property_description', 'property_uri', 'reference_uri',
       'reference_supporting_text', 'reference_date'],
      dtype='object')
* This is the first record:
  subject_id  object_id             property_id          property_label  \
0  HGNC:3272  HGNC:3274  biolink:interacts_with  biolink:interacts_with   

  property_description                                       property_uri  \
0                   NA  http://purl.obolibrary.org/obo/biolink_interac...   

                        

## Merge

In [4]:
#edges
curation_edges = "curation/data/Empty_edges.csv"
monarch_edges = "monarch/monarch_edges_v2024-05-27.csv"
regulation_edges = "graph/regulation_edges_v2024-05-25.csv"
transcriptomics_edges = "curation/data/Empty_edges.csv"  #############

# nodes
curation_nodes = "curation/data/Empty_nodes.csv"
monarch_nodes = "monarch/monarch_nodes_v2024-05-27.csv"
regulation_nodes = "graph/regulation_nodes_v2024-05-25.csv"
transcriptomics_nodes = "curation/data/Empty_nodes.csv" ###########

In [5]:
# EXTRA CONNECTIVITY FOR TRANSCR+MONARCH

# Monarch graph connectivity
graph_nodes_list, reg_graph_edges = graph.graph_nodes(
    curation=pd.read_csv(curation_edges),
    monarch=pd.read_csv(monarch_edges),
    transcriptomics=pd.read_csv(transcriptomics_edges),
    regulation=pd.read_csv(regulation_edges)
)

  exec(code_obj, self.user_global_ns, self.user_ns)



The function "graph_nodes()" is running...

Preparing networks...
Curated:
(0, 9)
Index(['subject_id', 'property_id', 'object_id', 'reference_uri',
       'reference_supporting_text', 'reference_date', 'property_label',
       'property_description', 'property_uri'],
      dtype='object')
Monarch:
(10597, 9)
Index(['subject_id', 'property_id', 'object_id', 'reference_uri',
       'reference_supporting_text', 'reference_date', 'property_label',
       'property_description', 'property_uri'],
      dtype='object')
Transcriptomics:
(0, 9)
Index(['subject_id', 'property_id', 'object_id', 'reference_uri',
       'reference_supporting_text', 'reference_date', 'property_label',
       'property_description', 'property_uri'],
      dtype='object')
Regulatory:
(197267, 9)
Index(['subject_id', 'object_id', 'property_id', 'property_label',
       'property_description', 'property_uri', 'reference_uri',
       'reference_supporting_text', 'reference_date'],
      dtype='object')

Concatenating in

In [6]:
monarch_network_graph = monarch.extract_edges(graph_nodes_list)
print('network: ',len(monarch_network_graph))


The function "extract_edges()" is running. Its runtime may take some hours. If you interrupt the process, you will lose all the edges retrieved and you should start over the execution of this function.


  0%|                                                                                        | 0/17084 [00:00<?, ?it/s]

error: <class 'AttributeError'>


  0%|                                                                             | 18/17084 [00:30<8:13:46,  1.74s/it]

error: <class 'IndexError'>


  3%|██▌                                                                         | 584/17084 [20:04<9:45:30,  2.13s/it]

error: <class 'IndexError'>


  4%|██▉                                                                        | 664/17084 [22:52<10:06:33,  2.22s/it]

error: <class 'IndexError'>


  5%|███▋                                                                        | 831/17084 [28:36<8:31:29,  1.89s/it]

error: <class 'IndexError'>


  9%|██████▍                                                                    | 1470/17084 [51:08<7:59:54,  1.84s/it]

error: <class 'IndexError'>


  9%|██████▉                                                                    | 1575/17084 [54:49<7:39:13,  1.78s/it]

error: <class 'IndexError'>


  9%|███████                                                                    | 1601/17084 [55:41<7:09:22,  1.66s/it]

error: <class 'IndexError'>


 12%|████████▌                                                                | 1993/17084 [1:09:26<9:19:37,  2.23s/it]

error: <class 'IndexError'>


 13%|█████████▎                                                               | 2166/17084 [1:15:54<7:17:38,  1.76s/it]

error: <class 'IndexError'>


 13%|█████████▊                                                               | 2296/17084 [1:20:28<7:02:56,  1.72s/it]

error: <class 'IndexError'>


 14%|██████████▏                                                              | 2371/17084 [1:23:08<8:18:07,  2.03s/it]

error: <class 'IndexError'>


 14%|██████████▎                                                              | 2404/17084 [1:24:14<8:13:51,  2.02s/it]

error: <class 'IndexError'>


 15%|██████████▋                                                              | 2494/17084 [1:27:16<9:16:33,  2.29s/it]

error: <class 'IndexError'>


 15%|███████████▎                                                             | 2647/17084 [1:33:07<8:04:50,  2.01s/it]

error: <class 'IndexError'>


 19%|█████████████▌                                                           | 3186/17084 [1:51:36<7:50:16,  2.03s/it]

error: <class 'IndexError'>


 21%|███████████████▌                                                         | 3653/17084 [2:07:06<6:47:05,  1.82s/it]

error: <class 'IndexError'>


 24%|█████████████████▊                                                       | 4163/17084 [2:23:48<8:19:24,  2.32s/it]

error: <class 'IndexError'>


 27%|███████████████████▌                                                     | 4589/17084 [2:37:24<7:14:51,  2.09s/it]

error: <class 'IndexError'>


 29%|████████████████████▉                                                    | 4905/17084 [2:47:47<5:15:49,  1.56s/it]

error: <class 'IndexError'>


 30%|██████████████████████                                                   | 5174/17084 [2:56:39<6:47:59,  2.06s/it]

error: <class 'IndexError'>


 31%|██████████████████████▌                                                  | 5291/17084 [3:00:26<6:20:00,  1.93s/it]

error: <class 'IndexError'>


 34%|████████████████████████▋                                                | 5776/17084 [3:16:22<6:03:37,  1.93s/it]

error: <class 'IndexError'>


 34%|█████████████████████████                                                | 5877/17084 [3:19:45<6:53:47,  2.22s/it]

error: <class 'IndexError'>


 35%|█████████████████████████▎                                               | 5930/17084 [3:21:30<5:31:10,  1.78s/it]

error: <class 'IndexError'>


 36%|██████████████████████████▎                                              | 6145/17084 [3:28:23<4:38:02,  1.53s/it]

error: <class 'IndexError'>


 39%|████████████████████████████▎                                            | 6612/17084 [3:43:03<5:45:40,  1.98s/it]

error: <class 'IndexError'>


 40%|█████████████████████████████▏                                           | 6843/17084 [3:50:48<5:51:14,  2.06s/it]

error: <class 'IndexError'>


 40%|█████████████████████████████▎                                           | 6855/17084 [3:51:05<4:24:56,  1.55s/it]

error: <class 'IndexError'>


 44%|███████████████████████████████▉                                         | 7480/17084 [4:11:02<4:36:17,  1.73s/it]

error: <class 'IndexError'>


 46%|█████████████████████████████████▋                                       | 7871/17084 [4:23:30<5:27:00,  2.13s/it]

error: <class 'IndexError'>


 46%|█████████████████████████████████▊                                       | 7924/17084 [4:25:12<4:37:10,  1.82s/it]

error: <class 'IndexError'>


 47%|██████████████████████████████████▍                                      | 8067/17084 [4:29:33<4:19:06,  1.72s/it]

error: <class 'IndexError'>


 48%|███████████████████████████████████                                      | 8216/17084 [4:34:04<3:25:30,  1.39s/it]

error: <class 'IndexError'>


 51%|█████████████████████████████████████▏                                   | 8705/17084 [4:49:40<4:18:36,  1.85s/it]

error: <class 'IndexError'>


 54%|███████████████████████████████████████▏                                 | 9182/17084 [5:04:39<4:35:33,  2.09s/it]

error: <class 'IndexError'>


 60%|██████████████████████████████████████████▉                             | 10198/17084 [5:36:48<3:39:32,  1.91s/it]

error: <class 'IndexError'>


 64%|█████████████████████████████████████████████▋                          | 10854/17084 [5:57:26<3:08:05,  1.81s/it]

error: <class 'IndexError'>


 64%|█████████████████████████████████████████████▊                          | 10863/17084 [5:57:39<2:17:36,  1.33s/it]

error: <class 'IndexError'>


 66%|████████████████████████████████████████████                       | 11232/17084 [7:53:26<3038:27:48, 1869.18s/it]

error: <class 'requests.exceptions.ChunkedEncodingError'>


 66%|████████████████████████████████████████████                       | 11233/17084 [7:53:27<2126:58:25, 1308.68s/it]

error: <class 'requests.exceptions.ConnectionError'>


 66%|████████████████████████████████████████████▋                       | 11234/17084 [7:53:27<1488:59:14, 916.30s/it]

error: <class 'requests.exceptions.ConnectionError'>


 66%|█████████████████████████████████████████████▍                       | 11236/17084 [7:53:28<729:31:10, 449.09s/it]

error: <class 'requests.exceptions.ConnectionError'>
error: <class 'requests.exceptions.ConnectionError'>


 66%|██████████████████████████████████████████████▋                        | 11247/17084 [7:53:28<98:51:34, 60.97s/it]

error: <class 'requests.exceptions.ConnectionError'>
error: <class 'requests.exceptions.ConnectionError'>
error: <class 'requests.exceptions.ConnectionError'>
error: <class 'requests.exceptions.ConnectionError'>
error: <class 'requests.exceptions.ConnectionError'>
error: <class 'requests.exceptions.ConnectionError'>
error: <class 'requests.exceptions.ConnectionError'>
error: <class 'requests.exceptions.ConnectionError'>
error: <class 'requests.exceptions.ConnectionError'>
error: <class 'requests.exceptions.ConnectionError'>
error: <class 'requests.exceptions.ConnectionError'>
error: <class 'requests.exceptions.ConnectionError'>
error: <class 'requests.exceptions.ConnectionError'>


 66%|██████████████████████████████████████████████▊                        | 11267/17084 [7:53:28<24:25:27, 15.12s/it]

error: <class 'requests.exceptions.ConnectionError'>
error: <class 'requests.exceptions.ConnectionError'>
error: <class 'requests.exceptions.ConnectionError'>
error: <class 'requests.exceptions.ConnectionError'>
error: <class 'requests.exceptions.ConnectionError'>
error: <class 'requests.exceptions.ConnectionError'>
error: <class 'requests.exceptions.ConnectionError'>
error: <class 'requests.exceptions.ConnectionError'>
error: <class 'requests.exceptions.ConnectionError'>
error: <class 'requests.exceptions.ConnectionError'>
error: <class 'requests.exceptions.ConnectionError'>
error: <class 'requests.exceptions.ConnectionError'>
error: <class 'requests.exceptions.ConnectionError'>
error: <class 'requests.exceptions.ConnectionError'>
error: <class 'requests.exceptions.ConnectionError'>
error: <class 'requests.exceptions.ConnectionError'>
error: <class 'requests.exceptions.ConnectionError'>
error: <class 'requests.exceptions.ConnectionError'>
error: <class 'requests.exceptions.ConnectionE

 68%|████████████████████████████████████████████████▉                       | 11604/17084 [8:03:37<3:24:46,  2.24s/it]

error: <class 'IndexError'>


 70%|██████████████████████████████████████████████████                      | 11878/17084 [8:11:55<2:57:01,  2.04s/it]

error: <class 'IndexError'>


 70%|██████████████████████████████████████████████████▌                     | 11995/17084 [8:15:32<2:07:11,  1.50s/it]

error: <class 'IndexError'>


 74%|█████████████████████████████████████████████████████▏                  | 12619/17084 [8:34:39<2:23:04,  1.92s/it]

error: <class 'IndexError'>


 74%|█████████████████████████████████████████████████████▎                  | 12639/17084 [8:35:17<2:40:02,  2.16s/it]

error: <class 'IndexError'>


 77%|███████████████████████████████████████████████████████                 | 13071/17084 [8:48:47<1:49:57,  1.64s/it]

error: <class 'IndexError'>


 81%|██████████████████████████████████████████████████████████▌             | 13888/17084 [9:14:36<1:30:15,  1.69s/it]

error: <class 'IndexError'>


 83%|███████████████████████████████████████████████████████████▊            | 14199/17084 [9:24:23<1:39:53,  2.08s/it]

error: <class 'IndexError'>


 84%|████████████████████████████████████████████████████████████▌           | 14366/17084 [9:29:29<1:28:55,  1.96s/it]

error: <class 'IndexError'>


 84%|████████████████████████████████████████████████████████████▌           | 14375/17084 [9:29:45<1:26:20,  1.91s/it]

error: <class 'IndexError'>


 87%|██████████████████████████████████████████████████████████████▋         | 14873/17084 [9:45:09<1:19:04,  2.15s/it]

error: <class 'IndexError'>


 88%|█████████████████████████████████████████████████████████████████         | 15034/17084 [9:50:10<53:26,  1.56s/it]

error: <class 'IndexError'>


 89%|███████████████████████████████████████████████████████████████▋        | 15121/17084 [9:52:53<1:04:31,  1.97s/it]

error: <class 'IndexError'>


 93%|███████████████████████████████████████████████████████████████████▊     | 15868/17084 [10:14:04<24:45,  1.22s/it]

error: <class 'IndexError'>


 93%|████████████████████████████████████████████████████████████████████▏    | 15962/17084 [10:16:22<25:11,  1.35s/it]

error: <class 'IndexError'>


 95%|█████████████████████████████████████████████████████████████████████    | 16172/17084 [10:21:40<24:27,  1.61s/it]

error: <class 'IndexError'>


 95%|█████████████████████████████████████████████████████████████████████▎   | 16223/17084 [10:22:54<21:06,  1.47s/it]

error: <class 'IndexError'>


100%|████████████████████████████████████████████████████████████████████████▉| 17075/17084 [10:44:15<00:12,  1.39s/it]

error: <class 'IndexError'>


100%|█████████████████████████████████████████████████████████████████████████| 17084/17084 [10:44:25<00:00,  2.26s/it]


Finished extract_edges(). To save the retrieved Monarch edges use the function "print_network()".

network:  1159364





In [7]:
## save Monarch network
monarch.print_network(monarch_network_graph, 'monarch_connections_graph')

## build Monarch network with graph schema
monarch_graph_edges = monarch.build_edges(monarch_network_graph)
monarch_graph_nodes = monarch.build_nodes(monarch_network_graph)


Saving Monarch edges at: 'C:\Users\mirei\Desktop\LEIDEN\th\kg\bioknowledge-reviewer-ADHD\bioknowledge_reviewer/monarch/monarch_connections_graph_v2024-05-27.csv'...


The function "build_edges()" is running...
df (1159364, 9)

* This is the size of the edges file data structure: (1159364, 9)
* These are the edges attributes: Index(['subject_id', 'object_id', 'property_id', 'property_label',
       'property_description', 'property_uri', 'reference_uri',
       'reference_supporting_text', 'reference_date'],
      dtype='object')
* This is the first record:
   subject_id   object_id      property_id   property_label  \
0  HGNC:27414  GO:0005515  biolink:enables  biolink:enables   

  property_description                                    property_uri  \
0                   NA  http://purl.obolibrary.org/obo/biolink_enables   

  reference_uri                          reference_supporting_text  \
0            NA  This edge comes from the Monarch Knowledge Gra...   

  reference_date  


In [9]:
# build review graph
monarch_graph_edges = "monarch/monarch_edges_v2024-05-27.csv"
monarch_graph_nodes = "monarch/monarch_nodes_v2024-05-27.csv"

edges = graph.build_edges(
    curation=curation_edges,
    monarch=monarch_graph_edges, 
    regulation=regulation_edges,
    transcriptomics=transcriptomics_edges, 
    input_from_file=True
)
nodes = graph.build_nodes(
    statements= edges,
    curation=curation_nodes,
    monarch=monarch_graph_nodes, 
    regulation=regulation_nodes,
    transcriptomics=transcriptomics_nodes, 
    input_from_file=True
)


The function "build_edges()" is running...

Preparing networks...
Curated:
(0, 9)
Index(['subject_id', 'property_id', 'object_id', 'reference_uri',
       'reference_supporting_text', 'reference_date', 'property_label',
       'property_description', 'property_uri'],
      dtype='object')
Monarch:
(1159364, 9)
Index(['subject_id', 'property_id', 'object_id', 'reference_uri',
       'reference_supporting_text', 'reference_date', 'property_label',
       'property_description', 'property_uri'],
      dtype='object')
Transcriptomics:
(0, 9)
Index(['subject_id', 'property_id', 'object_id', 'reference_uri',
       'reference_supporting_text', 'reference_date', 'property_label',
       'property_description', 'property_uri'],
      dtype='object')
Regulatory:
(197267, 9)
Index(['subject_id', 'object_id', 'property_id', 'property_label',
       'property_description', 'property_uri', 'reference_uri',
       'reference_supporting_text', 'reference_date'],
      dtype='object')

Concatenating 

In [None]:
# import to graph database
## prepare the graph to neo4j format
edges_df = utils.get_dataframe(edges)
nodes_df = utils.get_dataframe(nodes)
statements = neo4jlib.get_statements(edges_df)
concepts = neo4jlib.get_concepts(nodes_df)
print('statements: ', len(statements))
print('concepts: ',len(concepts))

## save files into neo4j import dir
neo4j_path = ''#'./{}'.format(neo4j_dir)
neo4jlib.save_neo4j_files(statements, neo4j_path, file_type = 'statements')
neo4jlib.save_neo4j_files(concepts, neo4j_path, file_type = 'concepts')