In [1]:
import os
import pandas as pd
import numpy as np
from neo4j import Query, GraphDatabase, RoutingControl, Result # Python database driver 5.13 +

## Database connection

In [2]:
DB_ULR = "neo4j://localhost:7687"
DB_USER = "neo4j"
DB_PASS = "test1234"
DB_NAME = "neo4j" # Have to be neo4j for neo4j aura (but keep it, good for testing on local dev env)

In [3]:
driver = GraphDatabase.driver(DB_ULR, auth=(DB_USER, DB_PASS))
driver.verify_connectivity()

## Utility functions

In [4]:
## Utility
def split_dataframe(df, chunk_size = 5000): 
    chunks = list()
    num_chunks = len(df) // chunk_size + 1
    for i in range(num_chunks):
        chunks.append(df[i*chunk_size:(i+1)*chunk_size])
    return chunks

## Data sources

In [9]:
kg_data = pd.read_csv('./data/kg.csv', low_memory=False)

In [10]:
## format the data for neo4j for nodes
node_data = kg_data[["x_id", "x_type", "x_name", "x_source"]].drop_duplicates()
 
# append y entities to node data
y_data = kg_data[["y_id", "y_type", "y_name", "y_source"]].drop_duplicates()
y_data.columns = ["x_id", "x_type", "x_name", "x_source"]
node_data = node_data.append(y_data).drop_duplicates()
 
# rename columns
node_data.columns = ["id", "type", "name", "source"]

  node_data = node_data.append(y_data).drop_duplicates()


In [17]:
node_data.head()

Unnamed: 0,id,type,name,source
0,9796,gene/protein,PHYHIP,NCBI
1,7918,gene/protein,GPANK1,NCBI
2,8233,gene/protein,ZRSR2,NCBI
3,4899,gene/protein,NRF1,NCBI
4,5297,gene/protein,PI4KA,NCBI


In [20]:
node_labels = node_data["type"].drop_duplicates().to_list()
node_labels

['gene/protein',
 'drug',
 'effect/phenotype',
 'disease',
 'biological_process',
 'molecular_function',
 'cellular_component',
 'exposure',
 'pathway',
 'anatomy']

## Define indexes and constraints

In [35]:
for label in node_labels:
    driver.execute_query(
        'create constraint if not exists for (n:`{label}`) require (n.id) is node key'.format(label=label),
        database_=DB_NAME,
        routing_=RoutingControl.WRITE
    )

# Fetch all constraints
schema_result_df  = driver.execute_query(
    'show constraints',
    database_=DB_NAME,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)
schema_result_df.head(100)

Unnamed: 0,id,name,type,entityType,labelsOrTypes,properties,ownedIndex,propertyType
0,8,constraint_142cd3cc,NODE_KEY,NODE,[effect/phenotype],[id],constraint_142cd3cc,
1,16,constraint_3a36d8ce,NODE_KEY,NODE,[cellular_component],[id],constraint_3a36d8ce,
2,12,constraint_445142fa,NODE_KEY,NODE,[biological_process],[id],constraint_445142fa,
3,20,constraint_6249184,NODE_KEY,NODE,[pathway],[id],constraint_6249184,
4,6,constraint_797fe72a,NODE_KEY,NODE,[drug],[id],constraint_797fe72a,
5,18,constraint_90f86fd7,NODE_KEY,NODE,[exposure],[id],constraint_90f86fd7,
6,4,constraint_91f1fb3e,NODE_KEY,NODE,[gene/protein],[id],constraint_91f1fb3e,
7,22,constraint_a2d568b5,NODE_KEY,NODE,[anatomy],[id],constraint_a2d568b5,
8,10,constraint_a96823a5,NODE_KEY,NODE,[disease],[id],constraint_a96823a5,
9,14,constraint_f14c3bd5,NODE_KEY,NODE,[molecular_function],[id],constraint_f14c3bd5,


## Graph creation

### Nodes

In [36]:
for label in node_labels:
    for chunk in split_dataframe( node_data[ node_data['type'] == label].drop_duplicates(subset='id', keep="last"), 50_000):
        records, summary, keys = driver.execute_query(
            ''' 
                unwind $rows as row
                create (n:`{label}`{{id: row['id']}})
                    set n += {{ 
                        name: row['name'], 
                        source: row['source']
                    }} 
                return count(*) as rows_processed
            '''.format(label = label),
            database_=DB_NAME,
            routing_=RoutingControl.WRITE,
            rows = chunk.to_dict('records')
        )
        print(summary.counters)

{'_contains_updates': True, 'labels_added': 27610, 'nodes_created': 27610, 'properties_set': 82830}
{'_contains_updates': True, 'labels_added': 7957, 'nodes_created': 7957, 'properties_set': 23871}
{'_contains_updates': True, 'labels_added': 15311, 'nodes_created': 15311, 'properties_set': 45933}
{'_contains_updates': True, 'labels_added': 17080, 'nodes_created': 17080, 'properties_set': 51240}
{'_contains_updates': True, 'labels_added': 28642, 'nodes_created': 28642, 'properties_set': 85926}
{'_contains_updates': True, 'labels_added': 11169, 'nodes_created': 11169, 'properties_set': 33507}
{'_contains_updates': True, 'labels_added': 4176, 'nodes_created': 4176, 'properties_set': 12528}
{'_contains_updates': True, 'labels_added': 818, 'nodes_created': 818, 'properties_set': 2454}
{'_contains_updates': True, 'labels_added': 2516, 'nodes_created': 2516, 'properties_set': 7548}
{'_contains_updates': True, 'labels_added': 14033, 'nodes_created': 14033, 'properties_set': 42099}


### Relationships