<a href="https://colab.research.google.com/github/olonok69/LLM_Notebooks/blob/main/neo4j/load_data_via_graph_construction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Graph construct: Import from Pandas

## Prerequisites

Running this notebook requires a Neo4j server with a recent version (2.0+) of GDS installed.
We recommend using Neo4j Desktop with GDS, or AuraDS.

The `graphdatascience` Python library needs to be installed as well. See the examples in the Setup section below and in the [client installation instructions](https://neo4j.com/docs/graph-data-science-client/current/installation/).

In [1]:

%pip install graphdatascience

Collecting graphdatascience
  Downloading graphdatascience-1.10-py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multimethod<2.0,>=1.0 (from graphdatascience)
  Downloading multimethod-1.11.2-py3-none-any.whl (10 kB)
Collecting neo4j<6.0,>=4.4.2 (from graphdatascience)
  Downloading neo4j-5.19.0.tar.gz (202 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m203.0/203.0 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting textdistance<5.0,>=4.0 (from graphdatascience)
  Downloading textdistance-4.6.1-py3-none-any.whl (31 kB)
Building wheels for collected packages: neo4j
  Building wheel for neo4j (pyproject.toml) ... [?25l[?

In [5]:
import os
from graphdatascience import GraphDataScience
import pandas as pd

In [27]:


# Get Neo4j DB URI and credentials from environment if applicable
NEO4J_URI = "bolt://44.204.192.158:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "kills-man-labels"
NEO4J_AUTH = (
       NEO4J_USER,
       NEO4J_PASSWORD,
    )


gds = GraphDataScience(NEO4J_URI, auth=NEO4J_AUTH)

In [28]:
gds.version()

'2.6.5'

# Load the Cora dataset

The Cora dataset consists of 2708 scientific publications classified into one of seven classes. The citation network consists of 5429 links. Each publication in the dataset is described by a 0/1-valued word vector indicating the absence/presence of the corresponding word from the dictionary. The dictionary consists of 1433 unique words.


https://graphsandnetworks.com/the-cora-dataset/


In [29]:

CORA_CONTENT = "https://data.neo4j.com/cora/cora.content"
CORA_CITES = "https://data.neo4j.com/cora/cora.cites"

content = pd.read_csv(CORA_CONTENT, header=None)
cites = pd.read_csv(CORA_CITES, header=None)


In [30]:
content.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1425,1426,1427,1428,1429,1430,1431,1432,1433,1434
0,31336,Neural_Networks,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,1061127,Rule_Learning,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,1106406,Reinforcement_Learning,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,13195,Reinforcement_Learning,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,37879,Probabilistic_Methods,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
cites.head(10)

Unnamed: 0,0,1
0,35,1033
1,35,103482
2,35,103515
3,35,1050679
4,35,1103960
5,35,1103985
6,35,1109199
7,35,1112911
8,35,1113438
9,35,1113831


In [32]:

SUBJECT_TO_ID = {
    "Neural_Networks": 0,
    "Rule_Learning": 1,
    "Reinforcement_Learning": 2,
    "Probabilistic_Methods": 3,
    "Theory": 4,
    "Genetic_Algorithms": 5,
    "Case_Based": 6,
}

In [33]:

nodes = pd.DataFrame().assign(
    nodeId=content[0],
    labels="Paper",
    subject=content[1].replace(SUBJECT_TO_ID),
    features=content.iloc[:, 2:].apply(list, axis=1),
)

In [34]:
nodes.head()

Unnamed: 0,nodeId,labels,subject,features
0,31336,Paper,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,1061127,Paper,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
2,1106406,Paper,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,13195,Paper,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,37879,Paper,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [35]:

dir_relationships = pd.DataFrame().assign(sourceNodeId=cites[0], targetNodeId=cites[1], relationshipType="CITES")
inv_relationships = pd.DataFrame().assign(sourceNodeId=cites[1], targetNodeId=cites[0], relationshipType="CITES")

relationships = pd.concat([dir_relationships, inv_relationships]).drop_duplicates()

In [36]:
relationships.head()

Unnamed: 0,sourceNodeId,targetNodeId,relationshipType
0,35,1033,CITES
1,35,103482,CITES
2,35,103515,CITES
3,35,1050679,CITES
4,35,1103960,CITES


In [37]:
G = gds.graph.construct("cora-graph", nodes, relationships)

In [38]:
gds.graph.list()

Unnamed: 0,degreeDistribution,graphName,database,databaseLocation,memoryUsage,sizeInBytes,nodeCount,relationshipCount,configuration,density,creationTime,modificationTime,schema,schemaWithOrientation
0,"{'min': 1, 'max': 168, 'p90': 7, 'p999': 74, '...",cora-graph,neo4j,local,34 MiB,35685000,2708,10556,"{'readConcurrency': 4, 'undirectedRelationship...",0.00144,2024-04-19T15:54:02.832681756+00:00,2024-04-19T15:54:02.932935342+00:00,"{'graphProperties': {}, 'nodes': {'Paper': {'s...","{'graphProperties': {}, 'nodes': {'Paper': {'s..."


In [39]:
G.node_count()

2708

In [40]:

len(content)

2708

In [41]:
gds.graph.nodeProperties.stream(G, ["subject"]).head(10)

Unnamed: 0,nodeId,nodeProperty,propertyValue,nodeLabels
0,31336,subject,0,[]
1,1061127,subject,1,[]
2,1106406,subject,2,[]
3,13195,subject,2,[]
4,37879,subject,3,[]
5,1126012,subject,3,[]
6,1107140,subject,4,[]
7,1102850,subject,0,[]
8,31349,subject,0,[]
9,1106418,subject,4,[]


In [42]:
gds.run_cypher("""CALL gds.ephemeral.database.create('gdsdb', 'cora-graph')""")

Unnamed: 0,dbName,graphName,createMillis
0,gdsdb,cora-graph,106


In [43]:
gds.run_cypher("""CALL gds.ephemeral.database.drop('gdsdb')""")

Unnamed: 0,dbName,dropMillis
0,gdsdb,253


In [44]:
gds.run_cypher("""call gds.graph.drop("cora-graph")""")

Unnamed: 0,graphName,database,databaseLocation,memoryUsage,sizeInBytes,nodeCount,relationshipCount,configuration,density,creationTime,modificationTime,schema,schemaWithOrientation
0,cora-graph,neo4j,local,,-1,2708,10556,"{'readConcurrency': 4, 'undirectedRelationship...",0.00144,2024-04-19T15:54:02.832681756+00:00,2024-04-19T15:54:02.932935342+00:00,"{'graphProperties': {}, 'nodes': {'Paper': {'s...","{'graphProperties': {}, 'nodes': {'Paper': {'s..."
