# Module objectives
- Creating a graph from structured data input
- Basic graph algorithms
- Text embeddings for semantic analysis
- Feature engineering
- Node embeddings


In [None]:
!pip install graphdatascience neo4j dotenv

# Setup

Import our usual suspects

In [None]:
import os
import pandas as pd
from dotenv import load_dotenv
from graphdatascience import GraphDataScience
from neo4j import Query, GraphDatabase, RoutingControl, Result

Load env variables

In [None]:
load_dotenv('ws.env', override=True)
# Neo4j
HOST = os.getenv('HOST')
USERNAME = os.getenv('USERNAME')
PASSWORD = os.getenv('PASSWORD')
DATABASE = os.getenv('DATABASE')

# AI
LLM = os.getenv('LLM')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
os.environ['OPENAI_API_KEY']=OPENAI_API_KEY

Connect to neo4j db

In [None]:
driver = GraphDatabase.driver(
    HOST,
    auth=(USERNAME, PASSWORD)
)
driver.verify_connectivity(database=DATABASE)

# Graph creation

In [None]:
## Utility - not needed for this small dataset, but as best practice example
def split_dataframe(df, chunk_size = 50_000):
    chunks = list()
    num_chunks = len(df) // chunk_size + 1
    for i in range(num_chunks):
        chunks.append(df[i*chunk_size:(i+1)*chunk_size])
    return chunks

In [None]:
# Load synthetic Skills dataset
url = "https://raw.githubusercontent.com/Kristof-Neys/Neo4j_demos/main/expanded_skills.csv"
skills_csv = pd.read_csv(url)

# Display the first few rows of the DataFrame
skills_csv.head(30)

In [None]:
# Convert skills column from comma separated string to List
skills_csv['skills'] = skills_csv['skills'].str.split(', ')
skills_csv.head()

### Schema

In [None]:
schema_statements = [
    'create constraint if not exists for (n:Person) require (n.email) is node key',
    'create constraint if not exists for (n:Skill) require (n.name) is node key',
]
for statement in schema_statements:
    driver.execute_query(
        statement,
        database_=DATABASE,
        routing_=RoutingControl.WRITE
    )

# Fetch all constraints
schema_result_df  = driver.execute_query(
    'show constraints',
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)
schema_result_df.head(100)


In [None]:
# Create a graph for (:Person)-[:KNOWS]->(:Skill)
for chunk in split_dataframe(skills_csv):
    records, summary, keys = driver.execute_query(
        '''
            unwind $rows as row
            merge (p:Person{email:row.email})
            set p.name = row.name
            with p, row
            foreach(skill in row.skills | merge (s:Skill{name:skill}) merge (p)-[:KNOWS]->(s) )
            return count(*) as rows_processed
        ''',
        database_=DATABASE,
        routing_=RoutingControl.WRITE,
        rows = chunk.to_dict('records')
    )

# Basic navigation of graph with cypher

In [None]:
# What persons are in the database?
driver.execute_query(
    '''
    match (p:Person)
    return p.name as person_name
    ''',
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
).head()

In [None]:
# What skills does each person know?
driver.execute_query(
    '''
    match (p:Person)-[:KNOWS]->(s:Skill)
    return p.email as email, p.name as person_name,collect(s.name) as skills
    ''',
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
).head()

In [None]:
# What are the most frequent skills?
driver.execute_query(
    '''
    match (p:Person)-[:KNOWS]->(s:Skill)
    return s.name, count(distinct p) as knownByCount order by knownByCount desc limit 10
    ''',
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
).head()

# Node similarity

Let's investigate Persons that are simiar in the graph (based on skills they share)

In [None]:
gds = GraphDataScience.from_neo4j_driver(driver=driver)
gds.set_database(DATABASE)
gds.version()

In [None]:
G, res = gds.graph.project(
    "person_skills_projection",  # Graph name
    ["Person", "Skill"],         #  Node projection
    ["KNOWS"]                    #  Relationship projection
)

In [None]:
res


Documentation https://neo4j.com/docs/graph-data-science/current/algorithms/node-similarity/

In [None]:
gds.nodeSimilarity.stream(
    G,
    similarityMetric='OVERLAP',
    topK=3
)

In [None]:
gds.nodeSimilarity.write(
    G,
    similarityMetric='OVERLAP',
    topK=3,
    writeRelationshipType='SIMILAR_SKILLSET',
    writeProperty='sim_score'
)

# Reomve symmetric relationships
gds.run_cypher('''
  match (a:Person)-[r:SIMILAR_SKILLSET]->(b:Person)
    where exists { (b)-[:SIMILAR_SKILLSET]->(a) }
    and   id(a)<id(b)
  delete r
''')


In [None]:
# Drop the projection from the graph catalogue to free up resources
G.drop()

Take a minute to explore the SIMILAR_SKILLSET network

# Semantic Similar skill

In [None]:
!pip install langchain langchain_openai

In [None]:
# Import langchain open ai
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')


In [None]:
skills_df = gds.run_cypher('''
  match (s:Skill)
  return s.name as skill
''')
skills_df.head(35)

## STOP STOP STOP - DO NOT PROCEED (YET)

In [None]:
# only to be run by instructor (or if you have your own api key)
# to-do: Don't do row-by-row, use bulk.

# skills_df['embedding'] = skills_df['skill'].apply( lambda skill: embeddings.embed_documents([skill])[0])
# skills_df.head()

In [None]:
# gds.run_cypher('''
#     unwind $data as row
#     match (s:Skill{name: row.skill})
#     set s.embedding = row.embedding
#     ''',
#     params = { 'data': skills_df.to_dict(orient='records') }
# )

In [None]:
# Todo: Kristof, can you update this so it also has embeddings from text-embedding-ada-002
skills_df = pd.read_csv('https://raw.githubusercontent.com/Kristof-Neys/Neo4j_demos/refs/heads/main/skills_embeddings.csv')
skills_df['Embedding'] = skills_df['Embedding'].apply( lambda x: [ float(i) for i in x.strip("[]").split(", ")] )
skills_df.head()

In [None]:
# Add embeddings to Skill nodes in database
gds.run_cypher('''
    unwind $data as row
    match (s:Skill{name: row.Name})
    set s.embedding = row.Embedding
    ''',
    params = { 'data': skills_df.to_dict(orient='records') }
)

# Let's use the semantic meaning to find similarities...

In [None]:
G, res = gds.graph.project(
    'skill_embedding_projection',
    {
        'Skill': {"properties": 'embedding'},
    },
    ['KNOWS']    # No rels will be projected, but we need to specify something here :)
)
res

***Running K-nearest Neighbours to find semantic similarities...*** [K Nearest Neighbours](https://neo4j.com/docs/graph-data-science/current/algorithms/knn/)

In [None]:
# Run knn
gds.knn.write(
    G,
    nodeLabels=['Skill'],
    nodeProperties=['embedding'],
    topK=3,
    writeRelationshipType='SIMILAR_SEMANTIC',
    writeProperty='sim_score'
)

# Remove symmetric relationships
gds.run_cypher('''
  match (a:Skill)-[r:SIMILAR_SEMANTIC]->(b:Skill)
    where exists { (b)-[:SIMILAR_SEMANTIC]->(a) }
    and   a<b
  delete r
''')

In [None]:
# Let's review
gds.run_cypher('''
  MATCH (s:Skill)-[r:SIMILAR_SEMANTIC]-(s2)
  RETURN s.name as skill,
         r.sim_score as score,
         s2.name as to_skill
  ORDER by skill asc, score desc
''').head(30)

In [None]:
# Drop the projection from the graph catalogue to free up resources
G.drop()

# *Let's do some "Graph Feature Engineering" - learn from our connected data...*

In [None]:
G, res = gds.graph.project(
    'skill_BetW_projection',
    ['Skill'],
    {'SIMILAR_SEMANTIC':{'orientation': 'NATURAL'}}
)

In [None]:
BetWresult = gds.betweenness.write(
    G,
    writeProperty='betweenness'
)
BetWresult


In [None]:
G.drop()

In [None]:
bridge_skill = gds.run_cypher('''
    MATCH (s:Skill)
    RETURN s.name as skill, s.betweenness AS betweenness
    ORDER BY betweenness DESC
''')

bridge_skill.head(10)

# *Wait! - and I can vectorise my Graph as well....?*

In [None]:
G, res = gds.graph.project(
    "Person_projection",  # Graph name
    ["Person"],         #  Node projection
    {'SIMILAR_SKILLSET': {'orientation': 'UNDIRECTED'}}                    #  Relationship projection
)


***Running a node embedding in a few lines...***

In [None]:
fastrp_res =  gds.fastRP.write(G,
    embeddingDimension = 128,
    iterationWeights = [0, 0, 1.0, 1.0],
    normalizationStrength = 0.05,
    writeProperty = "fastRP_Embedding",
    randomSeed = 42
)

In [None]:
#Adding it now to memory for some more computations
fastrp_res =  gds.fastRP.mutate(G,
    embeddingDimension = 128,
    iterationWeights = [0, 0, 1.0, 1.0],
    normalizationStrength = 0.05,
    mutateProperty = "fastRP_Embedding",
    randomSeed = 42
)

*Finding clusters based on their structural and specific attributes...*

In [None]:
kmeans_result = gds.kmeans.write(
    G,
    nodeProperty='fastRP_Embedding',
    k=5,
    writeProperty='kmeans5_cluster',
    randomSeed=42,
    maxIterations=100
)

In [None]:
G.drop()

In [None]:
skill_teams = gds.run_cypher('''
    MATCH (n) WHERE (n.kmeans5_cluster) IS NOT NULL
    RETURN n.kmeans5_cluster AS Team, collect(n.name) AS Team_members
''')

print("Teams of expertise: \n", skill_teams.head(6))