# Setup

Import our usual suspects

In [66]:
import os
import pandas as pd
from dotenv import load_dotenv
from graphdatascience import GraphDataScience
from neo4j import Query, GraphDatabase, RoutingControl, Result 

Register for a sandbox and create an empty sandbox  https://sandbox.neo4j.com

In [67]:
# Capture connection string and auth info
connectionUrl = 'neo4j://localhost:7687'
username = 'neo4j'
password = 'test1234'
database = 'upm'

In [68]:
driver = GraphDatabase.driver(
    connectionUrl, 
    auth=(username, password)
)
driver.verify_connectivity()

In [69]:
## Utility
def split_dataframe(df, chunk_size = 50_000): 
    chunks = list()
    num_chunks = len(df) // chunk_size + 1
    for i in range(num_chunks):
        chunks.append(df[i*chunk_size:(i+1)*chunk_size])
    return chunks

# Graph creation

In [70]:
skills_csv = pd.read_csv('skills.csv', delimiter='\t')
skills_csv.head(30)

Unnamed: 0,email,name,skills
0,john.smith@test.org,John Smith,"Python, SQL, Data Analysis"
1,alice.johnson@test.org,Alice Johnson,"Java, JavaScript, ReactJS"
2,michael.lee@test.org,Michael Lee,"C++, Machine Learning, TensorFlow"
3,emily.chen@test.org,Emily Chen,"HTML/CSS, WordPress, Graphic Design"
4,david.nguyen@test.org,David Nguyen,"Ruby on Rails, DevOps, AWS"
5,sarah.brown@test.org,Sarah Brown,"PHP, MySQL, Laravel"
6,alex.wang@test.org,Alex Wang,"Swift, iOS Development, UX/UI Design"
7,rachel.kim@test.org,Rachel Kim,"JavaScript, Node.js, MongoDB"
8,kevin.patel@test.org,Kevin Patel,"Java, Spring Boot, Microservices"
9,laura.garcia@test.org,Laura Garcia,"Python, Django, Flask"


In [71]:
skills_csv['skills'] = skills_csv['skills'].str.split(', ')
skills_csv.head()

Unnamed: 0,email,name,skills
0,john.smith@test.org,John Smith,"[Python, SQL, Data Analysis]"
1,alice.johnson@test.org,Alice Johnson,"[Java, JavaScript, ReactJS]"
2,michael.lee@test.org,Michael Lee,"[C++, Machine Learning, TensorFlow]"
3,emily.chen@test.org,Emily Chen,"[HTML/CSS, WordPress, Graphic Design]"
4,david.nguyen@test.org,David Nguyen,"[Ruby on Rails, DevOps, AWS]"


### Schema

In [72]:
schema_statements = [
    'create constraint if not exists for (n:Person) require (n.email) is node key',
    'create constraint if not exists for (n:Skill) require (n.name) is node key',
]
for statement in schema_statements:
    driver.execute_query(
        statement,
        database_=database,
        routing_=RoutingControl.WRITE
    )

# Fetch all constraints
schema_result_df  = driver.execute_query(
    'show constraints',
    database_=database,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)
schema_result_df.head(100)


Unnamed: 0,id,name,type,entityType,labelsOrTypes,properties,ownedIndex,propertyType
0,7,constraint_63bf11a1,NODE_KEY,NODE,[Skill],[name],constraint_63bf11a1,
1,5,constraint_d3bfd313,NODE_KEY,NODE,[Person],[email],constraint_d3bfd313,
2,3,constraint_f7832722,UNIQUENESS,NODE,[_Bloom_Perspective_],[id],constraint_f7832722,


In [73]:
# Create a graph for (:Person)-[:KNOWS]->(:Skill)
for chunk in split_dataframe(skills_csv):
    records, summary, keys = driver.execute_query(
        ''' 
            unwind $rows as row
            merge (p:Person{email:row.email})
            set p.name = row.name
            with p, row
            foreach(skill in row.skills | merge (s:Skill{name:skill}) merge (p)-[:KNOWS]->(s) )
            return count(*) as rows_processed
        ''',
        database_=database,
        routing_=RoutingControl.WRITE,
        rows = chunk.to_dict('records')
    )

### Create a normalized skill name and index it

In [74]:
with driver.session(database=database) as session:
    session.run(
        ''' 
            match (s:Skill)
            call (s) { 
                set s.norm_name = toLower(s.name)
            } in transactions of 50_000 rows
        '''
    ).consume()
    session.close()

In [75]:
driver.execute_query(
        ''' 
            create text index if not exists for (n:Skill) on (n.norm_name)
        ''',
        database_=database,
        routing_=RoutingControl.WRITE,
    )

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x302ed5f70>, keys=[])

# Basic navigation of graph with cypher

In [76]:
# What persons are in the database?
driver.execute_query(
    ''' 
    match (p:Person)
    return p.name as person_name
    ''',
    database_=database,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
).head()

Unnamed: 0,person_name
0,John Smith
1,Alice Johnson
2,Michael Lee
3,Emily Chen
4,David Nguyen


In [78]:
# What skills does each person know?
driver.execute_query(
    ''' 
    match (p:Person)-[:KNOWS]->(s:Skill)
    return p.email as email, p.name as person_name,collect(s.name) as skills
    ''',
    database_=database,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
).head()

Unnamed: 0,email,person_name,skills
0,alex.wang@test.org,Alex Wang,"[Swift, iOS Development, UX/UI Design]"
1,alice.johnson@test.org,Alice Johnson,"[Java, JavaScript, ReactJS]"
2,david.nguyen@test.org,David Nguyen,"[Ruby on Rails, DevOps, AWS]"
3,emily.chen@test.org,Emily Chen,"[HTML/CSS, WordPress, Graphic Design]"
4,john.smith@test.org,John Smith,"[Python, SQL, Data Analysis]"


In [79]:
# What are the most frequent skills?
driver.execute_query(
    ''' 
    match (p:Person)-[:KNOWS]->(s:Skill)
    return s.name, count(distinct p) as knownByCount order by knownByCount desc limit 10
    ''',
    database_=database,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
).head()

Unnamed: 0,s.name,knownByCount
0,Java,2
1,Python,2
2,JavaScript,2
3,Data Analysis,1
4,ReactJS,1


# Node similarity

Let's get this party started

In [80]:
gds = GraphDataScience(connectionUrl, auth=(username, password))
gds.set_database(database)
gds.version()



'2.13.1'

In [81]:
G, res = gds.graph.project(
    "person_skills_projection",  # Graph name
    ["Person", "Skill"],         #  Node projection
    ["KNOWS"]                    #  Relationship projection
)


In [82]:
res


nodeProjection            {'Skill': {'label': 'Skill', 'properties': {}}...
relationshipProjection    {'KNOWS': {'aggregation': 'DEFAULT', 'orientat...
graphName                                          person_skills_projection
nodeCount                                                                37
relationshipCount                                                        30
projectMillis                                                             8
Name: 0, dtype: object

Documentation https://neo4j.com/docs/graph-data-science/current/algorithms/node-similarity/

In [83]:
gds.nodeSimilarity.stream(
    G,
    similarityMetric='OVERLAP',
    topK=3
)

Unnamed: 0,node1,node2,similarity
0,0,9,0.333333
1,1,8,0.333333
2,1,7,0.333333
3,7,1,0.333333
4,8,1,0.333333
5,9,0,0.333333


In [84]:
gds.nodeSimilarity.write(
    G,
    similarityMetric='OVERLAP',
    topK=3,
    writeRelationshipType='SIMILAR_SKILLSET',
    writeProperty='sim_score'
)

preProcessingMillis                                                       0
computeMillis                                                             3
writeMillis                                                              16
postProcessingMillis                                                      0
nodesCompared                                                            10
relationshipsWritten                                                      6
similarityDistribution    {'min': 0.3333320617675781, 'p5': 0.3333320617...
configuration             {'writeProperty': 'sim_score', 'writeRelations...
Name: 0, dtype: object

In [85]:
# Drop the projection from the graph catalogue to free up resources
G.drop()



graphName                                         person_skills_projection
database                                                               upm
databaseLocation                                                     local
memoryUsage                                                               
sizeInBytes                                                             -1
nodeCount                                                               37
relationshipCount                                                       30
configuration            {'relationshipProjection': {'KNOWS': {'aggrega...
density                                                           0.022523
creationTime                           2025-02-11T10:30:41.797893000+01:00
modificationTime                       2025-02-11T10:30:41.797893000+01:00
schema                   {'graphProperties': {}, 'nodes': {'Skill': {},...
schemaWithOrientation    {'graphProperties': {}, 'nodes': {'Skill': {},...
Name: 0, dtype: object

In [86]:
# Remove symetric relationships
gds.run_cypher('''
  match (a:Person)-[r:SIMILAR_SKILLSET]->(b:Person) 
    where exists { (b)-[:SIMILAR_SKILLSET]->(a) }
    and   id(a)<id(b)
  delete r
''')




# Semantic Similar skill

In [87]:
# Import langchain open ai
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

In [88]:
skills_df = gds.run_cypher('''
  match (s:Skill)
  return s.name as skill
''')
skills_df.head(30)

Unnamed: 0,skill
0,AWS
1,C++
2,Data Analysis
3,DevOps
4,Django
5,Flask
6,Graphic Design
7,HTML/CSS
8,Java
9,JavaScript


In [89]:
load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
embeddings = OpenAIEmbeddings(
    deployment="your-embeddings-deployment-name",
    model='text-embedding-3-small', # 1536
)

In [90]:
skills_df['embedding'] = skills_df['skill'].apply( lambda skill: embeddings.embed_documents([skill])[0])

In [91]:
skills_df.head()

Unnamed: 0,skill,embedding
0,AWS,"[0.01564984514650839, -0.04042042616952391, 0...."
1,C++,"[0.01211648340819335, 0.017389811568969103, 0...."
2,Data Analysis,"[0.0039946269378510505, 0.014328012291175678, ..."
3,DevOps,"[0.0006287396891707879, 0.031127516921187134, ..."
4,Django,"[-0.003611742476297006, 0.006400906458475529, ..."


In [92]:
# Add embeddings to Skill nodes in database
gds.run_cypher('''
    unwind $data as row
    match (s:Skill{name: row.skill})
    set s.embedding = row.embedding                                                                             
''',
params = { 'data': skills_df.to_dict(orient='records') })

In [93]:
G, res = gds.graph.project(
    'skill_embedding_projection', 
    {
        'Skill': {"properties": 'embedding'},
    },        
    ['KNOWS']    # No rels will be projected, but we need to specify something here :)             
)

In [94]:
res

nodeProjection            {'Skill': {'label': 'Skill', 'properties': {'e...
relationshipProjection    {'KNOWS': {'aggregation': 'DEFAULT', 'orientat...
graphName                                        skill_embedding_projection
nodeCount                                                                27
relationshipCount                                                         0
projectMillis                                                            19
Name: 0, dtype: object

In [95]:
# Run knn
gds.knn.write(
    G,
    nodeLabels=['Skill'],
    nodeProperties=['embedding'],
    topK=3,
    writeRelationshipType='SIMILAR_EMBEDDING',
    writeProperty='sim_score'
)

ranIterations                                                             4
didConverge                                                            True
nodePairsConsidered                                                    1946
preProcessingMillis                                                       0
computeMillis                                                             6
writeMillis                                                              16
postProcessingMillis                                                      0
nodesCompared                                                            27
relationshipsWritten                                                     81
similarityDistribution    {'min': 0.6443901062011719, 'p5': 0.6608734130...
configuration             {'writeProperty': 'sim_score', 'writeRelations...
Name: 0, dtype: object

In [96]:
# Drop the projection from the graph catalogue to free up resources
G.drop()



graphName                                       skill_embedding_projection
database                                                               upm
databaseLocation                                                     local
memoryUsage                                                               
sizeInBytes                                                             -1
nodeCount                                                               27
relationshipCount                                                        0
configuration            {'relationshipProjection': {'KNOWS': {'aggrega...
density                                                                0.0
creationTime                           2025-02-11T10:41:09.359296000+01:00
modificationTime                       2025-02-11T10:41:09.359296000+01:00
schema                   {'graphProperties': {}, 'nodes': {'Skill': {'e...
schemaWithOrientation    {'graphProperties': {}, 'nodes': {'Skill': {'e...
Name: 0, dtype: object

In [97]:
# Again, let's remove symetric relationships
gds.run_cypher('''
  match (a:Skill)-[r:SIMILAR_EMBEDDING]->(b:Skill) 
    where exists { (b)-[:SIMILAR_EMBEDDING]->(a) }
    and   a<b
  delete r
''')

In [None]:
# Let's review
gds.run_cypher('''
  MATCH (s:Skill)-[r:SIMILAR_EMBEDDING]-(s2)
  RETURN s.name as skill, 
         r.sim_score as score,
         s2.name as to_skill
  ORDER by skill asc, score desc
''').head(30)

# Cheat sheet

In [None]:
# Find preson with skill, order by number of matched skills
gds.run_cypher('''
    unwind ['node.js','mongodb','JavaScript'] as wanted_skill
    match (s:Skill) where s.norm_name contains toLower(wanted_skill)
    with s
    match (s)<-[:KNOWS]-(p:Person)
    with p, collect(s) as matched_skills
    return p.name as person, [(p)-[:KNOWS]->(skill) | skill.name] as skills, size(matched_skills) as no_matched_skills
    order by no_matched_skills desc limit 10
''').head(30)

In [98]:
# Find preson with skill or similar skill, order by number of matched skills
gds.run_cypher('''
    unwind ['node.js','mongodb','JavaScript'] as wanted_skill
    match (s:Skill) where s.norm_name contains toLower(wanted_skill)
    with collect(s) as wanted_skills
    with wanted_skills
    unwind wanted_skills as s
    match (s)-[:KNOWS|SIMILAR_EMBEDDING*1..2]-(p:Person)
    with p, collect(distinct s) as matched_skills
    return p.name as person, [(p)-[:KNOWS]->(skill) | skill.name] as skills, size(matched_skills) as no_matched_skills, [x in matched_skills | x.name] as matched_skills
    order by no_matched_skills desc limit 10
''').head(30)

Unnamed: 0,person,skills,no_matched_skills,matched_skills
0,Rachel Kim,"[JavaScript, Node.js, MongoDB]",3,"[Node.js, MongoDB, JavaScript]"
1,Alice Johnson,"[Java, JavaScript, ReactJS]",2,"[Node.js, JavaScript]"
2,Sarah Brown,"[PHP, MySQL, Laravel]",2,"[MongoDB, JavaScript]"
3,John Smith,"[Python, SQL, Data Analysis]",1,[JavaScript]
4,Laura Garcia,"[Python, Django, Flask]",1,[JavaScript]
5,Kevin Patel,"[Java, Spring Boot, Microservices]",1,[JavaScript]
6,Michael Lee,"[C++, Machine Learning, TensorFlow]",1,[JavaScript]
7,Emily Chen,"[HTML/CSS, WordPress, Graphic Design]",1,[JavaScript]
8,Alex Wang,"[Swift, iOS Development, UX/UI Design]",1,[JavaScript]
