# Setup

Import our usual suspects

In [None]:
import os
import pandas as pd
from graphdatascience import GraphDataScience
from neo4j import Query, GraphDatabase, RoutingControl, Result 

Register for a sandbox and create an empty sandbox  https://sandbox.neo4j.com

In [None]:
# Capture connection string and auth info
connectionUrl = 'neo4j://localhost:7687'
username = 'neo4j'
password = 'test1234'
database = 'skills'

In [None]:
driver = GraphDatabase.driver(
    connectionUrl, 
    auth=(username, password)
)
driver.verify_connectivity()

In [None]:
## Utility
def split_dataframe(df, chunk_size = 50_000): 
    chunks = list()
    num_chunks = len(df) // chunk_size + 1
    for i in range(num_chunks):
        chunks.append(df[i*chunk_size:(i+1)*chunk_size])
    return chunks

# Graph creation

In [None]:
skills_csv = pd.read_csv('skills.csv', delimiter='\t')
skills_csv.head()

In [None]:
skills_csv['skills'] = skills_csv['skills'].str.split(', ')
skills_csv.head()

### Schema

In [None]:
schema_statements = [
    'create constraint if not exists for (n:Person) require (n.email) is node key',
    'create constraint if not exists for (n:Skill) require (n.name) is node key',
]
for statement in schema_statements:
    driver.execute_query(
        statement,
        database_=database,
        routing_=RoutingControl.WRITE
    )

# Fetch all constraints
schema_result_df  = driver.execute_query(
    'show constraints',
    database_=database,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)
schema_result_df.head(100)


In [None]:
# Create a graph for (:Person)-[:KNOWS]->(:Skill)
for chunk in split_dataframe(skills_csv):
    records, summary, keys = driver.execute_query(
        ''' 
            unwind $rows as row
            merge (p:Person{email:row.email})
            set p.name = row.name
            with p, row
            foreach(skill in row.skills | merge (s:Skill{name:skill}) merge (p)-[:KNOWS]->(s) )
            return count(*) as rows_processed
        ''',
        database_=database,
        routing_=RoutingControl.WRITE,
        rows = chunk.to_dict('records')
    )

# Basic navigation of graph with cypher

In [None]:
# What persons are in the database?
driver.execute_query(
    ''' 
    match (p:Person)
    return p.name as person_name
    ''',
    database_=database,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
).head()

In [None]:
# What skills does each person know?
driver.execute_query(
    ''' 
    match (p:Person)-[:KNOWS]->(s:Skill)
    return p.name as person_name,collect(s.name) as skills
    ''',
    database_=database,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
).head()

In [None]:
# What are the most frequent skills?
driver.execute_query(
    ''' 
    match (p:Person)-[:KNOWS]->(s:Skill)
    return s.name, count(distinct p) as knownByCount order by knownByCount desc limit 10
    ''',
    database_=database,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
).head()

# Node similarity

Let's get this party started

In [None]:
gds = GraphDataScience(connectionUrl, auth=(username, password))
gds.set_database(database)
gds.version()

In [None]:
G, res = gds.graph.project(
    "person_skills_projection",  # Graph name
    ["Person", "Skill"],         #  Node projection
    ["KNOWS"]                    #  Relationship projection
)


In [None]:
res


Documentation https://neo4j.com/docs/graph-data-science/current/algorithms/node-similarity/

In [None]:
gds.nodeSimilarity.stream(
    G,
    similarityMetric='OVERLAP',
    topK=3
)

In [None]:
gds.nodeSimilarity.write(
    G,
    similarityMetric='OVERLAP',
    topK=3,
    writeRelationshipType='SIMILAR_SKILLSET',
    writeProperty='sim_score'
)

In [None]:
# Drop the projection from the graph catalogue to free up resources
G.drop()

In [None]:
# Remove symetric relationships
gds.run_cypher('''
  match (a:Person)-[r:SIMILAR_SKILLSET]->(b:Person) 
    where (b)-[:SIMILAR_SKILLSET]->(a) 
    and   id(a)<id(b)
  delete r
''')


# Semantic Similar skill

In [None]:
# Import langchain open ai
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

In [None]:
skills_df = gds.run_cypher('''
  match (s:Skill)
  return s.name as skill
''')
skills_df.head()

In [None]:
# export OPENAI_API_KEY="your key"
embeddings = OpenAIEmbeddings(
    deployment="your-embeddings-deployment-name",
    model='text-embedding-3-small', # 1536
    #openai_api_base="https://your-endpoint.openai.azure.com/",
    #openai_api_type="azure",
)

In [None]:
skills_df['embedding'] = skills_df['skill'].apply( lambda skill: embeddings.embed_documents([skill])[0])

In [None]:
skills_df.head()

In [None]:
# Add embeddings to Skill nodes in database
gds.run_cypher('''
    unwind $data as row
    match (s:Skill{name: row.skill})
    set s.embedding = row.embedding                                                                             
''',
params = { 'data': skills_df.to_dict(orient='records') })

In [None]:
G, res = gds.graph.project(
    'skill_embedding_projection', 
    {
        'Skill': {"properties": 'embedding'},
    },        
    ['KNOWS']    # No rels will be projected, but we need to specify something here :)             
)

In [None]:
res

In [None]:
# Run knn
gds.knn.write(
    G,
    nodeLabels=['Skill'],
    nodeProperties=['embedding'],
    topK=3,
    writeRelationshipType='SIMILAR_EMBEDDING',
    writeProperty='sim_score'
)

In [None]:
# Drop the projection from the graph catalogue to free up resources
G.drop()

In [None]:
# Again, let's remove symetric relationships
gds.run_cypher("""
  match (a:Skill)-[r:SIMILAR_EMBEDDING]->(b:Skill) 
    where (b)-[:SIMILAR_EMBEDDING]->(a) 
    and   id(a)<id(b)
  delete r
""")

In [None]:
# Let's review
gds.run_cypher("""
  MATCH (s:Skill)-[r:SIMILAR_EMBEDDING]-(s2)
  RETURN s.name as skill, 
         r.sim_score as score,
         s2.name as to_skill
  ORDER by skill, to_skill, score
""").head(30)