# Module objectives
- Creating a graph from unstructured data input


In [1]:
#%pip install graphdatascience neo4j dotenv pydantic

# Setup

Import our usual suspects

In [2]:
import os
from dotenv import load_dotenv
from neo4j import Query, GraphDatabase, RoutingControl, Result

Load env variables

In [3]:
load_dotenv('ws.env', override=True)
# Neo4j
HOST = os.getenv('NEO4J_URI')
USERNAME = os.getenv('NEO4J_USERNAME')
PASSWORD = os.getenv('NEO4J_PASSWORD')
DATABASE = os.getenv('NEO4J_DATABASE', 'neo4j')

# AI
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
os.environ['OPENAI_API_KEY']=OPENAI_API_KEY

Connect to neo4j db

In [4]:
driver = GraphDatabase.driver(
    HOST,
    auth=(USERNAME, PASSWORD)
)
driver.verify_connectivity(database=DATABASE)

  experimental_warn(


# Unstructured data

In [5]:
list_of_bio = [
    ''' 
    Kristof "speedy gonzales" Neys, Graph Data Science Director at Neo4j. Kristof excel at Machine 
    Learning and has written more Quantified Path Patterns in Cypher than anyone else. Want to know more,
    drop him an email at kristof.neys@neo4j.com
    ''',
    ''' 
    Håkan Löfqvist, Solutions Engineer at Neo4j. Håkan prefer using java 
    over python, but nothing beats hacking cypher queries and using Graph Technology 
    to deliver insane success :) Email: hakan.lofqvist@neo4j.com
    ''',
]
print(list_of_bio)

[' \n    Kristof "speedy gonzales" Neys, Graph Data Science Director at Neo4j. Kristof excel at Machine \n    Learning and has written more Quantified Path Patterns in Cypher than anyone else. Want to know more,\n    drop him an email at kristof.neys@neo4j.com\n    ', ' \n    Håkan Löfqvist, Solutions Engineer at Neo4j. Håkan prefer using java \n    over python, but nothing beats hacking cypher queries and using Graph Technology \n    to deliver insane success :) Email: hakan.lofqvist@neo4j.com\n    ']


## Define domain model

In [6]:
#%pip install pydantic
#%pip install openai

In [7]:
from typing import List, Optional
from pydantic import BaseModel, Field, validator


class Skill(BaseModel):
    """
    Represents a professional skill or knwoledge of a person.
    """
    name: str = Field(..., description="Sortened name of the skill")
    

class Person(BaseModel):
    """
    Represents a person with a name.
    """
    name: str = Field(..., description="Full name of person")
    email: str = Field(..., description="A persons email address")
    skills: List[Skill] = Field(
        ...,
        description="List of skills known by the person"
    )

In [8]:
system_message = """
You are an expert in extracting structured information from person resumes.
Identify key details such as:
- Name of the person
- Email address of the person
- Skills known by the person

Present the extracted information in a clear, structured format. Be concise, focusing on:
- Key skills
- Full name of person
Ignore nick names, titles or roles and company information be short and consise with skills"""

In [9]:
from openai import OpenAI
import json
client = OpenAI()
def extract(document, model="gpt-4o", temperature=0):
    response = client.beta.chat.completions.parse(
        model=model,
        temperature=temperature,
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": document},
        ],
        response_format=Person,
    )
    return json.loads(response.choices[0].message.content)

In [10]:
rows = []
for text in list_of_bio:
    data = extract(text)
    rows.append(data)
rows

[{'name': 'Kristof Neys',
  'email': 'kristof.neys@neo4j.com',
  'skills': [{'name': 'Machine Learning'}, {'name': 'Cypher'}]},
 {'name': 'Håkan Löfqvist',
  'email': 'hakan.lofqvist@neo4j.com',
  'skills': [{'name': 'Java'},
   {'name': 'Cypher Queries'},
   {'name': 'Graph Technology'}]}]

In [11]:
class PersonList(BaseModel):
    persons:List[Person]

structured_data = PersonList.model_validate({'persons':rows})
print(structured_data.model_dump())

{'persons': [{'name': 'Kristof Neys', 'email': 'kristof.neys@neo4j.com', 'skills': [{'name': 'Machine Learning'}, {'name': 'Cypher'}]}, {'name': 'Håkan Löfqvist', 'email': 'hakan.lofqvist@neo4j.com', 'skills': [{'name': 'Java'}, {'name': 'Cypher Queries'}, {'name': 'Graph Technology'}]}]}


# Graph creation
Now that data is structured and validated, we can save it to the database

In [12]:
records, summary, keys = driver.execute_query(
    '''
        unwind $rows as row
        merge (p:Person{email:row.email})
        set p.name = row.name
        with p, row
        foreach(skill in row.skills | merge (s:Skill{name:skill.name}) merge (p)-[:KNOWS]->(s) )
        return count(*) as rows_processed
    ''',
    database_=DATABASE,
    routing_=RoutingControl.WRITE,
    rows = rows
)