# Module 2 - Taming Unstructured Data

This module has the following objectives:
- Creating a graph from Unstructured Data

In [None]:
# !pip install graphdatascience neo4j dotenv pydantic openai

Import our usual suspects (and some more...)

In [None]:
import os
import pandas as pd
from dotenv import load_dotenv
from graphdatascience import GraphDataScience
from neo4j import Query, GraphDatabase, RoutingControl, Result
from typing import List, Optional
from pydantic import BaseModel, Field, validator
from openai import OpenAI
import json

# Setup

Load env variables

In [None]:
env_file = 'ws.env'

In [None]:
if os.path.exists(env_file):
    load_dotenv(env_file, override=True)

    # Neo4j
    HOST = os.getenv('NEO4J_URI')
    USERNAME = os.getenv('NEO4J_USERNAME')
    PASSWORD = os.getenv('NEO4J_PASSWORD')
    DATABASE = os.getenv('NEO4J_DATABASE')

    # AI
    OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
    os.environ['OPENAI_API_KEY']=OPENAI_API_KEY
    LLM = os.getenv('LLM')
    EMBEDDINGS_MODEL = os.getenv('EMBEDDINGS_MODEL')
else:
    print(f"File {env_file} not found.")

Connect to neo4j db

In [None]:
driver = GraphDatabase.driver(
    HOST,
    auth=(USERNAME, PASSWORD)
)

Test the connection

In [None]:
driver.execute_query(
    """
    MATCH (n) RETURN COUNT(n) as Count
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)

## Unstructured data

Let's define some unstrucutured data from some of our Neo4j colleagues

In [None]:
list_of_bio = [
    """ 
    Kristof "speedy gonzales" Neys, Graph Data Science Director at Neo4j. Kristof excel at Machine 
    Learning and has written more Quantified Path Patterns in Cypher than anyone else. Want to know more,
    drop him an email at kristof.neys@neo4j.com
    """,
    """ 
    Håkan Löfqvist, Solutions Engineer at Neo4j. Håkan prefer using java 
    over python, but nothing beats hacking cypher queries and using Graph Technology 
    to deliver insane success :) Email: hakan.lofqvist@neo4j.com
    """,
]

In [None]:
for bio in list_of_bio:
    print(bio)

## Define the Domain Model

[Pydantic Models](https://docs.pydantic.dev/latest/api/base_model/) are simply classes which inherit from BaseModel and define fields as annotated attributes.

In [None]:
class Skill(BaseModel):
    """
    Represents a professional skill or knowledge of a person.
    """
    name: str = Field(..., description="Sortened name of the skill")
    
class Person(BaseModel):
    """
    Represents a person with a name.
    """
    name: str = Field(..., description="Full name of person")
    email: str = Field(..., description="A persons email address")
    skills: List[Skill] = Field(..., description="List of skills known by the person"
    )
    
class PersonList(BaseModel):
    persons:List[Person]

In [None]:
system_message = """
    You are an expert in extracting structured information from person resumes.
    Identify key details such as:
    - Name of the person
    - Email address of the person
    - Skills known by the person
    
    Present the extracted information in a clear, structured format. Be concise, focusing on:
    - Key skills
    - Full name of person
    Ignore nick names, titles or roles and company information be short and consise with skills
"""

In [None]:
client = OpenAI()

In [None]:
def extract(document, model=LLM, temperature=0):
    response = client.beta.chat.completions.parse(
        model=model,
        temperature=temperature,
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": document},
        ],
        response_format=Person,
    )
    return json.loads(response.choices[0].message.content)

In [None]:
rows = []
for text in list_of_bio:
    data = extract(text)
    rows.append(data)
rows

In [None]:
structured_data = PersonList.model_validate({'persons':rows})

In [None]:
for k, details_list in structured_data.model_dump().items():
    print(f"{k}")
    for details in details_list:
        for key, value in details.items():
            print(f"  {key}: {value}")
        print()

## Graph creation
Now that data is structured and validated, we can save it to the database

In [None]:
records, summary, keys = driver.execute_query(
    """
        UNWIND $rows AS row
        MERGE (p:Person{email:row.email})
        SET p.name = row.name
        WITH p, row
        FOREACH (skill IN row.skills | MERGE (s:Skill{name:skill.name}) MERGE (p)-[:KNOWS]->(s) )
        RETURN COUNT (*) AS rows_processed
    """,
    database_=DATABASE,
    routing_=RoutingControl.WRITE,
    rows = rows
)

We could also have used [neomodel (OGM)](https://neo4j.com/labs/neomodel/?utm_source=GSearch&utm_medium=PaidSearch&utm_campaign=Evergreen&utm_content=EMEA-Search-SEMCE-DSA-None-SEM-SEM-NonABM&utm_term=&utm_adgroup=DSA&gad_source=1&gclid=Cj0KCQjw4cS-BhDGARIsABg4_J3lQsfHEHC6mPeWozzT4IgafxMFSHlZeWAENoPHfKnqTpFtqDG4nIkaAr3XEALw_wcB) to update the graph.

Check the browser with the following:

- ```MATCH p=(n:Person {name: "Kristof Neys"})-[:KNOWS]->(:Skill) RETURN p```
- ```MATCH p=(n:Person {name: "Håkan Löfqvist"})-[:KNOWS]->(:Skill) RETURN p```