# LLM Powered Consultancy Graph Generation

### Outline
1. Configuration
2. Helper Functions
3. Prompts
4. Running the pipeline

In [1]:
%%capture
%pip install requirements.txt
%pip install neo4j openai python-dotenv

In [7]:
import os
import openai
from string import Template
import json
from neo4j import GraphDatabase
import glob
from timeit import default_timer as timer
from dotenv import load_dotenv
from time import sleep
import re

### 1. Configuration

In [3]:
# Load environment variables
load_dotenv()

False

In [None]:
# OpenAI API configuration
# openai.api_type = "azure"
# openai.api_key = os.getenv("OPENAI_API_KEY")
# openai.api_base = os.getenv("OPENAI_API_BASE")
# openai.api_version = os.getenv("OPENAI_API_VERSION")
# openai_deployment = "chat-gpt35"

In [18]:
openai.api_key = "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"

In [None]:
# Neo4j configuration & constraints
neo4j_url = os.getenv("NEO4J_CONNECTION_URL")
neo4j_user = os.getenv("NEO4J_USER")
neo4j_password = os.getenv("NEO4J_PASSWORD")
gds = GraphDatabase.driver(neo4j_url, auth=(neo4j_user, neo4j_password))

In [38]:
neo4j_url = "neo4j+s://82eaa59e.databases.neo4j.io"
neo4j_user = "neo4j"
neo4j_password = "TOA5RAvRqmFU6G7cqOBZkl5ngtlEaJaXvpqYATAl9AM"
gds = GraphDatabase.driver(neo4j_url, auth=(neo4j_user, neo4j_password))

### 2. Helper Functions

In [154]:
def process_gpt(file_prompt, system_msg):
    completion = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": system_msg},
            {"role": "user", "content": file_prompt},
        ],
        temperature=0.0,
    )
    nlp_results = completion.choices[0].message.content
    print(nlp_results)
    return nlp_results

def extract_entities_relationships(prompt_template, context_text):
    system_msg = "You are a data extraction assistant. You are asked to extract entities and relationships from the given context. Donot include any information that is not present in the context."
    prompt = Template(prompt_template).substitute(ctext=context_text)
    result = process_gpt(prompt, system_msg=system_msg)
    response = json.loads(result)
    entities = response.get("entities", [])
    relationships = response.get("relationships", [])
    return entities, relationships

def generate_cypher(entities, relationships, output_file=None):
    e_statements = []
    r_statements = []

    e_label_map = {entity["id"]: entity["label"] for entity in entities}

    # Loop through the entities
    for entity in entities:
        label = entity["label"]
        id = entity["id"].replace("-", "").replace("_", "")
        properties = {k: v for k, v in entity.items() if k not in ["label", "id"]}

        cypher = f'MERGE (n:{label} {{id: "{id}"}})'
        if properties:
            props_str = ", ".join(f'n.{key} = "{val}"' for key, val in properties.items())
            cypher += f" ON CREATE SET {props_str}"
        e_statements.append(cypher)
        e_label_map[id] = label

    # Loop through the relationships
    for rel in relationships:
        # Extract relationship components
        head_id, rel_type, tail_id = rel.split("|")
        
        # Remove dashes and underscores from IDs
        head_id = head_id.replace("-", "").replace("_", "")
        tail_id = tail_id.replace("-", "").replace("_", "")
        
        # Check if both head and tail entities exist in the entity list
        if head_id in e_label_map and tail_id in e_label_map:
            head_label = e_label_map[head_id]
            tail_label = e_label_map[tail_id]
            
            # Generate cypher statement for relationship
            cypher = f'MATCH (a:{head_label} {{id: "{head_id}"}}), (b:{tail_label} {{id: "{tail_id}"}}) MERGE (a)-[:{rel_type}]->(b)'
            r_statements.append(cypher)

    if output_file:
        with open(output_file, "w") as outfile:
            outfile.write("\n".join(e_statements + r_statements))

    return e_statements + r_statements

def ingestion_pipeline(prompt_template, context_text, output_file=None):
    entities, relationships = extract_entities_relationships(prompt_template, context_text)

    if entities or relationships:
        cypher_statements = generate_cypher(entities, relationships, output_file)
        print(cypher_statements)
        # for i, stmt in enumerate(cypher_statements):
        #     print(f"Executing cypher statement {i+1} of {len(cypher_statements)}")
        #     try:
        #         gds.execute_query(stmt)
        #     except Exception as e:
        #         with open("failed_statements.txt", "w") as f:
        #             f.write(f"{stmt} - Exception: {e}\n")

In [6]:
# Repo - Function to call the OpenAI API
def process_gpt(file_prompt, system_msg):
    completion = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": system_msg},
            {"role": "user", "content": file_prompt},
        ],
        temperature=0,
    )
    nlp_results = completion.choices[0].message.content
    sleep(2)
    return nlp_results


# Function to take folder of files and a prompt template, and return a json-object of all the entities and relationships
def extract_entities_relationships(folder, prompt_template):
    start = timer()
    files = glob.glob(f"./data/{folder}/*")
    system_msg = "You are a helpful IT-project and account management expert who extracts information from documents."
    print(f"Running pipeline for {len(files)} files in {folder} folder")
    results = []
    for i, file in enumerate(files):
        print(f"Extracting entities and relationships for {file}")
        try:
            with open(file, "r") as f:
                text = f.read().rstrip()
                prompt = Template(prompt_template).substitute(ctext=text)
                result = process_gpt(prompt, system_msg=system_msg)
                results.append(json.loads(result))
        except Exception as e:
            print(f"Error processing {file}: {e}")
    end = timer()
    print(f"Pipeline completed in {end-start} seconds")
    return results


# Function to take a json-object of entitites and relationships and generate cypher query for creating those entities
def generate_cypher(json_obj):
    e_statements = []
    r_statements = []

    e_label_map = {}

    # loop through our json object
    for i, obj in enumerate(json_obj):
        print(f"Generating cypher for file {i+1} of {len(json_obj)}")
        for entity in obj["entities"]:
            label = entity["label"]
            id = entity["id"]
            id = id.replace("-", "").replace("_", "")
            properties = {k: v for k, v in entity.items() if k not in ["label", "id"]}

            cypher = f'MERGE (n:{label} {{id: "{id}"}})'
            if properties:
                props_str = ", ".join(
                    [f'n.{key} = "{val}"' for key, val in properties.items()]
                )
                cypher += f" ON CREATE SET {props_str}"
            e_statements.append(cypher)
            e_label_map[id] = label

        for rs in obj["relationships"]:
            src_id, rs_type, tgt_id = rs.split("|")
            src_id = src_id.replace("-", "").replace("_", "")
            tgt_id = tgt_id.replace("-", "").replace("_", "")

            src_label = e_label_map[src_id]
            tgt_label = e_label_map[tgt_id]

            cypher = f'MERGE (a:{src_label} {{id: "{src_id}"}}) MERGE (b:{tgt_label} {{id: "{tgt_id}"}}) MERGE (a)-[:{rs_type}]->(b)'
            r_statements.append(cypher)

    with open("cyphers.txt", "w") as outfile:
        outfile.write("\n".join(e_statements + r_statements))

    return e_statements + r_statements


# Final function to bring all the steps together
def ingestion_pipeline(folders):
    # Extrating the entites and relationships from each folder, append into one json_object
    entities_relationships = []
    for key, value in folders.items():
        entities_relationships.extend(extract_entities_relationships(key, value))

    # Generate and execute cypher statements
    cypher_statements = generate_cypher(entities_relationships)
    for i, stmt in enumerate(cypher_statements):
        print(f"Executing cypher statement {i+1} of {len(cypher_statements)}")
        try:
            gds.execute_query(stmt)
        except Exception as e:
            with open("failed_statements.txt", "w") as f:
                f.write(f"{stmt} - Exception: {e}\n")

### 3. Defining Prompts

In [148]:
sample_prompt_template = """From the text provided, extract the entities and relationships that are present in the context.
You will be provided detailed profiles, school reports, interactions, and other information about the people, subjects, and activities in the context.

1. Begin by identifying the following entity types in the text and generate them as comma-separated entries, adhering to the specified format. Each entity should have unique alphanumeric `id` properties:
   Entity Types:
   - Person: Represents an individual with various attributes such as name, age, date of birth, school name, family relations, strengths, preferences, teaching roles, friendships, and interests.
     Original Properties:
     - id: The name of the person.
     - name: The name of the person.
     - age: The age of the person (in years).
     - dob: The date of birth of the person.
     - schoolName: The name of the school the person attends.
     - hasBrother: ID of the person's brother.
     - hasMother: ID of the person's mother.
     - hasSister: ID of the person's sister.
     - hasFather: ID of the person's father.
     - hasStrengthIn: Area in which the person has strength.
     - prefersSubject: Subject the person prefers.
     - teaches: Subject the person teaches.
     - hasTeacher: ID of the person's teacher.
     - hasFriend: ID of the person's friend.
     - isInterestedIn: Activity the person is interested in.
     - description: Additional information about the person.
   - Subject: Represents an academic subject with a unique name and additional information.
     Original Properties:
     - id: The name of the subject.
     - name: The name of the subject.
     - description: Additional information about the subject.
   - Activity: Represents an extracurricular activity, specifying its importance, attributes, and additional information.
     Original Properties:
     - id: The name of the activity.
     - name: The name of the activity.
     - isImportantFor: Indicates the importance of the activity for the person.
     - attributes: Attributes associated with the activity.
     - description: Additional information about the activity.

3. Subsequently, generate the relationships between entities as triples of head, relationship, and tail. Use the respective `id` properties of the head and tail entities. Each relationship should be defined as follows:
   Relationship Types:
   - person|hasStrengthIn|subject
   - person|prefersSubject|subject
   - person|teaches|subject
   - person|hasTeacher|person
   - person|hasMother|person
   - person|hasBrother|person
   - person|hasSister|person
   - person|hasFriend|person
   - person|isInterestedIn|activity
   - person|participatesIn|activity

Generate only meaningful relationships based on the context provided in the text. 
If a relationship is not present in the text, do not generate it. 
Be very careful about relationships like mother, father, brother, and sister. 
They are not reflexive, and the direction of the relationship should be inferred from the context. For example, if the text says "A is the mother of B," the relationship should be generated as "A|isMotherOf|B" and not "B|isMotherOf|A".

Strictly follow the output format specified below. The output should follow this format:
{
    "entities": [{"label":"Person","id":string,"name":string, "description":string}],
    "relationships": ["personid|hasStrengthIn|subjectid"]
}

Case Sheet:
$ctext
"""

In [87]:
# Repo - Prompt for processing project briefs
project_prompt_template = """
From the Project Brief below, extract the following Entities & relationships described in the mentioned format 
0. ALWAYS FINISH THE OUTPUT. Never send partial responses
1. First, look for these Entity types in the text and generate as comma-separated format similar to entity type.
   `id` property of each entity must be alphanumeric and must be unique among the entities. You will be referring this property to define the relationship between entities. Do not create new entity types that aren't mentioned below. Document must be summarized and stored inside Project entity under `summary` property. You will have to generate as many entities as needed as per the types below:
    Entity Types:
    label:'Project',id:string,name:string;summary:string //Project mentioned in the brief; `id` property is the full name of the project, in lowercase, with no capital letters, special characters, spaces or hyphens; Contents of original document must be summarized inside 'summary' property
    label:'Technology',id:string,name:string //Technology Entity; `id` property is the name of the technology, in camel-case. Identify as many of the technologies used as possible
    label:'Client',id:string,name:string;industry:string //Client that the project was done for; `id` property is the name of the Client, in camel-case; 'industry' is the industry that the client operates in, as mentioned in the project brief.
    
2. Next generate each relationships as triples of head, relationship and tail. To refer the head and tail entity, use their respective `id` property. Relationship property should be mentioned within brackets as comma-separated. They should follow these relationship types below. You will have to generate as many relationships as needed as defined below:
    Relationship types:
    project|USES_TECH|technology 
    project|HAS_CLIENT|client


3. The output should look like :
{
    "entities": [{"label":"Project","id":string,"name":string,"summary":string}],
    "relationships": ["projectid|USES_TECH|technologyid"]
}

Case Sheet:
$ctext
"""


# Prompt for processing peoples' profiles
people_prompt_template = """From the list of people below, extract the following Entities & relationships described in the mentioned format 
0. ALWAYS FINISH THE OUTPUT. Never send partial responses
1. First, look for these Entity types in the text and generate as comma-separated format similar to entity type.
   `id` property of each entity must be alphanumeric and must be unique among the entities. You will be referring this property to define the relationship between entities. Do not create new entity types that aren't mentioned below. You will have to generate as many entities as needed as per the types below:
    Entity Types:
    label:'Person',id:string,name:string //Person that the data is about. `id` property is the name of the person, in camel-case. 'name' is the person's name, as spelled in the text.
    label:'Project',id:string,name:string;summary:string //Project mentioned in the profile; `id` property is the full lowercase name of the project, with no capital letters, special characters, spaces or hyphens.
    label:'Technology',id:string,name:string //Technology Entity, as listed in the "skills"-section of every person; `id` property is the name of the technology, in camel-case.
    
3. Next generate each relationships as triples of head, relationship and tail. To refer the head and tail entity, use their respective `id` property. Relationship property should be mentioned within brackets as comma-separated. They should follow these relationship types below. You will have to generate as many relationships as needed as defined below:
    Relationship types:
    person|HAS_SKILLS|technology 
    project|HAS_PEOPLE|person


The output should look like :
{
    "entities": [{"label":"Person","id":string,"name":string}],
    "relationships": ["projectid|HAS_PEOPLE|personid"]
}

Case Sheet:
$ctext
"""


# Prompt for processing slack messages

slack_prompt_template = """
From the list of messages below, extract the following Entities & relationships described in the mentioned format 
0. ALWAYS FINISH THE OUTPUT. Never send partial responses
1. First, look for these Entity types in the text and generate as comma-separated format similar to entity type.
   `id` property of each entity must be alphanumeric and must be unique among the entities. You will be referring this property to define the relationship between entities. Do not create new entity types that aren't mentioned below. You will have to generate as many entities as needed as per the types below:
    Entity Types:
    label:'Person',id:string,name:string //Person that sent the message. `id` property is the name of the person, in camel-case; for example, "michaelClark", or "emmaMartinez"; 'name' is the person's name, as spelled in the text.
    label:'SlackMessage',id:string,text:string //The Slack-Message that was sent; 'id' property should be the message id, as spelled in the reference. 'text' property is the text content of the message, as spelled in the reference
    
3. Next generate each relationships as triples of head, relationship and tail. To refer the head and tail entity, use their respective `id` property. Relationship property should be mentioned within brackets as comma-separated. They should follow these relationship types below. You will have to generate as many relationships as needed as defined below:
    Relationship types:
    personid|SENT|slackmessageid

The output should look like :
{
    "entities": [{"label":"SlackMessage","id":string,"text":string}],
    "relationships": ["personid|SENT|messageid"]
}

Case Sheet:
$ctext
"""

In [152]:
with open('test_data/interactions.txt', 'r') as file:
    content = file.read()
    sample_text = content

In [155]:

ingestion_pipeline(sample_prompt_template, sample_text, "cyphers.txt")

{
    "entities": [
        {"label": "Person", "id": "Dan", "name": "Dan", "description": "Father of Tudor and Daniela"},
        {"label": "Person", "id": "Supersona", "name": "Supersona", "description": "Conversation partner"},
        {"label": "Person", "id": "Tudor", "name": "Tudor", "age": 11, "dob": "June 24, 2012", "schoolName": "Middle School", "hasMother": "Maria", "hasFather": "Dan", "hasSister": "Daniela", "hasStrengthIn": "Logical reasoning", "prefersSubject": "Mathematics", "isInterestedIn": "Chess, Running, Smartphone Games, Coding, Robotics", "description": "Empathetic, competitive spirit, enjoys chess, running, and technology"},
        {"label": "Person", "id": "Daniela", "name": "Daniela", "description": "Daughter of Dan"},
        {"label": "Person", "id": "Maria", "name": "Maria", "description": "Mother of Tudor and Daniela"}
    ],
    "relationships": [
        "Dan|hasChild|Tudor",
        "Dan|hasChild|Daniela",
        "Tudor|hasStrengthIn|Logical reasoning",

In [None]:
folders = {
    "people_profiles": people_prompt_template,
    "project_briefs": project_prompt_template,
    "slack_messages": slack_prompt_template,
}

ingestion_pipeline(folders)