# Some fun with unstructured text

## Neo4j Setup

In [1]:
import pandas as pd
from neo4j import GraphDatabase, RoutingControl # Python database driver

In [2]:

DB_ULR = "neo4j://localhost:7687"
DB_USER = "neo4j"
DB_PASS = "test1234"
DB_NAME = "test"

In [3]:
driver = GraphDatabase.driver(DB_ULR, auth=(DB_USER, DB_PASS))
driver.verify_connectivity()

## Test Graph Creation

In [4]:
# Test data
sections = [
    {
        "id": "4.3.34", 
        "text":'''Institutions should set out, in their credit risk policies and procedures, the criteria for
            identifying, assessing, approving, monitoring, reporting and mitigating credit risk, and the
            criteria for measuring allowances for both accounting and capital adequacy purposes.
            Institutions should document the framework and update it regularly.'''
    },
    {
        "id": "4.3.35", 
        "text":'''The objective followed in credit risk policies and procedures should be to promote a proactive
            approach to monitoring credit quality, identifying deteriorating credit early and managing the
            overall credit quality and associated risk profile of the portfolio, including through new creditgranting activities.'''
    },
    {
        "id": "4.3.36", 
        "text":'''Credit risk policies and procedures should cover all lending activities, asset classes, client
            segments, products and specific credit facilities, credit risk management practices, and
            associated responsibilities and controls.'''
    },
      {
        "id": "4.3.37", 
        "text":'''Credit risk policies and procedures should include specific lending policies and procedures, with
            sufficient granularity to capture the specific business lines of the institution, for different
            sectors, in line with their varying complexities and sizes, and risks of different market segments
            related to the credit facility.'''
    }

]


In [5]:
## Test data creation
driver.execute_query(
    '''
    unwind $sections as section
    merge (s:Section{id:section.id})
    set s.text = section.text
    ''',
    sections = sections,
    routing_= RoutingControl.WRITE,
    database_= DB_NAME
)

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x120492b20>, keys=[])

## Extract graph from summary

In [6]:
from openai import OpenAI
from retry import retry

In [7]:
client = OpenAI(
  #api_key="pick up a key from env by default"
)

In [8]:
df_sections = driver.execute_query(
    '''
    match (s:Section) where s.processed is null
    return s.id as id, s.text as text
    ''',
    None,
    routing_= RoutingControl.READ,
    database_= DB_NAME,
    result_transformer_= lambda r: r.to_df()
)
pd.set_option('display.max_colwidth',0)
df_sections.head()

Unnamed: 0,id,text
0,4.3.34,"Institutions should set out, in their credit risk policies and procedures, the criteria for\n identifying, assessing, approving, monitoring, reporting and mitigating credit risk, and the\n criteria for measuring allowances for both accounting and capital adequacy purposes.\n Institutions should document the framework and update it regularly."
1,4.3.35,"The objective followed in credit risk policies and procedures should be to promote a proactive\n approach to monitoring credit quality, identifying deteriorating credit early and managing the\n overall credit quality and associated risk profile of the portfolio, including through new creditgranting activities."
2,4.3.36,"Credit risk policies and procedures should cover all lending activities, asset classes, client\n segments, products and specific credit facilities, credit risk management practices, and\n associated responsibilities and controls."
3,4.3.37,"Credit risk policies and procedures should include specific lending policies and procedures, with\n sufficient granularity to capture the specific business lines of the institution, for different\n sectors, in line with their varying complexities and sizes, and risks of different market segments\n related to the credit facility."


In [9]:
system = "You are a regulatory compliance expert helping us extract relevant information."

# Set up the prompt for GPT-3 to complete
prompt = """#This is section from a regualtory guidline document. The task is to extract as many relevant entities to risks, requirements and methods.
#Additionally, extract all relevant relationships between identified entities.
#The output of a relationship should be in a form of a triple Head, Relationship, Tail, for example
#Peter, WORKS_AT, Hospital/n
# An example "St. Peter is located in Paris" should have an output with the following format
entity
St. Peter, person, 0.0
Paris, location, 0.0

relationships
St.Peter, LOCATED_IN, Paris\n"""

In [10]:
def parse_entities_and_relationships(input_str):
    # Parse the input string
    entities = []
    relationships = []
    entity_mode = True
    # Skip the first line
    for line in input_str.split("\n")[1:]:
        if line == "relationships":
            entity_mode = False
        elif line:
            if entity_mode:
                # Make sure the rel is in correct format
                # GPT-4 sometimes returns n/a when no entities are found
                if len(line.split(", ")) != 3:
                    continue
                entities.append(line.split(", "))
            else:
                # Make sure the rel is in correct format
                # GPT-4 sometimes returns n/a when no rels are found
                if len(line.split(", ")) != 3:
                    continue
                relationships.append(line.split(", "))
    return entities, relationships

In [11]:
@retry(tries=3, delay=5)
def process_gpt4(text):
    paragraph = text

    completion = client.chat.completions.create(
        model="gpt-4",
        # Try to be as deterministic as possible
        temperature=0,
        messages=[
            {"role": "system", "content": system},
            {"role": "user", "content": prompt + paragraph},
        ],
    )

    nlp_results = completion.choices[0].message.content

    print(nlp_results)
    
    if not "relationships" in nlp_results:
        raise Exception(
            "GPT is not being nice and isn't returning results in correct format"
        )
    
    return parse_entities_and_relationships(nlp_results)

In [12]:
# Test 
# for index, row in df_sections.iterrows():
#     print("processing: {id}".format(id=row.id))
#     entities, relationships = process_gpt4(row.text)
#     print(entities)
#     print(relationships)

In [13]:
driver.execute_query(
    'create constraint if not exists for (n:Entity) require (n.name) is node key',
    None,
    routing_= RoutingControl.WRITE,
    database_= DB_NAME
)

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x1205a1190>, keys=[])

In [14]:
for index, row in df_sections.iterrows():
    print("processing: {id}".format(id=row.id))
    entities, relationships = process_gpt4(row.text)
    driver.execute_query(
    '''
    MATCH (p:Section{id:$id}) set p.processed=datetime()
    FOREACH (e in $entities |
        MERGE (entity:Entity {name: e[0]})
        ON CREATE SET entity.type = e[1] 
        MERGE (p)-[:MENTIONS]->(entity)
    )
    WITH p
    UNWIND $relationships AS relation
    MERGE (source:Entity {name: relation[0]})
    MERGE (target:Entity {name: relation[2]})
    with source, target, relation
    CALL apoc.merge.relationship(source, relation[1],
        {roles:['Joe Fox']},
        {},
        target,
        {}
        )
        YIELD rel
    RETURN true as result
    ''',
    id = row.id,
    relationships = relationships,
    entities = entities,
    routing_ = RoutingControl.WRITE,
    database_ = DB_NAME
)

processing: 4.3.34
entities
Institutions, organization, 0.0
Credit risk policies and procedures, document, 0.0
Criteria, concept, 0.0
Identifying, method, 0.0
Assessing, method, 0.0
Approving, method, 0.0
Monitoring, method, 0.0
Reporting, method, 0.0
Mitigating credit risk, method, 0.0
Measuring allowances, method, 0.0
Accounting, field, 0.0
Capital adequacy purposes, concept, 0.0
Framework, concept, 0.0

relationships
Institutions, SET_OUT, Credit risk policies and procedures
Credit risk policies and procedures, INCLUDE, Criteria
Criteria, FOR, Identifying
Criteria, FOR, Assessing
Criteria, FOR, Approving
Criteria, FOR, Monitoring
Criteria, FOR, Reporting
Criteria, FOR, Mitigating credit risk
Criteria, FOR, Measuring allowances
Measuring allowances, FOR, Accounting
Measuring allowances, FOR, Capital adequacy purposes
Institutions, DOCUMENT, Framework
Institutions, UPDATE, Framework
processing: 4.3.35
entities
credit risk policies and procedures, policy, 0.0
credit quality, risk, 0.0
