In [1]:
import os
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
from IPython.display import Markdown, display, HTML

import uuid
from tqdm import tqdm
from langchain_community.document_loaders import DataFrameLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

load_dotenv()

True

In [2]:
os.chdir(os.path.dirname(os.getcwd()))

In [3]:
from pathlib import Path

domain_knowledge_path = "omni\prompts\knowledge_bases\cons\domain_knowledge.txt"

with open(domain_knowledge_path, "r") as file:
    domain_knowledge = file.read()

In [4]:
import re

sections = re.split(r'\n# ', domain_knowledge.strip())
sections = [s.strip() for s in sections if s]

data_list = []
for section in sections:
    lines = section.split('\n')
    name = lines[0].replace('# ', '').strip()
    text = ' '.join(lines[1:]).strip()
    data_list.append({'name': name, 'text': text})

services_df = pd.DataFrame(data_list)
services_df['full_description'] = services_df.apply(lambda row: f"{row['name']}: {row['text']}", axis=1)

In [5]:
print(services_df.shape)
services_df.head()

(139, 3)


Unnamed: 0,name,text,full_description
0,Employment Disputes,Handles legal issues concerning current or for...,Employment Disputes: Handles legal issues conc...
1,Coverage Team,Specialized attorneys providing advice on comp...,Coverage Team: Specialized attorneys providing...
2,Field Legal,Manages all litigation matters. Handles cases ...,Field Legal: Manages all litigation matters. H...
3,Compliance Team,Ensures company adherence to legal standards a...,Compliance Team: Ensures company adherence to ...
4,Fraud Investigation Unit,Investigates suspected insurance fraud cases. ...,Fraud Investigation Unit: Investigates suspect...


In [39]:
from typing import List, Dict
from pydantic import BaseModel, ConfigDict, Field

class OntologyLabel(BaseModel):
    """A label describing an ontology item for a given topic or theme."""
    
    category: str = Field(
        ...,
        description="A category label representing an entity type for a given text and theme of interest.",
    )
    description: str = Field(
        ...,
        description="A description or definition for the entity category.",
    )
    
    model_config = ConfigDict(
        json_schema_extra={
            "examples": [
                {   "category": "Department", 
                    "description": "A distinct department or team.."},
                {
                    "category": "Service",
                    "description": "A specific service offering.",
                },
                {
                    "category": "Role",
                    "description": "A role or job title within a department or team.",
                },
                {
                    "category": "Issue",
                    "description": "A specific issue that a team or service can resolve.",
                },
                {
                    "category": "CATEGORY",
                    "description": "DESCRIPTION",
                },
            ]
        }
    )

class Ontology(BaseModel):
    """An ontology for a given text and user specified theme or topic."""
    
    labels: List[OntologyLabel] = Field(
        ..., 
        default_factory=list,
        description="A list of ontology items to be used for downstream entity extraction and knowledge graph creation.",
    )
    model_config = ConfigDict(
        extra="allow",
        arbitrary_types_allowed=True,
    )
    
    @property
    def to_pandas(self):
        ontology_dict = {
            "category": [n.category for n in self.labels],
            "description": [n.description for n in self.labels],

        }
        return pd.DataFrame(ontology_dict)


In [44]:
import instructor
from openai import OpenAI


client = instructor.from_openai(OpenAI())


def user_message(theme: str, text: str) -> str:
    return f"While focusing on the theme **{theme}**, generate a comprehensive and targeted ontology for the following input text: ```\n{text}\n```"


def system_message() -> str:
    return (
        "You are an expert at creating a comprehensive ontology for a given theme or topic. "
        "Users will provide you with a **theme** and an input text delimited by ```. "
        "Extract ALL entity types from the input text that appear relevant to the **theme**. "
        "The goal is to create an ontology to use for downstream knowledge graph construction for the **theme**."
    )


def generate_ontology(theme: str, text: str) -> str:
    return client.chat.completions.create(
        model="gpt-4o",
        max_retries=3,
        messages=[
            {
                "role": "system",
                "content": system_message(),
            },
            {
                "role": "user",
                "content": user_message(theme=theme, text=text),
            },
        ],
        response_model=Ontology,
    )

In [45]:
extracted_ontology = generate_ontology(
    theme="Internal Legal Services",
    text=domain_knowledge,
)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [46]:
o_df = extracted_ontology.to_pandas
Markdown(o_df.to_markdown())

|    | category           | description                                                                                                                                                     |
|---:|:-------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------|
|  0 | Department         | A distinct department or team within the internal legal services.                                                                                               |
|  1 | Service            | A specific service offering within internal legal services.                                                                                                     |
|  2 | Role               | A role or job title within a department or team.                                                                                                                |
|  3 | Issue              | A specific issue that a team or service can resolve.                                                                                                            |
|  4 | Case Type          | A type of legal case handled by the internal legal services.                                                                                                    |
|  5 | Process            | A process or procedure followed by a legal team within the internal legal services.                                                                             |
|  6 | Specialization     | A specific area of expertise within the legal field relevant to internal legal services.                                                                        |
|  7 | Collaboration      | The act of working with other departments, teams, or external entities.                                                                                         |
|  8 | Compliance         | Adherence to laws, regulations, and standards within internal legal services.                                                                                   |
|  9 | Audit              | An internal or external review ensuring adherence to regulations and standards.                                                                                 |
| 10 | Document           | Legal documents or papers handled or produced by the internal legal services.                                                                                   |
| 11 | Guidance           | Advice or recommendations provided by legal experts.                                                                                                            |
| 12 | Litigation         | The action of taking legal proceedings in a court.                                                                                                              |
| 13 | Mediation          | A method of dispute resolution involving a neutral third party.                                                                                                 |
| 14 | Risk Management    | The identification, evaluation, and prioritization of risks followed by coordinated efforts to minimize, control, and monitor the impact of unfortunate events. |
| 15 | Policy             | A course or principle of action adopted or proposed by an organization or individual within the legal sphere.                                                   |
| 16 | Dispute Resolution | The process of resolving a dispute or a conflict.                                                                                                               |
| 17 | Training           | Educational activities aimed at improving legal knowledge and skills.                                                                                           |
| 18 | Regulation         | Rules or directives made and maintained by an authority in the legal context.                                                                                   |

In [47]:
print(extracted_ontology.model_dump_json(indent=4))

{
    "labels": [
        {
            "category": "Department",
            "description": "A distinct department or team within the internal legal services."
        },
        {
            "category": "Service",
            "description": "A specific service offering within internal legal services."
        },
        {
            "category": "Role",
            "description": "A role or job title within a department or team."
        },
        {
            "category": "Issue",
            "description": "A specific issue that a team or service can resolve."
        },
        {
            "category": "Case Type",
            "description": "A type of legal case handled by the internal legal services."
        },
        {
            "category": "Process",
            "description": "A process or procedure followed by a legal team within the internal legal services."
        },
        {
            "category": "Specialization",
            "description": "A specific area of 

In [50]:
from typing import List
from openai import OpenAI
import instructor
from pydantic import BaseModel, Field

client = instructor.from_openai(OpenAI())
model = "gpt-4o"


class Node(BaseModel):
    label: str
    name: str


class Edge(BaseModel):
    node_1: Node
    node_2: Node
    relationship: str
    

class KnowledgeGraph(BaseModel):
    edges: List[Edge] = Field(..., default_factory=list)
    
    @property
    def to_pandas(self):
        kg_dict = {
            "node_1": [n.node_1.name for n in self.edges],
            "node_2": [n.node_2.name for n in self.edges],
            "edge": [n.relationship for n in self.edges],
            "node_1_type": [n.node_1.label for n in self.edges],
            "node_2_type": [n.node_2.label for n in self.edges],
        }
        return pd.DataFrame(kg_dict)


def user_message(text: str) -> str:
    return f"input text: ```\n{text}\n```"

def system_message(ontology: Ontology) -> str:
    return (
        "You are an expert at creating Knowledge Graphs. "
        "Consider the following ontology. \n"
        f"{ontology.model_dump_json(indent=4)} \n"
        "The user will provide you with an input text delimited by ```. "
        "Extract all the entities and relationships from the user-provided text as per the given ontology. Do not use any previous knowledge about the context."
        "Remember there can be multiple direct (explicit) or implied relationships between the same pair of nodes. "
        "Be consistent with the given ontology. Use ONLY the labels and relationships mentioned in the ontology. "
        "Remember to follow the correct format, for example:\n"
        "[\n"
        "   {\n"
        '       node_1: Required, an entity object with attributes: {"label": "as per the ontology", "name": "Name of the entity"},\n'
        '       node_2: Required, an entity object with attributes: {"label": "as per the ontology", "name": "Name of the entity"},\n'
        "       relationship: Describe the relationship between node_1 and node_2 as per the context, in one or two sentences.\n"
        "   },\n"
        "]\n"
    )


def generate_graph(text: str, ontology: Ontology, model: str) -> str:
    return client.chat.completions.create(
        model=model,
        max_retries=3,
        messages=[
            {
                "role": "system",
                "content": system_message(ontology),
            },
            {
                "role": "user",
                "content": user_message(text=text),
            },
        ],
        response_model=KnowledgeGraph,
    )


In [51]:
def df2Graph(df, ontology, model="gpt-4o") -> pd.DataFrame:
    progress_bar = tqdm(total=len(df), desc="Processing chunks")

    def apply_graphPrompt(row):
        result = generate_graph(row.text, ontology, model)
        result_df = result.to_pandas
        result_df["chunk_id"] = row.chunk_id
        progress_bar.update(1)
        return result_df
    
    results = df.apply(apply_graphPrompt, axis=1)
    results_df = pd.concat(results.tolist(), ignore_index=True)
    progress_bar.close()
    
    return results_df

In [52]:
def dataframe2Documents(df: pd.DataFrame, text_column: str):
    loader = DataFrameLoader(df, page_content_column=text_column)
    return loader.load()

def documents2Dataframe(documents) -> pd.DataFrame:
    rows = []
    for chunk in documents:
        row = {
            "text": chunk.page_content,
            **chunk.metadata,
            "chunk_id": uuid.uuid4().hex,
        }
        rows = rows + [row]
    df = pd.DataFrame(rows)
    return df

In [53]:
docs = dataframe2Documents(
    df=services_df[["name", "full_description"]], 
    text_column="full_description",
)
prepped_df = documents2Dataframe(docs)
print(prepped_df.shape)
prepped_df.head(3)

(139, 3)


Unnamed: 0,text,name,chunk_id
0,Employment Disputes: Handles legal issues conc...,Employment Disputes,b9ce1fcf4245475baeeec1740564e622
1,Coverage Team: Specialized attorneys providing...,Coverage Team,22f1ff1b010c4a078b30682768fa6ce1
2,Field Legal: Manages all litigation matters. H...,Field Legal,54ce68e98bfb4a83bcef521071408301


In [54]:
dfg = df2Graph(prepped_df, extracted_ontology)

Processing chunks:   0%|          | 0/139 [00:00<?, ?it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Processing chunks:   1%|          | 1/139 [00:02<06:36,  2.87s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Processing chunks:   1%|▏         | 2/139 [00:08<10:50,  4.75s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Processing chunks:   2%|▏         | 3/139 [00:11<08:07,  3.58s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Processing chunks:   3%|▎         | 4/139 [00:13<07:00,  3.12s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Processing chunks:   4%|▎         | 5/139 [00:16<06:42,  3.00s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Processing chunks:   4%|▍         | 6/139 [00:19<07:01,  3.17s/it]

In [55]:
dfg.replace("", np.nan, inplace=True)
dfg.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
dfg['count'] = 4 

In [56]:
print(dfg.shape)
dfg.head()

(621, 7)


Unnamed: 0,node_1,node_2,edge,node_1_type,node_2_type,chunk_id,count
0,Employment Disputes,wrongful termination,Employment Disputes includes cases related to ...,Case Type,Issue,b9ce1fcf4245475baeeec1740564e622,4
1,Employment Disputes,payment-related disputes,Employment Disputes includes cases related to ...,Case Type,Issue,b9ce1fcf4245475baeeec1740564e622,4
2,Employment Disputes,contractual matters,Employment Disputes includes cases related to ...,Case Type,Issue,b9ce1fcf4245475baeeec1740564e622,4
3,Employment Disputes,discrimination accusations,Employment Disputes does not handle cases rela...,Case Type,Issue,b9ce1fcf4245475baeeec1740564e622,4
4,Coverage Team,Specialized attorneys,Coverage Team consists of Specialized attorneys.,Department,Role,22f1ff1b010c4a078b30682768fa6ce1,4


In [57]:
dfg['node_1'].nunique(), dfg['node_2'].nunique()

(170, 503)

In [58]:
def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    ## Melt the dataframe into a list of nodes
    dfg_long = pd.melt(
        df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
    )
    dfg_long.drop(columns=["variable"], inplace=True)
    # Self join with chunk id as the key will create a link between terms occurring in the same text chunk.
    dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))
    # drop self loops
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)
    ## Group and count edges.
    dfg2 = (
        dfg2.groupby(["node_1", "node_2"])
        .agg({"chunk_id": [",".join, "count"]})
        .reset_index()
    )
    dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
    dfg2.replace("", np.nan, inplace=True)
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)
    # Drop edges with 1 count
    dfg2 = dfg2[dfg2["count"] != 1]
    dfg2["edge"] = "contextual proximity"
    return dfg2

In [59]:
dfg2 = contextual_proximity(dfg)

In [60]:
dfg2['count'].describe()

count    1678.000000
mean        4.376639
std         2.108840
min         2.000000
25%         3.000000
50%         4.000000
75%         6.000000
max        18.000000
Name: count, dtype: float64

In [61]:
print(dfg2.shape)
dfg2.head()

(1678, 5)


Unnamed: 0,node_1,node_2,chunk_id,count,edge
0,ADR Strategies,Alternative Dispute Resolution Team,"ccb1a3260f904b8d9d687db10380464b,ccb1a3260f904...",6,contextual proximity
7,Addresses instances of insurance fraud,Insurance Fraud Prevention Team,"ecccbc4e02e247c9b476f8bbf6e843be,ecccbc4e02e24...",4,contextual proximity
10,Advertising and Marketing Legal Team,Advertising and marketing issues,"618d341bbe5e41d09f83fd2ec2cec1f4,618d341bbe5e4...",10,contextual proximity
11,Advertising and Marketing Legal Team,Advertising and marketing practices,"618d341bbe5e41d09f83fd2ec2cec1f4,618d341bbe5e4...",5,contextual proximity
12,Advertising and Marketing Legal Team,Advertising regulations and standards,"618d341bbe5e41d09f83fd2ec2cec1f4,618d341bbe5e4...",10,contextual proximity


In [62]:
dfg3 = pd.concat([dfg, dfg2], axis=0)
dfg3 = (
    dfg.groupby(["node_1", "node_2"])
    .agg({"chunk_id": ",".join, "edge": ','.join, 'count': 'sum'})
    .reset_index()
)
print(dfg3.shape)
dfg3.head()

(599, 5)


Unnamed: 0,node_1,node_2,chunk_id,edge,count
0,Advertising and Marketing Legal Team,Advertising and marketing practices,618d341bbe5e41d09f83fd2ec2cec1f4,Handles legal matters related to,4
1,Advertising and Marketing Legal Team,Advertising regulations and standards,618d341bbe5e41d09f83fd2ec2cec1f4,Provides advice on,4
2,Advertising and Marketing Legal Team,Advice on compliance,618d341bbe5e41d09f83fd2ec2cec1f4,Provides,4
3,Advertising and Marketing Legal Team,Disputes,618d341bbe5e41d09f83fd2ec2cec1f4,Manages,4
4,Advertising and Marketing Legal Team,Litigation,618d341bbe5e41d09f83fd2ec2cec1f4,Manages,4


In [63]:
nodes = pd.concat([dfg['node_1'], dfg['node_2']], axis=0).unique()
nodes.shape

(622,)

In [70]:
import networkx as nx
G = nx.Graph()

## Add nodes to the graph
for node in nodes:
    G.add_node(
        str(node)
    )

## Add edges to the graph
for index, row in dfg.iterrows():
    G.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        title=row["edge"],
        weight=row['count']/4
    )

In [71]:
communities_generator = nx.community.girvan_newman(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))
print("Number of Communities = ", len(communities))
print(communities)

Number of Communities =  67
[['ADR Strategies', 'Alternative Dispute Resolution Team', 'Arbitration', 'Auto Accidents', 'Auto Claims Legal Team', 'Auto Repair Experts', 'Bad Faith Allegations', 'Bad Faith Allegations Team', 'Claims Adjusters', 'Claims Litigation', 'Claims Litigation Team', 'Defense Strategies', 'Legal Advice', 'Legal Defense', 'Mediation', 'Non-litigation Dispute Resolution', 'Other Departments', 'Support for alternative dispute resolution methods', 'Vehicle Damage', 'Zoning Litigation', 'Zoning Litigation Team', 'Zoning and Land Use Disputes'], ['Addresses instances of insurance fraud', 'Fraud prevention and detection efforts', 'Insurance Fraud Prevention Team', 'Investigates instances of insurance fraud', 'Law enforcement and other agencies'], ['Advertising and Marketing Legal Team', 'Advertising and marketing issues', 'Advertising and marketing practices', 'Advertising regulations and standards', 'Advice on compliance', 'Appellate Legal Team', 'Appellate Litigation 

In [72]:
import random
import seaborn as sns
palette = "hls"

def colors2Community(communities) -> pd.DataFrame:
    p = sns.color_palette(palette, len(communities)).as_hex()
    random.shuffle(p)
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors

In [73]:
colors = colors2Community(communities)
colors

Unnamed: 0,node,color,group
0,ADR Strategies,#57aadb,1
1,Alternative Dispute Resolution Team,#57aadb,1
2,Arbitration,#57aadb,1
3,Auto Accidents,#57aadb,1
4,Auto Claims Legal Team,#57aadb,1
...,...,...,...
617,advice on compliance with zoning laws,#db575b,67
618,compliance with zoning laws,#db575b,67
619,disputes involving zoning and land use,#db575b,67
620,litigation involving zoning and land use issues,#db575b,67


In [74]:
for index, row in colors.iterrows():
    G.nodes[row['node']]['group'] = row['group']
    G.nodes[row['node']]['color'] = row['color']
    G.nodes[row['node']]['size'] = G.degree[row['node']]


In [75]:
from pyvis.network import Network

graph_output_directory = "index.html"

net = Network(
    notebook=False,
    bgcolor="#1a1a1a",
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    font_color="#cccccc",
    filter_menu=True,
)

net.from_nx(G)
net.force_atlas_2based(central_gravity=0.015, gravity=-31)
net.show_buttons(filter_=["physics"])

net.show(graph_output_directory, notebook=False)

index.html


In [4]:
from src.parsing.graph_maker import GraphMaker, Document, Ontology

In [76]:
from typing import Literal


ENTITIES = Literal["DEPARTMENT", "TEAM", "ORGANIZATION", "SERVICE", "LEGAL_TOPIC", "INSURANCE_TOPIC"]

RELATIONS = Literal[
    "OVERSEER_OF",
    "RESPONSIBLE_FOR",
    "PROVIDES_GUIDANCE_ON",
    "CONDUCTS",
    "SUPPORTS",
    "REGULATES",
    "REVIEWS",
]



In [88]:
def user_message(text: str) -> str:
    return f"input text: ```\n{text}\n```"

def system_message() -> str:
    return (
        "You are an expert at creating Knowledge Graphs. "
        "The user will provide you with an input text delimited by ```. "
        "Extract all the entities and relationships from the user-provided text. Do not use any previous knowledge about the context."
        "Remember there can be multiple direct (explicit) or implied relationships between the same pair of nodes. "
        "Remember to follow the correct format."
    )


def generate_graph(text: str, model: str = "gpt-4o") -> str:
    return client.chat.completions.create(
        model=model,
        max_retries=3,
        messages=[
            {
                "role": "system",
                "content": system_message(),
            },
            {
                "role": "user",
                "content": user_message(text=text),
            },
        ],
        response_model=List[triplet_cls],
    )

In [89]:
test = generate_graph(domain_knowledge)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


InstructorRetryException: 5 validation errors for Response
content.4.relation.type
  Input should be 'OVERSEER_OF', 'RESPONSIBLE_FOR', 'PROVIDES_GUIDANCE_ON', 'CONDUCTS', 'SUPPORTS', 'REGULATES' or 'REVIEWS' [type=literal_error, input_value='DOES_NOT_HANDLE', input_type=str]
    For further information visit https://errors.pydantic.dev/2.7/v/literal_error
content.12.relation.type
  Input should be 'OVERSEER_OF', 'RESPONSIBLE_FOR', 'PROVIDES_GUIDANCE_ON', 'CONDUCTS', 'SUPPORTS', 'REGULATES' or 'REVIEWS' [type=literal_error, input_value='ENSURES', input_type=str]
    For further information visit https://errors.pydantic.dev/2.7/v/literal_error
content.18.relation.type
  Input should be 'OVERSEER_OF', 'RESPONSIBLE_FOR', 'PROVIDES_GUIDANCE_ON', 'CONDUCTS', 'SUPPORTS', 'REGULATES' or 'REVIEWS' [type=literal_error, input_value='RECOVERS', input_type=str]
    For further information visit https://errors.pydantic.dev/2.7/v/literal_error
content.23.relation.type
  Input should be 'OVERSEER_OF', 'RESPONSIBLE_FOR', 'PROVIDES_GUIDANCE_ON', 'CONDUCTS', 'SUPPORTS', 'REGULATES' or 'REVIEWS' [type=literal_error, input_value='COLLABORATES_WITH', input_type=str]
    For further information visit https://errors.pydantic.dev/2.7/v/literal_error
content.35.relation.type
  Input should be 'OVERSEER_OF', 'RESPONSIBLE_FOR', 'PROVIDES_GUIDANCE_ON', 'CONDUCTS', 'SUPPORTS', 'REGULATES' or 'REVIEWS' [type=literal_error, input_value=' COLLABORATES_WITH', input_type=str]
    For further information visit https://errors.pydantic.dev/2.7/v/literal_error

In [23]:
from typing import List
from openai import OpenAI
import instructor
from pydantic import BaseModel, Field

client = instructor.from_openai(OpenAI())
model = "gpt-4o"


class Node(BaseModel):
    label: str
    name: str


class Edge(BaseModel):
    node_1: Node
    node_2: Node
    relationship: str
    

class KnowledgeGraph(BaseModel):
    edges: List[Edge] = Field(..., default_factory=list)
    
    @property
    def to_pandas(self):
        kg_dict = {
            "node_1": [n.node_1.name for n in self.edges],
            "node_2": [n.node_2.name for n in self.edges],
            "edge": [n.relationship for n in self.edges],
            "node_1_type": [n.node_1.label for n in self.edges],
            "node_2_type": [n.node_2.label for n in self.edges],
        }
        return pd.DataFrame(kg_dict)


def user_message(text: str) -> str:
    return f"input text: ```\n{text}\n```"

def system_message(ontology: Ontology) -> str:
    return (
        "You are an expert at creating Knowledge Graphs. "
        "Consider the following ontology. \n"
        f"{ontology} \n"
        "The user will provide you with an input text delimited by ```. "
        "Extract all the entities and relationships from the user-provided text as per the given ontology. Do not use any previous knowledge about the context."
        "Remember there can be multiple direct (explicit) or implied relationships between the same pair of nodes. "
        "Be consistent with the given ontology. Use ONLY the labels and relationships mentioned in the ontology. "
        "Remember to follow the correct format, for example:\n"
        "[\n"
        "   {\n"
        '       node_1: Required, an entity object with attributes: {"label": "as per the ontology", "name": "Name of the entity"},\n'
        '       node_2: Required, an entity object with attributes: {"label": "as per the ontology", "name": "Name of the entity"},\n'
        "       relationship: Describe the relationship between node_1 and node_2 as per the context, in one or two sentences.\n"
        "   },\n"
        "]\n"
    )


def generate_graph(text: str, ontology: Ontology, model: str) -> str:
    return client.chat.completions.create(
        model=model,
        max_retries=3,
        messages=[
            {
                "role": "system",
                "content": system_message(ontology),
            },
            {
                "role": "user",
                "content": user_message(text=text),
            },
        ],
        response_model=KnowledgeGraph,
    )


In [67]:
# df.head(1)

Unnamed: 0,id,citation,name,name_abbreviation,decision_date,court_id,court_name,court_slug,judges,attorneys,citations,url,head,body,name_contains_lm,body_contains_lm,year,context,context_citation,context_tokens
0,411690,154 Ill. 2d 90,"RICHARD R. JOHNSON, Plaintiff-Appellant and Cr...",Johnson v. Halloran,2000-01-13,8837,Illinois Appellate Court,ill-app-ct,[],"['Wolter, Beeman, Lynch & McIntyre, of Springf...","[{'type': 'official', 'cite': '312 Ill. App. 3...",https://api.case.law/v1/cases/411690/,"RICHARD R. JOHNSON, Plaintiff-Appellant and Cr...",JUSTICE HALL\r\ndelivered the opinion of the c...,False,True,2000,The public defender of Cook County was appoint...,154 Ill. 2d 90,1317


In [8]:
sample_test = df.sample(1)
text_column = 'body'

splitter = RecursiveCharacterTextSplitter(
    chunk_size=10000,
    chunk_overlap=0,
    length_function=len,
    is_separator_regex=False,
)

def dataframe2Documents(df: pd.DataFrame, text_column: str):
    loader = DataFrameLoader(df, page_content_column=text_column)
    return loader.load()

documents = dataframe2Documents(sample_test, text_column)
docs = splitter.split_documents(documents)
len(docs)

4

In [59]:
Markdown(docs[1].page_content)

In Illinois, an insurer’s duty to defend is determined by comparing the allegations in the underlying complaint to the relevant provisions of the insurance policy. Outboard Marine, 154 Ill. 2d at 107-08, 607 N.E.2d at 1212. If the allegations in the underlying complaint fall within or even potentially within the policy’s coverage, the insurer’s duty to defend arises. See, e.g., Cincinnati Cos., 183 Ill. 2d at 323, 701 N.E.2d at 502; Outboard Marine, 154 Ill. 2d at 125, 607 N.E.2d at 1212. Thus, in determining whether a duty to defend has arisen it is the potentiality of coverage that matters. See, e.g., Aetna Casualty & Surety Co. v. Prestige Casualty Co., 195 Ill. App. 3d 660, 664, 553 N.E.2d 39, 41 (1990) (“Unless the complaint, on its face, clearly alleges facts which, if true, would exclude coverage, the potentiality of coverage is present and the insurer has a duty to defend”). The threshold that a complaint must satisfy to present a claim of potential coverage is low. Bituminous Casualty Corp. v. Gust K. Newberg Construction Co., 218 Ill. App. 3d 956, 960, 578 N.E.2d 1006 (1991). For potentiality of coverage to exist, the complaint need present only a possibility of recovery, not a probability of recovery. Bituminous Casualty Corp., 218 Ill. App. 3d at 960, 578 N.E.2d 1006.
Our supreme court has repeatedly and consistently identified the two options available to an insurer that takes the position that a complaint potentially alleging coverage is not covered under its policy: the insurer may either “ ‘defend the suit under a reservation of rights or seek a declaratory judgment that there is no coverage.’ [Citation.]” (Emphasis added.) State Farm Fire & Casualty Co. v. Martin, 186 Ill. 2d 367, 373, 710 N.E.2d 1228, 1231 (1999). The course of action chosen by St. Paul, refusing to do either, has been referred to as a third option by one court. Maneikis v. St. Paul Insurance Co., 655 F.2d 818 (7th Cir. 1981) (noting that the insurer can refuse either to defend or to seek a declaratory judgment at the insurer’s peril that it might later be found to have breached its duty to defend).
Thus, the threshold question in this appeal is whether St. Paul breached its duty to defend. An insurer’s duty to defend is much broader than its duty to indemnify. Crum & Forster Managers Corp. v. Resolution Trust Corp., 156 Ill. 2d 384, 393-94, 620 N.E.2d 1073 (1993). The determination of whether St. Paul breached its duty to defend turns not on whether the claim actually came within the policy (establishing the duty to indemnify) but, rather, on whether the 1990 claim was potentially within the scope of coverage. St. Paul has presented numerous arguments as to why it believes the claim here was not within the policy coverage. In so doing, St. Paul fails to adequately distinguish its duty to defend from its duty to indemnify La Grange. The distinction is important in this appeal. It has long been established that the consequence of an insurer’s breach of its duty to defend is that it is estopped from later raising any policy defenses based on non-coverage. See Sims v. Illinois National Casualty Co., 43 Ill. App. 2d 184, 193 N.E.2d 123 (1963).
Clearly, with respect to the Mundy claim, there was a possibility that the St. Paul policy afforded coverage to La Grange; whether it actually did so was a matter for later investigation by St. Paul. See, e.g., Clemmons v. Travelers Insurance Co., 88 Ill. 2d 469, 476, 430 N.E.2d 1104, 1107 (1981). St. Paul offers no real argument, since there is none, that the 1990 claim did not potentially come within the scope of coverage. We conclude that St. Paul’s duty to defend La Grange in the underlying Mundy claim was established in 1990. St. Paul breached its duty to defend by failing to either seek a declaratory judgment that there was no coverage or defend the claim under a reservation of rights. Thus, since St. Paul breached its duty to defend, in the first instance, it is now estopped from asserting any defenses of noncoverage.
St. Paul’s attempts to draw a distinction between a duty to defend RSMA and a duty to defend La Grange are unavailing. St. Paul wrote this policy to include protection not only to the named insured, but also to third-party beneficiaries, such as La Grange. By drafting this language, St. Paul acknowledged and accepted that its insured would be entering into contracts under which St. Paul would be obligated to provide a defense and a direct benefit to those parties, such as La Grange. St. Paul did not require that its insured get St. Paul’s approval of the contracts or require its insured to disclose the identities of the third parties or require that RSMA name those parties as additional insureds. St. Paul thus assumed the responsibility of providing defenses for certain unknown and unnamed third-party beneficiaries. Since La Grange falls under the definition of the description of the others protected under the agreement, it is a direct beneficiary and is entitled to enforce the terms of the insurance policy. See Altevogt v. Brinkoetter, 85 Ill. 2d 44, 55, 421 N.E.2d 182 (1981) (it is unnecessary that a contract for the benefit of a third-party beneficiary identify him by name; the contract may define the party benefitted by class description); American National Trust Co. v. Kentucky Fried Chicken of Southern California, Inc., 308 Ill. App. 3d 106, 120, 719 N.E.2d 201, 211 (1999) (same); Garcia v. Lovellette, 265 Ill. App. 3d 724, 732, 639 N.E.2d 935 (1994) (same). This principle is even more compelling here where we are dealing with a policy of insurance which is, in addition, strictly construed against the drafter. La Grange is covered by the St. Paul policy because it is a protected person pursuant to the language drafted by St. Paul.
St. Paul, in bringing this appeal, continues to try to put forth its additional policy defense of late notice, contending that the 1990 notice was not adequate and that its duty to defend could not have been triggered until July 25, 1995, the date when St. Paul received notice of the claim directly from La Grange. Illinois law is clear that an insurer’s duty to defend is triggered by actual notice of a claim against the insured. Cincinnati Cos. v. West American Insurance Co., 183 Ill. 2d 317, 701 N.E.2d 499 (1998). An insurer has “actual notice” where it knows both “ ‘that a cause of action has been filed and that the complaint falls within or potentially within the scope of the coverage of one of its policies.’ [Citation.]” (Emphasis added.) Employers Insurance v. Ehlco Liquidating Trust, 186 Ill. 2d 127, 143, 708 N.E.2d 1122, 1131 (1999). “Actual notice” exists where the insurer receives notice “from any source sufficient to permit the insurer to locate and defend its insured.” (Emphasis added.) Insurance Co. v. Federal Kemper Insurance Co., 291 Ill. App. 3d 384, 388, 683 N.E.2d 947 (1997). As we previously explained, on March 27, 1990, RSMA provided notice to St. Paul that La Grange demanded indemnification for Mundy’s claim. This notice included a copy of the letter from La Grange, along with the contracts under which La Grange was claiming RSMA was responsible for Mundy’s claim. The documents sent to St. Paul were listed in a letter that accompanied the documents. On or about April 5, 1990, St. Paul confirmed receipt of this notice. That it was RSMA which provided notice at that time, not La Grange, is irrelevant. The date that La Grange first notified St. Paul is not the issue; what is critical is when St. Paul had actual notice. The trial court concluded, and we agree, that St. Paul had actual notice of this claim in 1990.
Nonetheless, St. Paul asserts in its brief as follows:
“At best, the 1990 notice informed St. Paul that Hilke Mundy, a La Grange employee, had brought a workers’ compensation claim against her employer, and that La Grange wanted RSMA to pay. The 1990 notice did not inform St. Paul that La Grange or RSMA had a contract between them requiring RSMA to make La Grange an insured under the St. Paul Umbrella Excess Liability policy. The 1990 notice did not inform St. Paul that La Grange did not have underlying Workers’ Compensation insurance.” (Emphasis added.)
The “pieces of information” purportedly lacking from RSMA’s notice are not pertinent. RSMA had no duty to include such information in its notice. St. Paul’s contention that the 1990 notice did not provide St. Paul with actual notice of its purported obligations is specious.
Because St. Paul destroyed the policy, the record does not contain the policy’s notice provision. Nonetheless, St. Paul has produced specimens of all of the alternative notice provisions that might have been in effect. First, none of the notice provisions can be interpreted as requiring RSMA to “inform St. Paul that La Grange or RSMA had a contract between them requiring RSMA to make La Grange an insured under the St. Paul Umbrella Excess Liability policy.” Second, none of the notice provisions can be interpreted as requiring RSMA to “inform St. Paul that La Grange did not have underlying Workers’ Compensation insurance.” In fact, St. Paul’s assertion of some requirement that RSMA make La Grange an insured is nothing more than an attempt, after the fact, to inject into the policy St. Paul’s meritless argument that only those that RSMA named as an insured were protected under the policy. RSMA did inform St. Paul that it had a contract with La Grange and provided a copy of the contract, which, when read together with the St. Paul policy, showed that La Grange was a protected entity and showed undeniably that Mundy’s claim was at least potentially within the policy’s coverage. Moreover, notice by an insured to its insurance company is sufficient to charge the insurer on all policies running in the insured’s favor. Casualty Insurance Co. v. E.W. Corrigan Construction Co., 247 Ill. App. 3d 326, 332-33, 617 N.E.2d 228, 233-34 (1993).

In [9]:
def documents2Dataframe(documents) -> pd.DataFrame:
    rows = []
    for chunk in documents:
        row = {
            "text": chunk.page_content,
            **chunk.metadata,
            "chunk_id": uuid.uuid4().hex,
        }
        rows = rows + [row]
    df = pd.DataFrame(rows)
    return df

In [10]:
df = documents2Dataframe(docs)
print(df.shape)
df.head(2)

(4, 21)


Unnamed: 0,text,id,citation,name,name_abbreviation,decision_date,court_id,court_name,court_slug,judges,attorneys,citations,url,head,name_contains_lm,body_contains_lm,year,context,context_citation,context_tokens,chunk_id
0,JUSTICE GALLAGHER\r\ndelivered the opinion of ...,1025915,183 Ill. 2d 317,"LA GRANGE MEMORIAL HOSPITAL, Plaintiff-Appelle...",La Grange Memorial Hospital v. St. Paul Insurance,2000-11-09,8837,Illinois Appellate Court,ill-app-ct,[],"['Piper Marbury Rudnick & Wolfe, of Chicago (K...","[{'type': 'official', 'cite': '317 Ill. App. 3...",https://api.case.law/v1/cases/1025915/,"LA GRANGE MEMORIAL HOSPITAL, Plaintiff-Appelle...",False,True,2000,"At the time, RSMA had a workers’ compensation ...",183 Ill. 2d 317,1546,e4ea5975aaca4e378931a802b8269818
1,"In Illinois, an insurer’s duty to defend is de...",1025915,183 Ill. 2d 317,"LA GRANGE MEMORIAL HOSPITAL, Plaintiff-Appelle...",La Grange Memorial Hospital v. St. Paul Insurance,2000-11-09,8837,Illinois Appellate Court,ill-app-ct,[],"['Piper Marbury Rudnick & Wolfe, of Chicago (K...","[{'type': 'official', 'cite': '317 Ill. App. 3...",https://api.case.law/v1/cases/1025915/,"LA GRANGE MEMORIAL HOSPITAL, Plaintiff-Appelle...",False,True,2000,"At the time, RSMA had a workers’ compensation ...",183 Ill. 2d 317,1546,45f9bd628cd644f09af527819547c567


In [11]:
def df2Ontology(theme: str, df:pd.DataFrame, model: str ="gpt-4o") -> pd.DataFrame:
    progress_bar = tqdm(total=len(df), desc="Processing chunks")

    def apply_ontologyPrompt(row):
        result = generate_ontology(theme=theme, text=row.text, model=model)
        result_df = result.to_pandas
        result_df["chunk_id"] = row.chunk_id
        progress_bar.update(1)
        return result_df
    
    results = df.apply(apply_ontologyPrompt, axis=1)
    results_df = pd.concat(results.tolist(), ignore_index=True)
    progress_bar.close()
    
    return results_df

In [12]:
test = df2Ontology(
    theme="Insurance Coverage",
    df=df)

Processing chunks: 100%|██████████| 4/4 [00:13<00:00,  3.42s/it]


In [13]:
Markdown(test.to_markdown())

|    | category                | description                                                                                                                                | chunk_id                         |
|---:|:------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------|
|  0 | Person                  | A person referenced in the context.                                                                                                        | e4ea5975aaca4e378931a802b8269818 |
|  1 | Organization            | An organization or entity that is referenced in the context.                                                                               | e4ea5975aaca4e378931a802b8269818 |
|  2 | Legal Document          | A formal document referenced in the context, including contracts, policies, or other written agreements.                                   | e4ea5975aaca4e378931a802b8269818 |
|  3 | Insurance Policy        | A specific insurance policy mentioned in the context.                                                                                      | e4ea5975aaca4e378931a802b8269818 |
|  4 | Insurance Claim         | A specific insurance claim mentioned in the context.                                                                                       | e4ea5975aaca4e378931a802b8269818 |
|  5 | Judicial Action         | A legal action, ruling or proceeding referenced in the context.                                                                            | e4ea5975aaca4e378931a802b8269818 |
|  6 | Law or Statute          | A specific law, act, or statute referenced in the text.                                                                                    | e4ea5975aaca4e378931a802b8269818 |
|  7 | Insurer                 | An insurance company referenced in the context.                                                                                            | 45f9bd628cd644f09af527819547c567 |
|  8 | Insurance Coverage      | A specific line of insurance coverage mentioned in the context.                                                                            | 45f9bd628cd644f09af527819547c567 |
|  9 | Duty to Defend          | An insurer's obligation to provide legal defense when a claim is made that potentially falls within the policy's coverage.                 | 45f9bd628cd644f09af527819547c567 |
| 10 | Duty to Indemnify       | An insurer's obligation to cover the costs or damages incurred by the insured if the claim falls within the policy's coverage.             | 45f9bd628cd644f09af527819547c567 |
| 11 | Declaratory Judgment    | A court judgment that determines the rights of parties without ordering any specific action or awarding damages.                           | 45f9bd628cd644f09af527819547c567 |
| 12 | Policy Defense          | Arguments or reasons presented by the insurer to deny coverage.                                                                            | 45f9bd628cd644f09af527819547c567 |
| 13 | Third-party Beneficiary | A party that can benefit from a contract or policy even though they are not the primary insured or policyholder.                           | 45f9bd628cd644f09af527819547c567 |
| 14 | Actual Notice           | The condition where the insurer is made aware of a claim that falls or potentially falls within the policy’s coverage.                     | 45f9bd628cd644f09af527819547c567 |
| 15 | Reservation of Rights   | A declaration made by an insurer to defend a policyholder in a legal action while reserving the right to contest coverage at a later date. | 45f9bd628cd644f09af527819547c567 |
| 16 | Noncoverage             | Situations or claims that are not covered under the insurance policy.                                                                      | 45f9bd628cd644f09af527819547c567 |
| 17 | Case Law                | Specific legal cases cited in the context to support arguments or establish precedents.                                                    | 45f9bd628cd644f09af527819547c567 |
| 18 | Insurance Company       | A company that offers insurance policies to the public.                                                                                    | e49f8433ccec41c7ae0acbbeceadbe5b |
| 19 | Legal Doctrine          | Established principles of law that are authoritative in determining legal outcomes.                                                        | e49f8433ccec41c7ae0acbbeceadbe5b |
| 20 | Legal Action            | Formal proceedings carried out in a court of law.                                                                                          | e49f8433ccec41c7ae0acbbeceadbe5b |
| 21 | Court Ruling            | A decision made by a judge or a court that resolves a legal issue.                                                                         | e49f8433ccec41c7ae0acbbeceadbe5b |
| 22 | Policy Defense          | Arguments or claims made by an insurance company to avoid paying a claim.                                                                  | e49f8433ccec41c7ae0acbbeceadbe5b |
| 23 | Legal Case              | A dispute between parties that is resolved in a court of law.                                                                              | e49f8433ccec41c7ae0acbbeceadbe5b |
| 24 | Reservation of Rights   | A statement by an insurer specifying that they may not cover a claim and that they reserve the right to assert any defenses later.         | e49f8433ccec41c7ae0acbbeceadbe5b |
| 25 | Declaratory Judgment    | A judgment from a court that determines the rights of parties without ordering any specific action or awarding damages.                    | e49f8433ccec41c7ae0acbbeceadbe5b |
| 26 | Penalties and Sanctions | Punitive measures or fines imposed by a court.                                                                                             | e49f8433ccec41c7ae0acbbeceadbe5b |
| 27 | Interest and Damages    | Financial compensation awarded to a party for loss or injury.                                                                              | e49f8433ccec41c7ae0acbbeceadbe5b |
| 28 | Statute or Rule         | A written law passed by a legislative body or a formal regulation.                                                                         | e49f8433ccec41c7ae0acbbeceadbe5b |
| 29 | Insurance Coverage      | The amount and type of protection provided by an insurance policy.                                                                         | e49f8433ccec41c7ae0acbbeceadbe5b |
| 30 | Case Law                | Specific cases mentioned in the context.                                                                                                   | dc432f8b98344f9fa73a9b78b770aa61 |
| 31 | Legal Rule              | Legal rules or statutes referenced in the context.                                                                                         | dc432f8b98344f9fa73a9b78b770aa61 |
| 32 | Insurance Company       | Insurance companies mentioned in the context.                                                                                              | dc432f8b98344f9fa73a9b78b770aa61 |
| 33 | Judicial Outcome        | Descriptions of judgments, affirmations, or dismissals.                                                                                    | dc432f8b98344f9fa73a9b78b770aa61 |
| 34 | Sanction                | Penalties or orders issued by the court.                                                                                                   | dc432f8b98344f9fa73a9b78b770aa61 |
| 35 | Legal Concept           | Legal principles or doctrines referenced in the context.                                                                                   | dc432f8b98344f9fa73a9b78b770aa61 |

In [25]:
client = instructor.from_openai(OpenAI())

def user_message(theme: str, text: str) -> str:
    return f"While focusing on the theme **{theme}**, generate an ontology for the following input text: ```\n{text}\n```"

def system_message() -> str:
    return (
        "You are an expert at creating an ontology for a given theme or topic. "
        "Users will provide you with a **theme** and an input text delimited by ```. "
        "Extract all the entity types from the input text relevant to the **theme**. "
        "The goal is to create an ontology to use for downstream knowledge graph construction for the **theme**."
    )


def generate_ontology(theme: str, text: str) -> str:
    return client.chat.completions.create(
        model="gpt-4o",
        max_retries=3,
        messages=[
            {
                "role": "system",
                "content": system_message(),
            },
            {
                "role": "user",
                "content": user_message(theme=theme, text=text),
            },
        ],
        response_model=Ontology,
    )



In [15]:
text_sample = df['text'].tolist()[0]

In [26]:
extracted_ontology = generate_ontology(
    theme="Insurance Coverage",
    text=text_sample,
)

In [27]:
extracted_ontology.to_pandas

Unnamed: 0,category,description
0,Person,An individual involved in the context of the i...
1,Organization,"A company, hospital, insurance firm or other e..."
2,Insurance Policy,A specific insurance coverage or policy mentio...
3,Legal Document,"Legal documents such as contracts, agreements,..."
4,Law,"Statutes, codes, or regulations relevant to th..."
5,Event,Incidents or occurrences relevant to the insur...
6,Court Decision,Specific court rulings or judicial findings re...
7,Legal Term,Specific legal terms or concepts relevant to t...
8,Claim,Specific claims made under an insurance policy.


In [29]:
from typing import List
from openai import OpenAI
import instructor
from pydantic import BaseModel, Field

client = instructor.from_openai(OpenAI())
model = "gpt-3.5-turbo"


class Node(BaseModel):
    label: str
    name: str


class Edge(BaseModel):
    node_1: Node
    node_2: Node
    relationship: str
    

class KnowledgeGraph(BaseModel):
    edges: List[Edge] = Field(..., default_factory=list)
    
    @property
    def to_pandas(self):
        kg_dict = {
            "node_1": [n.node_1.name for n in self.edges],
            "node_2": [n.node_2.name for n in self.edges],
            "edge": [n.relationship for n in self.edges],
            "node_1_type": [n.node_1.label for n in self.edges],
            "node_2_type": [n.node_2.label for n in self.edges],
        }
        return pd.DataFrame(kg_dict)


def user_message(text: str) -> str:
    return f"input text: ```\n{text}\n```"

def system_message(ontology: Ontology) -> str:
    return (
        "You are an expert at creating Knowledge Graphs. "
        "Consider the following ontology. \n"
        f"{ontology} \n"
        "The user will provide you with an input text delimited by ```. "
        "Extract all the entities and relationships from the user-provided text as per the given ontology. Do not use any previous knowledge about the context."
        "Remember there can be multiple direct (explicit) or implied relationships between the same pair of nodes. "
        "Be consistent with the given ontology. Use ONLY the labels and relationships mentioned in the ontology. "
        "Remember to follow the correct format, for example:\n"
        "[\n"
        "   {\n"
        '       node_1: Required, an entity object with attributes: {"label": "as per the ontology", "name": "Name of the entity"},\n'
        '       node_2: Required, an entity object with attributes: {"label": "as per the ontology", "name": "Name of the entity"},\n'
        "       relationship: Describe the relationship between node_1 and node_2 as per the context, in one or two sentences.\n"
        "   },\n"
        "]\n"
    )


def generate_graph(text: str, ontology: Ontology, model: str) -> str:
    return client.chat.completions.create(
        model=model,
        max_retries=3,
        messages=[
            {
                "role": "system",
                "content": system_message(ontology),
            },
            {
                "role": "user",
                "content": user_message(text=text),
            },
        ],
        response_model=KnowledgeGraph,
    )


In [30]:
def df2Graph(df, ontology, model="gpt-4o") -> pd.DataFrame:
    progress_bar = tqdm(total=len(df), desc="Processing chunks")

    def apply_graphPrompt(row):
        result = generate_graph(row.text, ontology, model)
        result_df = result.to_pandas
        result_df["chunk_id"] = row.chunk_id
        progress_bar.update(1)
        return result_df
    
    results = df.apply(apply_graphPrompt, axis=1)
    results_df = pd.concat(results.tolist(), ignore_index=True)
    progress_bar.close()
    
    return results_df

In [32]:
dfg = df2Graph(df, extracted_ontology)

Processing chunks: 100%|██████████| 4/4 [01:32<00:00, 23.21s/it]


In [33]:
print(dfg.shape)
dfg.head()

(90, 6)


Unnamed: 0,node_1,node_2,edge,node_1_type,node_2_type,chunk_id
0,JUSTICE GALLAGHER,Opinion,JUSTICE GALLAGHER delivered the opinion of the...,Person,Court Decision,e4ea5975aaca4e378931a802b8269818
1,La Grange Memorial Hospital,St. Paul Insurance Company,Involved in an insurance coverage dispute aris...,Organization,Organization,e4ea5975aaca4e378931a802b8269818
2,Hilke Mundy,La Grange Memorial Hospital,Hilke Mundy brought a workers’ compensation cl...,Person,Organization,e4ea5975aaca4e378931a802b8269818
3,La Grange Memorial Hospital,St. Paul Insurance Company,La Grange contended that St. Paul had a duty t...,Organization,Organization,e4ea5975aaca4e378931a802b8269818
4,Rehabilitation Services of Mid-America,St. Paul Insurance Company,St. Paul issued an umbrella excess insurance p...,Organization,Organization,e4ea5975aaca4e378931a802b8269818


In [34]:
dfg['node_1'].nunique(), dfg['node_2'].nunique()

(54, 48)

In [37]:
nodes = pd.concat(
    [dfg["node_1"], dfg["node_2"]], axis=0
).unique()

len(nodes)

87

In [38]:
import networkx as nx

G = nx.Graph()

for node in nodes:
    G.add_node(str(node))
# Add edges to the graph
for _, row in dfg.iterrows():
    G.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        title=row["edge"],
        chunk_id=row["chunk_id"],
    )

In [88]:
from cdlib import algorithms

def detect_communities(graph):
    communities = []
    index = 0
    for component in nx.connected_components(graph):
        print(
            f"Component index {index} of {len(list(nx.connected_components(graph)))}:")
        subgraph = graph.subgraph(component)
        if len(subgraph.nodes) > 1:  # Leiden algorithm requires at least 2 nodes
            try:
                sub_communities = algorithms.leiden(subgraph)
                for community in sub_communities.communities:
                    communities.append(list(community))
            except Exception as e:
                print(f"Error processing community {index}: {e}")
        else:
            communities.append(list(subgraph.nodes))
        index += 1
    print("Communities from detect_communities:", communities)
    return communities

In [89]:
test = detect_communities(G)

Component index 0 of 11:
Error processing community 0: invalid literal for int() with base 10: 'JUSTICE GALLAGHER'
Component index 1 of 11:
Component index 2 of 11:
Error processing community 2: invalid literal for int() with base 10: 'underlying complaint'
Component index 3 of 11:
Error processing community 3: invalid literal for int() with base 10: 'State Farm Fire & Casualty Co.'
Component index 4 of 11:
Error processing community 4: invalid literal for int() with base 10: 'American National Trust Co. v. Kentucky Fried Chicken of Southern California, Inc., 308 Ill. App. 3d 106, 120, 719 N.E.2d 201, 211 (1999)'
Component index 5 of 11:
Error processing community 5: invalid literal for int() with base 10: 'Federal Kemper Insurance Co.'
Component index 6 of 11:
Error processing community 6: invalid literal for int() with base 10: 'Casualty Insurance Co. v. E.W. Corrigan Construction Co., 247 Ill. App. 3d 326, 332-33, 617 N.E.2d 228, 233-34 (1993)'
Component index 7 of 11:
Error process

In [90]:
len(test)

6

In [85]:
def detect_communities(G):
    communities_generator = nx.community.girvan_newman(G)
    next_level_communities = next(communities_generator)
    communities = sorted(map(sorted, next_level_communities))
    return communities

In [86]:
communities = detect_communities(G)

StopIteration: 

In [81]:
len(communities)

26

In [82]:
communities

[['1990 claim', 'Mundy', 'RSMA', 'contracts'],
 ['1990 notice',
  'Illinois Supreme Court Rule 341(a)',
  'St. Paul',
  'appeal',
  'destruction of policy',
  'duty to defend RSMA versus La Grange',
  'equitable estoppel',
  'insurance policy',
  'late notice defense',
  'persuasion attempt'],
 ['2015 notice',
  'Illinois Interest Act, Section 2',
  'La Grange',
  'La Grange’s filing of declaratory judgment action',
  'St. Paul Policy',
  'sanctions against St. Paul'],
 ['Altevogt v. Brinkoetter, 85 Ill. 2d 44, 55, 421 N.E.2d 182 (1981)',
  'American National Trust Co. v. Kentucky Fried Chicken of Southern California, Inc., 308 Ill. App. 3d 106, 120, 719 N.E.2d 201, 211 (1999)',
  'Garcia v. Lovellette, 265 Ill. App. 3d 724, 732, 639 N.E.2d 935 (1994)',
  'third-party beneficiary'],
 ['Appeal', 'Jurisdiction'],
 ['Avis Rent A Car System, Inc.', 'Collier'],
 ['Award of penalties', 'Section 155'],
 ['Breach of duty to defend',
  'Illinois Supreme Court Rule 341(g)',
  'Industrial Coating

In [105]:
for index, community in enumerate(communities):
    print(f"Summarize Community index {index} of {len(communities)}:")
    subgraph = G.subgraph(community)
    nodes = list(subgraph.nodes)
    edges = list(subgraph.edges(data=True))
    description = "Entities: " + ", ".join(nodes) + "\nRelationships: "
    relationships = []
    for edge in edges:
        relationships.append(
            f"{edge[0]} -> {edge[2]['title']} -> {edge[1]}")
    description += ", ".join(relationships)
    print(description)

Summarize Community index 0 of 26:
Entities: RSMA, 1990 claim, Mundy, contracts
Relationships: RSMA -> RSMA provided notice of the 1990 claim to St. Paul. -> 1990 claim, 1990 claim -> Mundy presented the 1990 claim. -> Mundy, 1990 claim -> Contracts were part of the documents sent to St. Paul regarding the 1990 claim. -> contracts
Summarize Community index 1 of 26:
Entities: 1990 notice, Illinois Supreme Court Rule 341(a), persuasion attempt, equitable estoppel, late notice defense, St. Paul, duty to defend RSMA versus La Grange, insurance policy, destruction of policy, appeal
Relationships: 1990 notice -> Received notice of Mundy’s claim in 1990. -> St. Paul, Illinois Supreme Court Rule 341(a) -> St. Paul failed to comply with this rule regarding the page limitation of the reply brief. -> St. Paul, persuasion attempt -> St. Paul's attempt to persuade the court that RSMA’s 1990 notice did not constitute 'actual notice' under Illinois law was found unconvincing. -> St. Paul, equitable e

In [53]:
import openai

def summarize_communities(communities, graph):
    client = openai.OpenAI()
    community_summaries = []
    for index, community in enumerate(communities):
        print(f"Summarize Community index {index} of {len(communities)}:")
        subgraph = graph.subgraph(community)
        nodes = list(subgraph.nodes)
        edges = list(subgraph.edges(data=True))
        description = "Entities: " + ", ".join(nodes) + "\nRelationships: "
        relationships = []
        for edge in edges:
            relationships.append(
                f"{edge[0]} -> {edge[2]['title']} -> {edge[1]}")
        description += ", ".join(relationships)

        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "Summarize the following community of entities and relationships."},
                {"role": "user", "content": description}
            ]
        )
        summary = response.choices[0].message.content.strip()
        community_summaries.append(summary)
    return community_summaries

In [91]:
summarized_communities = summarize_communities(test, G)

Summarize Community index 0 of 6:
Summarize Community index 1 of 6:
Summarize Community index 2 of 6:
Summarize Community index 3 of 6:
Summarize Community index 4 of 6:
Summarize Community index 5 of 6:


In [104]:
Markdown(summarized_communities[5])

The community of entities and relationships revolves around legal cases and the concept of "actual notice" in the context of insurance law.

1. **Insurance Co. v. Federal Kemper Insurance Co., 291 Ill. App. 3d 384, 388, 683 N.E.2d 947 (1997)**: This case addresses the specification of the existence of actual notice.

2. **Employers Insurance v. Ehlco Liquidating Trust, 186 Ill. 2d 127, 143, 708 N.E.2d 1122, 1131 (1999)**: This case defines actual notice, stating that an insurer has actual notice when it is aware that a cause of action has been filed and the complaint possibly falls within the policy coverage.

3. **Cincinnati Cos. v. West American Insurance Co., 183 Ill. 2d 317, 701 N.E.2d 499 (1998)**: This case is associated with the principle that an insurer's duty to defend is triggered by its actual notice of a claim against the insured.

These cases collectively elucidate the role of actual notice in determining an insurer's duty to defend.

In [None]:
# 6. Community Summaries → Community Answers → Global Answer
def generate_answers_from_communities(community_summaries, query):
    intermediate_answers = []
    for index, summary in enumerate(community_summaries):
        print(f"Summary index {index} of {len(community_summaries)}:")
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "Answer the following query based on the provided summary."},
                {"role": "user", "content": f"Query: {query} Summary: {summary}"}
            ]
        )
        print("Intermediate answer:", response.choices[0].message.content)
        intermediate_answers.append(
            response.choices[0].message.content)

    final_response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system",
                "content": "Combine these answers into a final, concise response."},
            {"role": "user", "content": f"Intermediate answers: {intermediate_answers}"}
        ]
    )
    final_answer = final_response.choices[0].message.content
    return final_answer

In [None]:
final_answer = generate_answers_from_communities(
    community_summaries=summarized_communities,
    query="",
)

In [29]:
import logging
import random
import seaborn as sns


palette = "hls"
p = sns.color_palette(palette, len(communities)).as_hex()
random.shuffle(p)
rows = []
group = 0
for community in communities:
    color = p.pop()
    group += 1
    for node in community:
        rows += [{"node": node, "color": color, "group": group}]
df_colors = pd.DataFrame(rows)

for _, row in df_colors.iterrows():
    G.nodes[row["node"]]["group"] = row["group"]
    G.nodes[row["node"]]["color"] = row["color"]
    G.nodes[row["node"]]["size"] = G.degree[row["node"]]


In [30]:
import json

graph_data = nx.node_link_data(G)

# Specify the file path where you want to save the JSON
json_file_path = "data/graph_data.json"

# Write the graph data to a JSON file
with open(json_file_path, "w", encoding="utf-8-sig") as json_file:
    json.dump(graph_data, json_file, ensure_ascii=False)

In [31]:
graph_data["links"]

[{'title': 'chunk contextual proximity',
  'weight': 1.75,
  'chunk_id': '0d1336884d5b4d1b9f43d4144b3e7e67,0d1336884d5b4d1b9f43d4144b3e7e67,0d1336884d5b4d1b9f43d4144b3e7e67,0d1336884d5b4d1b9f43d4144b3e7e67,0d1336884d5b4d1b9f43d4144b3e7e67,0d1336884d5b4d1b9f43d4144b3e7e67,0d1336884d5b4d1b9f43d4144b3e7e67',
  'source': '$984,943.15',
  'target': 'Hartford'},
 {'title': 'chunk contextual proximity',
  'weight': 0.5,
  'chunk_id': '0d1336884d5b4d1b9f43d4144b3e7e67,0d1336884d5b4d1b9f43d4144b3e7e67',
  'source': '$984,943.15',
  'target': 'Hartford Casualty'},
 {'title': 'chunk contextual proximity',
  'weight': 0.5,
  'chunk_id': '0d1336884d5b4d1b9f43d4144b3e7e67,0d1336884d5b4d1b9f43d4144b3e7e67',
  'source': '$984,943.15',
  'target': 'January 17, 1995'},
 {'title': 'chunk contextual proximity,The amount included in the judgment in favor of Konami.',
  'weight': 4.25,
  'chunk_id': '0d1336884d5b4d1b9f43d4144b3e7e67,0d1336884d5b4d1b9f43d4144b3e7e67,0d1336884d5b4d1b9f43d4144b3e7e67,0d1336884

In [34]:
def load_graph_from_json(json_file_path):
    try:
        with open(json_file_path, 'r', encoding='utf-8-sig') as file:
            json_data = json.load(file)
    except FileNotFoundError:
        print(f"File not found: {json_file_path}")
        return None
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON from the file: {json_file_path}. Error: {e}")
        return None
    except Exception as e:
        print(f"Unexpected error while reading the file: {e}")
        return None

    G = nx.Graph()
    for node in json_data['nodes']:
        G.add_node(node['id'])
    for link in json_data['links']:
        G.add_edge(link['source'], link['target'], weight=link['weight'], title=link['title'])
    return G


# function to load df from json for chunk retrieval based on node and chunk id
def load_chunks_dataframe(json_file_path):
    try:
        with open(json_file_path, 'r', encoding='utf-8-sig') as file:
            data = json.load(file)

        # Extracting chunks and their IDs
        chunks = []
        for link in data['links']:
            chunk_ids = link.get('chunk_id', '').split(',')
            text = link.get('title', '')  # Assuming 'title' contains the text associated with the chunk
            for chunk_id in chunk_ids:
                if chunk_id:
                    chunks.append({'chunk_id': chunk_id, 'text': text})

        return pd.DataFrame(chunks)

    except Exception as e:
        print(f"Error: {e}")
        return pd.DataFrame()  # Return an empty DataFrame in case of an error

In [35]:
# Load the graph
graph = load_graph_from_json('data/graph_data.json')
# load the chunk_dataframe
chunks_dataframe = load_chunks_dataframe('data/graph_data.json')

In [37]:
chunks_dataframe[chunks_dataframe['text']!='chunk contextual proximity']

Unnamed: 0,chunk_id,text
11,0d1336884d5b4d1b9f43d4144b3e7e67,"chunk contextual proximity,The amount included..."
12,0d1336884d5b4d1b9f43d4144b3e7e67,"chunk contextual proximity,The amount included..."
13,0d1336884d5b4d1b9f43d4144b3e7e67,"chunk contextual proximity,The amount included..."
14,0d1336884d5b4d1b9f43d4144b3e7e67,"chunk contextual proximity,The amount included..."
15,0d1336884d5b4d1b9f43d4144b3e7e67,"chunk contextual proximity,The amount included..."
...,...,...
2070,7673b452763e4cb3b6cbca0d8abc3e83,Direct patent infringement refers to the makin...
2077,7673b452763e4cb3b6cbca0d8abc3e83,"chunk contextual proximity,Some dictionaries d..."
2078,7673b452763e4cb3b6cbca0d8abc3e83,"chunk contextual proximity,Some dictionaries d..."
2079,7673b452763e4cb3b6cbca0d8abc3e83,"chunk contextual proximity,Some dictionaries d..."


In [38]:
import re


def textualize_graph(graph):
    triplets = re.findall(r'\((.*?)\)', graph)
    nodes = {}
    edges = []
    for tri in triplets:
        src, edeg_attr, dst = tri.split(';')
        src = src.lower().strip()
        dst = dst.lower().strip()
        if src not in nodes:
            nodes[src] = len(nodes)
        if dst not in nodes:
            nodes[dst] = len(nodes)
        edges.append({'src': nodes[src], 'edge_attr': edeg_attr.lower().strip(), 'dst': nodes[dst], })

    nodes = pd.DataFrame(nodes.items(), columns=['node_attr', 'node_id'])
    edges = pd.DataFrame(edges)
    return nodes, edges

In [39]:
from src.embedding_models.models import OpenAIEmbeddings
import nest_asyncio

embeddings = OpenAIEmbeddings()
embedding_fn = embeddings.embedding_fn()

In [43]:
node_features = pd.DataFrame(graph_data["nodes"])
node_embeddings = embedding_fn(node_features["id"].tolist())
node_features["embeddings"] = node_embeddings

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


In [44]:
print(node_features.shape)
node_features.head()

(59, 5)


Unnamed: 0,group,color,size,id,embeddings
0,1,#db5f57,9,"$984,943.15","[0.01951550878584385, -0.01489550992846489, 0...."
1,1,#db5f57,6,935 F. Supp. at 1116,"[-0.00864872895181179, 0.005267801228910685, 0..."
2,1,#db5f57,5,Advertising mode,"[-0.006128218956291676, -0.011504832655191422,..."
3,1,#db5f57,9,"April 16, 1999","[-0.0065999156795442104, -0.031243011355400085..."
4,1,#db5f57,9,"April 23, 1996","[-0.001258106785826385, -0.008358835242688656,..."


In [45]:
edge_features = pd.DataFrame(graph_data["links"])
edge_embeddings = embedding_fn(edge_features["title"].tolist())
node_embeddings = embedding_fn(edge_features["source"].tolist())
edge_features["embeddings"] = edge_embeddings

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


In [46]:
edge_features

Unnamed: 0,title,weight,chunk_id,source,target,embeddings
0,chunk contextual proximity,1.75,"0d1336884d5b4d1b9f43d4144b3e7e67,0d1336884d5b4...","$984,943.15",Hartford,"[-0.025688467547297478, -0.01057156641036272, ..."
1,chunk contextual proximity,0.50,"0d1336884d5b4d1b9f43d4144b3e7e67,0d1336884d5b4...","$984,943.15",Hartford Casualty,"[-0.025688467547297478, -0.01057156641036272, ..."
2,chunk contextual proximity,0.50,"0d1336884d5b4d1b9f43d4144b3e7e67,0d1336884d5b4...","$984,943.15","January 17, 1995","[-0.025688467547297478, -0.01057156641036272, ..."
3,"chunk contextual proximity,The amount included...",4.25,"0d1336884d5b4d1b9f43d4144b3e7e67,0d1336884d5b4...","$984,943.15",Konami,"[0.0019303852459415793, -0.0123488400131464, 0..."
4,chunk contextual proximity,0.50,"0d1336884d5b4d1b9f43d4144b3e7e67,0d1336884d5b4...","$984,943.15",Land & Sky,"[-0.025688467547297478, -0.01057156641036272, ..."
...,...,...,...,...,...,...
324,"chunk contextual proximity,Some dictionaries d...",0.75,"7673b452763e4cb3b6cbca0d8abc3e83,7673b452763e4...",patent infringement,piracy,"[-0.01839851774275303, -0.02950393036007881, 0..."
325,chunk contextual proximity,0.75,"7673b452763e4cb3b6cbca0d8abc3e83,7673b452763e4...",patent infringement,sale of a patented component,"[-0.025586551055312157, -0.01060118991881609, ..."
326,chunk contextual proximity,0.50,"0d1336884d5b4d1b9f43d4144b3e7e67,0d1336884d5b4...",section 155,summary judgment,"[-0.025688467547297478, -0.01057156641036272, ..."
327,chunk contextual proximity,1.25,"0d1336884d5b4d1b9f43d4144b3e7e67,0d1336884d5b4...",section 155,trial,"[-0.025688467547297478, -0.01057156641036272, ..."


In [55]:
from pyvis.network import Network

net = Network(
    notebook=False,
    bgcolor="#1a1a1a",
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    font_color="#cccccc",
    filter_menu=False,
)
net.from_nx(G)
net.force_atlas_2based(central_gravity=0.015, gravity=-31)
net.show_buttons(filter_=["physics"])
html_output_path = os.path.join("data", "index.html")
html = net.generate_html()
with open(html_output_path, mode="w", encoding="utf-8-sig") as fp:
    fp.write(html)
net.show(html_output_path, notebook=False)

data\index.html


In [56]:
print(dfg_final.shape)
dfg_final.head()

(648, 5)


Unnamed: 0,node_1,node_2,chunk_id,edge,count
0,"$984,943.15",Hartford,"0d1336884d5b4d1b9f43d4144b3e7e67,0d1336884d5b4...",chunk contextual proximity,7.0
1,"$984,943.15",Hartford Casualty,"0d1336884d5b4d1b9f43d4144b3e7e67,0d1336884d5b4...",chunk contextual proximity,2.0
2,"$984,943.15","January 17, 1995","0d1336884d5b4d1b9f43d4144b3e7e67,0d1336884d5b4...",chunk contextual proximity,2.0
3,"$984,943.15",Konami,"0d1336884d5b4d1b9f43d4144b3e7e67,0d1336884d5b4...",chunk contextual proximity,17.0
4,"$984,943.15",Land & Sky,"0d1336884d5b4d1b9f43d4144b3e7e67,0d1336884d5b4...",chunk contextual proximity,2.0


In [57]:
from src.embedding_models.models import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

In [58]:
embeddings = OpenAIEmbeddings()

In [59]:
pd.DataFrame(nodes)

Unnamed: 0,0
0,"$984,943.15"
1,935 F. Supp. at 1116
2,Advertising mode
3,"April 16, 1999"
4,"April 23, 1996"
5,Brochu
6,CGL policy
7,Complaint
8,Device Insertion
9,Du Page County


In [106]:
from openai import AsyncOpenAI
import asyncio
from tqdm.asyncio import tqdm_asyncio

class NodeEntity(BaseModel):
    """The original and resolved entity name from a list of nodes."""
    
    original_name: str = Field(
        ...,
        description="The original entity given by the user.",
    )
    resolved_name: str = Field(
        ...,
        description="The name of the entity such that duplications are resolved and issues with punctuation or capitalization are corrected.",
    )
    
    
class ResolvedEntities(BaseModel):
    """A list of entities."""
    
    entities: List[NodeEntity] = Field(..., default_factory=list)
    
    @property
    def to_pandas(self):
        entity_dict = {
            "original_name": [n.original_name for n in self.entities],
            "resolved_name": [n.resolved_name for n in self.entities],
        }
        return pd.DataFrame(entity_dict)
    
    
def system_message() -> str:
    return (
        "You are an expert entity resolution AI. "
        "Users will provide you with a list of Node names representing entities from a knowledge graph separated by new lines. "
        "Your task is to, for each entity, generate a NodeEntity such that duplicates are removed by determining their resolved name. "
        "Try to infer the base name that uniquely describes the entity as concisely as possible. "
        "For example, 'Liberty Mutual Group', 'Liberty Mutual Insurance Co', 'Liberty Mutual Company of Massachusets', 'Liberty Mutual Casualty Insurance' "
        "should all simply be 'Liberty Mutual'. "
        "Please also correct any punctuation or capitalization issues."
    )


async def resolve_entities(entities: List[str], model: str = 'gpt-4o') -> str:
    entity_list_string = "\n ".join(entities)
    client = instructor.from_openai(AsyncOpenAI())
    return await client.chat.completions.create(
        model=model,
        max_retries=5,
        messages=[
            {
                "role": "system",
                "content": system_message(),
            },
            {
                "role": "user",
                "content": f"Here is the list of entities to resolve:\n\n{entity_list_string}",
            },
        ],
        response_model=ResolvedEntities,
    )
  
  
async def process_batch(df_batch: pd.DataFrame, column_name: str, model: str) -> pd.DataFrame:
    entities = df_batch[column_name].tolist()
    resolved_entities = await resolve_entities(entities, model)
    resolved_df = resolved_entities.to_pandas
    resolved_df.index = df_batch.index
    return df_batch.join(resolved_df.set_index("original_name"), on=column_name)


async def process_dataframe(df: pd.DataFrame, column_name: str, model: str, batch_size: int = 50) -> pd.DataFrame:
    tasks = []
    for start in range(0, len(df), batch_size):
        df_batch = df.iloc[start:start + batch_size]
        tasks.append(process_batch(df_batch, column_name, model))
    
    results = []
    for f in tqdm_asyncio.as_completed(tasks, desc="Processing batches"):
        result = await f
        results.append(result)
    
    return pd.concat(results, ignore_index=True)

# Example usage
# import nest_asyncio
# nest_asyncio.apply()

# column_name = "node"
# model = "gpt-4o"

# resolved_df = asyncio.run(process_dataframe(df, column_name, model))

In [76]:
resolved_entity_nodes = resolve_entities(nodes.tolist())

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [97]:
test_df = pd.DataFrame({'node': nodes})
test_df.head()

Unnamed: 0,node
0,"$984,943.15"
1,935 F. Supp. at 1116
2,Advertising mode
3,"April 16, 1999"
4,"April 23, 1996"


In [110]:
import nest_asyncio
nest_asyncio.apply()

column_name = "node"
model = "gpt-4o"

resolved_df = asyncio.run(process_dataframe(test_df, column_name, model))

Processing batches:   0%|          | 0/2 [00:00<?, ?it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Processing batches:  50%|█████     | 1/2 [00:03<00:03,  3.47s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Processing batches: 100%|██████████| 2/2 [00:12<00:00,  6.41s/it]


In [111]:
resolved_df

Unnamed: 0,node,resolved_name
0,"making, using, or selling of a patented invention","Making, Using, or Selling of a Patented Invention"
1,opinion of the court,Opinion of the Court
2,patent infringement,Patent Infringement
3,patent infringement lawsuit,Patent Infringement
4,piracy,Piracy
5,sale of a patented component,Sale of a Patented Component
6,section 155,Section 155
7,summary judgment,Summary Judgment
8,trial,Trial
9,"$984,943.15",984943.15


In [5]:
docs = [Document(text=t) for t in example_text_list]

In [6]:
graph_maker = GraphMaker(ontology=example_ontology, verbose=True)

In [7]:
graph = graph_maker.from_documents(
    docs[:3], 
    delay_s_between=1,
    ) 

[92m[39m
[92m▶︎ GRAPH MAKER LOG - 2024-05-10 16:39:48 - INFO [39m
[92mDocument: 1[39m
[92m[39m
[34m[39m
[34m▶︎ GRAPH MAKER VERBOSE - 2024-05-10 16:39:48 - INFO [39m
[34mUsing Ontology:
labels=[{'Person': 'Person name without any adjectives, Remember a person may be referenced by their name or using a pronoun'}, {'Object': "Do not add the definite article 'the' in the object name"}, {'Event': 'Event event involving multiple people. Do not include qualifiers or verbs like gives, leaves, works etc.'}, 'Place', 'Document', 'Organization', 'Action', {'Miscellaneous': 'Any important concept can not be categorized with any other given label'}] relationships=['Relation between any pair of Entities'][39m
[34m[39m
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[34m[39m
[34m▶︎ GRAPH MAKER VERBOSE - 2024-05-10 16:40:22 - INFO [39m
[34mLLM Response:
[
   {
       "node_1": {"label": "Person", "name": "Bilbo Baggins"},
       "node_2":

In [9]:
graph[0]

[Edge(node_1=Node(label='Person', name='Bilbo Baggins'), node_2=Node(label='Event', name='birthday'), relationship='Bilbo Baggins celebrates his birthday.', metadata=None, order=0),
 Edge(node_1=Node(label='Person', name='Bilbo Baggins'), node_2=Node(label='Person', name='Frodo'), relationship='Bilbo Baggins leaves the Ring to Frodo, who is his heir.', metadata=None, order=0),
 Edge(node_1=Node(label='Person', name='Gandalf'), node_2=Node(label='Object', name='Ring'), relationship='Gandalf suspects and later confirms that the Ring is a Ring of Power.', metadata=None, order=0),
 Edge(node_1=Node(label='Person', name='Ring'), node_2=Node(label='Person', name='Dark Lord Sauron'), relationship='The Ring was lost by Dark Lord Sauron.', metadata=None, order=0),
 Edge(node_1=Node(label='Person', name='Gandalf'), node_2=Node(label='Person', name='Frodo'), relationship='Gandalf counsels Frodo to take the Ring away from the Shire and promises to return.', metadata=None, order=0),
 Edge(node_1=No

___

# instructor KG with iterative updates

In [85]:
from pydantic import BaseModel, Field
from typing import List


class Node(BaseModel):
    id: int
    label: str
    color: str
    
    def __hash__(self) -> int:
        return hash((id, self.label))

class Edge(BaseModel):
    source: int
    target: int
    label: str
    color: str = "black"
    
    def __hash__(self) -> int:
        return hash((self.source, self.target, self.label))

class KnowledgeGraph(BaseModel):
    nodes: List[Node] = Field(..., default_factory=list)
    edges: List[Edge] = Field(..., default_factory=list)

In [89]:
from openai import OpenAI
import instructor

client = instructor.from_openai(OpenAI())

def generate_graph(input) -> KnowledgeGraph:
    return client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": f"Help me understand the following Legal Service by describing it as a detailed knowledge graph: {input}",
            }
        ],
        response_model=KnowledgeGraph,
    )

In [88]:
Markdown(services_df['full_description'].tolist()[0])

Employment Disputes: Handles legal issues concerning current or former employees pursuing legal action against the company. Cases include wrongful termination, payment-related disputes, or contractual matters. Does not handle discrimination accusations.

In [90]:
graph_0 = generate_graph(input=services_df['full_description'].tolist()[0])

In [91]:
from graphviz import Digraph

def visualize_knowledge_graph(kg: KnowledgeGraph):
    dot = Digraph(comment="Knowledge Graph")

    # Add nodes
    for node in kg.nodes:
        dot.node(str(node.id), node.label, color=node.color)

    # Add edges
    for edge in kg.edges:
        dot.edge(str(edge.source), str(edge.target), label=edge.label, color=edge.color)

    # Render the graph
    dot.render("knowledge_graph.gv", view=True)

In [92]:
visualize_knowledge_graph(graph_0)

In [93]:
from graphviz import Digraph
from typing import Optional
from pydantic import BaseModel, Field
from typing import List

class Node(BaseModel):
    id: int
    label: str
    color: str
    
    def __hash__(self) -> int:
        return hash((id, self.label))

class Edge(BaseModel):
    source: int
    target: int
    label: str
    color: str = "black"
    
    def __hash__(self) -> int:
        return hash((self.source, self.target, self.label))


class KnowledgeGraph(BaseModel):
    nodes: Optional[List[Node]] = Field(..., default_factory=list)
    edges: Optional[List[Edge]] = Field(..., default_factory=list)

    def update(self, other: "KnowledgeGraph") -> "KnowledgeGraph":
        """Updates the current graph with the other graph, deduplicating nodes and edges."""
        return KnowledgeGraph(
            nodes=list(set(self.nodes + other.nodes)),
            edges=list(set(self.edges + other.edges)),
        )

    def draw(self, prefix: str = None):
        dot = Digraph(comment="Knowledge Graph")

        for node in self.nodes:  
            dot.node(str(node.id), node.label, color=node.color)

        for edge in self.edges:  
            dot.edge(
                str(edge.source), str(edge.target), label=edge.label, color=edge.color
            )
        dot.render(prefix, format="png", view=True)

In [94]:
from openai import OpenAI
import instructor

client = instructor.from_openai(OpenAI())

def generate_graph(input: List[str]) -> KnowledgeGraph:
    cur_state = KnowledgeGraph()  
    num_iterations = len(input)
    for i, inp in enumerate(input):
        new_updates = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "system",
                    "content": """You are an iterative knowledge graph builder for a Legal Services front desk.
                    You are given the current state of the graph, and you must append the nodes and edges for a new Service 
                    to it. Do not provide any duplicates and try to reuse nodes as much as possible.""",
                },
                {
                    "role": "user",
                    "content": f"""Extract any new nodes and edges from the following:
                    # Part {i}/{num_iterations} of the input:

                    {inp}""",
                },
                {
                    "role": "user",
                    "content": f"""Here is the current state of the graph:
                    {cur_state.model_dump_json(indent=2)}""",
                },  
            ],
            response_model=KnowledgeGraph,
        )  # type: ignore

        # Update the current state
        cur_state = cur_state.update(new_updates)  
        cur_state.draw(prefix=f"iteration_{i}")
    return cur_state

In [95]:
sample_texts = services_df['full_description'].tolist()[:9]

In [96]:
graph: KnowledgeGraph = generate_graph(sample_texts)
graph.draw(prefix="final")