In [22]:
import os
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
from IPython.display import Markdown, display, HTML

import uuid
from tqdm import tqdm
import instructor
import openai
from langchain_community.document_loaders import DataFrameLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

load_dotenv()

True

In [2]:
os.chdir(os.path.dirname(os.getcwd()))

In [3]:
df = pd.read_parquet('citation_data_with_context.parquet')
df = df.drop_duplicates(subset=['body'], keep='first')

In [38]:
sample_test = df.sample(1)
text_column = 'body'

splitter = RecursiveCharacterTextSplitter(
    chunk_size=6000,
    chunk_overlap=100,
    length_function=len,
    is_separator_regex=False,
)

def dataframe2Documents(df: pd.DataFrame, text_column: str):
    loader = DataFrameLoader(df, page_content_column=text_column)
    return loader.load()

In [39]:
documents = dataframe2Documents(sample_test, text_column)
docs = splitter.split_documents(documents)
len(docs)

8

In [40]:
def extract_elements_from_chunks(docs):
    client = openai.OpenAI()
    elements = []
    for index, doc in enumerate(docs):
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "Extract entities and relationships from the following text."},
                {"role": "user", "content": doc.page_content}
            ]
        )
        print(f"Chunk index {index} of {len(docs)}:")
        entities_and_relations = response.choices[0].message.content
        elements.append(entities_and_relations)
    return elements

In [41]:
elements = extract_elements_from_chunks(docs)

Chunk index 0 of 8:
Chunk index 1 of 8:
Chunk index 2 of 8:
Chunk index 3 of 8:
Chunk index 4 of 8:
Chunk index 5 of 8:
Chunk index 6 of 8:
Chunk index 7 of 8:


In [42]:
print(elements[0])

Entities:
1. Justice Hartman
2. Illinois National Insurance Company (Illinois National)
3. RLI Insurance Company (RLI)
4. Michael Schneider
5. Haul-away, Inc. (Haulaway)
6. Hyang W Yoo
7. C. Groot Automatic Disposal Company, Inc. (Groot Automatic Disposal)
8. Groot Industries, Inc. (Groot Industries)
9. American International Adjustment Company, Inc.
10. Marvin L. Donaldson (litigation specialist)
11. Rick Dikeman (RLI claims examiner)

Relationships:
1. Justice Hartman - Delivered the opinion of the court
2. Illinois National Insurance Company - Defendant 
3. RLI Insurance Company - Plaintiff
4. Michael Schneider - Employee of Haul-away, Inc.; involved in an accident
5. Haul-away, Inc. - Related to Groot Industries; Michael Schneider's employer
6. Hyang W Yoo - Driver involved in the accident with Michael Schneider
7. C. Groot Automatic Disposal Company, Inc. - Owned the garbage truck involved in the accident
8. Groot Industries, Inc. - Parent company of Haulaway and Groot Automatic D

In [43]:
def summarize_elements(elements):
    client = openai.OpenAI()
    summaries = []
    for _, element in enumerate(elements):
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "Summarize the following entities and relationships in a structured format. Use \"->\" to represent relationships, after the \"Relationships:\" word."},
                {"role": "user", "content": element}
            ]
        )
        print("Element summary:", response.choices[0].message.content)
        summary = response.choices[0].message.content
        summaries.append(summary)
    return summaries

In [44]:
element_summaries = summarize_elements(elements)

Element summary: Entities:
1. Justice Hartman
2. Illinois National Insurance Company (Illinois National)
3. RLI Insurance Company (RLI)
4. Michael Schneider
5. Haul-away, Inc. (Haulaway)
6. Hyang W Yoo
7. C. Groot Automatic Disposal Company, Inc. (Groot Automatic Disposal)
8. Groot Industries, Inc. (Groot Industries)
9. American International Adjustment Company, Inc.
10. Marvin L. Donaldson (litigation specialist)
11. Rick Dikeman (RLI claims examiner)

Relationships:
1. Justice Hartman -> Delivered the opinion of the court
2. Illinois National Insurance Company -> Defendant
3. RLI Insurance Company -> Plaintiff
4. Michael Schneider -> Employee of Haul-away, Inc.; involved in an accident
5. Haul-away, Inc. -> Related to Groot Industries; Michael Schneider's employer
6. Hyang W Yoo -> Driver involved in the accident with Michael Schneider
7. C. Groot Automatic Disposal Company, Inc. -> Owned the garbage truck involved in the accident
8. Groot Industries, Inc. -> Parent company of Haulaw

In [46]:
import networkx as nx

# 4. Element Summaries → Graph Communities
def build_graph_from_summaries(summaries):
    G = nx.Graph()
    for index, summary in enumerate(summaries):
        print(f"Summary index {index} of {len(summaries)}:")
        lines = summary.split("\n")
        entities_section = False
        relationships_section = False
        entities = []
        for line in lines:
            if line.startswith("### Entities:") or line.startswith("**Entities:**"):
                entities_section = True
                relationships_section = False
                continue
            elif line.startswith("### Relationships:") or line.startswith("**Relationships:**"):
                entities_section = False
                relationships_section = True
                continue
            if entities_section and line.strip():
                if line[0].isdigit() and line[1] == ".":
                    line = line.split(".", 1)[1].strip()
                entity = line.strip()
                entity = entity.replace("**", "")
                entities.append(entity)
                G.add_node(entity)
            elif relationships_section and line.strip():
                parts = line.split("->")
                if len(parts) >= 2:
                    source = parts[0].strip()
                    target = parts[-1].strip()
                    relation = " -> ".join(parts[1:-1]).strip()
                    G.add_edge(source, target, label=relation)
    return G

In [47]:
graph = build_graph_from_summaries(element_summaries)

Summary index 0 of 8:
Summary index 1 of 8:
Summary index 2 of 8:
Summary index 3 of 8:
Summary index 4 of 8:
Summary index 5 of 8:
Summary index 6 of 8:
Summary index 7 of 8:


In [48]:
from cdlib import algorithms

def detect_communities(graph):
    communities = []
    index = 0
    for component in nx.connected_components(graph):
        print(
            f"Component index {index} of {len(list(nx.connected_components(graph)))}:")
        subgraph = graph.subgraph(component)
        if len(subgraph.nodes) > 1:  # Leiden algorithm requires at least 2 nodes
            try:
                sub_communities = algorithms.leiden(subgraph)
                for community in sub_communities.communities:
                    communities.append(list(community))
            except Exception as e:
                print(f"Error processing community {index}: {e}")
        else:
            communities.append(list(subgraph.nodes))
        index += 1
    print("Communities from detect_communities:", communities)
    return communities

In [49]:
communities = detect_communities(graph)

Component index 0 of 102:
Component index 1 of 102:
Component index 2 of 102:
Component index 3 of 102:
Component index 4 of 102:
Component index 5 of 102:
Component index 6 of 102:
Component index 7 of 102:
Component index 8 of 102:
Component index 9 of 102:
Component index 10 of 102:
Component index 11 of 102:
Error processing community 11: invalid literal for int() with base 10: '(Susan Warnke - AIG Claim Services, Inc. / AIG): Susan Warnke is an AIG claims representative.'
Component index 12 of 102:
Error processing community 12: invalid literal for int() with base 10: '2. Responsibility Assignment'
Component index 13 of 102:
Error processing community 13: invalid literal for int() with base 10: '3. Representation'
Component index 14 of 102:
Error processing community 14: invalid literal for int() with base 10: '4. Supervision'
Component index 15 of 102:
Error processing community 15: invalid literal for int() with base 10: '5. Interaction and Correspondence'
Component index 16 of 

In [50]:
len(communities)

65

In [51]:
def summarize_communities(communities, graph):
    client = openai.OpenAI()
    community_summaries = []
    for index, community in enumerate(communities):
        print(f"Summarize Community index {index} of {len(communities)}:")
        subgraph = graph.subgraph(community)
        nodes = list(subgraph.nodes)
        edges = list(subgraph.edges(data=True))
        description = "Entities: " + ", ".join(nodes) + "\nRelationships: "
        relationships = []
        for edge in edges:
            relationships.append(
                f"{edge[0]} -> {edge[2]['label']} -> {edge[1]}")
        description += ", ".join(relationships)

        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "Summarize the following community of entities and relationships."},
                {"role": "user", "content": description}
            ]
        )
        summary = response.choices[0].message.content.strip()
        community_summaries.append(summary)
    return community_summaries

In [52]:
summarized_communities = summarize_communities(communities, graph)

Summarize Community index 0 of 65:
Summarize Community index 1 of 65:
Summarize Community index 2 of 65:
Summarize Community index 3 of 65:
Summarize Community index 4 of 65:
Summarize Community index 5 of 65:
Summarize Community index 6 of 65:
Summarize Community index 7 of 65:
Summarize Community index 8 of 65:
Summarize Community index 9 of 65:
Summarize Community index 10 of 65:
Summarize Community index 11 of 65:
Summarize Community index 12 of 65:
Summarize Community index 13 of 65:
Summarize Community index 14 of 65:
Summarize Community index 15 of 65:
Summarize Community index 16 of 65:
Summarize Community index 17 of 65:
Summarize Community index 18 of 65:
Summarize Community index 19 of 65:
Summarize Community index 20 of 65:
Summarize Community index 21 of 65:
Summarize Community index 22 of 65:
Summarize Community index 23 of 65:
Summarize Community index 24 of 65:
Summarize Community index 25 of 65:
Summarize Community index 26 of 65:
Summarize Community index 27 of 65:
Su

In [55]:
summarized_communities[3]

'The community involves Schneider, who is the subject of a lawsuit that encompasses various claims. This indicates a legal context where multiple allegations or issues have been raised against Schneider. The primary relationship revolves around Schneider being the central figure in the litigation process with these multiple claims.'

In [None]:
def generate_answers_from_communities(community_summaries, query):
    client = openai.OpenAI()
    intermediate_answers = []
    for index, summary in enumerate(community_summaries):
        print(f"Summary index {index} of {len(community_summaries)}:")
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "Answer the following query based on the provided summary."},
                {"role": "user", "content": f"Query: {query} Summary: {summary}"}
            ]
        )
        print("Intermediate answer:", response.choices[0].message.content)
        intermediate_answers.append(
            response.choices[0].message.content)

    final_response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system",
                "content": "Combine these answers into a final, concise response."},
            {"role": "user", "content": f"Intermediate answers: {intermediate_answers}"}
        ]
    )
    final_answer = final_response.choices[0].message.content
    return final_answer

In [6]:
from typing import List
from openai import OpenAI
import instructor
from pydantic import BaseModel, Field

client = instructor.from_openai(OpenAI())


def user_message(theme: str, text: str) -> str:
    return f"While focusing on the theme **{theme}**, generate an ontology for the following input text: ```\n{text}\n```"

def system_message() -> str:
    return (
        "You are an expert at creating an ontology for a given theme or topic. "
        "Users will provide you with a **theme** and an input text delimited by ```. "
        "Extract all the entity types from the input text relevant to the **theme**. "
        "The goal is to create an ontology to use for downstream knowledge graph construction for the **theme**."
    )


def generate_ontology(theme: str, text: str, model: str) -> str:
    return client.chat.completions.create(
        model=model,
        max_retries=3,
        messages=[
            {
                "role": "system",
                "content": system_message(),
            },
            {
                "role": "user",
                "content": user_message(theme=theme, text=text),
            },
        ],
        response_model=Ontology,
    )
