In [None]:
!pip install langchain neo4j openai wikipedia tiktoken langchain_openai

In [None]:
from langchain.graphs import Neo4jGraph

url = "neo4j+s://815e74eb.databases.neo4j.io"
username ="neo4j"
password = "VoQigCFun5R1oK5zvXderMxODDQDbbPHTjc3U5sOCis"
graph = Neo4jGraph(
    url=url,
    username=username,
    password=password
)

In [None]:
from langchain_community.graphs.graph_document import (
    Node as BaseNode,
    Relationship as BaseRelationship,
    GraphDocument,
)
from langchain.schema import Document
from typing import List, Dict, Any, Optional
from langchain.pydantic_v1 import Field, BaseModel

class Property(BaseModel):
  """A single property consisting of key and value"""
  key: str = Field(..., description="key")
  value: str = Field(..., description="value")

class Node(BaseNode):
    properties: Optional[List[Property]] = Field(
        None, description="List of node properties")

class Relationship(BaseRelationship):
    properties: Optional[List[Property]] = Field(
        None, description="List of relationship properties"
    )

class KnowledgeGraph(BaseModel):
    """Generate a knowledge graph with entities and relationships."""
    nodes: List[Node] = Field(
        ..., description="List of nodes in the knowledge graph")
    rels: List[Relationship] = Field(
        ..., description="List of relationships in the knowledge graph"
    )

In [None]:
def format_property_key(s: str) -> str:
    words = s.split()
    if not words:
        return s
    first_word = words[0].lower()
    capitalized_words = [word.capitalize() for word in words[1:]]
    return "".join([first_word] + capitalized_words)

def props_to_dict(props) -> dict:
    """Convert properties to a dictionary."""
    properties = {}
    if not props:
      return properties
    for p in props:
        properties[format_property_key(p.key)] = p.value
    return properties

def map_to_base_node(node: Node) -> BaseNode:
    """Map the KnowledgeGraph Node to the base Node."""
    properties = props_to_dict(node.properties) if node.properties else {}
    # Add name property for better Cypher statement generation
    properties["name"] = node.id.title()
    return BaseNode(
        id=node.id.title(), type=node.type.capitalize(), properties=properties
    )


def map_to_base_relationship(rel: Relationship) -> BaseRelationship:
    """Map the KnowledgeGraph Relationship to the base Relationship."""
    source = map_to_base_node(rel.source)
    target = map_to_base_node(rel.target)
    properties = props_to_dict(rel.properties) if rel.properties else {}
    return BaseRelationship(
        source=source, target=target, type=rel.type, properties=properties
    )

In [None]:
!pip install openai==1.6.1
!pip show openai

In [None]:
!pip install pydantic==1.10.13
!pip show pydantic

In [None]:
import os
from langchain.chains.openai_functions import (
    create_openai_fn_chain,
    create_structured_output_chain,
)
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

os.environ["OPENAI_API_KEY"] = "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"
llm = ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)

def get_extraction_chain(
    allowed_nodes: Optional[List[str]] = None,
    allowed_rels: Optional[List[str]] = None
    ):
    prompt = ChatPromptTemplate.from_messages(
        [(
          "system",
          f"""# Knowledge Graph Instructions for GPT-4
## 1. Overview
You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph.
- **Nodes** represent entities and concepts. They're akin to Wikipedia nodes.
- The aim is to achieve simplicity and clarity in the knowledge graph, making it accessible for a vast audience.
## 2. Labeling Nodes
- **Consistency**: Ensure you use basic or elementary types for node labels.
  - For example, when you identify an entity representing a person, always label it as **"person"**. Avoid using more specific terms like "mathematician" or "scientist".
- **Node IDs**: Never utilize integers as node IDs. Node IDs should be names or human-readable identifiers found in the text.
{'- **Allowed Node Labels:**' + ", ".join(allowed_nodes) if allowed_nodes else ""}
{'- **Allowed Relationship Types**:' + ", ".join(allowed_rels) if allowed_rels else ""}
## 3. Handling Numerical Data and Dates
- Numerical data, like age or other related information, should be incorporated as attributes or properties of the respective nodes.
- **No Separate Nodes for Dates/Numbers**: Do not create separate nodes for dates or numerical values. Always attach them as attributes or properties of nodes.
- **Property Format**: Properties must be in a key-value format.
- **Quotation Marks**: Never use escaped single or double quotes within property values.
- **Naming Convention**: Use camelCase for property keys, e.g., `birthDate`.
## 4. Coreference Resolution
- **Maintain Entity Consistency**: When extracting entities, it's vital to ensure consistency.
If an entity, such as "John Doe", is mentioned multiple times in the text but is referred to by different names or pronouns (e.g., "Joe", "he"),
always use the most complete identifier for that entity throughout the knowledge graph. In this example, use "John Doe" as the entity ID.
Remember, the knowledge graph should be coherent and easily understandable, so maintaining consistency in entity references is crucial.
## 5. Strict Compliance
Adhere to the rules strictly. Non-compliance will result in termination.
          """),
            ("human", "Use the given format to extract information from the following input: {input}"),
            ("human", "Tip: Make sure to answer in the correct format"),
        ])
    return create_structured_output_chain(KnowledgeGraph, llm, prompt, verbose=False)

In [None]:
def extract_and_store_graph(
    document: Document,
    nodes:Optional[List[str]] = None,
    rels:Optional[List[str]]=None) -> None:
    # Extract graph data using OpenAI functions
    extract_chain = get_extraction_chain(nodes, rels)
    data = extract_chain.invoke(document.page_content)['function']
    # Construct a graph document
    graph_document = GraphDocument(
      nodes = [map_to_base_node(node) for node in data.nodes],
      relationships = [map_to_base_relationship(rel) for rel in data.rels],
      source = document
    )
    # Store information into a graph
    graph.add_graph_documents([graph_document])

In [None]:
import pandas as pd

aila_directory = "/kaggle/input/aila-dataset/AILA_2019_Dataset"

query_document_relevance_pairings = pd.read_csv(aila_directory + '/relevance_judgments_priorcases.txt', delimiter = " ", header = None)
query_document_relevance_pairings.columns = ["Query_Name", "Q0", "Document_Name" ,"Relevance"]
query_document_relevance_pairings = query_document_relevance_pairings.drop(columns=["Q0"])
# query_document_relevance_pairings.head()

relevant_docs = []
for i, row in query_document_relevance_pairings.iterrows():
    if row['Relevance'] == 1:
        relevant_docs.append(row['Document_Name'])
relevant_docs = list(set(relevant_docs))
print(sorted(relevant_docs))
print(len(relevant_docs))

In [None]:
from collections import namedtuple
from langchain.text_splitter import TokenTextSplitter


In [None]:
from tqdm import tqdm
import csv

In [None]:
allowed_nodes = [
    "Accused",
    "Acts",
    "Advisory Jurisdiction",
    "Apex Court",
    "Appeal",
    "Appellant",
    "Appellant Jurisdiction",
    "Author",
    "Bench",
    "Case",
    "Case Type",
    "Chief Metropolitan Court",
    "City Civil Courts",
    "Civil",
    "Civil Courts",
    "Country",
    "Court Decision",
    "Court",
    "Court Judgements",
    "Courts",
    "Courts of Smaller Causes",
    "Criminal",
    "Criminal Courts",
    "District",
    "District Court",
    "District Courts",
    "Division Bench",
    "Document",
    "Evidence",
    "FIR",
    "Government",
    "Group",
    "High Court",
    "Individual",
    "Investigator",
    "Judge",
    "Judgement",
    "Judicial Magistrate Court (First Class)",
    "Judicial Magistrate Court (Second Class)",
    "Jurisdiction",
    "Larger Bench",
    "Legal Participants",
    "Location",
    "Metropolitan Court",
    "Metropolitan Magistrate Courts",
    "Munsif Court",
    "Non-Legal Participants",
    "Order",
    "Organization",
    "Original Jurisdiction",
    "Others",
    "Participants",
    "Petition",
    "Petitioner",
    "Place",
    "Plaintiff",
    "Precedent Case",
    "Principal Junior Civil Court",
    "Respondent",
    "Review Jurisdiction",
    "Sessions Court",
    "Single Judge",
    "Solicitor",
    "Special Bench",
    "State",
    "Sub Court",
    "Taluka",
    "Tribunal Bench",
    "Tribunals",
    "Witness",
    "Writ Jurisdiction"
]

allowed_relations = [
   "caseBelongsToType",
   "documentType",
   "hasActs",
   "hasAdvisoryJurisdiction",
   "hasAppellantJurisdiction",
   "hasAuthor",
   "hasBench",
   "hasCourtDecision",
   "hasCourts",
   "hasEvidenceLocation",
   "hasEvidences",
   "hasIndividuals",
   "hasLegalParticipants",
   "hasLocation",
   "hasNonLegalParticipants",
   "hasOriginalJurisdiction",
   "hasParticipantType",
   "hasPrecedentCase",
   "hasReviewJurisdiction",
   "hasWritJurisdiction",
   "isA",
]



In [None]:
input_dir = aila_directory
output_dir = "/kaggle/working/"

# Delete the graph
graph.query("MATCH (n) DETACH DELETE n")

for j in range(1, 151): 
    
    # Specify the path to your dataset folder
    filename1 = input_dir + "/Object_casedocs/C" + str(j) + ".txt"

    document_instances = []

    document_instance = namedtuple('Document', ['page_content', 'metadata'])
    with open(filename1, 'r', encoding='utf-8') as file:
        document_instances.append(document_instance(page_content=file.read(), metadata={'filename': filename1}))

    text_splitter = TokenTextSplitter(chunk_size=1024, chunk_overlap=24)

    # Split the selected document into chunks
    documents = text_splitter.split_documents(document_instances)
    
    for i, d in tqdm(enumerate(documents), total=len(documents)):
        extract_and_store_graph(d, allowed_nodes, allowed_relations)
    #     time.sleep(1)    
    
    graph_result = graph.query("""
    MATCH (a)-[r]->(b)
    RETURN labels(a) AS source_labels, a, type(r) AS relationship_type, properties(r) AS relationship_properties, labels(b) AS target_labels, b
    """)
    
    output_file = output_dir + "KG_C" + str(j) + ".csv"

    with open(output_file, 'w', newline='') as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(["n_type", "n", "r_type", "r", "m_type", "m"])
        for record in graph_result:
            csv_writer.writerow([record["source_labels"], record["a"], record["relationship_type"], record["relationship_properties"], record["target_labels"], record["b"]])
    
    # Delete the graph
    graph.query("MATCH (n) DETACH DELETE n")
            


In [None]:
# from tqdm import tqdm

# for i, d in tqdm(enumerate(documents), total=len(documents)):
#     extract_and_store_graph(d)
# #     time.sleep(1)

In [None]:
# import os
# import time
# from langchain.chains.openai_functions import (
#     create_openai_fn_chain,
#     create_structured_output_chain,
# )
# from langchain_openai import ChatOpenAI
# from langchain.prompts import ChatPromptTemplate

# os.environ["OPENAI_API_KEY"] = "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"
# llm = ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)

# # Define rate limits
# tokens_per_minute_limit = 40000
# requests_per_minute_limit = 3
# requests_per_day_limit = 200

# tokens_used = 0
# requests_made_today = 0
# last_minute_timestamp = None

# def get_extraction_chain(
#     allowed_nodes: Optional[List[str]] = None,
#     allowed_rels: Optional[List[str]] = None
#     ):
#     prompt = ChatPromptTemplate.from_messages(
#         [(
#           "system",
#           f"""# Knowledge Graph Instructions for GPT-4
# ## 1. Overview
# You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph.
# - **Nodes** represent entities and concepts. They're akin to Wikipedia nodes.
# - The aim is to achieve simplicity and clarity in the knowledge graph, making it accessible for a vast audience.
# ## 2. Labeling Nodes
# - **Consistency**: Ensure you use basic or elementary types for node labels.
#   - For example, when you identify an entity representing a person, always label it as **"person"**. Avoid using more specific terms like "mathematician" or "scientist".
# - **Node IDs**: Never utilize integers as node IDs. Node IDs should be names or human-readable identifiers found in the text.
# {'- **Allowed Node Labels:**' + ", ".join(allowed_nodes) if allowed_nodes else ""}
# {'- **Allowed Relationship Types**:' + ", ".join(allowed_rels) if allowed_rels else ""}
# ## 3. Handling Numerical Data and Dates
# - Numerical data, like age or other related information, should be incorporated as attributes or properties of the respective nodes.
# - **No Separate Nodes for Dates/Numbers**: Do not create separate nodes for dates or numerical values. Always attach them as attributes or properties of nodes.
# - **Property Format**: Properties must be in a key-value format.
# - **Quotation Marks**: Never use escaped single or double quotes within property values.
# - **Naming Convention**: Use camelCase for property keys, e.g., `birthDate`.
# ## 4. Coreference Resolution
# - **Maintain Entity Consistency**: When extracting entities, it's vital to ensure consistency.
# If an entity, such as "John Doe", is mentioned multiple times in the text but is referred to by different names or pronouns (e.g., "Joe", "he"),
# always use the most complete identifier for that entity throughout the knowledge graph. In this example, use "John Doe" as the entity ID.
# Remember, the knowledge graph should be coherent and easily understandable, so maintaining consistency in entity references is crucial.
# ## 5. Strict Compliance
# Adhere to the rules strictly. Non-compliance will result in termination.
#           """),
#             ("human", "Use the given format to extract information from the following input: {input}"),
#             ("human", "Tip: Make sure to answer in the correct format"),
#         ])
#     return create_structured_output_chain(KnowledgeGraph, llm, prompt, verbose=False)

# def extract_and_store_graph(document, nodes=None, rels=None):
#     global tokens_used, requests_made_today, last_minute_timestamp

#     # Check tokens used in the last minute
#     current_minute = int(time.time() / 60)
#     if last_minute_timestamp != current_minute:
#         tokens_used = 0
#         last_minute_timestamp = current_minute

#     if tokens_used >= tokens_per_minute_limit:
#         print("Exceeded tokens per minute limit. Waiting...")
#         time.sleep(60 - (time.time() % 60))
#         tokens_used = 0

#     # Check requests made in the last minute
#     if requests_made_today >= requests_per_day_limit:
#         print("Exceeded requests per day limit. Exiting...")
#         return None

#     # Make the API request
#     extract_chain = get_extraction_chain(nodes, rels)
#     data = extract_chain.invoke(document.page_content)['function']

#     # Update usage statistics
#     tokens_used += data.tokens_used
#     requests_made_today += 1

#     # Construct a graph document and store information into a graph
#     graph_document = GraphDocument(
#         nodes=[map_to_base_node(node) for node in data.nodes],
#         relationships=[map_to_base_relationship(rel) for rel in data.rels],
#         source=document
#     )
#     graph.add_graph_documents([graph_document])

# from collections import namedtuple
# from langchain.text_splitter import TokenTextSplitter

# # Specify the path to your dataset folder
# dataset_folder = "/kaggle/input/aila-casedocs/Object_casedocs"

# # Read only one text file from the folder
# selected_document = None
# for filename in os.listdir(dataset_folder):
#     if filename.endswith(".txt"):
#         file_path = os.path.join(dataset_folder, filename)
#         with open(file_path, 'r', encoding='utf-8') as file:
#             selected_document = file.read()
#             break  # Stop after reading the first text file

# # Define a named tuple for your document
# Document = namedtuple('Document', ['page_content', 'metadata'])

# # Create an instance of the Document named tuple
# document_instance = Document(page_content=selected_document, metadata={'filename': filename})

# # Define chunking strategy
# text_splitter = TokenTextSplitter(chunk_size=1024, chunk_overlap=24)

# # Split the selected document into chunks
# documents = text_splitter.split_documents([document_instance])



# from tqdm import tqdm

# for i, d in tqdm(enumerate(documents), total=len(documents)):
#     extract_and_store_graph(d)
#     time.sleep(1)

# # Print usage statistics
# print(f"Tokens used: {tokens_used}")
# print(f"Requests made today: {requests_made_today}")


In [None]:
# import csv

# graph_result = graph.query("""
# MATCH (a)-[r]->(b)
# RETURN labels(a) AS source_labels, a, type(r) AS relationship_type, properties(r) AS relationship_properties, labels(b) AS target_labels, b
# """)

# with open("/kaggle/working/query-1.csv", 'w', newline='') as csvfile:
#     csv_writer = csv.writer(csvfile)
#     csv_writer.writerow(["n_type", "n", "r_type", "r", "m_type", "m"])
#     for record in graph_result:
#         csv_writer.writerow([record["source_labels"], record["a"], record["relationship_type"], record["relationship_properties"], record["target_labels"], record["b"]])

In [None]:
# # Delete the graph
# graph.query("MATCH (n) DETACH DELETE n")