In [1]:
import os
from gatenlp import Document
from gatenlp.corpora import ListCorpus
import requests
import json
import os
import ipywidgets as widgets
from IPython.display import display, Markdown
from gatenlp.lib_spacy import AnnSpacy
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
import pandas as pd
import instructor

from dotenv import load_dotenv


In [2]:
from pydantic import BaseModel
from typing import List, Optional

class LegalEvent(BaseModel):
    event_type: str  # "event_circumstance" or "event_procedure"
    event_who: Optional[str]
    event_what: Optional[str]
    event_when: Optional[str]

class LegalEventsExtraction(BaseModel):
    events: List[LegalEvent]

In [3]:
load_dotenv()

user_email = os.getenv("USEREMAIL")  # Enter your email here
password = os.getenv("PASSWORD")  # Enter your password here

# Fetch Access Token

# Define the URL for the authentication endpoint
auth_url = "http://localhost:8080/api/v1/auths/signin"

# Define the payload with user credentials
auth_payload = json.dumps({"email": user_email, "password": "admin"})

# Define the headers for the authentication request
auth_headers = {"accept": "application/json", "content-type": "application/json"}

# Make the POST request to fetch the access token
auth_response = requests.post(auth_url, data=auth_payload, headers=auth_headers)

# Extract the access token from the response
access_token = auth_response.json().get("token")

In [9]:
import instructor

def askChatbot(model, role, instruction, content):
    chat_url = "http://localhost:11434/api/chat"
    chat_headers = {
        "accept": "application/json",
        "content-type": "application/json",
        "Authorization": f"Bearer {access_token}",
    }
    chat_payload = json.dumps(
        {
            "stream": False,
            "model": model,
            "temperature": 0.0,
            "messages": [
                {"role": "system", "content": role},
                {"role": "user", "content": f"{instruction}\n\n{content}"},
            ],
        }
    )
    chat_response = requests.post(chat_url, data=chat_payload, headers=chat_headers)
    print(chat_response)
    response_content = chat_response.json().get("message", {}).get("content", "")
    # Use instructor to parse the response into the structured model
    try:
        structured = LegalEventsExtraction.model_validate_json(response_content)
        return structured
    except Exception as e:
        print(f"Parsing error: {e}")
        return None

In [None]:
# Create a new corpus with an empty list
corpus = ListCorpus([])

# Define the base directory
base_dir = "input/annotated"

# Walk through the directory and load each XML file
for root, dirs, files in os.walk(base_dir):
    for file in files:
        if file.endswith(".xml"):
            file_path = os.path.join(root, file)
            doc = Document.load(file_path, fmt="gatexml")
            # Add the document to the corpus
            corpus.append(doc)
            print(f"Loaded {file_path} into corpus")            
                
print("All documents loaded into the corpus.")

In [None]:
len(corpus)

In [None]:
print(doc.features.get("gate.SourceURL").replace("file:/C:/Users/mnavas/", "").replace("%20", " "))

In [None]:
#nlp_spacy = English()
#nlp_spacy.add_pipe('sentencizer')
#tokenize = AnnSpacy(nlp_spacy, add_nounchunks=False, add_deps=False, add_entities=False)

#for doc in corpus:
#    doc = tokenize(doc)
#    doc

In [None]:
from gatenlp.visualization import CorpusViewer

viewer = CorpusViewer(corpus)
viewer.show()

In [10]:
models = [#"gemma3:12b",
          "GandalfBaum/llama3.1/claude3.7:latest",
          "chevalblanc/claude-3-haiku:latest",
          "incept5/llama3.1-claude:latest",
          "llama3.3:latest",
          "deepseek-r1:8b",
          "mistral:latest",
          "llama3-gradient:latest"
]

event_definitions = """
You are an expert in legal text analysis. Here are the definitions of legal events:
- Event: Relates to the extent of text containing contextual event-related information. 
- Event_who: Corresponds to the subject of the event, which can either be a subject, but also an object (i.e., an application). 
    Examples: applicant, respondent, judge, witness
- Event_what: Corresponds to the main verb reflecting the baseline of all the paragraph. Additionally, we include thereto a complementing verb or object whenever the core verb is not self-explicit or requires an extension to attain a sufficient meaning.
    Examples: lodged an application, decided, ordered, dismissed
- Event_when: Refers to the date of the event, or to any temporal reference thereto.
- Event_circumstance: Meaning that the event correspond to the facts under judgment.
- Event_procedure: The events belongs to the procedural dimension of the case.

Events contain the annotations event_who, event_what and event_when. Events can be of type event_circumstance and event_procedure.
"""

#instruction = "Analyze the provided text and extract the legal events. Provide the results in a structured format. Obviously, Event_who, Event_what and Event_when can only appear within an Event. If you find an event, also classify it into an event_circumstance or event_procedure. Do not invent additional information."

instruction = """
Analyze the provided text and extract the legal events. Provide the results in a structured format.
Your output MUST be valid JSON matching this schema:
{
  "events": [
    {
      "event_type": "event_circumstance or event_procedure",
      "event_who": "string or null",
      "event_what": "string or null",
      "event_when": "string or null"
    }
  ]
}
Do not include any explanation or text outside the JSON.
Obviously, Event_who, Event_what and Event_when can only appear within an Event.
If you find an event, also classify it into an event_circumstance or event_procedure.
Do not invent additional information. Only return the JSON, no additional text or something else
"""

In [13]:
combined_procedure_text = """1.  The case originated in an application (no. 11236/09) against the Republic of Turkey lodged with the Court under Article 34 of the Convention for the Protection of Human Rights and Fundamental Freedoms (“the Convention”) by a Turkish national, Mr Mehmet Aytunç Altay (“the applicant”), on 17 February 2006. 
2.  The applicant was represented by Ms G. Tuncer, a lawyer practising in Istanbul. The Turkish Government (“the Government”) were represented by their Agent. 
3.  The applicant alleged, in particular, that the restriction of the privacy of his consultations with his lawyer was incompatible with his rights under Article 8 of the Convention and that the domestic proceedings with respect to this measure had not complied with the requirements of Article 6 § 1 of the Convention. 
4.  On 17 October 2017 notice of the above complaints was given to the Government and the remainder of the application was declared inadmissible pursuant to Rule 54 § 3 of the Rules of Court. """
structured_response = askChatbot(models[0], event_definitions, instruction, combined_procedure_text)

<Response [400]>
Parsing error: 1 validation error for LegalEventsExtraction
  Invalid JSON: EOF while parsing a value at line 1 column 0 [type=json_invalid, input_value='', input_type=str]
    For further information visit https://errors.pydantic.dev/2.10/v/json_invalid


In [None]:
from tqdm import tqdm
results = []
# Iterate over documents and models
for doc in tqdm(corpus, desc="Processing documents"):
    doc_dict = {"Document": doc.features.get("gate.SourceURL")}
    print(f"Processing document: {doc.features.get("gate.SourceURL")}")
    
    # Combine all procedure texts for the document
    procedure_texts = []
    annotations = doc.annset("Section")
    procedure_annotations = annotations.with_type("Procedure")
    for ann in procedure_annotations:
        procedure_text = doc.text[ann.start:ann.end]
        procedure_texts.append(procedure_text)
    combined_procedure_text = " ".join(procedure_texts)
    #print(f"Combined procedure text: {combined_procedure_text}")
    
    # Iterate over models
    for model in models:
        print(f"Using model: {model}")

        # Call the chatbot with role, instruction, and content
        structured_response = askChatbot(model, event_definitions, instruction, combined_procedure_text)
        # Extract and store the response
        doc_dict[model] = structured_response.model_dump() if structured_response else "Parsing failed"
    
    # Append the document dictionary to the results list
    results.append(doc_dict)

# Convert results to a DataFrame and save as CSV
df = pd.DataFrame(results)
df.to_csv("chat_responses_with_instructions.csv", index=False)
df.to_excel("chat_responses_with_instructions.xlsx", index=False)

In [None]:
# Convert the results list to a pandas DataFrame
df = pd.DataFrame(doc_list)

# Display the DataFrame
print(df)

# Save the DataFrame to a CSV file for later analysis
df.to_csv("chat_responses.csv", index=False)