In [1]:
import os
from dotenv import load_dotenv
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
from IPython.display import Markdown, display, HTML

load_dotenv()
os.chdir(os.path.dirname(os.getcwd()))

df = pd.read_parquet('./data/splade.parquet')
print(f"df shape: {df.shape}")
df.head(1)

df shape: (1987, 24)


Unnamed: 0,index,id,citation,name,name_abbreviation,decision_date,court_id,court_name,court_slug,judges,attorneys,citations,url,head,body,name_contains_lm,body_contains_lm,year,context,context_citation,context_tokens,openai_embeddings,splade_embeddings,state
0,0,411690,154 Ill. 2d 90,"RICHARD R. JOHNSON, Plaintiff-Appellant and Cr...",Johnson v. Halloran,2000-01-13,8837,Illinois Appellate Court,ill-app-ct,[],"['Wolter, Beeman, Lynch & McIntyre, of Springf...","[{'type': 'official', 'cite': '312 Ill. App. 3...",https://api.case.law/v1/cases/411690/,"RICHARD R. JOHNSON, Plaintiff-Appellant and Cr...",JUSTICE HALL\r\ndelivered the opinion of the c...,False,True,2000,The public defender of Cook County was appoint...,154 Ill. 2d 90,1317,"[-0.0017778094625100493, -0.002360282698646187...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",New Mexico


In [3]:
from src.llm.utils import (
    instructor_cache,
    async_retry_with_exponential_backoff,
    retry_with_exponential_backoff,
)

In [58]:
# from openai-cookbook
import asyncio
import functools
import inspect
from itertools import islice
import logging
import random
import time
from typing import Any, Callable, Dict, Iterable, List

import aiohttp
import diskcache
import openai
from pydantic import BaseModel
import requests
import diskcache
import traceback
from src.utils.logging import setup_colored_logging


cache = diskcache.Cache("./my_cache_directory")
logger = setup_colored_logging(__name__)


def instructor_cache(func):
    """Cache a function that returns a Pydantic model"""
    return_type = inspect.signature(func).return_annotation
    if not issubclass(return_type, BaseModel):
        raise ValueError("The return type must be a Pydantic model")

    is_async = inspect.iscoroutinefunction(func)

    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        key = f"{func.__name__}-{functools._make_key(args, kwargs, typed=False)}"
        # Check if the result is already cached
        if (cached := cache.get(key)) is not None:
            # Deserialize from JSON based on the return type
            if issubclass(return_type, BaseModel):
                return return_type.model_validate_json(cached)

        # Call the function and cache its result
        result = func(*args, **kwargs)
        serialized_result = result.model_dump_json()
        cache.set(key, serialized_result)

        return result

    @functools.wraps(func)
    async def awrapper(*args, **kwargs):
        key = f"{func.__name__}-{functools._make_key(args, kwargs, typed=False)}"
        # Check if the result is already cached
        if (cached := cache.get(key)) is not None:
            # Deserialize from JSON based on the return type
            if issubclass(return_type, BaseModel):
                return return_type.model_validate_json(cached)

        # Call the function and cache its result
        result = await func(*args, **kwargs)
        serialized_result = result.model_dump_json()
        cache.set(key, serialized_result)

        return result

    return wrapper if not is_async else awrapper



In [82]:
from textwrap import fill
from typing import List
from datetime import datetime
from pydantic import BaseModel, Field, model_validator
import hashlib


def hash_text(text: str) -> str:
    return hashlib.md5(text.encode()).hexdigest()


class LegalArgument(BaseModel):
    """Information about a legal argument"""

    legal_question: str = Field(
        ...,
        description="A concise and well-structured legal question. For example: 'Plaintiff slipped and fell in the hotel lobby. Is the hotel liable?'",
    )
    rule: str = Field(
        ...,
        description="A concise legal ruling, decision, or authority. For example: 'If a hotel knows its floors are wet, it has a duty to take reasonable steps to avoid such injury.'",
    )
    application: str = Field(
        ...,
        description="Application, or potential application of the rule. For example: 'The hotel acted negligently'.",
    )
    
    def __str__(self):
        wrapped_question = fill(self.legal_question, width=100)
        wrapped_rule = fill(self.rule, width=100)
        wrapped_application = fill(self.application, width=100)
        
        thought = f"Question: {wrapped_question}\n\n"
        thought += f"Rule: {wrapped_rule}\n\n"
        thought += f"Application: {wrapped_application}\n"
        return thought
    
    
class LegalArguments(BaseModel):
    """A list of legal arguments."""
    
    arguments: List[LegalArgument] = Field(
        default_factory=list,
        description="A list of legal arguments from a court document.",
    )
    
    @property
    def print_args(self):
        output_string = ""
        for arg in self.arguments:
            output_string += str(arg)
            output_string += "\n------------------------------------------------\n\n"
        return output_string
        
            
    @model_validator(mode="before")
    def generate_tiemstamp(cls, values):
        timestamp = datetime.now()
        values['timestamp'] = timestamp.isoformat(timespec='minutes')
        return values

In [142]:
import instructor
import openai
from tenacity import (
    Retrying,
    AsyncRetrying,
    stop_after_attempt, 
    wait_fixed,
) # for instructor validation

from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
    after_log,
)  # for exponential backoff


aclient = instructor.from_openai(openai.AsyncOpenAI())
client = instructor.from_openai(openai.OpenAI())

@retry(
    wait=wait_random_exponential(min=1, max=60),
    stop=stop_after_attempt(6),
    after=after_log(logger, logging.INFO),
)
def acompletion_with_backoff(**kwargs):
    return aclient.chat.completions.create(**kwargs)


@retry(
    wait=wait_random_exponential(min=1, max=60),
    stop=stop_after_attempt(6),
    after=after_log(logger, logging.INFO),
)
def completion_with_backoff(**kwargs):
    return client.chat.completions.create(**kwargs)


@instructor_cache
async def aextract_data(data: str, model_name: str = "gpt-4o") -> LegalArguments:
    return await acompletion_with_backoff(
        model=model_name,
        response_model=LegalArguments,
        max_retries=AsyncRetrying(
            stop=stop_after_attempt(5),
            wait=wait_fixed(1),
        ),
        messages=[
            {
                "role": "system",
                "content": "You are a world class structured data extraction AI.",
            },
            {
                "role": "user",
                "content": f"Use the context to extract information in the correct format.\nCONTEXT: {data}",
            },
        ],
    )
    
    
@instructor_cache
def extract_data(data: str, model_name: str = "gpt-4o") -> LegalArguments:
    return completion_with_backoff(
        model=model_name,
        response_model=LegalArguments,
        max_retries=Retrying(
            stop=stop_after_attempt(5),
            wait=wait_fixed(1),
        ),
        messages=[
            {
                "role": "system",
                "content": "You are a world class structured data extraction AI.",
            },
            {
                "role": "user",
                "content": f"Use the context to extract information in the correct format.\nCONTEXT: {data}",
            },
        ],
    ) 

In [143]:
context = df["body"].tolist()[1900]

In [144]:
import nest_asyncio
nest_asyncio.apply()

test = await aextract_data(context)

In [145]:
test = await aextract_data(context)

In [146]:
test.arguments

[LegalArgument(legal_question='Does Liberty Mutual have a duty to defend or indemnify Dometic against the underlying class action complaints?', rule='An insurer has a duty to defend if the claims potentially fall within the indemnity coverage provided by the policies. The duty to indemnify is determined when the underlying litigation is resolved.', application="Liberty Mutual's duty to defend is triggered by the putative class action complaints because the factual allegations in the complaints suggest property damage that falls within the policy's coverage. However, the duty to indemnify is not ripe for determination until the underlying cases are concluded."),
 LegalArgument(legal_question="Does the exhaustion of the 2004-2005 policy affect Liberty Mutual's duty to defend or indemnify?", rule='A policy that is exhausted does not require the insurer to defend or indemnify the insured under that policy.', application="The court granted Liberty Mutual's motion for judgment on the pleadin

In [147]:
print(test.print_args)

Question: Does Liberty Mutual have a duty to defend or indemnify Dometic against the underlying class action
complaints?

Rule: An insurer has a duty to defend if the claims potentially fall within the indemnity coverage
provided by the policies. The duty to indemnify is determined when the underlying litigation is
resolved.

Application: Liberty Mutual's duty to defend is triggered by the putative class action complaints because the
factual allegations in the complaints suggest property damage that falls within the policy's
coverage. However, the duty to indemnify is not ripe for determination until the underlying cases
are concluded.

------------------------------------------------

Question: Does the exhaustion of the 2004-2005 policy affect Liberty Mutual's duty to defend or indemnify?

Rule: A policy that is exhausted does not require the insurer to defend or indemnify the insured under
that policy.

Application: The court granted Liberty Mutual's motion for judgment on the plead

In [148]:
test2 = extract_data(context)

In [149]:
test2 = extract_data(context)

In [150]:
print(test2.print_args)

Question: Liberty Mutual seeks a declaratory judgment that it has no duty to defend or indemnify Dometic
against three putative class action complaints. Does Liberty Mutual have an obligation to defend?

Rule: An insurer has a duty to defend if the allegations in the underlying complaint potentially fall
within the indemnity coverage provided by the policies.

Application: The factual allegations of property damage in each of the underlying putative class action
complaints are sufficient to trigger Liberty Mutual's duty to defend.

------------------------------------------------

Question: Does Liberty Mutual have a duty to indemnify Dometic for the claims from the underlying putative
class action complaints at this stage?

Rule: The duty to indemnify isn't ripe for determination until the underlying claims are resolved.

Application: The underlying actions are still proceeding, so the duty to indemnify claims aren’t ripe for a
decision.

----------------------------------------------