In [1]:
import os
from dotenv import load_dotenv
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
from IPython.display import Markdown, display, HTML

load_dotenv()
os.chdir(os.path.dirname(os.getcwd()))

df = pd.read_parquet('./data/splade.parquet')
print(f"df shape: {df.shape}")
df.head(1)

df shape: (1987, 24)


Unnamed: 0,index,id,citation,name,name_abbreviation,decision_date,court_id,court_name,court_slug,judges,attorneys,citations,url,head,body,name_contains_lm,body_contains_lm,year,context,context_citation,context_tokens,openai_embeddings,splade_embeddings,state
0,0,411690,154 Ill. 2d 90,"RICHARD R. JOHNSON, Plaintiff-Appellant and Cr...",Johnson v. Halloran,2000-01-13,8837,Illinois Appellate Court,ill-app-ct,[],"['Wolter, Beeman, Lynch & McIntyre, of Springf...","[{'type': 'official', 'cite': '312 Ill. App. 3...",https://api.case.law/v1/cases/411690/,"RICHARD R. JOHNSON, Plaintiff-Appellant and Cr...",JUSTICE HALL\r\ndelivered the opinion of the c...,False,True,2000,The public defender of Cook County was appoint...,154 Ill. 2d 90,1317,"[-0.0017778094625100493, -0.002360282698646187...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",New Mexico


In [2]:
query = """
Regarding the pollution exclusion clause under the terms of comprehensive general liability (CGL) insurance, \
how has the California court defined the phrase 'sudden and accidental', in particular for polluting events? \
Also, has there been any consideration for intentional vs unintentional polluting events?
"""

In [3]:
from src.agent.tools.full_text_search import FTSConfig, FTSSearchEngine

DATA_PATH = "data/splade.parquet"

fts_config = FTSConfig(
    data_path=DATA_PATH,
    text_column="body"
)

fts_search = FTSSearchEngine().create(fts_config)

[32m2024-04-14 23:34:41 - INFO - Read in df with shape: (1987, 24)[0m


In [4]:
test = fts_search.query_similar_documents(query=query, top_k=20)
test.shape

[32m2024-04-14 23:34:49 - INFO - Full Text Search (FTS) search yielded a DataFrame with 20 rows[0m


(20, 25)

In [5]:
test.head(1)

Unnamed: 0,index,id,citation,name,name_abbreviation,decision_date,court_id,court_name,court_slug,judges,attorneys,citations,url,head,body,name_contains_lm,body_contains_lm,year,context,context_citation,context_tokens,openai_embeddings,splade_embeddings,state,score
0,1725,12489335,144 Ill. 2d 64,"R.T. VANDERBILT COMPANY, INC. v. HARTFORD ACCI...",R.T. Vanderbilt Co. v. Hartford Accident & Ind...,2017-03-07,8801,Connecticut Appellate Court,conn-app-ct,[],"['Elizabeth J. Stewart, New Haven and Jacob M....","[{'type': 'official', 'cite': '156 A.3d 539'},...",https://api.case.law/v1/cases/12489335/,"R.T. VANDERBILT COMPANY, INC.\r\nv.\r\nHARTFOR...","Lavine, Beach and Bear, Js.\r\nLAVINE, BEACH a...",False,True,2017,"""Applying these definitions in a purely litera...",144 Ill. 2d 64,1483,"[-0.00035266715, 0.016778922, 0.029396245, -0....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",Missouri,40.368801


In [6]:
from src.search.models import (
    SearchQuery, 
    SearchType, 
    SearchRequest,
    DocumentSource,
    BaseFilters,
    Tag
)
from src.search.query_expansion import generate_subquestions
from src.search.query_planning import generate_query_filters, screen_query

DEFAULT_TOP_K = 20
DEFAULT_RERANK_TOP_K = 10
DEFAULT_MAX_LLM_CHUNKS = 10

from typing import Any
from src.search.models import ChunkMetric

def get_chunk_metrics(df: pd.DataFrame, source: Any):
    chunk_metrics = [
        ChunkMetric(
            source=source.__class__.__name__,
            document_id=str(row.name),
            score=row["score"],
            rank=i+1
        )
        for i, row in df.iterrows()
    ]
    return chunk_metrics

def retrieval_preprocessing(
    search_request: SearchRequest,
    verbose: bool = False,
) -> SearchQuery:
    
    tag_list = []
    
    query = search_request.query

    preset_filters = search_request.human_selected_filters or None
    
    if preset_filters:
        tag_list.extend(preset_filters.tags)

    query_screening = None
    if search_request.enable_query_screening:
        query_screening = screen_query(query=query)
        
    if search_request.enable_auto_detect_filters:
        predicted_filters = generate_query_filters(
            search_df=df,
            query=query,
            filter_fields=['state'],
            verbose=verbose,
        )
        pred_tags = []
        for tag in predicted_filters:
            pred_tag = Tag(
                tag_key=tag.filter_key,
                tag_value=tag.filter_value
            )
            pred_tags.append(pred_tag)
        
        tag_list.extend(pred_tags)
        
    subqueries = None
    if search_request.enable_auto_subquestions:
        n = str(query_screening.n_subquestions) if query_screening else "2 to 3"
        subqueries = generate_subquestions(
            query, n,
        )

    return SearchQuery(
        query=search_request.query,
        subquestions=subqueries.questions if subqueries is not None else None,
        search_type=SearchType.SEMANTIC,
        filters=tag_list,
        num_hits=DEFAULT_TOP_K,
        num_rerank=DEFAULT_RERANK_TOP_K,
        max_llm_filter_chunks=DEFAULT_MAX_LLM_CHUNKS,
        rerank=search_request.rerank,
        llm_chunk_filter=search_request.llm_chunk_filter,
        query_eval=query_screening if query_screening is not None else None,
    )


In [17]:
state_tag = Tag(
    tag_key="state",
    tag_value="Texas"
)
base_filters = BaseFilters(
    source_type=DocumentSource.OPINIONS,
    document_set="sharepoint",
    tags=[state_tag],
)
search_request = SearchRequest(
    query=query,
    human_selected_filters=base_filters,
    enable_query_screening=True,
    enable_auto_detect_filters=True,
    enable_auto_subquestions=True,
    rerank=False,
    llm_chunk_filter=False,
)
preprocessed_search_query = retrieval_preprocessing(
    search_request=search_request,
    verbose=True,
)
preprocessed_search_query.model_dump()

[32m2024-04-14 23:46:27 - INFO - Schema shown to LLM: 
Name of each field, its type and unique values (up to 20):
* state (string);  Values - ['New Mexico' 'Nevada' 'Virginia' 'Vermont' 'Kansas' 'Maryland' 'Arkansas'
 'Massachusetts' 'West Virginia' 'Pennsylvania' 'Texas' 'Maine'
 'Mississippi' 'Hawaii' 'Iowa' 'Kentucky' 'Ohio' 'New Hampshire'
 'Wisconsin' 'North Dakota'], ... 30 more
        [0m


{'query': "\nRegarding the pollution exclusion clause under the terms of comprehensive general liability (CGL) insurance, how has the California court defined the phrase 'sudden and accidental', in particular for polluting events? Also, has there been any consideration for intentional vs unintentional polluting events?\n",
 'filters': [{'tag_key': 'state', 'tag_value': 'Texas'},
  {'tag_key': 'state', 'tag_value': 'California'}],
 'num_hits': 20,
 'search_type': <SearchType.SEMANTIC: 'semantic'>,
 'rerank': False,
 'num_rerank': 10,
 'llm_chunk_filter': False,
 'max_llm_filter_chunks': 10,
 'subquestions': ["How has the California court defined the phrase 'sudden and accidental' in relation to the pollution exclusion clause in comprehensive general liability (CGL) insurance?",
  'In California court cases, has there been any distinction made between intentional and unintentional polluting events when interpreting the pollution exclusion clause in CGL insurance?'],
 'query_eval': {'topi

In [8]:
base_filter = preprocessed_search_query.filters[0].filter_dict
base_filter

{'state': 'Texas'}

In [13]:
from src.agent.tools.semantic_search import SemanticSearchConfig, SemanticSearchEngine
from src.agent.tools.splade_search import SPLADESearchConfig, SPLADESparseSearch
from src.agent.tools.hybrid_search import HybridSearchConfig, HybridSearchEngine

SEMANTIC_TEXT_COLUMN = "context"
SPLADE_TEXT_COLUMN = "context"
HYBRID_TEXT_COLUMN = "context"

def retrieve_chunks(search_query: SearchQuery):
    search_type = search_query.search_type
    if search_type == SearchType.SEMANTIC:
        config = SemanticSearchConfig(
            data_path=DATA_PATH,
            text_column=SEMANTIC_TEXT_COLUMN,
        )
        search_engine = SemanticSearchEngine.create(config)
    elif search_type == SearchType.SPLADE:
        config = SPLADESearchConfig(
            data_path=DATA_PATH,
            text_column=SPLADE_TEXT_COLUMN,
        )
        search_engine = SPLADESparseSearch.create(config)
    elif search_type == SearchType.HYBRID:
        config = HybridSearchConfig(
            data_path=DATA_PATH,
            text_column=HYBRID_TEXT_COLUMN,
        )
        search_engine = HybridSearchEngine.create(config)

    return search_engine.query_similar_documents(query, top_k=search_query.num_hits)

In [14]:
retrieval_res = retrieve_chunks(
    search_query=preprocessed_search_query
)

[32m2024-04-14 23:45:17 - INFO - Read in df with shape: (1987, 24)[0m


In [16]:
retrieval_res.head(2)

Unnamed: 0,index,id,citation,name,name_abbreviation,decision_date,court_id,court_name,court_slug,judges,attorneys,citations,url,head,body,name_contains_lm,body_contains_lm,year,context,context_citation,context_tokens,openai_embeddings,splade_embeddings,state,search_type,score
585,756,5454708,268 Ill. App. 3d 598,"EMERSON ELECTRIC COMPANY et al., Plaintiffs-Ap...",Emerson Electric Co. v. Aetna Casualty & Suret...,2004-08-30,8837,Illinois Appellate Court,ill-app-ct,"['McNULTY and McBRIDE, JJ., concur.']","['Stephan G. Weil, Mark H. Kolman, Keisha A. G...","[{'type': 'official', 'cite': '352 Ill. App. 3...",https://api.case.law/v1/cases/5454708/,"EMERSON ELECTRIC COMPANY et al., Plaintiffs-Ap...",JUSTICE GORDON\r\ndelivered the opinion of the...,False,True,2004,"However, as discussed in detail below, such a ...",268 Ill. App. 3d 598,1176,"[-0.019431067630648613, -0.006070500705391169,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",Hawaii,vector,0.865989
920,1145,4268691,89 Ill. App. 3d 617,"COUNTRY MUTUAL INSURANCE COMPANY, Plaintiff-Ap...",Country Mutual Insurance v. Carr,2007-03-19,8837,Illinois Appellate Court,ill-app-ct,[],"['Edward H. Rawles (argued), of Rawles, O’Byrn...","[{'type': 'official', 'cite': '372 Ill. App. 3...",https://api.case.law/v1/cases/4268691/,"COUNTRY MUTUAL INSURANCE COMPANY, Plaintiff-Ap...",JUSTICE KNECHT\r\ndelivered the opinion of the...,False,True,2007,Even if only one of several theories of recove...,89 Ill. App. 3d 617,1481,"[-0.021835995838046074, -0.01290853600949049, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",Nebraska,vector,0.861436


In [None]:
assert 1 + 1 == 3

In [28]:
from datetime import datetime

def convert_timestamp_to_datetime(timestamp: str) -> str:
    return datetime.fromtimestamp(int(timestamp)).strftime("%Y-%m-%d")

timestamp = datetime.timestamp(datetime.now())

timestamp_string = convert_timestamp_to_datetime(timestamp)

'2024-04-15'

In [None]:
from collections.abc import Callable
from collections.abc import Generator
from typing import cast


from src.search.models import (
    SearchQuery, 
    InferenceChunk, 
    RerankMetricsContainer, 
    RetrievalMetricsContainer,
    SearchRequest,
)


class SearchPipeline:
    def __init__(
        self,
        search_request: SearchRequest,
        retrieval_metrics_callback: Callable[[RetrievalMetricsContainer], None]
        | None = None,
        rerank_metrics_callback: Callable[[RerankMetricsContainer], None] | None = None,
    ):
        self.search_request = search_request
        self.retrieval_metrics_callback = retrieval_metrics_callback
        self.rerank_metrics_callback = rerank_metrics_callback

        self._search_query: SearchQuery | None = None

        self._retrieved_docs: list[InferenceChunk] | None = None
        self._reranked_docs: list[InferenceChunk] | None = None
        self._relevant_chunk_indicies: list[int] | None = None

        # generator state
        self._postprocessing_generator: Generator[
            list[InferenceChunk] | list[str], None, None
        ] | None = None

    """Pre-processing"""

    def _run_preprocessing(self) -> None:
        final_search_query = retrieval_preprocessing(
            search_request=self.search_request,
        )
        self._search_query = final_search_query

    @property
    def search_query(self) -> SearchQuery:
        if self._search_query is not None:
            return self._search_query

        self._run_preprocessing()
        return cast(SearchQuery, self._search_query)

    """Retrieval"""

    @property
    def retrieved_docs(self) -> list[InferenceChunk]:
        if self._retrieved_docs is not None:
            return self._retrieved_docs

        self._retrieved_docs = retrieve_chunks(
            query=self.search_query,
            document_index=self.document_index,
            retrieval_metrics_callback=self.retrieval_metrics_callback,
        )

        return cast(list[InferenceChunk], self._retrieved_docs)

    """Post-Processing"""

    @property
    def reranked_docs(self) -> list[InferenceChunk]:
        if self._reranked_docs is not None:
            return self._reranked_docs

        self._postprocessing_generator = search_postprocessing(
            search_query=self.search_query,
            retrieved_chunks=self.retrieved_docs,
            rerank_metrics_callback=self.rerank_metrics_callback,
        )
        self._reranked_docs = cast(
            list[InferenceChunk], next(self._postprocessing_generator)
        )
        return self._reranked_docs

    @property
    def relevant_chunk_indicies(self) -> list[int]:
        if self._relevant_chunk_indicies is not None:
            return self._relevant_chunk_indicies

        # run first step of postprocessing generator if not already done
        reranked_docs = self.reranked_docs

        relevant_chunk_ids = next(
            cast(Generator[list[str], None, None], self._postprocessing_generator)
        )
        self._relevant_chunk_indicies = [
            ind
            for ind, chunk in enumerate(reranked_docs)
            if chunk.unique_id in relevant_chunk_ids
        ]
        return self._relevant_chunk_indicies

    @property
    def chunk_relevance_list(self) -> list[bool]:
        return [
            True if ind in self.relevant_chunk_indicies else False
            for ind in range(len(self.reranked_docs))
        ]