In [1]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
import re
from dotenv import load_dotenv
import pandas as pd

from IPython.display import Markdown, display, HTML
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
load_dotenv()
os.chdir(os.path.dirname(os.getcwd()))

df = pd.read_parquet("./data/splade_embeds.parquet")
# Convert column names to snake_case for compatibility with LanceDB
original_columns = df.columns
snake_case_columns = {col: re.sub(r'(?<!^)(?=[A-Z])', '_', col).lower() for col in original_columns}
df.rename(columns=snake_case_columns, inplace=True)
df.head(2)

Unnamed: 0,index,created_utc,full_link,id,body,title,text_label,flair_label,embeddings,token_count,llm_title,state,kmeans_label,topic_title,splade_embeddings
0,1078,1575952538,https://www.reddit.com/r/legaladvice/comments/...,e8lsen,I applied for a job and after two interviews I...,"Failed a drug test due to amphetamines, I have...",employment,5,"[9.475638042064453e-05, 0.0005111666301983955,...",493,"""Validity of Schedule II Drug Prescription in ...",PR,8,Employment Legal Concerns and Issues,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,2098,1577442453,https://www.reddit.com/r/legaladvice/comments/...,eg9ll2,"Hi everyone, thanks in advance for any guidanc...","Speeding ticket in Tennessee, Georgia Driver's...",driving,4,"[-0.006706413111028856, 0.020911016696181495, ...",252,"""Speeding ticket consequences for out-of-state...",KY,10,Legal Topics in Traffic Violations,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [2]:
df.sort_values(by='token_count', ascending=False, inplace=True)

In [4]:
from src.search.query_expansion import segment

queries = segment(
    test_query,
    n='3 to 5'
)

[32m2024-03-20 01:37:40 - INFO - Using pre-computed 'body' embeddings from existing column: splade_embeddings[0m
[32m2024-03-20 01:37:40 - INFO - Using pre-computed 'body' embeddings from existing column: splade_embeddings[0m


In [5]:
queries.searches

[SubQuestion(chain_of_thought="To determine if the user is exempt from overtime payment, it's necessary to understand the criteria for overtime exemption under New York State labor laws, especially for salaried employees and those in managerial positions like 'Director of IT'.", sub_question_topic='Overtime Exemption Criteria in New York', sub_question_query='What are the criteria for overtime exemption for salaried employees in New York State?', sub_question_keywords=['overtime exemption', 'salaried employees', 'New York State labor laws']),
 SubQuestion(chain_of_thought="Given the user's significant increase in work hours to meet a deadline, it's important to explore if there are specific laws or regulations in New York State that address compensation for excessive overtime, particularly for positions like 'Director of IT'.", sub_question_topic='Compensation for Excessive Overtime in New York', sub_question_query='Are there laws in New York State regarding compensation for excessive 

In [6]:
vector_results, keyword_results = queries.execute()

[32m2024-03-20 01:38:06 - INFO - 

Thought: To determine if the user is exempt from overtime payment, it's necessary to understand the criteria
for overtime exemption under New York State labor laws, especially for salaried employees and those
in managerial positions like 'Director of IT'.
Search topic: Overtime Exemption Criteria in New York[0m
[32m2024-03-20 01:38:06 - INFO - Running vector (OpenAI) search on: What are the criteria for overtime exemption for salaried employees in New York State?[0m
[32m2024-03-20 01:38:07 - INFO - Running keyword (SPLADE) search on: overtime exemption, salaried employees, New York State labor laws[0m
[32m2024-03-20 01:38:07 - INFO - Returning 10 records from vector search and 10 from keywords[0m
[32m2024-03-20 01:38:07 - INFO - Results contain 15 unique IDs[0m
[32m2024-03-20 01:38:07 - INFO - ---------------------------------------------------------------------------[0m
[32m2024-03-20 01:38:07 - INFO - 

Thought: Given the user's signifi

In [7]:
from src.search.doc_joiner import DocJoinerDF
from src.embedding_models.models import ColbertReranker
from src.search.threadpool import run_functions_tuples_in_parallel

In [8]:
USEFUL_PAT = "Yes useful"
NONUSEFUL_PAT = "Not useful"
CHUNK_FILTER_PROMPT = f"""
Determine if the reference section is USEFUL for answering the user query.
It is good enough for the section to be related or similar to the query, \
it should be relevant information that is USEFUL for answering the query.
If the section contains ANY useful information, that is good enough, \
it does not need to fully answer the every part of the user query.

Reference Section:
```
{{chunk_text}}
```

User Query:
```
{{user_query}}
```

Respond with EXACTLY AND ONLY: "{USEFUL_PAT}" or "{NONUSEFUL_PAT}"
""".strip()

In [9]:
from langchain.schema.messages import AIMessage
from langchain.schema.messages import BaseMessage
from langchain.schema.messages import HumanMessage
from langchain.schema.messages import SystemMessage

def dict_based_prompt_to_langchain_prompt(
    messages: list[dict[str, str]]
) -> list[BaseMessage]:
    prompt: list[BaseMessage] = []
    for message in messages:
        role = message.get("role")
        content = message.get("content")
        if not role:
            raise ValueError(f"Message missing `role`: {message}")
        if not content:
            raise ValueError(f"Message missing `content`: {message}")
        elif role == "user":
            prompt.append(HumanMessage(content=content))
        elif role == "system":
            prompt.append(SystemMessage(content=content))
        elif role == "assistant":
            prompt.append(AIMessage(content=content))
        else:
            raise ValueError(f"Unknown role: {role}")
    return prompt

In [10]:
from typing import Callable
from langchain_openai import ChatOpenAI

def llm_eval_chunk(query: str, chunk_content: str) -> bool:
    def _get_usefulness_messages() -> list[dict[str, str]]:
        messages = [
            {
                "role": "user",
                "content": CHUNK_FILTER_PROMPT.format(
                    chunk_text=chunk_content, user_query=query
                ),
            },
        ]

        return messages

    def _extract_usefulness(model_output: str) -> bool:
        """Default 'useful' if the LLM doesn't match pattern exactly.
        This is because it's better to trust the (re)ranking if LLM fails"""
        if model_output.content.strip().strip('"').lower() == NONUSEFUL_PAT.lower():
            return False
        return True

    llm = ChatOpenAI(model='gpt-3.5-turbo')

    messages = _get_usefulness_messages()
    filled_llm_prompt = dict_based_prompt_to_langchain_prompt(messages)
    model_output = llm.invoke(filled_llm_prompt)

    return _extract_usefulness(model_output)


def llm_batch_eval_chunks(
    query: str, chunk_contents: list[str], use_threads: bool = True
) -> list[bool]:
    if use_threads:
        functions_with_args: list[tuple[Callable, tuple]] = [
            (llm_eval_chunk, (query, chunk_content)) for chunk_content in chunk_contents
        ]

        print(
            "Running LLM usefulness eval in parallel (following logging may be out of order)"
        )
        parallel_results = run_functions_tuples_in_parallel(
            functions_with_args, allow_failures=True
        )

        # In case of failure/timeout, don't throw out the chunk
        return [True if item is None else item for item in parallel_results]

    else:
        return [
            llm_eval_chunk(query, chunk_content) for chunk_content in chunk_contents
        ]

In [11]:
from llama_index_client import TextNode
from src.search.models import dataframe_to_text_nodes

def filter_chunks(
    query: str,
    chunks_to_filter: list[TextNode],
    max_llm_filter_chunks: int = 20,
) -> list[TextNode]:
    """Filters chunks based on whether the LLM thought they were relevant to the query.

    Args:
        query (str): The query to filter chunks against.
        chunks_to_filter (list[TextNode]): A list of TextNode objects to filter.
        max_llm_filter_chunks (int, optional): The maximum number of chunks to consider. Defaults to 20.

    Returns:
        list[TextNode]: A list of TextNode objects that were marked as relevant.
    """
    chunks_to_filter = chunks_to_filter[: max_llm_filter_chunks]
    llm_chunk_selection = llm_batch_eval_chunks(
        query=query,
        chunk_contents=[chunk.text for chunk in chunks_to_filter],
    )
    return [
        chunk
        for ind, chunk in enumerate(chunks_to_filter)
        if llm_chunk_selection[ind]
    ]

In [12]:
cols = df.columns.tolist()
cols.remove('embeddings')

i = 0

query = queries.searches[i].sub_question_keywords
nodes = dataframe_to_text_nodes(
    keyword_results[i],
    id_column='index', 
    text_col='body', 
    metadata_fields=cols
)

In [13]:
from src.search.models import text_nodes_to_dataframe

node_df = text_nodes_to_dataframe(nodes)
node_df.head(1)

Unnamed: 0,text,index,created_utc,full_link,id,body,title,text_label,flair_label,token_count,llm_title,state,kmeans_label,topic_title,splade_embeddings
0,Okay so I have 2 questions. One regarding whe...,6076,1472610609,https://www.reddit.com/r/legaladvice/comments/...,50exvs,Okay so I have 2 questions. One regarding whe...,[NY] Exempt from overtime? (Salaried Software ...,employment,5,710,"""Legal Questions: Exempt from Overtime and Cla...",NE,8,Employment Legal Concerns and Issues,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [14]:
filtered_chunks = filter_chunks(test_query, nodes)
print(f"\nReturned {len(filtered_chunks)} nodes")
print(f"{query}\n")
for obj in filtered_chunks:
    print(obj)
    print("-" * 50)


Running LLM usefulness eval in parallel (following logging may be out of order)

Returned 8 nodes
['overtime exemption', 'salaried employees', 'New York State labor laws']

Node ID: 6076
Text: Okay so I have 2 questions.  One regarding whether I am exempt
from overtime payment.  The other regarding the wording of the paid
vacation days clause in my offer letter.  Okay so first off I live in
NY.  I work remotely from home.  The company I work for is also based
out of NY.  (Upstate, not the city, if that makes any difference).
When I w...
--------------------------------------------------
Node ID: 4229
Text: Im an Assistant Manager at a car dealership in Connecticut. We
have been short staffed for 4 months now (I can't hire anyone without
the approval of 2 other people) and I have been made to work unpaid
overtime almost every day to pick up all of the unfinished work.  I've
looked up a few sites about labor law and overtime law, but nothing is
real...
-----------------------------------

In [8]:
contater = DocJoinerDF(join_mode="reciprocal_rank_fusion", top_k=20)

In [9]:
vector_res = contater.run(vector_results)
keyword_res = contater.run(keyword_results)
vector_res.shape, keyword_res.shape

((20, 17), (20, 17))

In [None]:
reranker = ColbertReranker(column='body')

In [None]:
vec_res = vector_res.drop(columns=['embeddings', 'splade_embeddings'])
kw_res = keyword_res.drop(columns=['embeddings', 'splade_embeddings'])

In [None]:
hybrid_results = reranker.rerank_hybrid(test_query, vec_res, kw_res)

In [None]:
hybrid_results.shape

(40, 15)

In [None]:
hybrid_results.drop_duplicates(subset=['body'], inplace=True)
hybrid_results.shape

(35, 15)

In [None]:
hybrid_results.head(20)['search_type'].value_counts()

search_type
splade    11
vector     9
Name: count, dtype: int64

In [None]:
from src.agent.tools.utils import aget_fact_patterns_df
import nest_asyncio
nest_asyncio.apply()

prep_df = await aget_fact_patterns_df(hybrid_results, 'body', "id")

Processing summaries: 100%|██████████| 35/35 [00:11<00:00,  2.96it/s]


In [None]:
prep_df.head(2)

Unnamed: 0,index,created_utc,full_link,id,body,title,text_label,flair_label,token_count,llm_title,state,kmeans_label,topic_title,search_type,_relevance_score,summary
0,5872,1576255862,https://www.reddit.com/r/legaladvice/comments/...,ea6gvy,"This will be a long post, but I really need so...",Who do I contact if I think my landlord is bei...,housing,7,1001,"""Legal issues with landlord: repairs, theft, r...",MH,3,Rental Property and Landlord Matters,splade,0.939139,A tenant in Missouri experiences multiple ongo...
1,9173,1422499769,https://www.reddit.com/r/legaladvice/comments/...,2u1eqk,This is a follow up post to [this post](http:/...,(Florida) Need to know how to dispute refusal ...,housing,7,665,"""Landlord withholding security deposit - How t...",ID,0,Legal Topics in Rental Properties,splade,0.870435,A renter details a series of problematic renta...


In [None]:
from src.agent.tools.utils import create_formatted_input, get_final_answer

formatted_input = create_formatted_input(
    prep_df, test_query, context_token_limit=8000
    )

response_model = get_final_answer(formatted_input, model_name="gpt-4-turbo-preview")

In [None]:
Markdown(response_model.research_report)

The new query presents a multifaceted legal issue involving poor maintenance, unresponsive landlords, potential unauthorized entry and theft by the landlord, and a rent payment dispute possibly being leveraged for eviction purposes. The tenant's situation in Missouri shares several parallels with past cases, providing an informative basis for comparison. 

The scenario with the tenant experiencing multiple ongoing issues, including unaddressed maintenance problems and unauthorized entry, closely mirrors the situation described in [Search Result 1](https://www.reddit.com/r/legaladvice/comments/ea6gvy/who_do_i_contact_if_i_think_my_landlord_is_being/). Both highlight the challenges faced by tenants dealing with landlords who are slow or refuse to make necessary repairs, coupled with the suspicion that the landlord is entering their premises without permission. The advice sought in both queries centers on how to handle these complex tenant-landlord relationship issues, especially regarding the landlord's responsibilities for upkeep and respect for tenant privacy. 

The challenge of dealing with a rent payment dispute, where the tenant has made a good faith effort to pay rent on time but faces difficulties due to mail delivery issues, aligns with the primary concern of [Search Result 1](https://www.reddit.com/r/legaladvice/comments/ea6gvy/who_do_i_contact_if_i_think_my_landlord_is_being/) and is further compounded by the landlord's threats and the inherent fear of eviction. This situation also resonates with concerns raised in [Search Result 22](https://www.reddit.com/r/legaladvice/comments/56uax5/missouri_landlord_attempting_to_evict/), where a Missouri resident faces eviction without proper notice and questionable rent increase demands. Both scenarios emphasize the precarious nature of tenant-landlord trust and the legal technicalities surrounding proper eviction processes and rent increase notices. 

Notably, the current query uniquely combines these elements into a single narrative, presenting a multifaceted legal dilemma. The past cases offer valuable insights into addressing each aspect individually but also illustrate the complexity of navigating these issues concurrently. Legal advice and intervention might be necessary to ensure that the tenant's rights are protected, especially when facing potential eviction under dubious circumstances. The incorporation of concerns regarding unauthorized entry, property theft, and exploitation of rent payment issues adds layers to the tenant's predicament, underscoring the importance of seeking comprehensive legal advice tailored to the specificities of the situation.

In [None]:
from src.search.query_expansion import generate_subquestions

questions = generate_subquestions(test_query, n='any number of')
questions.questions

['Should I just pay the rent and wait for my refund?',
 'If the post office shows that my original money order was cashed, am I out that money?',
 'What can I do about a landlord who is slow to make repairs?',
 'Is it legal for my landlord to enter my apartment without permission?',
 'Can a landlord raise rent in response to making repairs?',
 'How do I handle disruptive neighbors?',
 'What should I do if I suspect my landlord of stealing from me?',
 'What are my rights as a tenant in Missouri?',
 'Is it legal to use pliers to turn on water in lieu of a broken knob?',
 "What actions can I take if I've been treated unfairly by my landlord?"]

In [None]:
questions._raw_response.usage

CompletionUsage(completion_tokens=147, prompt_tokens=1130, total_tokens=1277)

In [8]:
from src.search.query_filter import generate_query_plan, auto_filter_fts_search

In [9]:
query_plan = generate_query_plan(
    input_df=df,
    query=test_query,
    filter_fields=[
        'state',
    ]
)
filtered_df = query_plan.filter_df(df=df)

[32m2024-03-19 21:42:45 - INFO - Schema shown to LLM: 
Name of each field, its type and unique values (up to 20):
* state (string);  Values - ['NM' 'IN' 'WY' 'NH' 'MP' 'PA' 'MH' 'ID' 'AR' 'MA' 'KS' 'AS' 'ND' 'PR'
 'DE' 'FL' 'LA' 'OR' 'VT' 'PW'], ... 39 more
        [0m


[32m2024-03-19 21:42:55 - INFO - Input DataFrame has 5,000 rows[0m
[32m2024-03-19 21:42:55 - INFO - Applying filter(s): state LIKE '%OR%'[0m
[32m2024-03-19 21:43:03 - INFO - Filtered DataFrame has 86 rows[0m


In [15]:
filtered_df.head(2)

Unnamed: 0,index,created_utc,full_link,id,body,title,text_label,flair_label,embeddings,token_count,llm_title,state,kmeans_label,topic_title,splade_embeddings
0,2029,1578267399,https://www.reddit.com/r/legaladvice/comments/...,ekl2ef,For context I live in the Philippines. I wont ...,My professor refuses to show us ALL of our tes...,school,9,"[-0.00954271624451715, 0.007157037183387862, 0...",953,"""Unrevealed Grades and Lack of Transparency: S...",OR,9,Legal Consequences of False Accusations,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,3320,1591126549,https://www.reddit.com/r/legaladvice/comments/...,gve3nq,Edit: I live in Washington state.\n\nSo I live...,My landlord has been harassing me about my pet...,housing,7,"[-0.0034782202413053045, 0.00616729225832095, ...",759,"""Legal dispute over pet snake: Landlord threat...",OR,3,Rental Property and Landlord Matters,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [10]:
print(query_plan.original_query)
print(query_plan.rephrased_query)

Do I have any legal recourse here? I know Oregon is an 'at will' state, but it sounds like there are at LEAST two instances that offer grounds for wrongful termination (just based on my limited knowledge of the ADA, dept of labor, BOLI, etc.). 
legal recourse for wrongful termination in 'at will' employment including issues related to mistreatment, health code violations, improper handling of company money, and potential discrimination due to medical conditions


In [13]:
test_res = auto_filter_fts_search(
    df=df,
    query='marijuana',
    top_k=20,
    text_column="body",
    embeddings_column="embeddings",
    filter_fields=[
        'state',
    ])

[32m2024-03-19 21:44:19 - INFO - Schema shown to LLM: 
Name of each field, its type and unique values (up to 20):
* state (string);  Values - ['NM' 'IN' 'WY' 'NH' 'MP' 'PA' 'MH' 'ID' 'AR' 'MA' 'KS' 'AS' 'ND' 'PR'
 'DE' 'FL' 'LA' 'OR' 'VT' 'PW'], ... 39 more
        [0m


[32m2024-03-19 21:44:22 - INFO - No filters were identified for query: marijuana[0m
[32m2024-03-19 21:44:22 - INFO - Revised query: marijuana[0m
[32m2024-03-19 21:44:23 - INFO - Full Text Search (FTS) search yielded a DataFrame with 20 rows[0m


In [14]:
Markdown(test_res['body'].tolist()[0])

I'm a New York State medical marijuana patient. I also work in healthcare. I applied to a new job at a new hospital, and they are discriminating against me for being a medical marijuana patient. I was offered the job and accepted, but when I went to get my pre-employment physical conducted, I gave them my medical marijuana card and informed them that I am a patient. They are now refusing to hire me. Is this legal? I already contacted the division of human rights at the labor department and they said I may or may not have a case.