In [1]:
from langchain_voyageai import VoyageAIEmbeddings
from langchain_community.vectorstores import LanceDB
import os
from dotenv import load_dotenv
load_dotenv()

voyage_api_key = os.getenv("VOYAGE_API_KEY")

embeddings = VoyageAIEmbeddings(
    voyage_api_key=voyage_api_key, model="voyage-3-lite", show_progress_bar = True, truncation = True
)

vector_store = LanceDB(embedding=embeddings,table_name="usa_code",uri = "/Users/vaibhavnakrani/usa_laws/uscode/lance",distance="cosine")

In [2]:
normal_retriever = vector_store.as_retriever(search_kwargs={"k": 2})

In [3]:
# Make save BM25 retriever.
# from langchain_core.documents import Document

# tbl = vector_store.get_table()
# pd_df = tbl.to_pandas()
# pd_df.head()

# documents = []
# for _, row in pd_df.iterrows():
#     # Extract text and metadata
#     text = row['text']  # or whatever column contains your document text
#     metadata = row["metadata"]
    
#     # Create Document object
#     doc = Document(page_content=text, metadata=metadata)
#     documents.append(doc)

# from langchain_community.retrievers import BM25Retriever

# bm25_retriever = BM25Retriever.from_documents(documents)
# bm25_retriever.k = 2

# import pickle

# # Save the BM25Retriever
# with open('bm25_retriever.pkl', 'wb') as f:
#     pickle.dump(bm25_retriever, f)

In [4]:
import pickle

with open('bm25_retriever.pkl', 'rb') as f:
    bm25_retriever = pickle.load(f)


In [5]:
from langchain.retrievers import EnsembleRetriever

ensemble_retriever = EnsembleRetriever(
    retrievers=[
        bm25_retriever.with_config({"kwargs": {"k": 1}}),
        normal_retriever.with_config({"kwargs": {"k": 1}})
    ],
    weights=[0.5, 0.5]
)

In [6]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain_voyageai import VoyageAIRerank

compressor = VoyageAIRerank(
    model="rerank-2-lite", voyageai_api_key=voyage_api_key, top_k=3
)

compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=normal_retriever
)

In [7]:
from langchain_core.prompts import ChatPromptTemplate

template = """You are analyzing the United States Code and related legislative documents. Below is relevant content from the US Code in XML format:
{context}

Based solely on the provided content, compose a structured response surrounded by ```markdown``` tags that:

1. Begins with the relevant US Code title and section reference
2. Includes source credit in proper format:
   - For main law: (Pub. L. XX-XXX, §X, Date, XX Stat. XXXX)
   - For amendments: As amended Pub. L. XX-XXX, §X, Date, XX Stat. XXXX
3. Explains provisions clearly with proper markdown formatting
4. Uses the following structure:

```markdown
# [Title Number] U.S.C. § [Section Number] - [Section Title]

## Source
[Source credits in standard legal citation format]

## Current Law
[Main content explanation]

## Legislative History
[Relevant amendments and changes]

## Important Notes
[Relevant findings and additional provisions]
```

Ensure all responses maintain consistent markdown formatting and are wrapped in markdown tags.
Question: {question}
Answer: """

prompt = ChatPromptTemplate.from_template(template)

In [8]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")

In [9]:
from langchain_google_genai import ChatGoogleGenerativeAI
from google.generativeai.types.safety_types import HarmBlockThreshold, HarmCategory

safety_settings = {
    HarmCategory.HARM_CATEGORY_UNSPECIFIED:
    HarmBlockThreshold.BLOCK_ONLY_HIGH,
    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT:
    HarmBlockThreshold.BLOCK_ONLY_HIGH
}

llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro-latest",safety_settings=safety_settings)
res = llm.invoke("Write me a ballad about LangChain")

In [10]:
import re

def parse_markdown_tags(text: str) -> str:
    pattern = r'```markdown\s*(.*?)\s*```'
    match = re.search(pattern, text, re.DOTALL)
    
    if match:
        return match.group(1).strip()
    return ""

In [11]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from typing import Dict, List
from langchain.globals import set_debug

set_debug(True)


def format_docs(docs):
    """Extract xml_content from document metadata and join them."""
    # Extract xml_content from each document's metadata
    xml_contents = [doc.metadata.get("xml_content", "") for doc in docs]
    # Join all xml contents with newlines
    return "\n\n".join([content for content in xml_contents if content])

# Create the chain with the custom document formatter
chain = (
    {
        "context": normal_retriever | format_docs,
        "question": RunnablePassthrough()
    }
    | prompt
    | llm
    | StrOutputParser()
    | parse_markdown_tags
)

In [12]:
res = chain.invoke("What is the laws related to cosmetics ?")

[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence] Entering Chain run with input:
[0m{
  "input": "What is the laws related to cosmetics ?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence > chain:RunnableParallel<context,question>] Entering Chain run with input:
[0m{
  "input": "What is the laws related to cosmetics ?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence > chain:RunnableParallel<context,question> > chain:RunnableSequence] Entering Chain run with input:
[0m{
  "input": "What is the laws related to cosmetics ?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence > chain:RunnableParallel<context,question> > chain:RunnablePassthrough] Entering Chain run with input:
[0m{
  "input": "What is the laws related to cosmetics ?"
}
[36;1m[1;3m[chain/end][0m [1m[chain:RunnableSequence > chain:RunnableParallel<context,question> > chain:RunnablePassthrough] [0ms] Exiting Chain run with output:
[0m{
  "output": "What is the laws related 