In [1]:
import os
from dotenv import load_dotenv

os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'

load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

In [2]:
import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain.schema import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

#### INDEXING ####

# Load Documents
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

# Embed
vectorstore = Chroma.from_documents(documents=splits, 
                                    embedding=OpenAIEmbeddings())

retriever = vectorstore.as_retriever()

#### RETRIEVAL and GENERATION ####

# Prompt
prompt = hub.pull("rlm/rag-prompt")

# LLM
llm = ChatOpenAI(model_name="gpt-4o", temperature=0)

# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# Question
rag_chain.invoke("What is Task Decomposition?")

USER_AGENT environment variable not set, consider setting it to identify your requests.


'Task Decomposition is the process of breaking down a complex task into smaller, more manageable steps. This can be achieved using techniques like Chain of Thought (CoT) or Tree of Thoughts, which help in systematically addressing each part of the task. It can be done through simple prompting, task-specific instructions, or human inputs.'

In [3]:
# Documents
question = "What kinds of pets do I like?"
document = "My favorite pet is a cat."

In [4]:
import tiktoken

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

num_tokens_from_string(question, "cl100k_base")

8

Text embedding models

In [5]:
from langchain_openai import OpenAIEmbeddings
embd = OpenAIEmbeddings()
query_result = embd.embed_query(question)
document_result = embd.embed_query(document)
len(query_result)

1536

cosine similarity

In [6]:
import numpy as np

def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

similarity = cosine_similarity(query_result, document_result)
print("Cosine Similarity:", similarity)

Cosine Similarity: 0.8807044730847644


Loading Document

In [7]:
import fitz  # PyMuPDF
from langchain.document_loaders import TextLoader

def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text()
    return text

In [8]:
contract_text_raptor = extract_text_from_pdf('../data/Evaluation Sets/Raptor Contract.pdf')
qa_text_raptor = extract_text_from_pdf('../data/Evaluation Sets/Raptor Q&A.pdf')
contract_text_robinson = extract_text_from_pdf('../data/Evaluation Sets/Robinson Advisory.pdf')
qa_text_robinson = extract_text_from_pdf('../data/Evaluation Sets/Robinson Q&A.pdf')

In [9]:
# Chunking
def chunk_text(text, chunk_size=500, overlap=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = ' '.join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

In [10]:
contract_chunks_raptor = chunk_text(qa_text_raptor, chunk_size=300, overlap=50)

In [11]:
qa_chunks_raptor = chunk_text(contract_text_raptor, chunk_size=300, overlap=50)

In [12]:
documents_raptor = [Document(page_content=chunk) for chunk in contract_chunks_raptor]

In [13]:
documents_raptor_qa = [Document(page_content=chunk) for chunk in qa_chunks_raptor]

Vectorstores

In [14]:
from langchain_community.vectorstores import FAISS

# Create embeddings
embeddings = OpenAIEmbeddings()

vectorstore_raptor = FAISS.from_documents(documents_raptor, embeddings)
vectorstore_raptor_qa = FAISS.from_documents(documents_raptor_qa, embeddings)

## Reranking

In [15]:
from sentence_transformers.cross_encoder import CrossEncoder

# Initialize a cross-encoder model for reranking
reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

# Define a function to rerank retrieved documents
def rerank_documents(query, retrieved_docs, reranker):
    pairs = [(query, doc.page_content) for doc in retrieved_docs]
    scores = reranker.predict(pairs)
    ranked_docs = sorted(zip(scores, retrieved_docs), key=lambda x: x[0], reverse=True)
    return [doc for _, doc in ranked_docs]

  from tqdm.autonotebook import tqdm, trange


### Retrieval

In [16]:
def retrieve_and_rerank(query, vectorstore, reranker, top_k=5):
    retrieved_docs = vectorstore.similarity_search(query, k=top_k)
    ranked_docs = rerank_documents(query, retrieved_docs, reranker)
    return ranked_docs

In [17]:
from langchain.chains import RetrievalQA
from langchain.chains import LLMChain

In [18]:
qa_pipeline_raptor = RetrievalQA.from_llm(llm=llm, retriever=vectorstore_raptor.as_retriever())

### Prompt Engineering

In [19]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

In [20]:
# LLM
llm = ChatOpenAI(model_name="gpt-4o", temperature=0)

In [21]:
prompt_template = """You are a legal assistant. Answer the question based on the given context.
Context: {context}
Question: {question}
Answer:"""

In [22]:
# Create the RAG pipeline
retrieval_qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore_raptor.as_retriever(),
    return_source_documents=True,
    verbose=True,
    chain_type_kwargs={
        "verbose": True,
        "prompt": prompt
    }
)

### Parse Q&A pairs

In [23]:
import re

def parse_qa_pairs(text):
    # Regular expressions for matching questions and answers
    question_pattern = re.compile(r'Q\d+[a-z]?: (.*?)\n')
    answer_pattern = re.compile(r'A\d+[a-z]?: (.*?)\n')
    
    # Find all questions and answers
    questions = question_pattern.findall(text)
    answers = answer_pattern.findall(text)
    
    # Group questions and answers
    qa_pairs = []
    q_index = 0
    a_index = 0
    
    while q_index < len(questions) and a_index < len(answers):
        question = questions[q_index].strip()
        answer = answers[a_index].strip()
        
        # Check if the next question or answer is a sub-question/sub-answer
        while (q_index + 1 < len(questions) and re.match(r'Q\d+[a-z]:', questions[q_index + 1]) or
               a_index + 1 < len(answers) and re.match(r'A\d+[a-z]:', answers[a_index + 1])):
            sub_questions = []
            sub_answers = []
            
            # Collect sub-questions
            while q_index + 1 < len(questions) and re.match(r'Q\d+[a-z]:', questions[q_index + 1]):
                q_index += 1
                sub_questions.append(questions[q_index].strip())
            
            # Collect sub-answers
            while a_index + 1 < len(answers) and re.match(r'A\d+[a-z]:', answers[a_index + 1]):
                a_index += 1
                sub_answers.append(answers[a_index].strip())
            
            question += ' ' + ' '.join(sub_questions)
            answer += ' ' + ' '.join(sub_answers)
        
        # Append the Q&A pair to the list
        qa_pairs.append({"question": question, "answer": answer})
        
        # Move to the next question and answer
        q_index += 1
        a_index += 1
    
    return qa_pairs

qa_pairs_raptor = parse_qa_pairs(qa_text_raptor)

In [24]:
qa_pairs_raptor

[{'question': 'Under what circumstances and to what extent the Sellers are responsible for a breach of',
  'answer': 'Except in the case of fraud, the Sellers have no liability for breach of representations and'},
 {'question': 'Would the Sellers be responsible if after the closing it is determined that there were',
  'answer': 'No'},
 {'question': 'How much is the escrow amount?',
  'answer': 'The escrow amount is equal to $1,000,000.'},
 {'question': 'Is escrow amount grete then the Retention Amount ?',
  'answer': 'No.'},
 {'question': 'What is the purpose of the escrow?',
  'answer': 'To serve as a recourse of the Buyer in case of post-closing adjustments of the purchase price.'},
 {'question': 'May the Escrow Amount serve as a recourse for the Buyer in case of breach of',
  'answer': 'No'},
 {'question': 'Are there any conditions to the closing?',
  'answer': 'No, as the signing and closing are simultaneous.'},
 {'question': 'Are Change of Control Payments considered a Seller Tran

In [25]:
# Evaluation 
def evaluate_model(qa_pipeline, qa_pairs):
    correct = 0
    total = len(qa_pairs)
    
    for qa in qa_pairs:
        question = qa['question']
        expected_answer = qa['answer']
        
        # Get the generated answer from the RAG pipeline
        generated_answer = qa_pipeline.run(question)
        
        # Compare the generated answer to the expected answer
        if generated_answer.strip().lower() == expected_answer.strip().lower():
            correct += 1
        else:
            print(f"Question: {question}")
            print(f"Expected: {expected_answer}")
            print(f"Generated: {generated_answer}")
            print("---")
    
    accuracy = correct / total
    return accuracy

In [26]:
# Run evaluation
accuracy_raptor = evaluate_model(qa_pipeline_raptor, qa_pairs_raptor)
print(f"Raptor Contract Accuracy: {accuracy_raptor * 100:.2f}%")

  warn_deprecated(


Question: Under what circumstances and to what extent the Sellers are responsible for a breach of
Expected: Except in the case of fraud, the Sellers have no liability for breach of representations and
Generated: The Sellers are responsible for a breach of representations and warranties only in the case of fraud. In all other circumstances, the Sellers have no liability for such breaches.
---
Question: Would the Sellers be responsible if after the closing it is determined that there were
Expected: No
Generated: No, the Sellers would not be responsible if after the closing it is determined that there were inaccuracies in the representation provided by them, even if such inaccuracies are the result of the Sellers’ gross negligence. The Sellers have no liability for breach of representations and warranties except in the case of fraud.
---
Question: Is escrow amount grete then the Retention Amount ?
Expected: No.
Generated: No. The escrow amount is not greater than the Retention Amount.
---