In [1]:
import os
from dotenv import load_dotenv

os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'

load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
LANGCHAIN_API_KEY = os.getenv('LANGCHAIN_API_KEY')

In [2]:
#### INDEXING ####

# Load blog
import bs4
from langchain_community.document_loaders import WebBaseLoader
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
blog_docs = loader.load()

# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=300, 
    chunk_overlap=50)

# Make splits
splits = text_splitter.split_documents(blog_docs)

# Index
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
vectorstore = Chroma.from_documents(documents=splits, 
                                    embedding=OpenAIEmbeddings())

retriever = vectorstore.as_retriever()

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [4]:
import fitz  # PyMuPDF
from langchain.document_loaders import TextLoader

def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text()
    return text

In [5]:
contract_text_raptor = extract_text_from_pdf('../data/Evaluation Sets/Raptor Contract.pdf')
qa_text_raptor = extract_text_from_pdf('../data/Evaluation Sets/Raptor Q&A.pdf')
contract_text_robinson = extract_text_from_pdf('../data/Evaluation Sets/Robinson Advisory.pdf')
qa_text_robinson = extract_text_from_pdf('../data/Evaluation Sets/Robinson Q&A.pdf')

In [6]:
import re

def parse_qa_pairs(text):
    # Regular expressions for matching questions and answers
    question_pattern = re.compile(r'Q\d+[a-z]?: (.*?)\n')
    answer_pattern = re.compile(r'A\d+[a-z]?: (.*?)\n')
    
    # Find all questions and answers
    questions = question_pattern.findall(text)
    answers = answer_pattern.findall(text)
    
    # Group questions and answers
    qa_pairs = []
    q_index = 0
    a_index = 0
    
    while q_index < len(questions) and a_index < len(answers):
        question = questions[q_index].strip()
        answer = answers[a_index].strip()
        
        # Check if the next question or answer is a sub-question/sub-answer
        while (q_index + 1 < len(questions) and re.match(r'Q\d+[a-z]:', questions[q_index + 1]) or
               a_index + 1 < len(answers) and re.match(r'A\d+[a-z]:', answers[a_index + 1])):
            sub_questions = []
            sub_answers = []
            
            # Collect sub-questions
            while q_index + 1 < len(questions) and re.match(r'Q\d+[a-z]:', questions[q_index + 1]):
                q_index += 1
                sub_questions.append(questions[q_index].strip())
            
            # Collect sub-answers
            while a_index + 1 < len(answers) and re.match(r'A\d+[a-z]:', answers[a_index + 1]):
                a_index += 1
                sub_answers.append(answers[a_index].strip())
            
            question += ' ' + ' '.join(sub_questions)
            answer += ' ' + ' '.join(sub_answers)
        
        # Append the Q&A pair to the list
        qa_pairs.append({"question": question, "answer": answer})
        
        # Move to the next question and answer
        q_index += 1
        a_index += 1
    
    return qa_pairs

qa_pairs_raptor = parse_qa_pairs(qa_text_raptor)