In [2]:
from dotenv import load_dotenv
load_dotenv()

import warnings
warnings.filterwarnings("ignore")

import os, requests, git, shutil
from collections import defaultdict
from itertools import chain

from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter, MarkdownHeaderTextSplitter
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores import Chroma
from langchain_community.llms import HuggingFaceEndpoint
from langchain.storage import InMemoryStore

from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

from langchain_community.document_loaders import PyMuPDFLoader


embedding_model_list = ['sentence-transformers/all-MiniLM-L6-v2', 'BAAI/bge-small-en-v1.5', 'BAAI/bge-large-en-v1.5']
inference_model_list = ['google/gemma-2b-it', 'google/gemma-7b-it', 'mistralai/Mixtral-8x7B-Instruct-v0.1']

HF_READ_API_KEY = os.environ["HF_READ_API_KEY"]

# Helper function for extracting text from Langchain's Document object
def get_text(docs):
    return [d.page_content for d in docs]

inference_model_name = inference_model_list[2]

model_parameters = {'max_new_tokens': 512, 'temperature': 0.3, 'top_p': 0.95}

llm_model = HuggingFaceEndpoint(
            repo_id=inference_model_name,                     
            max_new_tokens=model_parameters['max_new_tokens'], 
            temperature=model_parameters['temperature'], 
            top_p=model_parameters['top_p'],
            huggingfacehub_api_token=HF_READ_API_KEY
        )

hf = HuggingFaceEmbeddings(model_name=embedding_model_list[0])

chunking_parameters = {'chunk_size': 512, 'chunk_overlap': 128}

text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunking_parameters['chunk_size'], chunk_overlap=chunking_parameters['chunk_overlap'])

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to C:\Users\playgrdstar\.cache\huggingface\token
Login successful


In [6]:
loader = PyMuPDFLoader("https://www.berkshirehathaway.com/letters/2023ltr.pdf")
docs = loader.load()


In [7]:
texts = text_splitter.split_documents(docs)

In [8]:
query = " What investments are good?"

In [9]:
## Simple Vectorstore Retriever
## To get a standard retriever from a vectorstore, just need to get VectorStoreRetriever from the as_retriever method

In [10]:
vector_db_from_docs = Chroma.from_documents(texts, hf)

In [11]:
simple_retriever = vector_db_from_docs.as_retriever(search_kwargs={"k": 5})
retrieved_docs = simple_retriever.get_relevant_documents(query)
print(get_text(retrieved_docs))

['at a premium. \nThe lesson from Coke and AMEX? When you find a truly wonderful business, stick with \nit. Patience pays, and one wonderful business can offset the many mediocre decisions that \nare inevitable. \n* * * * * * * * * * * * \nThis year, I would like to describe two other investments that we expect to maintain \nindefinitely. Like Coke and AMEX, these commitments are not huge relative to our resources. \nThey are worthwhile, however, and we were able to increase both positions during 2023.', 'think to predict which will be the winners and losers. And those who tell you they know the answer \nare usually either self-delusional or snake-oil salesmen. \nAt Berkshire, we particularly favor the rare enterprise that can deploy additional capital at \nhigh returns in the future. Owning only one of these companies – and simply sitting tight – can \ndeliver wealth almost beyond measure. Even heirs to such a holding can – ugh! – sometimes live \na lifetime of leisure.', 'is now. \nO

In [12]:
## Parent Document Retriever

In [13]:
from langchain.retrievers import ParentDocumentRetriever
parent_text_splitter = child_text_splitter = text_splitter

In [14]:
vector_db = Chroma(collection_name="parent_child", embedding_function=hf)
# The storage layer for the parent documents
store = InMemoryStore()

pr_retriever = ParentDocumentRetriever(
    vectorstore=vector_db,
    docstore=store,
    child_splitter=child_text_splitter,
    parent_splitter=parent_text_splitter,
)
pr_retriever.add_documents(docs)

In [15]:
retrieved_docs = pr_retriever.get_relevant_documents(query)
print(get_text(retrieved_docs))

["But before we get into VAR, we first need to discuss what value we are assessing risk against. What we want to measure would be the change in market prices over a time period (e.g. day to day). So what VAR would then tell us then would be how much we could lose (or gain) due to the change in prices. It's quite common to use lognormal instead of normal returns when computing the change in prices.\n\nUseful links which provide more information on the difference between the two -", '---\nlayout: default\ntitle:  Measuring Market Risk in Python\ndescription: Measuring Market Risk in Python\ndate:   2021-01-26 00:00:00 +0000\npermalink: /market_risk/\ncategory: Finance\n---\n## Measuring Market Risk in Python\n\nVAR is a common term that one would usually come across in finance when it comes to the measurement of market risks.\n\nVAR, or Value At Risk, is basically a measure of the potential losses that one could face, at a specific level of confidence - e.g. 99%.', 'If you recall the bas

In [35]:
# MultiQueryRetriever

In [87]:
def multi_query_retriever(query, llm, retriever):
    DEFAULT_QUERY_PROMPT = PromptTemplate(
        input_variables=["question"],
        template="""You are an AI language model assistant. Your task is 
        to generate 3 different versions of the given user 
        question to retrieve relevant documents from a vector  database. 
        By generating multiple perspectives on the user question, 
        your goal is to help the user overcome some of the limitations 
        of distance-based similarity search. Provide these alternative 
        questions separated by newlines. Original question: {question}""",
    )
    mq_llm_chain = LLMChain(llm=llm, prompt=DEFAULT_QUERY_PROMPT)
    
    generated_queries = parse_lines(mq_llm_chain.invoke(query)['text'])
    all_queries = [query] + generated_queries
    
    all_retrieved_docs = []
    for q in all_queries:
        retrieved_docs = retriever.get_relevant_documents(q)
        all_retrieved_docs.extend(retrieved_docs)
    
    unique_retrieved_docs = [doc for i, doc in enumerate(all_retrieved_docs) if doc not in all_retrieved_docs[:i]]
    
    return get_text(unique_retrieved_docs)

In [88]:
multi_query_retriever(query, llm_model, simple_retriever)

['---\nlayout: default\ntitle:  How normal are you? Checking distributional assumptions.\ndescription: How normal are you? Checking distributional assumptions.\ndate:   2021-01-27 00:00:00 +0000\npermalink: /normality_distribution_test/\ncategory: Finance\n---\n## How normal are you? Checking distributional assumptions.\n\nThe need to understand the underlying distribution of data is critical in most parts of quantitative finance. Statistical tests can be applied for this purpose.',
 '[1]:\thttps://medium.com/quaintitative/data-exploration-in-pandas-f7cd1a3b3594\n[2]:\thttps://medium.com/quaintitative/quickstart-to-visualising-and-analysing-financial-data-with-pandas-bbd835c9c560\n[3]:\thttps://playgrdstar.github.io/portfolio_optimisation_with_tensorflow/\n[4]:\thttps://medium.com/creative-coding-space/3-days-of-hand-coding-visualisations-introduction-64da30d8793f\n[5]:\thttps://github.com/playgrdstar/portfolio_optimisation_with_tensorflow',
 "And since I will almost certainly forget w

In [72]:
## Contextual Compression

In [52]:
from langchain.retrievers.document_compressors import LLMChainExtractor, LLMChainFilter, EmbeddingsFilter


In [56]:
chain_extractor = LLMChainExtractor.from_llm(llm_model)
filter_extractor = LLMChainFilter.from_llm(llm_model)
embeddings_filter = EmbeddingsFilter(embeddings=hf, similarity_threshold=0.75)
retrieved_docs = simple_retriever.get_relevant_documents(query)
compressed_docs = chain_extractor.compress_documents(retrieved_docs, query)


In [90]:
def compressed_retrieval(query, llm, retriever, extractor_type='chain', embedding_model=None):
    retrieved_docs = retriever.get_relevant_documents(query)
    if extractor_type == 'chain':
        extractor = LLMChainExtractor.from_llm(llm)
    elif extractor_type == 'filter':
        extractor = LLMChainFilter.from_llm(llm)
    elif extractor_type == 'embeddings':
        if hf is None:
            raise ValueError("hf (embeddings model) must be provided for the embeddings extractor.")
        extractor = EmbeddingsFilter(embeddings=hf, similarity_threshold=0.75)
    else:
        raise ValueError("Invalid extractor_type. Options are 'chain', 'filter', or 'embeddings'.")
    compressed_docs = extractor.compress_documents(retrieved_docs, query)

    return get_text(compressed_docs)

In [91]:
compressed_retrieval(query, llm_model, simple_retriever, extractor_type='chain')

['- The need to understand the underlying distribution of data is critical in most parts of quantitative finance.',
 '[2]:\thttps://medium.com/quaintitative/quickstart-to-visualising-and-analysing-financial-data-with-pandas-bbd835c9c560\n[5]:\thttps://github.com/playgrdstar/portfolio_optimisation_with_tensorflow',
 'None of the context is relevant to the question.',
 '- Oct. 2021: "Learning Knowledge-Enriched Company Embeddings for Investment Management" accepted by the 2nd ACM International Conference on AI in Finance (ICAIF 2021)\n\nReasoning: The context mentions a paper about "Learning Knowledge-Enriched Company Embeddings for Investment Management" which is relevant to finance as it is about investment management.',
 'Finance\nMonte Carlo Simulation of Value at Risk in Python\n\nReturn:\nFinance\nMonte Carlo Simulation of Value at Risk in Python']

In [82]:
## Ensemble Retriever

In [63]:
from langchain.retrievers import BM25Retriever
# combine with the simple_retriever, pr_retriever, and mq_retriever
bm25_retriever = BM25Retriever.from_documents(docs)
all_retrievers = [simple_retriever, pr_retriever, bm25_retriever]

In [95]:
def ensemble_retriever(query, retrievers_list, c=60):

    retrieved_docs_by_retriever = [retriever.get_relevant_documents(query) for retriever in all_retrievers]
    weights = [1 / len(all_retrievers)] * len(all_retrievers)

    # Calculate RRF scores for all documents
    rrf_score = defaultdict(float)
    for doc_list, weight in zip(retrieved_docs_by_retriever, weights):
        for rank, doc in enumerate(doc_list, start=1):
            rrf_score[doc.page_content] += weight / (rank + c)

    # Chain all document lists into a single iterable
    all_docs = chain.from_iterable(retrieved_docs_by_retriever)

    # Define function to yield unique documents based on a key
    def unique_by_key(iterable, key_func):
        seen = set()
        for element in iterable:
            key = key_func(element)
            if key not in seen:
                seen.add(key)
                yield element

    # Sort documents by RRF score in descending order
    sorted_docs = sorted(
        unique_by_key(all_docs, lambda doc: doc.page_content),
        key=lambda doc: rrf_score[doc.page_content],
        reverse=True
    )

    return get_text(sorted_docs)

In [96]:
ensemble_retriever(query, all_retrievers)

['---\nlayout: default\ntitle:  How normal are you? Checking distributional assumptions.\ndescription: How normal are you? Checking distributional assumptions.\ndate:   2021-01-27 00:00:00 +0000\npermalink: /normality_distribution_test/\ncategory: Finance\n---\n## How normal are you? Checking distributional assumptions.\n\nThe need to understand the underlying distribution of data is critical in most parts of quantitative finance. Statistical tests can be applied for this purpose.',
 '[1]:\thttps://medium.com/quaintitative/data-exploration-in-pandas-f7cd1a3b3594\n[2]:\thttps://medium.com/quaintitative/quickstart-to-visualising-and-analysing-financial-data-with-pandas-bbd835c9c560\n[3]:\thttps://playgrdstar.github.io/portfolio_optimisation_with_tensorflow/\n[4]:\thttps://medium.com/creative-coding-space/3-days-of-hand-coding-visualisations-introduction-64da30d8793f\n[5]:\thttps://github.com/playgrdstar/portfolio_optimisation_with_tensorflow',
 '- Dec. 2021: "Learning User Interface Sema

In [97]:
## Long Context Re-ordering

In [81]:
#  Effect of this function is to alternate elements from the ends towards the center when looking at the original order of docs
def reorder_docs(docs):
    docs.reverse()
    reordered_result = []
    for i, value in enumerate(docs):
        if i % 2 == 1:
            reordered_result.append(value) # append even numbered elements
        else:
            reordered_result.insert(0, value) # add odd numbered elemenats to the start
    return reordered_result
                 
retrieved_docs = simple_retriever.get_relevant_documents(query)
reordered_docs = reorder_docs(docs)

print(get_text(reordered_docs)[:5])

In [99]:
def long_context_reorder_retrieval(query, retriever):
    retrieved_docs = retriever.get_relevant_documents(query)
    retrieved_docs.reverse()
    reordered_results = []
    for i, value in enumerate(retrieved_docs):
        if i % 2 == 1:
            reordered_results.append(value) # append even numbered elements
        else:
            reordered_results.insert(0, value) # add odd numbered elemenats to the start
    return get_text(reordered_results)

In [100]:
long_context_reorder_retrieval(query, simple_retriever)

['---\nlayout: default\ntitle:  How normal are you? Checking distributional assumptions.\ndescription: How normal are you? Checking distributional assumptions.\ndate:   2021-01-27 00:00:00 +0000\npermalink: /normality_distribution_test/\ncategory: Finance\n---\n## How normal are you? Checking distributional assumptions.\n\nThe need to understand the underlying distribution of data is critical in most parts of quantitative finance. Statistical tests can be applied for this purpose.',
 "And since I will almost certainly forget what went through my mind during this process, I decided to jot down some notes on key points that I learnt or went through my mind. \n\nI'm not very hopeful about selling any of my NFTs, given how crowded the marketplaces have become in the short span of a year or so, so this post would probably be my main takeaway.\n\nSome of these points could potentially be wrong, or outdated. If so, please let me know.",
 '---\nlayout: default\ntitle:  Monte Carlo Simulation o